diff --git a/.bumpversion.cfg b/.bumpversion.cfg deleted file mode 100644 index 97e8d915f..000000000 --- a/.bumpversion.cfg +++ /dev/null @@ -1,6 +0,0 @@ -[bumpversion] -current_version = 3.4.0 -commit = False -tag = False - -[bumpversion:file:VERSION] diff --git a/.gitignore b/.gitignore index ae1ec4e16..746a9461e 100644 --- a/.gitignore +++ b/.gitignore @@ -14,6 +14,8 @@ src/21cmFAST.egg-info/ pip-wheel-metadata/ src/py21cmfast\.egg-info/ +.python-version + *.so build/ *.*~ diff --git a/VERSION b/VERSION deleted file mode 100644 index 18091983f..000000000 --- a/VERSION +++ /dev/null @@ -1 +0,0 @@ -3.4.0 diff --git a/build_cffi.py b/build_cffi.py deleted file mode 100755 index 2b132e78c..000000000 --- a/build_cffi.py +++ /dev/null @@ -1,132 +0,0 @@ -"""Build the C code with CFFI.""" - -import os -import sys -import sysconfig -from pathlib import Path - -from cffi import FFI - -# Get the compiler. We support gcc and clang. -# The compiler is determnined from the environment and uses sysconfig as a fallback. -source = "environment variable 'CC'" if "CC" in os.environ else "sysconfig" -_compiler = os.environ.get("CC", sysconfig.get_config_var("CC")) -print(f"Using compiler from {source}: {_compiler}") - -if "gcc" in _compiler: - compiler = "gcc" -elif "clang" in _compiler: - compiler = "clang" -else: - raise ValueError(f"Compiler {_compiler} not supported for 21cmFAST") - -ffi = FFI() - -LOCATION = Path(__file__).resolve().parent -CLOC = LOCATION / "src" / "py21cmfast" / "src" -include_dirs = [str(CLOC)] -c_files = [str(fl.relative_to(LOCATION)) for fl in sorted(CLOC.glob("*.c"))] - -# Set the C-code logging level. -# If DEBUG is set, we default to the highest level, but if not, -# we set it to the level just above no logging at all. -log_level = os.environ.get("LOG_LEVEL", 4 if "DEBUG" in os.environ else 1) -available_levels = [ - "NONE", - "ERROR", - "WARNING", - "INFO", - "DEBUG", - "SUPER_DEBUG", - "ULTRA_DEBUG", -] - - -if isinstance(log_level, str) and log_level.upper() in available_levels: - log_level = available_levels.index(log_level.upper()) - -try: - log_level = int(log_level) -except ValueError as e: - # note: for py35 support, can't use f strings. - raise ValueError( - "LOG_LEVEL must be specified as a positive integer, or one " - f"of {available_levels}" - ) from e - -# ================================================== -# Set compilation arguments dependent on environment -# ================================================== - -extra_compile_args = ["-Wall", "--verbose", f"-DLOG_LEVEL={log_level:d}"] - -if "DEBUG" in os.environ: - extra_compile_args += ["-g", "-O0"] -else: - extra_compile_args += ["-Ofast"] - -if sys.platform == "darwin": - extra_compile_args += ["-Xpreprocessor"] - -extra_compile_args += ["-fopenmp"] - -libraries = ["m", "gsl", "gslcblas", "fftw3f_omp", "fftw3f"] - -# stuff for gperftools -if "PROFILE" in os.environ: - libraries += ["profiler", "tcmalloc"] - # we need this even if DEBUG is off - extra_compile_args += ["-g"] - -if compiler == "clang": - libraries += ["omp"] - -library_dirs = [] -for k, v in os.environ.items(): - if "inc" in k.lower(): - include_dirs += [v] - elif "lib" in k.lower(): - library_dirs += [v] - -# ================================================================= -# NOTES FOR DEVELOPERS: -# The CFFI implementation works as follows: -# - All function prototypes, global variables and type definitions *directly* used -# in the python wrapper must be declared via ffi.cdef("""C CODE"""). -# There must be no compiler directives in this code (#include, #define, etc) -# - All implementations of global variables and types present in the cdef() calls -# must also be present in the second argument of set_source. -# This is passed to the compiler. -# - The `sources` kwarg then contains all the .c files in the library which are to be compiled - -# This is the overall C code. -ffi.set_source( - "py21cmfast.c_21cmfast", # Name/Location of shared library module - """ - #include "21cmFAST.h" - """, - sources=c_files, - include_dirs=include_dirs, - library_dirs=library_dirs, - libraries=libraries, - extra_compile_args=extra_compile_args, -) - -# Header files containing types, globals and function prototypes -with (CLOC / "_inputparams_wrapper.h").open() as f: - ffi.cdef(f.read()) -with (CLOC / "_outputstructs_wrapper.h").open() as f: - ffi.cdef(f.read()) -with (CLOC / "_functionprototypes_wrapper.h").open() as f: - ffi.cdef(f.read()) - -# CFFI needs to be able to access a free function to make the __del__ method for OutputStruct fields -# This will expose the standard free() function to the wrapper so it can be used -ffi.cdef( - """ - void free(void *ptr); - """ -) - -if __name__ == "__main__": - ffi.compile() diff --git a/bump b/bump deleted file mode 100755 index 1d8fa939d..000000000 --- a/bump +++ /dev/null @@ -1,18 +0,0 @@ -#!/usr/bin/bash -set -e - -PART=$1 - -OLDVERSION=$(cat VERSION) -NEWVERSION=$(bump2version --dry-run --list ${PART} | grep new_version | sed -r s,"^.*=",,) -echo "New Version: ${NEWVERSION}" - -# Actually Run The Update -bump2version $PART - -# Now add in stuff to the changelog -python changethelog.py ${NEWVERSION} - -# Now commit -git add . -git commit -m "Bump Version: ${OLDVERSION} -> ${NEWVERSION}" diff --git a/environment_dev.yml b/environment_dev.yml index 8a73359c3..3e3b1bb88 100644 --- a/environment_dev.yml +++ b/environment_dev.yml @@ -8,7 +8,6 @@ dependencies: - zlib - pip - libxml2 - - libffi - zipp - click - scipy @@ -47,7 +46,6 @@ dependencies: - qt - packaging - ipython_genutils - - cffi - pytest-remotedata - nomkl - chardet @@ -117,4 +115,4 @@ dependencies: - pytest-plt - questionary - pip: - - pre-commit + - pre-commit diff --git a/install_custom.py b/install_custom.py new file mode 100755 index 000000000..a2a7624d4 --- /dev/null +++ b/install_custom.py @@ -0,0 +1,72 @@ +#!/bin/python +""" +install_custom.py provides a custom installation process for the 21cmFAST package. + +Allows users to specify various build and configuration options via command-line arguments. + +Features: +- Allows setting the log level for the installation process. +- Provides an option to enable debug symbols for the build. +- Enables customization of the optimization level for the build process. + +Command-line Arguments: +- --log-level: Specifies the log level for the build process. Options include: + NO_LOG, ERROR, WARNING, INFO, DEBUG, SUPER_DEBUG, ULTRA_DEBUG. Defaults to WARNING. +- --debug: Enables debug symbols for the build, which can be useful for debugging. +- -o, --optimization: Sets the optimization level for the build (e.g., -O0, -O1, -O2, -O3). + Defaults to 3. + +Usage: +Run the script from the command line to install 21cmFAST with the desired options: + python install_custom.py [options] + +Example: + python install_custom.py --log-level DEBUG --debug -o 2 + +Dependencies: +- Python 3.x +- pip (Python package installer) + +Note: +This script uses the `subprocess` module to invoke the `pip install` command with +custom configuration settings. + +""" + +import argparse +import subprocess + +# Define the command-line arguments +parser = argparse.ArgumentParser(description="Install 21cmFAST with custom options.") +parser.add_argument( + "--log-level", + type=str, + default="WARNING", + help="Set the log level (NO_LOG, ERROR, WARNING, INFO, DEBUG, SUPER_DEBUG, ULTRA_DEBUG)", +) +parser.add_argument("--debug", action="store_true", help="Enable debug symbols") +parser.add_argument( + "-o", + "--optimization", + help="optimisation level (i,e -O0, -O1, -O2, -O3)", + default="3", +) + +args = parser.parse_args() +if args.optimization not in ["0", "g", "1", "2", "3", "s"]: + raise ValueError("Invalid optimization level. Choose from 0, g, 1, 2, 3 or s") + +# Get the LOG_LEVEL environment variable (default to 'WARNING' if not set) +log_level_str = args.log_level +setup_args = [ + f"--config-setting=setup-args=-Dlog_level={log_level_str}", +] + +debug = str(args.debug).lower() + +setup_args += [f"--config-setting=setup-args=-Ddebug={args.debug}"] +setup_args += [f"--config-setting=setup-args=-Doptimization={args.optimization}"] + + +# Run pip install with the specified options +subprocess.run(["pip", "install", ".", *setup_args]) diff --git a/meson.build b/meson.build new file mode 100644 index 000000000..6bc0cfbed --- /dev/null +++ b/meson.build @@ -0,0 +1,36 @@ +#To find the version we need to find python and run setuptools-scm, while keeping the project() call first +project('21cmFAST', ['c', 'cpp'], + version : run_command(find_program('python'), '-c', 'from setuptools_scm import get_version; print(get_version())', check: true).stdout().strip(), + default_options : ['cpp_std=c++17'], +) + +# Check if CUDA should be enabled after project declaration +env_cuda_str = run_command('printenv', 'USE_CUDA', check: false).stdout().strip() +if env_cuda_str == '' + env_cuda_flag = get_option('use_cuda') +else + if env_cuda_str == 'TRUE' + env_cuda_flag = true + else + env_cuda_flag = false + endif +endif + +nvcc_prog = find_program('nvcc', required: false) + +# Add CUDA language if both the environment variable is set AND nvcc is available +if env_cuda_flag and nvcc_prog.found() + add_languages('cuda') + # For CUDA, we need to pass the C++ standard as a compiler argument since we're adding the language dynamically + add_project_arguments('--std=c++17', language: 'cuda') + message('CUDA enabled: nvcc found at ' + nvcc_prog.full_path()) +else + if env_cuda_flag and not nvcc_prog.found() + warning('USE_CUDA=TRUE but nvcc not found, building CPU-only version') + endif + message('Building CPU-only version') +endif + +py = import('python').find_installation(pure: false) + +subdir('src') diff --git a/meson.options b/meson.options new file mode 100644 index 000000000..b6db5bbb9 --- /dev/null +++ b/meson.options @@ -0,0 +1,3 @@ +# Define the log level option +option('log_level', type: 'string', value: 'WARNING', description: 'Set the log level (e.g., NO_LOG, ERROR, WARNING, INFO, DEBUG, SUPER_DEBUG, ULTRA_DEBUG)') +option('use_cuda',type: 'boolean', value: false, description: 'Attempt to find and use CUDA in the compilation if set to TRUE') diff --git a/pyproject.toml b/pyproject.toml index 3a17beeaf..d8f8647b7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,8 +1,128 @@ +[project] +name="21cmFAST" +dynamic = ["version"] +license="MIT" +license-files = ["LICENSE"] +description="A semi-numerical cosmological simulation code for the 21cm signal" +# long_description="%s\n%s" +# % ( +# re.compile("^.. start-badges.*^.. end-badges", re.M | re.S).sub( +# "", _read("README.rst") +# ), +# re.sub(":[a-z]+:`~?(.*?)`", r"``\1``", _read("CHANGELOG.rst")), +# ) +# long_description_content_type="text/x-rst" +authors=[ {name = "The 21cmFAST coredev team" ,email = "21cmfast.coredev@gmail.com"}] +readme= {file = "README.rst", content-type = "text/x-rst" } +include_package_data=true +requires-python=">=3.10" +classifiers=[ + # complete classifier list: http://pypi.python.org/pypi?%3Aaction=list_classifiers + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "Operating System :: Unix", + "Operating System :: POSIX", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: Implementation :: CPython", +] +keywords=["Epoch of Reionization", "Cosmology"] +dependencies=[ + "click", + "numpy>=2.0", + "pyyaml", + "cffi>=1.0", + "scipy", + "astropy>=2.0", + "h5py>=2.8.0", + "matplotlib", + "bidict", + "cosmotile>=0.2.5", + "attrs", + "tqdm", + "classy", + "cyclopts", + "tomlkit", +] + +# [tool.setuptools.packages.find] +# where = ["src/py21cmfast"] + +[project.optional-dependencies] +tests = [ + "clang-format", + "clang-tidy", + "hmf", + "pre-commit", + "pytest>=5.0", + "pytest-cov", + "tox", + "pytest-remotedata>=0.3.2", + "powerbox", + "pytest-plt", + "pytest-benchmark", + "tyro", + "rich", + "pytest-xdist", + "pytest-mock", +] +docs = [ + "nbsphinx", + "numpydoc", + "sphinx >= 1.3", + "furo" +] +# When the min python version supports PEP 735, this can be simplified +# as dev = test_req + doc_req again (as it was implemented in setup.py) +dev = [ + "clang-format", + "clang-tidy", + "hmf", + "pre-commit", + "pytest>=5.0", + "pytest-cov", + "tox", + "pytest-remotedata>=0.3.2", + "powerbox", + "pytest-plt", + "pytest-benchmark", + "tyro", + "rich", + "pytest-xdist", + "pytest-mock", + "nbsphinx", + "numpydoc", + "sphinx>=1.3", + "sphinx-rtd-theme", +] + +# UPDATE THESE +[project.urls] +Homepage="https://github.com/21cmFAST/21cmFAST" +Documentation="https://github.com/21cmFAST/21cmFAST" +Repository="https://github.com/21cmFAST/21cmFAST" +Issues="https://github.com/21cmFAST/21cmFAST" +Changelog="https://github.com/21cmFAST/21cmFAST" + +[project.scripts] +21cmfast = "py21cmfast.cli:main" + [build-system] -requires = ["setuptools>=78.1.0", "setuptools_scm>=8", "cython"] -build-backend = "setuptools.build_meta" +build-backend = 'mesonpy' +requires = ['meson-python', 'nanobind>=2.4.0', 'setuptools-scm>=8'] + +[tool.meson-python.args] +setup = ["-Dbuildtype=release"] [tool.setuptools_scm] +version_file = "src/py21cmfast/_version.py" +fallback_version = "4.0.0b1" + +[tool.pytest.ini_options] +testpaths = ["tests"] [tool.ruff] line-length = 88 @@ -82,10 +202,6 @@ ignore = [ "D103", # ignore missing docstrings "T", # print statements ] -"build_cffi.py" = [ - "T", # print statements -] - [tool.ruff.lint.pydocstyle] convention = 'numpy' diff --git a/setup.py b/setup.py deleted file mode 100644 index 85cf6f497..000000000 --- a/setup.py +++ /dev/null @@ -1,104 +0,0 @@ -#!/usr/bin/env python -"""Setup the package.""" - -import os -import re -from pathlib import Path - -from setuptools import find_packages, setup - -THISDIR = Path(__file__).parent.resolve() - - -def _read(name: str): - with (THISDIR / name).open(encoding="utf8") as fl: - return fl.read() - - -# Enable code coverage for C code: we can't use CFLAGS=-coverage in tox.ini, since that -# may mess with compiling dependencies (e.g. numpy). Therefore we set SETUPPY_ -# CFLAGS=-coverage in tox.ini and copy it to CFLAGS here (after deps have been safely installed). -if "TOXENV" in os.environ and "SETUPPY_CFLAGS" in os.environ: - os.environ["CFLAGS"] = os.environ["SETUPPY_CFLAGS"] - -test_req = [ - "clang-format", - "clang-tidy", - "hmf", - "pre-commit", - "pytest>=5.0", - "pytest-cov", - "tox", - "pytest-remotedata>=0.3.2", - "powerbox", - "pytest-plt", - "pytest-benchmark", - "tyro", - "rich", - "pytest-xdist", - "pytest-mock", -] - -doc_req = ["nbsphinx", "numpydoc", "sphinx >= 1.3", "furo"] - -setup( - name="21cmFAST", - license="MIT license", - description="A semi-numerical cosmological simulation code for the 21cm signal", - long_description="{}\n{}".format( - re.compile("^.. start-badges.*^.. end-badges", re.M | re.S).sub( - "", _read("README.rst") - ), - re.sub(":[a-z]+:`~?(.*?)`", r"``\1``", _read("CHANGELOG.rst")), - ), - long_description_content_type="text/x-rst", - author="The 21cmFAST coredev team", - author_email="21cmfast.coredev@gmail.com", - url="https://github.com/21cmFAST/21cmFAST", - packages=find_packages("src"), - package_dir={"": "src"}, - include_package_data=True, - python_requires=">=3.11", - zip_safe=False, - classifiers=[ - # complete classifier list: http://pypi.python.org/pypi?%3Aaction=list_classifiers - "Development Status :: 5 - Production/Stable", - "Intended Audience :: Developers", - "License :: OSI Approved :: MIT License", - "Operating System :: Unix", - "Operating System :: POSIX", - "Programming Language :: Python", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: Implementation :: CPython", - ], - keywords=["Epoch of Reionization", "Cosmology"], - install_requires=[ - "click", - "numpy>=2.0", - "pyyaml", - "cffi>=1.0", - "scipy", - "astropy>=2.0", - "h5py>=2.8.0", - "matplotlib", - "bidict", - "cosmotile>=0.2.5", - "attrs", - "tqdm", - "classy", - "cyclopts", - "tomlkit", - ], - extras_require={"tests": test_req, "docs": doc_req, "dev": test_req + doc_req}, - setup_requires=["cffi>=1.0", "setuptools_scm"], - entry_points={"console_scripts": ["21cmfast = py21cmfast.cli:app"]}, - cffi_modules=[f"{THISDIR}/build_cffi.py:ffi"], - use_scm_version={ - "write_to": "src/py21cmfast/_version.py", - "parentdir_prefix_version": "21cmFAST-", - "fallback_version": "0.0.0", - }, -) diff --git a/src/meson.build b/src/meson.build new file mode 100644 index 000000000..9b87673d7 --- /dev/null +++ b/src/meson.build @@ -0,0 +1 @@ +subdir('py21cmfast') diff --git a/src/py21cmfast/_cfg.py b/src/py21cmfast/_cfg.py index 69cd925ef..b94393680 100644 --- a/src/py21cmfast/_cfg.py +++ b/src/py21cmfast/_cfg.py @@ -6,9 +6,10 @@ from pathlib import Path from typing import ClassVar +import py21cmfast.c_21cmfast as lib + from . import yaml from ._data import DATA_PATH -from .c_21cmfast import ffi, lib from .wrapper.structs import StructInstanceWrapper @@ -25,6 +26,7 @@ class Config(dict): "ignore_R_BUBBLE_MAX_error": False, "external_table_path": DATA_PATH, "HALO_CATALOG_MEM_FACTOR": 1.2, + "EXTRA_HALOBOX_FIELDS": False, "safe_read": True, } _defaults["wisdoms_path"] = Path(_defaults["direc"]) / "wisdoms" @@ -32,7 +34,7 @@ class Config(dict): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # keep the config settings from the C library here - self._c_config_settings = StructInstanceWrapper(lib.config_settings, ffi) + self._c_config_settings = StructInstanceWrapper(lib.get_config_settings()) for k, v in self._defaults.items(): if k not in self: @@ -59,13 +61,9 @@ def __setitem__(self, key, value): def _pass_to_backend(self, key, value): """Set the value in the backend.""" - # we should possibly do a typemap for the ffi - if isinstance(value, Path | str): - setattr( - self._c_config_settings, key, ffi.new("char[]", str(value).encode()) - ) - else: - setattr(self._c_config_settings, key, value) + if isinstance(value, Path): + value = str(value) + setattr(self._c_config_settings, key, value) @contextlib.contextmanager def use(self, **kwargs): diff --git a/src/py21cmfast/drivers/coeval.py b/src/py21cmfast/drivers/coeval.py index eb850e7a9..d357caace 100644 --- a/src/py21cmfast/drivers/coeval.py +++ b/src/py21cmfast/drivers/coeval.py @@ -13,8 +13,9 @@ from rich.console import Console from rich.progress import Progress +import py21cmfast.c_21cmfast as lib + from .. import __version__ -from ..c_21cmfast import lib from ..io import h5 from ..io.caching import CacheConfig, OutputCache, RunCache from ..rsds import apply_rsds, include_dvdr_in_tau21 @@ -30,7 +31,11 @@ PerturbHaloField, TsBox, ) -from ..wrapper.photoncons import _get_photon_nonconservation_data, setup_photon_cons +from ..wrapper.photoncons import ( + _get_photon_nonconservation_data, + _photoncons_state, + setup_photon_cons, +) from . import single_field as sf from ._param_config import high_level_func @@ -625,7 +630,7 @@ def generate_coeval( ): yield coeval, coeval.redshift in out_redshifts - if lib.photon_cons_allocated: + if _photoncons_state.c_memory_allocated: lib.FreePhotonConsMemory() @@ -741,7 +746,7 @@ def _redshift_loop_generator( this_halobox = sf.compute_halo_grid( inputs=inputs, perturbed_halo_list=this_pthalo, - perturbed_field=this_perturbed_field, + redshift=z, previous_ionize_box=getattr(prev_coeval, "ionized_box", None), previous_spin_temp=getattr(prev_coeval, "ts_box", None), write=write.halobox, diff --git a/src/py21cmfast/drivers/lightcone.py b/src/py21cmfast/drivers/lightcone.py index 72cef3774..590cbdefd 100644 --- a/src/py21cmfast/drivers/lightcone.py +++ b/src/py21cmfast/drivers/lightcone.py @@ -14,8 +14,9 @@ from astropy import units from astropy.cosmology import z_at_value +import py21cmfast.c_21cmfast as lib + from .. import __version__ -from ..c_21cmfast import lib from ..io import h5 from ..io.caching import CacheConfig, OutputCache from ..lightconers import Lightconer, RectilinearLightconer @@ -31,6 +32,7 @@ PerturbHaloField, TsBox, ) +from ..wrapper.photoncons import _photoncons_state from ._param_config import high_level_func from .coeval import ( _obtain_starting_point_for_scrolling, @@ -513,7 +515,7 @@ def _run_lightcone_from_perturbed_fields( # last redshift things if iz == len(scrollz) - 1: - if lib.photon_cons_allocated: + if _photoncons_state.c_memory_allocated: lib.FreePhotonConsMemory() if include_dvdr_in_tau21: diff --git a/src/py21cmfast/drivers/single_field.py b/src/py21cmfast/drivers/single_field.py index 83248dee8..d2c4c0df7 100644 --- a/src/py21cmfast/drivers/single_field.py +++ b/src/py21cmfast/drivers/single_field.py @@ -203,10 +203,10 @@ def perturb_halo_list( @single_field_func def compute_halo_grid( *, + redshift: float, initial_conditions: InitialConditions, inputs: InputParameters | None = None, perturbed_halo_list: PerturbHaloField | None = None, - perturbed_field: PerturbedField | None = None, previous_spin_temp: TsBox | None = None, previous_ionize_box: IonizedBox | None = None, ) -> HaloBox: @@ -243,29 +243,9 @@ def compute_halo_grid( regenerate, write, cache: See docs of :func:`initial_conditions` for more information. """ - if perturbed_halo_list: - redshift = perturbed_halo_list.redshift - elif perturbed_field: - redshift = perturbed_field.redshift - else: - raise ValueError( - "Either perturbed_field or perturbed_halo_list are required (or both)." - ) - box = HaloBox.new(redshift=redshift, inputs=inputs) - if perturbed_field is None: - if ( - inputs.matter_options.FIXED_HALO_GRIDS - or inputs.astro_options.AVG_BELOW_SAMPLER - ): - raise ValueError( - "You must provide the perturbed field if FIXED_HALO_GRIDS is True or AVG_BELOW_SAMPLER is True" - ) - else: - perturbed_field = PerturbedField.dummy() - - elif perturbed_halo_list is None: + if perturbed_halo_list is None: if not inputs.matter_options.FIXED_HALO_GRIDS: raise ValueError( "You must provide the perturbed halo list if FIXED_HALO_GRIDS is False" @@ -302,7 +282,6 @@ def compute_halo_grid( return box.compute( initial_conditions=initial_conditions, pt_halos=perturbed_halo_list, - perturbed_field=perturbed_field, previous_ionize_box=previous_ionize_box, previous_spin_temp=previous_spin_temp, ) diff --git a/src/py21cmfast/lightconers.py b/src/py21cmfast/lightconers.py index ffafd9ee3..675d06d21 100644 --- a/src/py21cmfast/lightconers.py +++ b/src/py21cmfast/lightconers.py @@ -125,7 +125,6 @@ def between_redshifts( d_at_redshift = cosmo.comoving_distance(min_redshift).to_value(Mpc) dmax = cosmo.comoving_distance(max_redshift).to_value(Mpc) res = resolution.to_value(Mpc) - lc_distances = np.arange(d_at_redshift, dmax + res, res) return cls(lc_distances=lc_distances * Mpc, cosmo=cosmo, **kw) diff --git a/src/py21cmfast/meson.build b/src/py21cmfast/meson.build new file mode 100644 index 000000000..c91874ade --- /dev/null +++ b/src/py21cmfast/meson.build @@ -0,0 +1,35 @@ +source_files = [ + '__init__.py', + '_cfg.py', + '_logging.py', + 'cli.py', + 'input_serialization.py', + 'lightconers.py', + 'plotting.py', + 'rsds.py', + '_templates.py', + 'utils.py', + 'yaml.py', +] + +py.install_sources( + source_files, + subdir: 'py21cmfast' +) + +pure_subdirs = [ + 'drivers', + 'io', + 'templates', + 'wrapper', + '_data', +] + +# Install the Python code +install_root = py.get_install_dir() +foreach subdir: pure_subdirs + install_subdir(subdir, install_dir: install_root / 'py21cmfast') +endforeach + +# Build C-extension +subdir('src') diff --git a/src/py21cmfast/plotting.py b/src/py21cmfast/plotting.py index 62dafd912..458a2e3c2 100644 --- a/src/py21cmfast/plotting.py +++ b/src/py21cmfast/plotting.py @@ -170,7 +170,7 @@ def coeval_sliceplot( """ if kind is None: if isinstance(struct, outputs.OutputStruct): - kind = struct.struct.fieldnames[0] + kind = next(iter(struct.arrays.keys())) elif isinstance(struct, Coeval): kind = "brightness_temp" diff --git a/src/py21cmfast/src/BrightnessTemperatureBox.h b/src/py21cmfast/src/BrightnessTemperatureBox.h index c4cf5b8a4..dbd9e3b25 100644 --- a/src/py21cmfast/src/BrightnessTemperatureBox.h +++ b/src/py21cmfast/src/BrightnessTemperatureBox.h @@ -5,7 +5,13 @@ #include "InputParameters.h" #include "OutputStructs.h" +#ifdef __cplusplus +extern "C" { +#endif int ComputeBrightnessTemp(float redshift, TsBox *spin_temp, IonizedBox *ionized_box, PerturbedField *perturb_field, BrightnessTemp *box); +#ifdef __cplusplus +} +#endif #endif diff --git a/src/py21cmfast/src/DeviceConstants.cuh b/src/py21cmfast/src/DeviceConstants.cuh new file mode 100644 index 000000000..759a731f7 --- /dev/null +++ b/src/py21cmfast/src/DeviceConstants.cuh @@ -0,0 +1,15 @@ +#ifndef _DEVICECONSTANTS_CUH +#define _DEVICECONSTANTS_CUH + +#include "InputParameters.h" + +#ifndef _HALOFIELD_CU // double check whether this is necessary + +extern __constant__ UserParams d_user_params; +extern __constant__ CosmoParams d_cosmo_params; +extern __constant__ AstroParams d_astro_params; +extern __constant__ double d_test_params; + +#endif + +#endif diff --git a/src/py21cmfast/src/HaloBox.c b/src/py21cmfast/src/HaloBox.c index 9fb2d49df..6ef8c7c0b 100644 --- a/src/py21cmfast/src/HaloBox.c +++ b/src/py21cmfast/src/HaloBox.c @@ -21,26 +21,24 @@ #include "indexing.h" #include "interp_tables.h" #include "logger.h" +#include "map_mass.h" #include "scaling_relations.h" #include "thermochem.h" -// struct holding each halo property we currently need. -// This is only used for both averages over the box/catalogues -// as well as an individual halo's properties -struct HaloProperties { - double halo_mass; - double stellar_mass; - double halo_sfr; - double stellar_mass_mini; - double sfr_mini; - double fescweighted_sfr; - double n_ion; - double halo_xray; - double metallicity; - double m_turn_acg; - double m_turn_mcg; - double m_turn_reion; -}; +// TODO: this should probably be somewhere else +void set_integral_constants(IntegralCondition *consts, double redshift, double M_min, double M_max, + double M_cell) { + consts->redshift = redshift; + consts->growth_factor = dicke(redshift); + consts->M_min = M_min; + consts->M_max = M_max; + consts->lnM_min = log(M_min); + consts->lnM_max = log(M_max); + consts->M_cell = M_cell; + consts->lnM_cell = log(M_cell); + // no table since this should be called once + consts->sigma_cell = sigma_z0(M_cell); +} // calculates halo properties from astro parameters plus the correlated rng // The inputs include all properties with a separate RNG @@ -56,8 +54,7 @@ struct HaloProperties { // representing a smooth transition in halo mass from one set of SFR/emmissivity parameters to the // other. void set_halo_properties(double halo_mass, double M_turn_a, double M_turn_m, - struct ScalingConstants *consts, double *input_rng, - struct HaloProperties *output) { + ScalingConstants *consts, double *input_rng, HaloProperties *output) { double n_ion_sample, wsfr_sample; double fesc; double fesc_mini = 0.; @@ -99,8 +96,8 @@ void set_halo_properties(double halo_mass, double M_turn_a, double M_turn_m, // Expected global averages for box quantities for mean adjustment // WARNING: THESE AVERAGE BOXES ARE WRONG, CHECK THEM -int get_box_averages(double M_min, double M_max, double M_turn_a, double M_turn_m, - struct ScalingConstants *consts, struct HaloProperties *averages_out) { +int get_uhmf_averages(double M_min, double M_max, double M_turn_a, double M_turn_m, + ScalingConstants *consts, HaloProperties *averages_out) { LOG_SUPER_DEBUG("Getting Box averages z=%.2f M [%.2e %.2e] Mt [%.2e %.2e]", consts->redshift, M_min, M_max, M_turn_a, M_turn_m); double t_h = consts->t_h; @@ -124,7 +121,7 @@ int get_box_averages(double M_min, double M_max, double M_turn_a, double M_turn_ // NOTE: we use the atomic method for all halo mass/count here mass_intgrl = Fcoll_General(consts->redshift, lnMmin, lnMmax); - struct ScalingConstants consts_sfrd = evolve_scaling_constants_sfr(consts); + ScalingConstants consts_sfrd = evolve_scaling_constants_sfr(consts); intgrl_fesc_weighted = Nion_General(consts->redshift, lnMmin, lnMmax, M_turn_a, consts); intgrl_stars_only = Nion_General(consts->redshift, lnMmin, lnMmax, M_turn_a, &consts_sfrd); @@ -154,70 +151,142 @@ int get_box_averages(double M_min, double M_max, double M_turn_a, double M_turn_ return 0; } +HaloProperties get_halobox_averages(HaloBox *grids) { + int mean_count = 0; + double mean_mass = 0., mean_stars = 0., mean_stars_mini = 0., mean_sfr = 0., mean_sfr_mini = 0.; + double mean_n_ion = 0., mean_xray = 0., mean_wsfr = 0.; + +#pragma omp parallel for reduction(+ : mean_count, mean_mass, mean_stars, mean_stars_mini, \ + mean_sfr, mean_sfr_mini) + for (int i = 0; i < HII_TOT_NUM_PIXELS; i++) { + mean_sfr += grids->halo_sfr[i]; + mean_n_ion += grids->n_ion[i]; + if (astro_options_global->USE_TS_FLUCT) { + mean_xray += grids->halo_xray[i]; + } + if (astro_options_global->USE_MINI_HALOS) { + mean_sfr_mini += grids->halo_sfr_mini[i]; + } + if (astro_options_global->INHOMO_RECO) mean_wsfr += grids->whalo_sfr[i]; + + if (config_settings.EXTRA_HALOBOX_FIELDS) { + mean_count += grids->count[i]; + mean_mass += grids->halo_mass[i]; + mean_stars += grids->halo_stars[i]; + if (astro_options_global->USE_MINI_HALOS) mean_stars_mini += grids->halo_stars_mini[i]; + } + } + + HaloProperties averages = { + .count = (double)mean_count / HII_TOT_NUM_PIXELS, + .halo_mass = mean_mass / HII_TOT_NUM_PIXELS, + .stellar_mass = mean_stars / HII_TOT_NUM_PIXELS, + .stellar_mass_mini = mean_stars_mini / HII_TOT_NUM_PIXELS, + .halo_sfr = mean_sfr / HII_TOT_NUM_PIXELS, + .sfr_mini = mean_sfr_mini / HII_TOT_NUM_PIXELS, + .n_ion = mean_n_ion / HII_TOT_NUM_PIXELS, + .halo_xray = mean_xray / HII_TOT_NUM_PIXELS, + .fescweighted_sfr = mean_wsfr / HII_TOT_NUM_PIXELS, + }; + return averages; +} // This takes a HaloBox struct and fixes it's mean to exactly what we expect from the UMF integrals. // Generally should only be done for the fixed portion of the grids, since // it will otherwise make the box inconsistent with the input catalogue -void mean_fix_grids(double M_min, double M_max, HaloBox *grids, struct HaloProperties *averages_box, - struct ScalingConstants *consts) { - struct HaloProperties averages_global; - double M_turn_a_global = averages_box->m_turn_acg; - double M_turn_m_global = averages_box->m_turn_mcg; - get_box_averages(M_min, M_max, M_turn_a_global, M_turn_m_global, consts, &averages_global); +void mean_fix_grids(double M_min, double M_max, HaloBox *grids, ScalingConstants *consts) { + HaloProperties averages_global; + // NOTE: requires the mean mcrits to be set on the grids + double M_turn_a_global = pow(10, grids->log10_Mcrit_ACG_ave); + double M_turn_m_global = pow(10, grids->log10_Mcrit_MCG_ave); + get_uhmf_averages(M_min, M_max, M_turn_a_global, M_turn_m_global, consts, &averages_global); + HaloProperties averages_hbox; + averages_hbox = get_halobox_averages(grids); unsigned long long int idx; #pragma omp parallel for num_threads(simulation_options_global->N_THREADS) private(idx) for (idx = 0; idx < HII_TOT_NUM_PIXELS; idx++) { - grids->halo_mass[idx] *= averages_global.halo_mass / averages_box->halo_mass; - grids->halo_stars[idx] *= averages_global.stellar_mass / averages_box->stellar_mass; - grids->halo_sfr[idx] *= averages_global.halo_sfr / averages_box->halo_sfr; - grids->n_ion[idx] *= averages_global.n_ion / averages_box->n_ion; + grids->halo_sfr[idx] *= averages_global.halo_sfr / averages_hbox.halo_sfr; + grids->n_ion[idx] *= averages_global.n_ion / averages_hbox.n_ion; if (astro_options_global->USE_MINI_HALOS) { - grids->halo_stars_mini[idx] *= - averages_global.stellar_mass_mini / averages_box->stellar_mass_mini; - grids->halo_sfr_mini[idx] *= averages_global.sfr_mini / averages_box->sfr_mini; + grids->halo_sfr_mini[idx] *= averages_global.sfr_mini / averages_hbox.sfr_mini; } if (astro_options_global->USE_TS_FLUCT) { - grids->halo_xray[idx] *= averages_global.halo_xray / averages_box->halo_xray; + grids->halo_xray[idx] *= averages_global.halo_xray / averages_hbox.halo_xray; } if (astro_options_global->INHOMO_RECO) { grids->whalo_sfr[idx] *= - averages_global.fescweighted_sfr / averages_box->fescweighted_sfr; + averages_global.fescweighted_sfr / averages_hbox.fescweighted_sfr; + } + + if (config_settings.EXTRA_HALOBOX_FIELDS) { + grids->halo_mass[idx] *= averages_global.halo_mass / averages_hbox.halo_mass; + grids->halo_stars[idx] *= averages_global.stellar_mass / averages_hbox.stellar_mass; + if (astro_options_global->USE_MINI_HALOS) { + grids->halo_stars_mini[idx] *= + averages_global.stellar_mass_mini / averages_hbox.stellar_mass_mini; + } } } } +// Evaluate Mass function integrals given information from the cell +void get_cell_integrals(double dens, double l10_mturn_a, double l10_mturn_m, + ScalingConstants *consts, IntegralCondition *int_consts, + HaloProperties *properties) { + double M_min = int_consts->M_min; + double M_max = int_consts->M_max; + double growth_z = int_consts->growth_factor; + double M_cell = int_consts->M_cell; + double sigma_cell = int_consts->sigma_cell; + + // set all fields to zero + memset(properties, 0, sizeof(HaloProperties)); + + // using the properties struct: + // stellar_mass --> no F_esc integral ACG + // stellar_mass_mini --> no F_esc integral MCG + // n_ion --> F_esc integral ACG + // fescweighted_sfr --> F_esc integral MCG + // halo_xray --> Xray integral + // halo_mass --> total mass + properties->n_ion = EvaluateNion_Conditional(dens, l10_mturn_a, growth_z, M_min, M_max, M_cell, + sigma_cell, consts, false); + properties->stellar_mass = + EvaluateSFRD_Conditional(dens, growth_z, M_min, M_max, M_cell, sigma_cell, consts); + // TODO: SFRD tables still assume no reion feedback, this should be fixed + // although it doesn't affect the histories (only used in Ts) it makes outputs wrong + // for post-processing + if (astro_options_global->USE_MINI_HALOS) { + properties->stellar_mass_mini = EvaluateSFRD_Conditional_MINI( + dens, l10_mturn_m, growth_z, M_min, M_max, M_cell, sigma_cell, consts); + // re-using field + properties->fescweighted_sfr = EvaluateNion_Conditional_MINI( + dens, l10_mturn_m, growth_z, M_min, M_max, M_cell, sigma_cell, consts, false); + } + + if (astro_options_global->USE_TS_FLUCT) { + properties->halo_xray = + EvaluateXray_Conditional(dens, l10_mturn_m, consts->redshift, growth_z, M_min, M_max, + M_cell, sigma_cell, consts); + } + + if (config_settings.EXTRA_HALOBOX_FIELDS) { + properties->halo_mass = + EvaluateMcoll(dens, growth_z, log(M_min), log(M_max), M_cell, sigma_cell, dens); + } +} + // Fixed halo grids, where each property is set as the integral of the CMF on the EULERIAN cell // scale As per default 21cmfast (strange pretending that the lagrangian density is eulerian and // then *(1+delta)) This outputs the UN-NORMALISED grids (before mean-adjustment) -int set_fixed_grids(double M_min, double M_max, InitialConditions *ini_boxes, - PerturbedField *perturbed_field, TsBox *previous_spin_temp, - IonizedBox *previous_ionize_box, struct ScalingConstants *consts, - HaloBox *grids, struct HaloProperties *averages, const bool eulerian) { +int set_fixed_grids(double M_min, double M_max, InitialConditions *ini_boxes, float *mturn_a_grid, + float *mturn_m_grid, ScalingConstants *consts, HaloBox *grids) { double M_cell = RHOcrit * cosmo_params_global->OMm * VOLUME / HII_TOT_NUM_PIXELS; // mass in cell of mean dens - double growth_z = dicke(consts->redshift); - - double lnMmin = log(M_min); - double lnMcell = log(M_cell); - double lnMmax = log(M_max); - - double sigma_cell = EvaluateSigma(lnMcell); - - double prefactor_mass = RHOcrit * cosmo_params_global->OMm; - double prefactor_stars = RHOcrit * cosmo_params_global->OMb * consts->fstar_10; - double prefactor_stars_mini = RHOcrit * cosmo_params_global->OMb * consts->fstar_7; - double prefactor_sfr = prefactor_stars / consts->t_star / consts->t_h; - double prefactor_sfr_mini = prefactor_stars_mini / consts->t_star / consts->t_h; - double prefactor_nion = prefactor_stars * consts->fesc_10 * consts->pop2_ion; - double prefactor_nion_mini = prefactor_stars_mini * consts->fesc_7 * consts->pop3_ion; - double prefactor_wsfr = prefactor_sfr * consts->fesc_10 * consts->pop2_ion; - double prefactor_wsfr_mini = prefactor_sfr_mini * consts->fesc_7 * consts->pop3_ion; - double prefactor_xray = RHOcrit * cosmo_params_global->OMm; - - double hm_sum = 0, nion_sum = 0, wsfr_sum = 0, xray_sum = 0; - double sm_sum = 0, sm_sum_mini = 0, sfr_sum = 0, sfr_sum_mini = 0; - double l10_mlim_m_sum = 0., l10_mlim_a_sum = 0., l10_mlim_r_sum = 0.; + IntegralCondition integral_cond; + set_integral_constants(&integral_cond, consts->redshift, M_min, M_max, M_cell); + double growthf = dicke(consts->redshift); // find grid limits for tables double min_density = 0.; @@ -226,56 +295,27 @@ int set_fixed_grids(double M_min, double M_max, InitialConditions *ini_boxes, double min_log10_mturn_m = log10(M_MAX_INTEGRAL); double max_log10_mturn_a = log10(astro_params_global->M_TURN); double max_log10_mturn_m = log10(astro_params_global->M_TURN); - float *mturn_a_grid = calloc(HII_TOT_NUM_PIXELS, sizeof(float)); - float *mturn_m_grid = calloc(HII_TOT_NUM_PIXELS, sizeof(float)); #pragma omp parallel num_threads(simulation_options_global->N_THREADS) { unsigned long long int i; double dens; - double J21_val, Gamma12_val, zre_val; - double M_turn_r = 0.; double M_turn_m = consts->mturn_m_nofb; double M_turn_a = consts->mturn_a_nofb; - double curr_vcb = consts->vcb_norel; #pragma omp for reduction(min : min_density, min_log10_mturn_a, min_log10_mturn_m) \ - reduction(max : max_density, max_log10_mturn_a, max_log10_mturn_m) \ - reduction(+ : l10_mlim_m_sum, l10_mlim_a_sum, l10_mlim_r_sum) + reduction(max : max_density, max_log10_mturn_a, max_log10_mturn_m) for (i = 0; i < HII_TOT_NUM_PIXELS; i++) { - if (eulerian) - dens = perturbed_field->density[i]; - else - dens = ini_boxes->lowres_density[i] * growth_z; + dens = ini_boxes->lowres_density[i] * growthf; if (dens > max_density) max_density = dens; if (dens < min_density) min_density = dens; if (astro_options_global->USE_MINI_HALOS) { - if (!astro_options_global->FIX_VCB_AVG && - matter_options_global->USE_RELATIVE_VELOCITIES) { - curr_vcb = ini_boxes->lowres_vcb[i]; - } - J21_val = Gamma12_val = zre_val = 0.; - if (consts->redshift < simulation_options_global->Z_HEAT_MAX) { - J21_val = previous_spin_temp->J_21_LW[i]; - Gamma12_val = previous_ionize_box->ionisation_rate_G12[i]; - zre_val = previous_ionize_box->z_reion[i]; - } - M_turn_a = consts->mturn_a_nofb; - M_turn_m = lyman_werner_threshold(consts->redshift, J21_val, curr_vcb); - M_turn_r = reionization_feedback(consts->redshift, Gamma12_val, zre_val); - M_turn_a = fmax(M_turn_a, fmax(M_turn_r, astro_params_global->M_TURN)); - M_turn_m = fmax(M_turn_m, fmax(M_turn_r, astro_params_global->M_TURN)); + M_turn_a = mturn_a_grid[i]; + M_turn_m = mturn_m_grid[i]; + if (min_log10_mturn_a > M_turn_a) min_log10_mturn_a = M_turn_a; + if (min_log10_mturn_m > M_turn_m) min_log10_mturn_m = M_turn_m; + if (max_log10_mturn_a < M_turn_a) max_log10_mturn_a = M_turn_a; + if (max_log10_mturn_m < M_turn_m) max_log10_mturn_m = M_turn_m; } - mturn_a_grid[i] = log10(M_turn_a); - mturn_m_grid[i] = log10(M_turn_m); - - if (min_log10_mturn_a > mturn_a_grid[i]) min_log10_mturn_a = mturn_a_grid[i]; - if (min_log10_mturn_m > mturn_m_grid[i]) min_log10_mturn_m = mturn_m_grid[i]; - if (max_log10_mturn_a < mturn_a_grid[i]) max_log10_mturn_a = mturn_a_grid[i]; - if (max_log10_mturn_m < mturn_m_grid[i]) max_log10_mturn_m = mturn_m_grid[i]; - - l10_mlim_a_sum += mturn_a_grid[i]; - l10_mlim_m_sum += mturn_m_grid[i]; - l10_mlim_r_sum += log10(M_turn_r); } } // buffers for table ranges @@ -286,17 +326,14 @@ int set_fixed_grids(double M_min, double M_max, InitialConditions *ini_boxes, max_log10_mturn_a = max_log10_mturn_a * 1.001; max_log10_mturn_m = max_log10_mturn_m * 1.001; - LOG_DEBUG("Mean halo boxes || M = [%.2e %.2e] | Mcell = %.2e (s=%.2e) | z = %.2e | D = %.2e", - M_min, M_max, M_cell, sigma_cell, consts->redshift, growth_z); - + LOG_DEBUG("Mean halo boxes || M = [%.2e %.2e] | Mcell = %.2e", M_min, M_max, M_cell); // These tables are coarser than needed, an initial loop for Mturn to find limits may help if (matter_options_global->USE_INTERPOLATION_TABLES > 1) { if (astro_options_global->INTEGRATION_METHOD_ATOMIC == 1 || (astro_options_global->USE_MINI_HALOS && astro_options_global->INTEGRATION_METHOD_MINI == 1)) { - initialise_GL(lnMmin, lnMmax); + initialise_GL(integral_cond.lnM_min, integral_cond.lnM_max); } - // This table assumes no reionisation feedback initialise_SFRD_Conditional_table(consts->redshift, min_density, max_density, M_min, M_max, M_cell, consts); @@ -307,95 +344,26 @@ int set_fixed_grids(double M_min, double M_max, InitialConditions *ini_boxes, M_cell, min_log10_mturn_a, max_log10_mturn_a, min_log10_mturn_m, max_log10_mturn_m, consts, false); - initialise_dNdM_tables(min_density, max_density, lnMmin, lnMmax, growth_z, lnMcell, false); + initialise_dNdM_tables(min_density, max_density, integral_cond.lnM_min, + integral_cond.lnM_max, integral_cond.growth_factor, + integral_cond.lnM_cell, false); if (astro_options_global->USE_TS_FLUCT) { initialise_Xray_Conditional_table(consts->redshift, min_density, max_density, M_min, M_max, M_cell, consts); } } -#pragma omp parallel num_threads(simulation_options_global->N_THREADS) - { - unsigned long long int i; - double dens; - double l10_mturn_a, l10_mturn_m; - double mass_intgrl, h_count; - double intgrl_fesc_weighted, intgrl_stars_only; - double intgrl_fesc_weighted_mini = 0., intgrl_stars_only_mini = 0., integral_xray = 0; - double dens_fac; - -#pragma omp for reduction(+ : hm_sum, sm_sum, sm_sum_mini, sfr_sum, sfr_sum_mini, xray_sum, \ - nion_sum, wsfr_sum) - for (i = 0; i < HII_TOT_NUM_PIXELS; i++) { - if (eulerian) { - dens = perturbed_field->density[i]; - dens_fac = (1. + dens); - } else { - dens = ini_boxes->lowres_density[i] * growth_z; - dens_fac = 1.; - } - l10_mturn_a = mturn_a_grid[i]; - l10_mturn_m = mturn_m_grid[i]; - - h_count = EvaluateNhalo(dens, growth_z, lnMmin, lnMmax, M_cell, sigma_cell, dens); - mass_intgrl = EvaluateMcoll(dens, growth_z, lnMmin, lnMmax, M_cell, sigma_cell, dens); - intgrl_fesc_weighted = EvaluateNion_Conditional( - dens, l10_mturn_a, growth_z, M_min, M_max, M_cell, sigma_cell, consts, false); - intgrl_stars_only = - EvaluateSFRD_Conditional(dens, growth_z, M_min, M_max, M_cell, sigma_cell, consts); - // TODO: SFRD tables still assume no reion feedback, this should be fixed - // although it doesn't affect the histories (only used in Ts) it makes outputs wrong - // for post-processing - if (astro_options_global->USE_MINI_HALOS) { - intgrl_stars_only_mini = EvaluateSFRD_Conditional_MINI( - dens, l10_mturn_m, growth_z, M_min, M_max, M_cell, sigma_cell, consts); - intgrl_fesc_weighted_mini = EvaluateNion_Conditional_MINI( - dens, l10_mturn_m, growth_z, M_min, M_max, M_cell, sigma_cell, consts, false); - } - - if (astro_options_global->USE_TS_FLUCT) { - integral_xray = - EvaluateXray_Conditional(dens, l10_mturn_m, consts->redshift, growth_z, M_min, - M_max, M_cell, sigma_cell, consts); - } - - grids->count[i] = (int)(h_count * M_cell * dens_fac); // NOTE: truncated - grids->halo_mass[i] = mass_intgrl * prefactor_mass * dens_fac; - grids->halo_sfr[i] = (intgrl_stars_only * prefactor_sfr) * dens_fac; - grids->n_ion[i] = (intgrl_fesc_weighted * prefactor_nion + - intgrl_fesc_weighted_mini * prefactor_nion_mini) * - dens_fac; - grids->halo_stars[i] = intgrl_stars_only * prefactor_stars * dens_fac; - - hm_sum += grids->halo_mass[i]; - nion_sum += grids->n_ion[i]; - sfr_sum += grids->halo_sfr[i]; - sm_sum += grids->halo_stars[i]; - - if (astro_options_global->USE_TS_FLUCT) { - grids->halo_xray[i] = prefactor_xray * integral_xray * dens_fac; - xray_sum += grids->halo_xray[i]; - } - if (astro_options_global->INHOMO_RECO) { - grids->whalo_sfr[i] = (intgrl_fesc_weighted * prefactor_wsfr + - intgrl_fesc_weighted_mini * prefactor_wsfr_mini) * - dens_fac; - wsfr_sum += grids->whalo_sfr[i]; - } - if (astro_options_global->USE_MINI_HALOS) { - grids->halo_stars_mini[i] = - intgrl_stars_only_mini * prefactor_stars_mini * dens_fac; - grids->halo_sfr_mini[i] = intgrl_stars_only_mini * prefactor_sfr_mini * dens_fac; - sm_sum_mini += grids->halo_stars_mini[i]; - sfr_sum_mini += grids->halo_sfr_mini[i]; - } - } - } - - LOG_ULTRA_DEBUG("Cell 0 Totals: HM: %.2e SM: %.2e SF: %.2e, NI: %.2e ct : %d", - grids->halo_mass[HII_R_INDEX(0, 0, 0)], grids->halo_stars[HII_R_INDEX(0, 0, 0)], - grids->halo_sfr[HII_R_INDEX(0, 0, 0)], grids->n_ion[HII_R_INDEX(0, 0, 0)], - grids->count[HII_R_INDEX(0, 0, 0)]); + int grid_dim[3] = {simulation_options_global->HII_DIM, simulation_options_global->HII_DIM, + HII_D_PARA}; + float *vel_pointers[3] = {ini_boxes->lowres_vx, ini_boxes->lowres_vy, ini_boxes->lowres_vz}; + float *vel_pointers_2LPT[3] = {ini_boxes->lowres_vx_2LPT, ini_boxes->lowres_vy_2LPT, + ini_boxes->lowres_vz_2LPT}; + move_grid_galprops(consts->redshift, ini_boxes->lowres_density, grid_dim, vel_pointers, + vel_pointers_2LPT, grid_dim, grids, grid_dim, mturn_a_grid, mturn_m_grid, + consts, &integral_cond); + + LOG_ULTRA_DEBUG("Cell 0 Totals: SF: %.2e, NI: %.2e", grids->halo_sfr[HII_R_INDEX(0, 0, 0)], + grids->n_ion[HII_R_INDEX(0, 0, 0)]); if (astro_options_global->INHOMO_RECO) { LOG_ULTRA_DEBUG("FESC * SF %.2e", grids->whalo_sfr[HII_R_INDEX(0, 0, 0)]); } @@ -405,57 +373,26 @@ int set_fixed_grids(double M_min, double M_max, InitialConditions *ini_boxes, if (astro_options_global->USE_MINI_HALOS) { LOG_ULTRA_DEBUG("MINI SM %.2e SF %.2e", grids->halo_stars_mini[HII_R_INDEX(0, 0, 0)], grids->halo_sfr_mini[HII_R_INDEX(0, 0, 0)]); + LOG_ULTRA_DEBUG("Mturn_a %.2e Mturn_m %.2e", mturn_a_grid[HII_R_INDEX(0, 0, 0)], + mturn_m_grid[HII_R_INDEX(0, 0, 0)]); } - LOG_ULTRA_DEBUG("Mturn_a %.2e Mturn_m %.2e", mturn_a_grid[HII_R_INDEX(0, 0, 0)], - mturn_m_grid[HII_R_INDEX(0, 0, 0)]); - - free(mturn_a_grid); - free(mturn_m_grid); free_conditional_tables(); - averages->halo_mass = hm_sum / HII_TOT_NUM_PIXELS; - averages->stellar_mass = sm_sum / HII_TOT_NUM_PIXELS; - averages->stellar_mass_mini = sm_sum_mini / HII_TOT_NUM_PIXELS; - averages->halo_sfr = sfr_sum / HII_TOT_NUM_PIXELS; - averages->sfr_mini = sfr_sum_mini / HII_TOT_NUM_PIXELS; - averages->n_ion = nion_sum / HII_TOT_NUM_PIXELS; - averages->halo_xray = xray_sum / HII_TOT_NUM_PIXELS; - averages->fescweighted_sfr = wsfr_sum / HII_TOT_NUM_PIXELS; - averages->m_turn_acg = pow(10, l10_mlim_a_sum / HII_TOT_NUM_PIXELS); - averages->m_turn_mcg = pow(10, l10_mlim_m_sum / HII_TOT_NUM_PIXELS); - averages->m_turn_reion = pow(10, l10_mlim_r_sum / HII_TOT_NUM_PIXELS); - - // mean-fix the grids - // TODO: put this behind a flag - if (consts->fix_mean) mean_fix_grids(M_min, M_max, grids, averages, consts); - - // assign the log10 average Mturn for the Ts global tables - grids->log10_Mcrit_MCG_ave = l10_mlim_m_sum / HII_TOT_NUM_PIXELS; - grids->log10_Mcrit_ACG_ave = l10_mlim_a_sum / HII_TOT_NUM_PIXELS; + if (consts->fix_mean) mean_fix_grids(M_min, M_max, grids, consts); return 0; } -void halobox_debug_print_avg(struct HaloProperties *averages_box, - struct HaloProperties *averages_subsampler, - struct ScalingConstants *consts, double M_min, double M_max) { +void halobox_debug_print_avg(HaloBox *halobox, ScalingConstants *consts, double M_min, + double M_max) { if (LOG_LEVEL < DEBUG_LEVEL) return; - struct HaloProperties averages_sub_expected, averages_global; + HaloProperties averages_box; + averages_box = get_halobox_averages(halobox); + HaloProperties averages_global; LOG_DEBUG("HALO BOXES REDSHIFT %.2f [%.2e %.2e]", consts->redshift, M_min, M_max); - if (matter_options_global->FIXED_HALO_GRIDS) { - get_box_averages(M_min, M_max, averages_box->m_turn_acg, averages_box->m_turn_mcg, consts, - &averages_global); - } else { - get_box_averages(simulation_options_global->SAMPLER_MIN_MASS, M_max, - averages_box->m_turn_acg, averages_box->m_turn_mcg, consts, - &averages_global); - if (astro_options_global->AVG_BELOW_SAMPLER && - M_min < simulation_options_global->SAMPLER_MIN_MASS) { - get_box_averages(M_min, simulation_options_global->SAMPLER_MIN_MASS, - averages_box->m_turn_acg, averages_box->m_turn_mcg, consts, - &averages_sub_expected); - } - } + double mturn_a_avg = pow(10, halobox->log10_Mcrit_ACG_ave); + double mturn_m_avg = pow(10, halobox->log10_Mcrit_MCG_ave); + get_uhmf_averages(M_min, M_max, mturn_a_avg, mturn_m_avg, consts, &averages_global); LOG_DEBUG( "Exp. averages: (HM %11.3e, SM %11.3e SM_MINI %11.3e SFR %11.3e, SFR_MINI %11.3e, XRAY " @@ -466,43 +403,22 @@ void halobox_debug_print_avg(struct HaloProperties *averages_box, LOG_DEBUG( "Box. averages: (HM %11.3e, SM %11.3e SM_MINI %11.3e SFR %11.3e, SFR_MINI %11.3e, XRAY " "%11.3e, NION %11.3e)", - averages_box->halo_mass, averages_box->stellar_mass, averages_box->stellar_mass_mini, - averages_box->halo_sfr, averages_box->sfr_mini, averages_box->halo_xray, - averages_box->n_ion); - - if (!matter_options_global->FIXED_HALO_GRIDS && astro_options_global->AVG_BELOW_SAMPLER && - M_min < simulation_options_global->SAMPLER_MIN_MASS) { - LOG_DEBUG("SUB-SAMPLER"); - LOG_DEBUG( - "Exp. averages: (HM %11.3e, SM %11.3e SM_MINI %11.3e SFR %11.3e, SFR_MINI %11.3e, XRAY " - "%11.3e, NION %11.3e)", - averages_sub_expected.halo_mass, averages_sub_expected.stellar_mass, - averages_sub_expected.stellar_mass_mini, averages_sub_expected.halo_sfr, - averages_sub_expected.sfr_mini, averages_sub_expected.halo_xray, - averages_sub_expected.n_ion); - LOG_DEBUG( - "Box. averages: (HM %11.3e, SM %11.3e SM_MINI %11.3e SFR %11.3e, SFR_MINI %11.3e, XRAY " - "%11.3e, NION %11.3e)", - averages_subsampler->halo_mass, averages_subsampler->stellar_mass, - averages_subsampler->stellar_mass_mini, averages_subsampler->halo_sfr, - averages_subsampler->sfr_mini, averages_subsampler->halo_xray, - averages_subsampler->n_ion); - } + averages_box.halo_mass, averages_box.stellar_mass, averages_box.stellar_mass_mini, + averages_box.halo_sfr, averages_box.sfr_mini, averages_box.halo_xray, averages_box.n_ion); } // We need the mean log10 turnover masses for comparison with expected global Nion and SFRD. // Sometimes we don't calculate these on the grid (if we use halos and no sub-sampler) // So this function simply returns the volume-weighted average log10 turnover mass -void get_mean_log10_turnovers(InitialConditions *ini_boxes, TsBox *previous_spin_temp, - IonizedBox *previous_ionize_box, PerturbedField *perturbed_field, - struct ScalingConstants *consts, double turnovers[3]) { +void get_log10_turnovers(InitialConditions *ini_boxes, TsBox *previous_spin_temp, + IonizedBox *previous_ionize_box, float *mturn_a_grid, float *mturn_m_grid, + ScalingConstants *consts, double averages[2]) { + averages[0] = log10(consts->mturn_a_nofb); + averages[1] = log10(consts->mturn_m_nofb); if (!astro_options_global->USE_MINI_HALOS) { - turnovers[0] = log10(consts->mturn_a_nofb); // ACG - turnovers[1] = log10(consts->mturn_m_nofb); // MCG - turnovers[2] = 0.; // reion (log10 so effectively 1 solar mass) return; } - double l10_mturn_a_avg = 0., l10_mturn_m_avg = 0., l10_mturn_r_avg = 0.; + double log10_mturn_m_avg = 0., log10_mturn_a_avg = 0.; #pragma omp parallel num_threads(simulation_options_global->N_THREADS) { @@ -513,7 +429,7 @@ void get_mean_log10_turnovers(InitialConditions *ini_boxes, TsBox *previous_spin double M_turn_a = consts->mturn_a_nofb; double M_turn_r; -#pragma omp for reduction(+ : l10_mturn_m_avg, l10_mturn_a_avg, l10_mturn_r_avg) +#pragma omp for reduction(+ : log10_mturn_m_avg, log10_mturn_a_avg) for (i = 0; i < HII_TOT_NUM_PIXELS; i++) { if (!astro_options_global->FIX_VCB_AVG && matter_options_global->USE_RELATIVE_VELOCITIES) { @@ -529,31 +445,27 @@ void get_mean_log10_turnovers(InitialConditions *ini_boxes, TsBox *previous_spin M_turn_r = reionization_feedback(consts->redshift, Gamma12_val, zre_val); M_turn_a = fmax(M_turn_a, fmax(M_turn_r, astro_params_global->M_TURN)); M_turn_m = fmax(M_turn_m, fmax(M_turn_r, astro_params_global->M_TURN)); - l10_mturn_a_avg += log10(M_turn_a); - l10_mturn_m_avg += log10(M_turn_m); - l10_mturn_r_avg += log10(M_turn_r); - } - l10_mturn_a_avg /= HII_TOT_NUM_PIXELS; - l10_mturn_m_avg /= HII_TOT_NUM_PIXELS; - l10_mturn_r_avg /= HII_TOT_NUM_PIXELS; - turnovers[0] = l10_mturn_a_avg; - turnovers[1] = l10_mturn_m_avg; - turnovers[2] = l10_mturn_r_avg; + mturn_a_grid[i] = log10(M_turn_a); + log10_mturn_a_avg += log10(M_turn_a); + mturn_m_grid[i] = log10(M_turn_m); + log10_mturn_m_avg += log10(M_turn_m); + } } + + // NOTE: This average log10 Mturn will be passed onto the spin temperature calculations where + // It is used to perform the frequency integrals (over tau, dependent on ), and possibly + // for mean fixing. It is the volume-weighted mean of LOG10 Mturn, although we could do another + // weighting or use Mturn directly None of these are a perfect representation due to the + // nonlinear way turnover mass affects N_ion + log10_mturn_a_avg /= HII_TOT_NUM_PIXELS; + log10_mturn_m_avg /= HII_TOT_NUM_PIXELS; + averages[0] = log10_mturn_a_avg; + averages[1] = log10_mturn_m_avg; } -void sum_halos_onto_grid(InitialConditions *ini_boxes, TsBox *previous_spin_temp, - IonizedBox *previous_ionize_box, PerturbHaloField *halos, - struct ScalingConstants *consts, HaloBox *grids, - struct HaloProperties *averages) { - double redshift = consts->redshift; - // averages - double hm_avg = 0., sm_avg = 0., sfr_avg = 0.; - double sm_avg_mini = 0., sfr_avg_mini = 0.; - double M_turn_a_avg = 0., M_turn_m_avg = 0., M_turn_r_avg = 0.; - double n_ion_avg = 0., wsfr_avg = 0., xray_avg = 0.; - // counts +void sum_halos_onto_grid(InitialConditions *ini_boxes, PerturbHaloField *halos, float *mturn_a_grid, + float *mturn_m_grid, ScalingConstants *consts, HaloBox *grids) { unsigned long long int total_n_halos, n_halos_cut = 0.; double cell_volume = VOLUME / HII_TOT_NUM_PIXELS; @@ -568,20 +480,15 @@ void sum_halos_onto_grid(InitialConditions *ini_boxes, TsBox *previous_spin_temp double halo_pos[3]; int halo_idx[3]; unsigned long long int i_halo, i_cell; - double hmass, nion, sfr, wsfr, sfr_mini, stars_mini, stars, xray; - double J21_val, Gamma12_val, zre_val; + double hmass; - double curr_vcb = consts->vcb_norel; double M_turn_m = consts->mturn_m_nofb; double M_turn_a = consts->mturn_a_nofb; - double M_turn_r = 0.; double in_props[3]; - struct HaloProperties out_props; + HaloProperties out_props; -#pragma omp for reduction(+ : hm_avg, sm_avg, sm_avg_mini, sfr_avg, sfr_avg_mini, n_ion_avg, \ - xray_avg, wsfr_avg, M_turn_a_avg, M_turn_m_avg, M_turn_r_avg, \ - n_halos_cut) +#pragma omp for reduction(+ : n_halos_cut) for (i_halo = 0; i_halo < halos->n_halos; i_halo++) { hmass = halos->halo_masses[i_halo]; // It is sometimes useful to make cuts to the halo catalogues before gridding. @@ -604,22 +511,8 @@ void sum_halos_onto_grid(InitialConditions *ini_boxes, TsBox *previous_spin_temp // NOTE: I could easily apply reionization feedback without minihalos but this was not // done previously if (astro_options_global->USE_MINI_HALOS) { - if (!astro_options_global->FIX_VCB_AVG && - matter_options_global->USE_RELATIVE_VELOCITIES) - curr_vcb = ini_boxes->lowres_vcb[i_cell]; - - J21_val = Gamma12_val = zre_val = 0.; - if (consts->redshift < simulation_options_global->Z_HEAT_MAX) { - J21_val = previous_spin_temp->J_21_LW[i_cell]; - Gamma12_val = previous_ionize_box->ionisation_rate_G12[i_cell]; - zre_val = previous_ionize_box->z_reion[i_cell]; - } - - M_turn_a = consts->mturn_a_nofb; - M_turn_m = lyman_werner_threshold(redshift, J21_val, curr_vcb); - M_turn_r = reionization_feedback(redshift, Gamma12_val, zre_val); - M_turn_a = fmax(M_turn_a, fmax(M_turn_r, astro_params_global->M_TURN)); - M_turn_m = fmax(M_turn_m, fmax(M_turn_r, astro_params_global->M_TURN)); + M_turn_a = mturn_a_grid[i_cell]; + M_turn_m = mturn_m_grid[i_cell]; } // these are the halo property RNG sequences @@ -629,22 +522,15 @@ void sum_halos_onto_grid(InitialConditions *ini_boxes, TsBox *previous_spin_temp set_halo_properties(hmass, M_turn_a, M_turn_m, consts, in_props, &out_props); - sfr = out_props.halo_sfr; - sfr_mini = out_props.sfr_mini; - nion = out_props.n_ion; - wsfr = out_props.fescweighted_sfr; - stars = out_props.stellar_mass; - stars_mini = out_props.stellar_mass_mini; - xray = out_props.halo_xray; - #if LOG_LEVEL >= ULTRA_DEBUG_LEVEL if (i_cell == 0) { // LOG_ULTRA_DEBUG("(%d %d %d) i_cell %llu i_halo %llu",x,y,z,i_cell, i_halo); LOG_ULTRA_DEBUG( "Cell 0 Halo: HM: %.2e SM: %.2e (%.2e) SF: %.2e (%.2e) X: %.2e NI: %.2e WS: " "%.2e Z : %.2e ct : %llu", - hmass, stars, stars_mini, sfr, sfr_mini, xray, nion, wsfr, - out_props.metallicity, i_halo); + hmass, out_props.stellar_mass, out_props.stellar_mass_mini, out_props.halo_sfr, + out_props.sfr_mini, out_props.halo_xray, out_props.n_ion, + out_props.fescweighted_sfr, out_props.metallicity, i_halo); // LOG_ULTRA_DEBUG("Cell 0 Sums: HM: %.2e SM: %.2e (%.2e) SF: %.2e (%.2e) X: %.2e // NI: %.2e WS: %.2e ct : %d", @@ -660,52 +546,43 @@ void sum_halos_onto_grid(InitialConditions *ini_boxes, TsBox *previous_spin_temp // update the grids #pragma omp atomic update - grids->halo_mass[i_cell] += hmass; -#pragma omp atomic update - grids->halo_stars[i_cell] += stars; -#pragma omp atomic update - grids->n_ion[i_cell] += nion; -#pragma omp atomic update - grids->halo_sfr[i_cell] += sfr; + grids->n_ion[i_cell] += out_props.n_ion; #pragma omp atomic update - grids->count[i_cell] += 1; + grids->halo_sfr[i_cell] += out_props.halo_sfr; if (astro_options_global->USE_MINI_HALOS) { #pragma omp atomic update - grids->halo_stars_mini[i_cell] += stars_mini; -#pragma omp atomic update - grids->halo_sfr_mini[i_cell] += sfr_mini; + grids->halo_sfr_mini[i_cell] += out_props.sfr_mini; } if (astro_options_global->INHOMO_RECO) { #pragma omp atomic update - grids->whalo_sfr[i_cell] += wsfr; + grids->whalo_sfr[i_cell] += out_props.fescweighted_sfr; } if (astro_options_global->USE_TS_FLUCT) { #pragma omp atomic update - grids->halo_xray[i_cell] += xray; + grids->halo_xray[i_cell] += out_props.halo_xray; } - hm_avg += hmass; - sfr_avg += sfr; - sfr_avg_mini += sfr_mini; - sm_avg += stars; - sm_avg_mini += stars_mini; - xray_avg += xray; - n_ion_avg += nion; - wsfr_avg += wsfr; - M_turn_a_avg += M_turn_a; - M_turn_r_avg += M_turn_r; - M_turn_m_avg += M_turn_m; + if (config_settings.EXTRA_HALOBOX_FIELDS) { +#pragma omp atomic update + grids->halo_mass[i_cell] += hmass; +#pragma omp atomic update + grids->halo_stars[i_cell] += out_props.stellar_mass; +#pragma omp atomic update + grids->count[i_cell] += 1; + if (astro_options_global->USE_MINI_HALOS) { +#pragma omp atomic update + grids->halo_stars_mini[i_cell] += out_props.stellar_mass_mini; + } + } } #pragma omp for for (i_cell = 0; i_cell < HII_TOT_NUM_PIXELS; i_cell++) { - grids->halo_mass[i_cell] /= cell_volume; - grids->halo_sfr[i_cell] /= cell_volume; - grids->halo_stars[i_cell] /= cell_volume; grids->n_ion[i_cell] /= cell_volume; + grids->halo_sfr[i_cell] /= cell_volume; if (astro_options_global->USE_TS_FLUCT) { grids->halo_xray[i_cell] /= cell_volume; } @@ -714,15 +591,19 @@ void sum_halos_onto_grid(InitialConditions *ini_boxes, TsBox *previous_spin_temp } if (astro_options_global->USE_MINI_HALOS) { grids->halo_sfr_mini[i_cell] /= cell_volume; - grids->halo_stars_mini[i_cell] /= cell_volume; + } + if (config_settings.EXTRA_HALOBOX_FIELDS) { + grids->halo_mass[i_cell] /= cell_volume; + grids->halo_stars[i_cell] /= cell_volume; + if (astro_options_global->USE_MINI_HALOS) { + grids->halo_stars_mini[i_cell] /= cell_volume; + } } } } total_n_halos = halos->n_halos - n_halos_cut; - LOG_SUPER_DEBUG("Cell 0 Totals: HM: %.2e SM: %.2e SF: %.2e NI: %.2e ct : %d", - grids->halo_mass[HII_R_INDEX(0, 0, 0)], grids->halo_stars[HII_R_INDEX(0, 0, 0)], - grids->halo_sfr[HII_R_INDEX(0, 0, 0)], grids->halo_xray[HII_R_INDEX(0, 0, 0)], - grids->n_ion[HII_R_INDEX(0, 0, 0)], grids->count[HII_R_INDEX(0, 0, 0)]); + LOG_SUPER_DEBUG("Cell 0 Totals: SF: %.2e NI: %.2e", grids->halo_sfr[HII_R_INDEX(0, 0, 0)], + grids->n_ion[HII_R_INDEX(0, 0, 0)]); if (astro_options_global->INHOMO_RECO) { LOG_SUPER_DEBUG("FESC * SF %.2e", grids->whalo_sfr[HII_R_INDEX(0, 0, 0)]); } @@ -733,46 +614,11 @@ void sum_halos_onto_grid(InitialConditions *ini_boxes, TsBox *previous_spin_temp LOG_SUPER_DEBUG("MINI SM %.2e SF %.2e", grids->halo_stars_mini[HII_R_INDEX(0, 0, 0)], grids->halo_sfr_mini[HII_R_INDEX(0, 0, 0)]); } - - // NOTE: There is an inconsistency here, the sampled grids use a halo-averaged turnover mass - // whereas the fixed grids / default 21cmfast uses the volume averaged LOG10(turnover mass). - // Neither of these are a perfect representation due to the nonlinear way turnover mass - // affects N_ion - if (total_n_halos > 0) { - M_turn_r_avg /= total_n_halos; - M_turn_a_avg /= total_n_halos; - M_turn_m_avg /= total_n_halos; - } else { - // If we have no halos, assume the turnover has no reion feedback & no LW - M_turn_m_avg = consts->mturn_m_nofb; - M_turn_a_avg = consts->mturn_a_nofb; - M_turn_r_avg = 0.; - } - - hm_avg /= VOLUME; - sm_avg /= VOLUME; - sm_avg_mini /= VOLUME; - sfr_avg /= VOLUME; - sfr_avg_mini /= VOLUME; - n_ion_avg /= VOLUME; - xray_avg /= VOLUME; - - averages->halo_mass = hm_avg; - averages->stellar_mass = sm_avg; - averages->halo_sfr = sfr_avg; - averages->stellar_mass_mini = sm_avg_mini; - averages->sfr_mini = sfr_avg_mini; - averages->halo_xray = xray_avg; - averages->n_ion = n_ion_avg; - averages->m_turn_acg = M_turn_a_avg; - averages->m_turn_mcg = M_turn_m_avg; - averages->m_turn_reion = M_turn_r_avg; } // We grid a PERTURBED halofield into the necessary quantities for calculating radiative backgrounds -int ComputeHaloBox(double redshift, InitialConditions *ini_boxes, PerturbedField *perturbed_field, - PerturbHaloField *halos, TsBox *previous_spin_temp, - IonizedBox *previous_ionize_box, HaloBox *grids) { +int ComputeHaloBox(double redshift, InitialConditions *ini_boxes, PerturbHaloField *halos, + TsBox *previous_spin_temp, IonizedBox *previous_ionize_box, HaloBox *grids) { int status; Try { // get parameters @@ -787,54 +633,58 @@ int ComputeHaloBox(double redshift, InitialConditions *ini_boxes, PerturbedField unsigned long long int idx; #pragma omp parallel for num_threads(simulation_options_global->N_THREADS) private(idx) for (idx = 0; idx < HII_TOT_NUM_PIXELS; idx++) { - grids->halo_mass[idx] = 0.0; grids->n_ion[idx] = 0.0; grids->halo_sfr[idx] = 0.0; - grids->halo_stars[idx] = 0.0; - grids->count[idx] = 0; if (astro_options_global->USE_TS_FLUCT) { grids->halo_xray[idx] = 0.0; } if (astro_options_global->USE_MINI_HALOS) { - grids->halo_stars_mini[idx] = 0.0; grids->halo_sfr_mini[idx] = 0.0; } if (astro_options_global->INHOMO_RECO) { grids->whalo_sfr[idx] = 0.0; } + if (config_settings.EXTRA_HALOBOX_FIELDS) { + grids->halo_mass[idx] = 0.0; + grids->halo_stars[idx] = 0.0; + grids->count[idx] = 0; + if (astro_options_global->USE_MINI_HALOS) { + grids->halo_stars_mini[idx] = 0.0; + } + } } - struct ScalingConstants hbox_consts; - + ScalingConstants hbox_consts; set_scaling_constants(redshift, &hbox_consts, true); LOG_DEBUG("Gridding %llu halos...", halos->n_halos); double M_min = minimum_source_mass(redshift, false); double M_max_integral; - double cell_volume = VOLUME / HII_TOT_NUM_PIXELS; - - double turnovers[3]; - - struct HaloProperties averages_box, averages_subsampler; init_ps(); if (matter_options_global->USE_INTERPOLATION_TABLES > 0) { - initialiseSigmaMInterpTable( - M_min / 2, - M_MAX_INTEGRAL); // this needs to be initialised above MMax because of Nion_General + // this needs to be initialised above MMax because of Nion_General + initialiseSigmaMInterpTable(M_min / 2, M_MAX_INTEGRAL); } - // do the mean HMF box - // The default 21cmFAST has a strange behaviour where the nonlinear density is used as - // linear, the condition mass is at mean density, but the total cell mass is multiplied by - // delta This part mimics that behaviour Since we need the average turnover masses before we - // can calculate the global means, we do the CMF integrals first Then we calculate the - // expected UMF integrals before doing the adjustment + + float *mturn_a_grid = NULL; + float *mturn_m_grid = NULL; + if (astro_options_global->USE_MINI_HALOS) { + mturn_a_grid = calloc(HII_TOT_NUM_PIXELS, sizeof(float)); + mturn_m_grid = calloc(HII_TOT_NUM_PIXELS, sizeof(float)); + } + double mturn_averages[2]; + get_log10_turnovers(ini_boxes, previous_spin_temp, previous_ionize_box, mturn_a_grid, + mturn_m_grid, &hbox_consts, mturn_averages); + grids->log10_Mcrit_ACG_ave = mturn_averages[0]; + grids->log10_Mcrit_MCG_ave = mturn_averages[1]; if (matter_options_global->FIXED_HALO_GRIDS) { M_max_integral = M_MAX_INTEGRAL; - set_fixed_grids(M_min, M_max_integral, ini_boxes, perturbed_field, previous_spin_temp, - previous_ionize_box, &hbox_consts, grids, &averages_box, true); + set_fixed_grids(M_min, M_max_integral, ini_boxes, mturn_a_grid, mturn_m_grid, + &hbox_consts, grids); } else { + sum_halos_onto_grid(ini_boxes, halos, mturn_a_grid, mturn_m_grid, &hbox_consts, grids); // set below-resolution properties if (astro_options_global->AVG_BELOW_SAMPLER) { if (matter_options_global->HALO_STOCHASTICITY) { @@ -844,52 +694,22 @@ int ComputeHaloBox(double redshift, InitialConditions *ini_boxes, PerturbedField simulation_options_global->DIM); } if (M_min < M_max_integral) { - set_fixed_grids(M_min, M_max_integral, ini_boxes, perturbed_field, - previous_spin_temp, previous_ionize_box, &hbox_consts, grids, - &averages_subsampler, false); -// This is pretty redundant, but since the fixed grids have density units (X Mpc-3) I have to -// re-multiply before adding the halos. -// I should instead have a flag to output the summed values in cell. (2*N_pixel > N_halo so -// generally i don't want to do it in the halo loop) -#pragma omp parallel for num_threads(simulation_options_global->N_THREADS) private(idx) - for (idx = 0; idx < HII_TOT_NUM_PIXELS; idx++) { - grids->halo_mass[idx] *= cell_volume; - grids->halo_stars[idx] *= cell_volume; - grids->n_ion[idx] *= cell_volume; - grids->halo_sfr[idx] *= cell_volume; - if (astro_options_global->USE_TS_FLUCT) { - grids->halo_xray[idx] *= cell_volume; - } - if (astro_options_global->INHOMO_RECO) { - grids->whalo_sfr[idx] *= cell_volume; - } - if (astro_options_global->USE_MINI_HALOS) { - grids->halo_stars_mini[idx] *= cell_volume; - grids->halo_sfr_mini[idx] *= cell_volume; - } - } + set_fixed_grids(M_min, M_max_integral, ini_boxes, mturn_a_grid, mturn_m_grid, + &hbox_consts, grids); LOG_DEBUG("finished subsampler M[%.2e %.2e]", M_min, M_max_integral); } - } else { - // we still need the average turnovers for global values in spintemp, so get them - // here - get_mean_log10_turnovers(ini_boxes, previous_spin_temp, previous_ionize_box, - perturbed_field, &hbox_consts, turnovers); - grids->log10_Mcrit_ACG_ave = turnovers[0]; - grids->log10_Mcrit_MCG_ave = turnovers[1]; } - sum_halos_onto_grid(ini_boxes, previous_spin_temp, previous_ionize_box, halos, - &hbox_consts, grids, &averages_box); } - halobox_debug_print_avg(&averages_box, &averages_subsampler, &hbox_consts, M_min, - M_MAX_INTEGRAL); + halobox_debug_print_avg(grids, &hbox_consts, M_min, M_MAX_INTEGRAL); + if (astro_options_global->USE_MINI_HALOS) { + free(mturn_a_grid); + free(mturn_m_grid); + } // NOTE: the density-grid based calculations (!USE_HALO_FIELD) // use the cell-weighted average of the log10(Mturn) (see issue #369) - LOG_SUPER_DEBUG("log10 Mutrn ACG: log10 cell-weighted %.6e Halo-weighted %.6e", - pow(10, grids->log10_Mcrit_ACG_ave), averages_box.m_turn_acg); - LOG_SUPER_DEBUG("log10 Mutrn MCG: log10 cell-weighted %.6e Halo-weighted %.6e", - pow(10, grids->log10_Mcrit_MCG_ave), averages_box.m_turn_mcg); + LOG_SUPER_DEBUG("log10 Mutrn ACG: %.6e", pow(10, grids->log10_Mcrit_ACG_ave)); + LOG_SUPER_DEBUG("log10 Mutrn MCG: %.6e", pow(10, grids->log10_Mcrit_MCG_ave)); if (matter_options_global->USE_INTERPOLATION_TABLES > 0) { freeSigmaMInterpTable(); @@ -903,13 +723,14 @@ int ComputeHaloBox(double redshift, InitialConditions *ini_boxes, PerturbedField // test function for getting halo properties from the wrapper, can use a lot of memory for large // catalogs int test_halo_props(double redshift, float *vcb_grid, float *J21_LW_grid, float *z_re_grid, - float *Gamma12_ion_grid, int n_halos, float *halo_masses, float *halo_coords, - float *star_rng, float *sfr_rng, float *xray_rng, float *halo_props_out) { + float *Gamma12_ion_grid, unsigned long long int n_halos, float *halo_masses, + float *halo_coords, float *star_rng, float *sfr_rng, float *xray_rng, + float *halo_props_out) { int status; Try { // get parameters - struct ScalingConstants hbox_consts; + ScalingConstants hbox_consts; set_scaling_constants(redshift, &hbox_consts, true); print_sc_consts(&hbox_consts); @@ -931,7 +752,7 @@ int test_halo_props(double redshift, float *vcb_grid, float *J21_LW_grid, float double M_turn_r = 0.; double in_props[3], halo_pos[3]; - struct HaloProperties out_props; + HaloProperties out_props; #pragma omp for for (i_halo = 0; i_halo < n_halos; i_halo++) { diff --git a/src/py21cmfast/src/HaloBox.h b/src/py21cmfast/src/HaloBox.h index a81f77e25..5ca2e005f 100644 --- a/src/py21cmfast/src/HaloBox.h +++ b/src/py21cmfast/src/HaloBox.h @@ -8,10 +8,57 @@ #include "OutputStructs.h" #include "PerturbHaloField.h" #include "SpinTemperatureBox.h" +#include "scaling_relations.h" -// Compute the HaloBox Object -int ComputeHaloBox(double redshift, InitialConditions *ini_boxes, PerturbedField *perturbed_field, - PerturbHaloField *halos, TsBox *previous_spin_temp, - IonizedBox *previous_ionize_box, HaloBox *grids); +#ifdef __cplusplus +extern "C" { +#endif + +// struct holding each halo property we currently need. +// This is only used for both averages over the box/catalogues +// as well as an individual halo's properties +typedef struct HaloProperties { + double count; // from integral + double halo_mass; + double stellar_mass; + double halo_sfr; + double stellar_mass_mini; + double sfr_mini; + double fescweighted_sfr; + double n_ion; + double halo_xray; + double metallicity; + double m_turn_acg; + double m_turn_mcg; + double m_turn_reion; +} HaloProperties; + +// TODO: apply this constant struct to the EvaluateX functions in interp_tables.c, +// the integral_wrappers.c functions, and other places where the tables are called +// (probably not hmf.c) +typedef struct IntegralCondition { + double redshift; + double growth_factor; + double M_min; + double lnM_min; + double M_max; + double lnM_max; + double M_cell; + double lnM_cell; + double sigma_cell; +} IntegralCondition; + +void set_integral_constants(IntegralCondition *consts, double redshift, double M_min, double M_max, + double M_cell); +int ComputeHaloBox(double redshift, InitialConditions *ini_boxes, PerturbHaloField *halos, + TsBox *previous_spin_temp, IonizedBox *previous_ionize_box, HaloBox *grids); + +void get_cell_integrals(double dens, double l10_mturn_a, double l10_mturn_m, + ScalingConstants *consts, IntegralCondition *int_consts, + HaloProperties *properties); + +#ifdef __cplusplus +} +#endif #endif diff --git a/src/py21cmfast/src/HaloField.cu b/src/py21cmfast/src/HaloField.cu new file mode 100644 index 000000000..0390316c2 --- /dev/null +++ b/src/py21cmfast/src/HaloField.cu @@ -0,0 +1,22 @@ +#ifndef _HALOFIELD_CU +#define _HALOFIELD_CU + +#include + +#include "DeviceConstants.cuh" +#include "HaloField.cuh" + +// define relevant variables stored in constant memory +__constant__ MatterOptions d_matter_options; +__constant__ SimulationOptions d_simulation_options; +__constant__ CosmoParams d_cosmo_params; +__constant__ AstroParams d_astro_params; + +void updateGlobalParams(SimulationOptions *h_simulation_options, MatterOptions * h_matter_options, CosmoParams *h_cosmo_params, AstroParams *h_astro_params){ + cudaMemcpyToSymbol(d_simulation_options, h_simulation_options, sizeof(SimulationOptions), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(d_matter_options, h_matter_options, sizeof(MatterOptions), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(d_cosmo_params, h_cosmo_params, sizeof(CosmoParams), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(d_astro_params, h_astro_params, sizeof(AstroParams), 0, cudaMemcpyHostToDevice); +} + +#endif diff --git a/src/py21cmfast/src/HaloField.cuh b/src/py21cmfast/src/HaloField.cuh new file mode 100644 index 000000000..128f340e4 --- /dev/null +++ b/src/py21cmfast/src/HaloField.cuh @@ -0,0 +1,15 @@ +#ifndef _HALOFIELD_CUH +#define _HALOFIELD_CUH +#include "InputParameters.h" +#include "interpolation_types.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + void updateGlobalParams(SimulationOptions *h_simulation_options, CosmoParams *h_cosmo_params, AstroParams *h_astro_params); +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/py21cmfast/src/HaloField.h b/src/py21cmfast/src/HaloField.h index 1eb35b79b..7c78fd6e4 100644 --- a/src/py21cmfast/src/HaloField.h +++ b/src/py21cmfast/src/HaloField.h @@ -5,7 +5,13 @@ #include "InputParameters.h" #include "OutputStructs.h" +#ifdef __cplusplus +extern "C" { +#endif int ComputeHaloField(float redshift_desc, float redshift, InitialConditions *boxes, unsigned long long int random_seed, HaloField *halos_desc, HaloField *halos); +#ifdef __cplusplus +} +#endif #endif diff --git a/src/py21cmfast/src/InitialConditions.c b/src/py21cmfast/src/InitialConditions.c index f75678f86..3a8fff291 100644 --- a/src/py21cmfast/src/InitialConditions.c +++ b/src/py21cmfast/src/InitialConditions.c @@ -94,6 +94,22 @@ int ComputeInitialConditions(unsigned long long random_seed, InitialConditions * int status; + bool use_cuda = false; + if (use_cuda) { + printf("Check GPU device ...\n\n"); +#if CUDA_FOUND + // print key device properties + print_key_device_properties(); + + // tmp: ensure hello_world works on GPU + call_cuda(); +#else + LOG_ERROR( + "CUDA function print_key_device_properties() and call_cuda() called but code was not " + "compiled for CUDA."); +#endif + } + Try { // This Try wraps the entire function so we don't indent. // Makes the parameter structs visible to a variety of functions/macros diff --git a/src/py21cmfast/src/InitialConditions.h b/src/py21cmfast/src/InitialConditions.h index 835245926..beb950d22 100644 --- a/src/py21cmfast/src/InitialConditions.h +++ b/src/py21cmfast/src/InitialConditions.h @@ -4,9 +4,17 @@ #include -#include "InputParameters.h" +#ifdef __cplusplus +extern "C" { +#endif #include "OutputStructs.h" int ComputeInitialConditions(unsigned long long random_seed, InitialConditions *boxes); +void seed_rng_threads(gsl_rng *rng_arr[], unsigned long long int seed); +void free_rng_threads(gsl_rng *rng_arr[]); + +#ifdef __cplusplus +} +#endif #endif diff --git a/src/py21cmfast/src/InputParameters.c b/src/py21cmfast/src/InputParameters.c index 574f6ca72..6abd76fb5 100644 --- a/src/py21cmfast/src/InputParameters.c +++ b/src/py21cmfast/src/InputParameters.c @@ -1,5 +1,8 @@ #include "InputParameters.h" +#include +#include + void Broadcast_struct_global_all(SimulationOptions *simulation_options, MatterOptions *matter_options, CosmoParams *cosmo_params, AstroParams *astro_params, AstroOptions *astro_options) { diff --git a/src/py21cmfast/src/InputParameters.h b/src/py21cmfast/src/InputParameters.h index ce30c5ae5..8ace82b70 100644 --- a/src/py21cmfast/src/InputParameters.h +++ b/src/py21cmfast/src/InputParameters.h @@ -2,9 +2,169 @@ #define _PARAMSTRUCTURES_H #include -// since ffi.cdef() cannot include directives, we store the types and globals in another file -// Since it is unguarded, make sure to ONLY include this file from here -#include "_inputparams_wrapper.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct CosmoParams { + float SIGMA_8; + float hlittle; + float OMm; + float OMl; + float OMb; + float POWER_INDEX; + + float OMn; + float OMk; + float OMr; + float OMtot; + float Y_He; + float wl; + +} CosmoParams; + +typedef struct SimulationOptions { + // Parameters taken from INIT_PARAMS.H + int HII_DIM; + int DIM; + float BOX_LEN; + float NON_CUBIC_FACTOR; + int N_THREADS; + double Z_HEAT_MAX; + double ZPRIME_STEP_FACTOR; + + // Halo Sampler Options + float SAMPLER_MIN_MASS; + double SAMPLER_BUFFER_FACTOR; + int N_COND_INTERP; + int N_PROB_INTERP; + double MIN_LOGPROB; + double HALOMASS_CORRECTION; + double PARKINSON_G0; + double PARKINSON_y1; + double PARKINSON_y2; + + float INITIAL_REDSHIFT; + double DELTA_R_FACTOR; + double DENSITY_SMOOTH_RADIUS; + + double DEXM_OPTIMIZE_MINMASS; + double DEXM_R_OVERLAP; + + double CORR_STAR; + double CORR_SFR; + double CORR_LX; +} SimulationOptions; + +typedef struct MatterOptions { + bool USE_FFTW_WISDOM; + int HMF; + int USE_RELATIVE_VELOCITIES; + int POWER_SPECTRUM; + int USE_INTERPOLATION_TABLES; + bool PERTURB_ON_HIGH_RES; + int PERTURB_ALGORITHM; + bool MINIMIZE_MEMORY; + bool KEEP_3D_VELOCITIES; + bool DEXM_OPTIMIZE; + int FILTER; + int HALO_FILTER; + bool SMOOTH_EVOLVED_DENSITY_FIELD; + + bool USE_HALO_FIELD; + bool HALO_STOCHASTICITY; + bool FIXED_HALO_GRIDS; + int SAMPLE_METHOD; +} MatterOptions; + +typedef struct AstroParams { + float HII_EFF_FACTOR; + + // SHMR + float F_STAR10; + float ALPHA_STAR; + float ALPHA_STAR_MINI; + float SIGMA_STAR; + double UPPER_STELLAR_TURNOVER_MASS; + double UPPER_STELLAR_TURNOVER_INDEX; + float F_STAR7_MINI; + + // SFMS + float t_STAR; + double SIGMA_SFR_INDEX; + double SIGMA_SFR_LIM; + + // L_X/SFR + double L_X; + double L_X_MINI; + double SIGMA_LX; + + // Escape Fraction + float F_ESC10; + float ALPHA_ESC; + float F_ESC7_MINI; + + float T_RE; + + float M_TURN; + float R_BUBBLE_MAX; + float ION_Tvir_MIN; + double F_H2_SHIELD; + float NU_X_THRESH; + float X_RAY_SPEC_INDEX; + float X_RAY_Tvir_MIN; + + double A_LW; + double BETA_LW; + double A_VCB; + double BETA_VCB; + + double FIXED_VAVG; + double POP2_ION; + double POP3_ION; + + double PHOTONCONS_CALIBRATION_END; + double CLUMPING_FACTOR; + double ALPHA_UVB; + + float R_MAX_TS; + int N_STEP_TS; + double DELTA_R_HII_FACTOR; + float R_BUBBLE_MIN; + double MAX_DVDR; + double NU_X_MAX; + double NU_X_BAND_MAX; +} AstroParams; + +typedef struct AstroOptions { + bool USE_MINI_HALOS; + bool USE_CMB_HEATING; // CMB Heating Flag + bool USE_LYA_HEATING; // Lya Heating Flag + bool USE_MASS_DEPENDENT_ZETA; + bool INHOMO_RECO; + bool USE_TS_FLUCT; + bool M_MIN_in_Mass; + bool FIX_VCB_AVG; + bool USE_EXP_FILTER; + bool CELL_RECOMB; + int PHOTON_CONS_TYPE; + bool USE_UPPER_STELLAR_TURNOVER; + bool HALO_SCALING_RELATIONS_MEDIAN; + int HII_FILTER; + int HEAT_FILTER; + bool IONISE_ENTIRE_SPHERE; + bool AVG_BELOW_SAMPLER; + int INTEGRATION_METHOD_ATOMIC; + int INTEGRATION_METHOD_MINI; +} AstroOptions; + +typedef struct ConfigSettings { + double HALO_CATALOG_MEM_FACTOR; + bool EXTRA_HALOBOX_FIELDS; + char external_table_path[200]; + char wisdoms_path[200]; +} ConfigSettings; void Broadcast_struct_global_all(SimulationOptions *simulation_options, MatterOptions *matter_options, CosmoParams *cosmo_params, @@ -12,4 +172,31 @@ void Broadcast_struct_global_all(SimulationOptions *simulation_options, void Broadcast_struct_global_noastro(SimulationOptions *simulation_options, MatterOptions *matter_options, CosmoParams *cosmo_params); +void set_external_table_path(ConfigSettings *params, const char *value); +char *get_external_table_path(ConfigSettings *params); +void set_wisdoms_path(ConfigSettings *params, const char *value); +char *get_wisdoms_path(ConfigSettings *params); + +/* Previously, we had a few structures spread throughout the code e.g simulation_options_ufunc which + were all globally defined and separately broadcast at different times. Several of these were used + across different files and some inside #defines (e.g indexing.h), so for now I've combined + the parameter structures to avoid confusion (we shouldn't have the possibility of two files using + different parameters). + + In future we should have a parameter structure in each .c file containing ONLY parameters + relevant to it (look at HaloBox.c), and force the broadcast at each _compute() step (or even + decorate any library call) However this would require us to be very careful about initialising + the globals when ANY function from that file is called */ +// The structs declared here defined in InputParameters.c +extern SimulationOptions *simulation_options_global; +extern MatterOptions *matter_options_global; +extern CosmoParams *cosmo_params_global; +extern AstroParams *astro_params_global; +extern AstroOptions *astro_options_global; + +extern ConfigSettings config_settings; + +#ifdef __cplusplus +} +#endif #endif diff --git a/src/py21cmfast/src/IonisationBox.c b/src/py21cmfast/src/IonisationBox.c index e55f884a9..2526286b1 100644 --- a/src/py21cmfast/src/IonisationBox.c +++ b/src/py21cmfast/src/IonisationBox.c @@ -54,7 +54,7 @@ struct IonBoxConstants { int hii_filter; // astro parameters - struct ScalingConstants scale_consts; + ScalingConstants scale_consts; double T_re; // astro calculated values @@ -135,7 +135,7 @@ void set_ionbox_constants(double redshift, double prev_redshift, struct IonBoxCo else consts->dz = prev_redshift - redshift; - struct ScalingConstants sc; + ScalingConstants sc; set_scaling_constants(redshift, &sc, true); consts->scale_consts = sc; @@ -206,9 +206,7 @@ void set_ionbox_constants(double redshift, double prev_redshift, struct IonBoxCo pow(1 + redshift, 2) * CMperMPC * SIGMA_HI * astro_params_global->ALPHA_UVB / (astro_params_global->ALPHA_UVB + 2.75) * N_b0 * consts->ion_eff_factor / 1.0e-12; if (matter_options_global->USE_HALO_FIELD) - consts->gamma_prefactor /= - RHOcrit * cosmo_params_global->OMb; // TODO: double-check these unit differences, - // HaloBox.halo_wsfr vs Nion_General units + consts->gamma_prefactor /= RHOcrit * cosmo_params_global->OMb; else consts->gamma_prefactor = consts->gamma_prefactor / (sc.t_h * sc.t_star); @@ -447,7 +445,7 @@ void calculate_mcrit_boxes(IonizedBox *prev_ionbox, TsBox *spin_temp, InitialCon void set_mean_fcoll(struct IonBoxConstants *c, IonizedBox *prev_box, IonizedBox *curr_box, double mturn_acg, double mturn_mcg, double *f_limit_acg, double *f_limit_mcg) { double f_coll_curr = 0., f_coll_prev = 0., f_coll_curr_mini = 0., f_coll_prev_mini = 0.; - struct ScalingConstants *sc_ptr = &(c->scale_consts); + ScalingConstants *sc_ptr = &(c->scale_consts); if (astro_options_global->USE_MASS_DEPENDENT_ZETA) { f_coll_curr = Nion_General(c->redshift, c->lnMmin, c->lnMmax_gl, mturn_acg, sc_ptr); *f_limit_acg = Nion_General(simulation_options_global->Z_HEAT_MAX, c->lnMmin, c->lnMmax_gl, @@ -669,7 +667,7 @@ void setup_integration_tables(struct FilteredGrids *fg_struct, struct IonBoxCons double min_density, max_density, prev_min_density = 0., prev_max_density = 0.; double log10Mturn_min = 0., log10Mturn_max = 0., log10Mturn_min_MINI = 0., log10Mturn_max_MINI = 0.; - struct ScalingConstants *sc_ptr = &(consts->scale_consts); + ScalingConstants *sc_ptr = &(consts->scale_consts); // TODO: instead of putting a random upper limit, put a proper flag for switching of one/both // sides of the clipping @@ -741,7 +739,7 @@ void calculate_fcoll_grid(IonizedBox *box, IonizedBox *previous_ionize_box, double f_coll_total = 0., f_coll_MINI_total = 0.; // TODO: make proper error tracking through the parallel region bool error_flag; - struct ScalingConstants *sc_ptr = &(consts->scale_consts); + ScalingConstants *sc_ptr = &(consts->scale_consts); int fc_r_idx; fc_r_idx = (astro_options_global->USE_MINI_HALOS && !matter_options_global->USE_HALO_FIELD) @@ -1379,6 +1377,28 @@ int ComputeIonizedBox(float redshift, float prev_redshift, PerturbedField *pertu int n_radii; n_radii = setup_radii(&radii_spec, &ionbox_constants); + fftwf_complex *d_deltax_filtered = NULL; + fftwf_complex *d_xe_filtered = NULL; + float *d_y_arr = NULL; + float *d_Fcoll = NULL; //_outputstructs_wrapper.h + + unsigned int threadsPerBlock; + unsigned int numBlocks; + + // If GPU & flags call init_ionbox_gpu_data() + bool use_cuda = false; // pass this as a parameter later + if (use_cuda && astro_options_global->USE_MASS_DEPENDENT_ZETA && + !astro_options_global->USE_MINI_HALOS && !matter_options_global->USE_HALO_FIELD) { + unsigned int Nion_nbins = get_nbins(); +#if CUDA_FOUND + init_ionbox_gpu_data(&d_deltax_filtered, &d_xe_filtered, &d_y_arr, &d_Fcoll, Nion_nbins, + HII_TOT_NUM_PIXELS, HII_KSPACE_NUM_PIXELS, &threadsPerBlock, + &numBlocks); +#else + LOG_ERROR( + "CUDA function init_ionbox_gpu_data() called but code was not compiled for CUDA."); +#endif + } // CONSTRUCT GRIDS OUTSIDE R LOOP HERE // if we don't have a previous ionised box, make a fake one here if (prev_redshift < 1) @@ -1528,8 +1548,27 @@ int ComputeIonizedBox(float redshift, float prev_redshift, PerturbedField *pertu need_prev_ion); } - calculate_fcoll_grid(box, previous_ionize_box, grid_struct, &ionbox_constants, - &curr_radius); + // If GPU & flags, call gpu version of calculate_fcoll_grid() + bool use_cuda = false; // pass this as a parameter later + if (use_cuda && astro_options_global->USE_MASS_DEPENDENT_ZETA && + !astro_options_global->USE_MINI_HALOS && + !matter_options_global->USE_HALO_FIELD) { +#if CUDA_FOUND + calculate_fcoll_grid_gpu(box, grid_struct->deltax_filtered, + grid_struct->xe_filtered, + &curr_radius.f_coll_grid_mean, d_deltax_filtered, + d_xe_filtered, d_Fcoll, d_y_arr, HII_TOT_NUM_PIXELS, + HII_KSPACE_NUM_PIXELS, &threadsPerBlock, &numBlocks); +#else + LOG_ERROR( + "CUDA function calculate_fcoll_grid_gpu() called but code was not compiled " + "for CUDA."); +#endif + } else { + calculate_fcoll_grid(box, previous_ionize_box, grid_struct, &ionbox_constants, + &curr_radius); + } + // To avoid ST_over_PS becoming nan when f_coll = 0, I set f_coll = FRACT_FLOAT_ERR. // TODO: This was the previous behaviour, but is this right? // setting the *total* to the minimum for the adjustment factor, @@ -1554,6 +1593,17 @@ int ComputeIonizedBox(float redshift, float prev_redshift, PerturbedField *pertu LOG_ULTRA_DEBUG("z_reion after R=%f: ", curr_radius.R); debugSummarizeBox(box->z_reion, simulation_options_global->HII_DIM, simulation_options_global->HII_DIM, HII_D_PARA, " "); +#endif + } + // If GPU & flags, call free_ionbox_gpu_data() + if (use_cuda && astro_options_global->USE_MASS_DEPENDENT_ZETA && + !astro_options_global->USE_MINI_HALOS && !matter_options_global->USE_HALO_FIELD) { +#if USE_CUDA + free_ionbox_gpu_data(&d_deltax_filtered, &d_xe_filtered, &d_y_arr, &d_Fcoll); +#else + LOG_ERROR( + "CUDA function free_ionbox_gpu_data() called but code was not compiled for " + "CUDA."); #endif } set_ionized_temperatures(box, perturbed_field, spin_temp, &ionbox_constants); diff --git a/src/py21cmfast/src/IonisationBox.cu b/src/py21cmfast/src/IonisationBox.cu new file mode 100644 index 000000000..f0d473825 --- /dev/null +++ b/src/py21cmfast/src/IonisationBox.cu @@ -0,0 +1,240 @@ +#include "cexcept.h" +#include "exceptions.h" +#include "logger.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// GPU +#include +#include +#include +// We use thrust for reduction +#include +#include // thrust::plus +#include + +#include "Constants.h" +#include "InitialConditions.h" +#include "InputParameters.h" +#include "OutputStructs.h" +#include "bubble_helper_progs.h" +#include "cosmology.h" +#include "debugging.h" +#include "dft.h" +#include "filtering.h" +#include "heating_helper_progs.h" +#include "hmf.h" +#include "indexing.h" +#include "interp_tables.h" +#include "photoncons.h" +#include "recombinations.h" +#include "thermochem.h" + +#include "IonisationBox_gpu.h" +#include "cuda_utils.cuh" + +__device__ inline double EvaluateRGTable1D_f_gpu(double x, double x_min, + double x_width, float *y_arr) { + + int idx = (int)floor((x - x_min) / x_width); + + double table_val = x_min + x_width * (float)idx; + double interp_point = (x - table_val) / x_width; + + return y_arr[idx] * (1 - interp_point) + y_arr[idx + 1] * (interp_point); +} + +// template +__global__ void +compute_Fcoll(cuFloatComplex *deltax_filtered, // fg_struct + cuFloatComplex *xe_filtered, // fg_struct + float *y_arr, // Nion_conditional_table1D + double x_min, // Nion_conditional_table1D + double x_width, // Nion_conditional_table1D + double fract_float_err, // FRACT_FLOAT_ERR + bool use_ts_fluct, // flag_options_global->USE_TS_FLUCT + unsigned long long hii_tot_num_pixels, // HII_TOT_NUM_PIXELS + long long hii_d, // HII_D + long long hii_d_para, // HII_D_PARA + long long hii_mid_para, // HII_MID_PARA + float *Fcoll // box +) { + // Get index of grids + unsigned long long idx = blockIdx.x * blockDim.x + threadIdx.x; + + // Bound check + if (idx >= hii_tot_num_pixels) { + return; + } + + // Get x, y, z from idx using HII_R_INDEX macro formula + int z = idx % hii_d_para; + unsigned long long remaining = idx / hii_d_para; + int y = remaining % hii_d; + int x = remaining / hii_d; + + // Get FFT index using HII_R_FFT_INDEX macro formula + unsigned long long fft_idx = z + 2 * (hii_mid_para + 1) * (y + hii_d * x); + + // These clippings could be made in the calling function, using thrust, rather + // than here... + + // Clip the filtered grids to physical values + // delta cannot be less than -1 + *((float *)deltax_filtered + fft_idx) = + fmaxf(*((float *)deltax_filtered + fft_idx), -1. + fract_float_err); + // cannot be less than zero + // x_e has to be between zero and unity + if (use_ts_fluct) { + *((float *)xe_filtered + fft_idx) = + fmaxf(*((float *)xe_filtered + fft_idx), 0.0); + *((float *)xe_filtered + fft_idx) = + fminf(*((float *)xe_filtered + fft_idx), 0.999); + } + + // Compute collapse fraction + Fcoll[idx] = exp(EvaluateRGTable1D_f_gpu( + *((float *)deltax_filtered + fft_idx), x_min, x_width, y_arr)); +} + +void init_ionbox_gpu_data( + fftwf_complex **d_deltax_filtered, // copies of pointers to pointers + fftwf_complex **d_xe_filtered, float **d_y_arr, float **d_Fcoll, + unsigned int nbins, // nbins for Nion_conditional_table1D->y + unsigned long long hii_tot_num_pixels, // HII_TOT_NUM_PIXELS + unsigned long long hii_kspace_num_pixels, // HII_KSPACE_NUM_PIXELS + unsigned int *threadsPerBlock, unsigned int *numBlocks) { + CALL_CUDA(cudaMalloc( + (void **)d_deltax_filtered, + sizeof(fftwf_complex) * + hii_kspace_num_pixels)); // already pointers to pointers (no & needed) + CALL_CUDA(cudaMemset( + *d_deltax_filtered, 0, + sizeof(fftwf_complex) * + hii_kspace_num_pixels)); // dereference the pointer to a pointer (*) + + if (astro_options_global->USE_TS_FLUCT) { + CALL_CUDA(cudaMalloc((void **)d_xe_filtered, + sizeof(fftwf_complex) * hii_kspace_num_pixels)); + CALL_CUDA(cudaMemset(*d_xe_filtered, 0, + sizeof(fftwf_complex) * hii_kspace_num_pixels)); + } + + CALL_CUDA(cudaMalloc((void **)d_y_arr, sizeof(float) * nbins)); + CALL_CUDA(cudaMemset(*d_y_arr, 0, sizeof(float) * nbins)); + + CALL_CUDA(cudaMalloc((void **)d_Fcoll, sizeof(float) * hii_tot_num_pixels)); + CALL_CUDA(cudaMemset(*d_Fcoll, 0, sizeof(float) * hii_tot_num_pixels)); + + LOG_INFO("Ionisation grids allocated on device."); + LOG_INFO("Ionisation grids initialised on device."); + + // Get max threads/block for device + int maxThreadsPerBlock; + CALL_CUDA(cudaDeviceGetAttribute(&maxThreadsPerBlock, + cudaDevAttrMaxThreadsPerBlock, 0)); + + // Set threads/block based on device max + if (maxThreadsPerBlock >= 512) { + *threadsPerBlock = 512; + } else if (maxThreadsPerBlock >= 256) { + *threadsPerBlock = 256; + } else if (maxThreadsPerBlock >= 128) { + *threadsPerBlock = 128; + } else if (maxThreadsPerBlock >= 64) { + *threadsPerBlock = 64; + } else if (maxThreadsPerBlock >= 32) { + *threadsPerBlock = 32; + } else { + *threadsPerBlock = 16; + } + + *numBlocks = (hii_tot_num_pixels + *threadsPerBlock - 1) / *threadsPerBlock; +} + +void calculate_fcoll_grid_gpu( + IonizedBox *box, // for box->Fcoll + fftwf_complex *h_deltax_filtered, // members of fg_struct + fftwf_complex *h_xe_filtered, + double *f_coll_grid_mean, // member of rspec + fftwf_complex *d_deltax_filtered, // device pointers + fftwf_complex *d_xe_filtered, float *d_Fcoll, float *d_y_arr, + unsigned long long hii_tot_num_pixels, // HII_TOT_NUM_PIXELS + unsigned long long hii_kspace_num_pixels, // HII_KSPACE_NUM_PIXELS + unsigned int *threadsPerBlock, unsigned int *numBlocks) { + RGTable1D_f *Nion_conditional_table1D = get_Nion_conditional_table1D(); + + // Copy grids from host to device + CALL_CUDA(cudaMemcpy(d_deltax_filtered, h_deltax_filtered, + sizeof(fftwf_complex) * hii_kspace_num_pixels, + cudaMemcpyHostToDevice)); + if (astro_options_global->USE_TS_FLUCT) { + CALL_CUDA(cudaMemcpy(d_xe_filtered, h_xe_filtered, + sizeof(fftwf_complex) * hii_kspace_num_pixels, + cudaMemcpyHostToDevice)); + } + CALL_CUDA(cudaMemcpy(d_y_arr, Nion_conditional_table1D->y_arr, + sizeof(float) * Nion_conditional_table1D->n_bin, + cudaMemcpyHostToDevice)); + LOG_INFO("Ionisation grids copied to device."); + + // TODO: Can I pass these straight to kernel? (or access in kernel w/ Tiger's + // method) + double fract_float_err = FRACT_FLOAT_ERR; + bool use_ts_fluct = astro_options_global->USE_TS_FLUCT; + long long hii_d = HII_D; + long long hii_d_para = HII_D_PARA; + long long hii_mid_para = HII_MID_PARA; + + // Invoke kernel + compute_Fcoll<<<*numBlocks, *threadsPerBlock>>>( + reinterpret_cast(d_deltax_filtered), + reinterpret_cast(d_xe_filtered), d_y_arr, + Nion_conditional_table1D->x_min, Nion_conditional_table1D->x_width, + fract_float_err, use_ts_fluct, hii_tot_num_pixels, hii_d, hii_d_para, + hii_mid_para, d_Fcoll); + CALL_CUDA(cudaDeviceSynchronize()); + LOG_INFO("IonisationBox compute_Fcoll kernel called."); + + // Use thrust to reduce computed sums to one value. + // Wrap device pointer in a thrust::device_ptr + thrust::device_ptr d_Fcoll_ptr(d_Fcoll); + // Reduce final buffer sums to one value + double f_coll_grid_total = thrust::reduce( + d_Fcoll_ptr, d_Fcoll_ptr + hii_tot_num_pixels, 0., thrust::plus()); + *f_coll_grid_mean = f_coll_grid_total / (double)hii_tot_num_pixels; + LOG_INFO("Fcoll sum reduced to single value by thrust::reduce operation."); + + // Copy results from device to host + CALL_CUDA(cudaMemcpy(box->Fcoll, d_Fcoll, sizeof(float) * hii_tot_num_pixels, + cudaMemcpyDeviceToHost)); + CALL_CUDA(cudaMemcpy(h_deltax_filtered, d_deltax_filtered, + sizeof(fftwf_complex) * hii_kspace_num_pixels, + cudaMemcpyDeviceToHost)); + if (astro_options_global->USE_TS_FLUCT) { + CALL_CUDA(cudaMemcpy(h_xe_filtered, d_xe_filtered, + sizeof(fftwf_complex) * hii_kspace_num_pixels, + cudaMemcpyDeviceToHost)); + } + LOG_INFO("Grids copied to host."); +} + +void free_ionbox_gpu_data( + fftwf_complex **d_deltax_filtered, // copies of pointers to pointers + fftwf_complex **d_xe_filtered, float **d_y_arr, float **d_Fcoll) { + CALL_CUDA(cudaFree( + *d_deltax_filtered)); // Need to dereference the pointers to pointers (*) + if (astro_options_global->USE_TS_FLUCT) { + CALL_CUDA(cudaFree(*d_xe_filtered)); + } + CALL_CUDA(cudaFree(*d_y_arr)); + CALL_CUDA(cudaFree(*d_Fcoll)); + LOG_INFO("Device memory freed."); +} diff --git a/src/py21cmfast/src/IonisationBox.h b/src/py21cmfast/src/IonisationBox.h index e3a12e0b7..a221a2064 100644 --- a/src/py21cmfast/src/IonisationBox.h +++ b/src/py21cmfast/src/IonisationBox.h @@ -1,12 +1,21 @@ #ifndef _IONBOX_H #define _IONBOX_H +#include +#include + #include "InputParameters.h" #include "OutputStructs.h" +#ifdef __cplusplus +extern "C" { +#endif int ComputeIonizedBox(float redshift, float prev_redshift, PerturbedField *perturbed_field, PerturbedField *previous_perturbed_field, IonizedBox *previous_ionize_box, TsBox *spin_temp, HaloBox *halos, InitialConditions *ini_boxes, IonizedBox *box); +#ifdef __cplusplus +} +#endif #endif diff --git a/src/py21cmfast/src/IonisationBox_gpu.h b/src/py21cmfast/src/IonisationBox_gpu.h new file mode 100644 index 000000000..62a59fcc3 --- /dev/null +++ b/src/py21cmfast/src/IonisationBox_gpu.h @@ -0,0 +1,35 @@ +#ifndef _IONBOX_H +#define _IONBOX_H + +#include +#include +// #include + +#include "InputParameters.h" +#include "OutputStructs.h" + +#ifdef __cplusplus +extern "C" { +#endif +void init_ionbox_gpu_data(fftwf_complex **d_deltax_filtered, // copies of pointers to pointers + fftwf_complex **d_xe_filtered, float **d_y_arr, float **d_Fcoll, + unsigned int nbins, // nbins for Nion_conditional_table1D->y + unsigned long long hii_tot_num_pixels, // HII_TOT_NUM_PIXELS + unsigned long long hii_kspace_num_pixels, // HII_KSPACE_NUM_PIXELS + unsigned int *threadsPerBlock, unsigned int *numBlocks); +void calculate_fcoll_grid_gpu(IonizedBox *box, // for box->Fcoll + fftwf_complex *h_deltax_filtered, // members of fg_struct + fftwf_complex *h_xe_filtered, + double *f_coll_grid_mean, // member of rspec + fftwf_complex *d_deltax_filtered, // device pointers + fftwf_complex *d_xe_filtered, float *d_Fcoll, float *d_y_arr, + unsigned long long hii_tot_num_pixels, // HII_TOT_NUM_PIXELS + unsigned long long hii_kspace_num_pixels, // HII_KSPACE_NUM_PIXELS + unsigned int *threadsPerBlock, unsigned int *numBlocks); +void free_ionbox_gpu_data(fftwf_complex **d_deltax_filtered, // copies of pointers to pointers + fftwf_complex **d_xe_filtered, float **d_y_arr, float **d_Fcoll); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/src/py21cmfast/src/LuminosityFunction.h b/src/py21cmfast/src/LuminosityFunction.h index 5c8b3049e..1863d756d 100644 --- a/src/py21cmfast/src/LuminosityFunction.h +++ b/src/py21cmfast/src/LuminosityFunction.h @@ -3,7 +3,13 @@ #include "InputParameters.h" +#ifdef __cplusplus +extern "C" { +#endif int ComputeLF(int nbins, int component, int NUM_OF_REDSHIFT_FOR_LF, float *z_LF, float *M_TURNs, double *M_uv_z, double *M_h_z, double *log10phi); +#ifdef __cplusplus +} +#endif #endif diff --git a/src/py21cmfast/src/MapMass_gpu.cu b/src/py21cmfast/src/MapMass_gpu.cu new file mode 100644 index 000000000..d2034ee37 --- /dev/null +++ b/src/py21cmfast/src/MapMass_gpu.cu @@ -0,0 +1,386 @@ +// Re-write of perturb_field.c for being accessible within the MCMC + +#include +#include +#include +#include +#include +#include + +// GPU +#include +#include + +#include "cexcept.h" +#include "exceptions.h" +#include "logger.h" +#include "Constants.h" +#include "indexing.h" +#include "InputParameters.h" +#include "OutputStructs.h" +#include "cosmology.h" +#include "dft.h" +#include "debugging.h" +#include "filtering.h" + +#include "PerturbField.h" + +// #define R_INDEX(x,y,z)((unsigned long long)((z)+D_PARA*((y)+D*(x)))) +__device__ inline unsigned long long compute_R_INDEX(int i, int j, int k, int dim, long long d_para) { + return k + d_para * (j + dim * i); +} + +// #define HII_R_INDEX(x,y,z)((unsigned long long)((z)+HII_D_PARA*((y)+HII_D*(x)))) +__device__ inline unsigned long long compute_HII_R_INDEX(int i, int j, int k, int hii_d, long long hii_d_para) { + return k + hii_d_para * (j + hii_d * i); +} + +// Is const needed as well as __restrict__? +__global__ void perturb_density_field_kernel( + double *resampled_box, + // const float* __restrict__ hires_density, + // const float* __restrict__ hires_vx, + // const float* __restrict__ hires_vy, + // const float* __restrict__ hires_vz, + // const float* __restrict__ lowres_vx, + // const float* __restrict__ lowres_vy, + // const float* __restrict__ lowres_vz, + // const float* __restrict__ hires_vx_2LPT, + // const float* __restrict__ hires_vy_2LPT, + // const float* __restrict__ hires_vz_2LPT, + // const float* __restrict__ lowres_vx_2LPT, + // const float* __restrict__ lowres_vy_2LPT, + // const float* __restrict__ lowres_vz_2LPT, + float* hires_density, + float* hires_vx, + float* hires_vy, + float* hires_vz, + float* lowres_vx, + float* lowres_vy, + float* lowres_vz, + float* hires_vx_2LPT, + float* hires_vy_2LPT, + float* hires_vz_2LPT, + float* lowres_vx_2LPT, + float* lowres_vy_2LPT, + float* lowres_vz_2LPT, + int dimension, int DIM, + long long d_para, long long hii_d, long long hii_d_para, + int non_cubic_factor, + float f_pixel_factor, float init_growth_factor, + bool perturb_on_high_res, bool use_2lpt + ) { + + unsigned long long idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < DIM * DIM * d_para) { + + // Get index of density cell + int i = idx / (d_para * DIM); + int j = (idx / d_para) % DIM; + int k = idx % d_para; + + unsigned long long r_index = compute_R_INDEX(i, j, k, DIM, d_para); + + // Map index to location in units of box size + double xf = (i + 0.5) / (DIM + 0.0); + double yf = (j + 0.5) / (DIM + 0.0); + double zf = (k + 0.5) / (d_para + 0.0); + + // Update locations + unsigned long long HII_index; + + if (perturb_on_high_res) { + // xf += __ldg(&hires_vx[r_index]); + // yf += __ldg(&hires_vy[r_index]); + // zf += __ldg(&hires_vz[r_index]); + xf += hires_vx[r_index]; + yf += hires_vy[r_index]; + zf += hires_vz[r_index]; + } + else { + unsigned long long HII_i = (unsigned long long)(i / f_pixel_factor); + unsigned long long HII_j = (unsigned long long)(j / f_pixel_factor); + unsigned long long HII_k = (unsigned long long)(k / f_pixel_factor); + HII_index = compute_HII_R_INDEX(HII_i, HII_j, HII_k, hii_d, hii_d_para); + // xf += __ldg(&lowres_vx[HII_index]); + // yf += __ldg(&lowres_vy[HII_index]); + // zf += __ldg(&lowres_vz[HII_index]); + xf += lowres_vx[HII_index]; + yf += lowres_vy[HII_index]; + zf += lowres_vz[HII_index]; + } + + // 2LPT (add second order corrections) + if (use_2lpt) { + if (perturb_on_high_res) { + // xf -= __ldg(&hires_vx_2LPT[r_index]); + // yf -= __ldg(&hires_vy_2LPT[r_index]); + // zf -= __ldg(&hires_vz_2LPT[r_index]); + xf -= hires_vx_2LPT[r_index]; + yf -= hires_vy_2LPT[r_index]; + zf -= hires_vz_2LPT[r_index]; + } + else { + // xf -= __ldg(&lowres_vx_2LPT[HII_index]); + // yf -= __ldg(&lowres_vy_2LPT[HII_index]); + // zf -= __ldg(&lowres_vz_2LPT[HII_index]); + xf -= lowres_vx_2LPT[HII_index]; + yf -= lowres_vy_2LPT[HII_index]; + zf -= lowres_vz_2LPT[HII_index]; + } + } + + // TODO: shared between threads? + // Convert once to reduce overhead of multiple casts + double dimension_double = (double)(dimension); + double dimension_factored_double = dimension_double * (double)(non_cubic_factor); + int dimension_factored = dimension * non_cubic_factor; + + // Scale coordinates back to grid size + xf *= dimension_double; + yf *= dimension_double; + zf *= dimension_factored_double; + + // Wrap coordinates to keep them within valid boundaries + xf = fmod(fmod(xf, dimension_double) + dimension_double, dimension_double); + yf = fmod(fmod(yf, dimension_double) + dimension_double, dimension_double); + zf = fmod(fmod(zf, dimension_factored_double) + dimension_factored_double, dimension_factored_double); + + // FROM NVIDIA DOCS: + // __device__ doublenearbyint(double x) // Round the input argument to the nearest integer. + // There are SO many double-to-int conversion intrinsics. How to know if should use any? + + // Get integer values for indices from double precision values + int xi = xf; + int yi = yf; + int zi = zf; + + // Wrap index coordinates to ensure no out-of-bounds array access will be attempted + xi = ((xi % dimension) + dimension) % dimension; + yi = ((yi % dimension) + dimension) % dimension; + zi = ((zi % dimension_factored) + dimension_factored) % dimension_factored; + + // Determine the fraction of the perturbed cell which overlaps with the 8 nearest grid cells, + // based on the grid cell which contains the centre of the perturbed cell + float d_x = fabs(xf - (double)(xi + 0.5)); // Absolute distances from grid cell centre to perturbed cell centre + float d_y = fabs(yf - (double)(yi + 0.5)); // (also) The fractions of mass which will be moved to neighbouring cells + float d_z = fabs(zf - (double)(zi + 0.5)); + + // 8 neighbour cells-of-interest will be shifted left/down/behind if perturbed midpoint is in left/bottom/back corner of cell. + if (xf < (double)(xi + 0.5)) { + // If perturbed cell centre is less than the mid-point then update fraction + // of mass in the cell and determine the cell centre of neighbour to be the + // lowest grid point index + d_x = 1. - d_x; + xi -= 1; + xi = (xi + dimension) % dimension; // Only this critera is possible as iterate back by one (we cannot exceed DIM) + } + if(yf < (double)(yi + 0.5)) { + d_y = 1. - d_y; + yi -= 1; + yi = (yi + dimension) % dimension; + } + if(zf < (double)(zi + 0.5)) { + d_z = 1. - d_z; + zi -= 1; + zi = (zi + (unsigned long long)(non_cubic_factor * dimension)) % (unsigned long long)(non_cubic_factor * dimension); + } + // The fractions of mass which will remain with perturbed cell + float t_x = 1. - d_x; + float t_y = 1. - d_y; + float t_z = 1. - d_z; + + // Determine the grid coordinates of the 8 neighbouring cells. + // Neighbours will be in positive direction; front/right/above cells (-> 2x2 cube, with perturbed cell bottom/left/back) + // Takes into account the offset based on cell centre determined above + int xp1 = (xi + 1) % dimension; + int yp1 = (yi + 1) % dimension; + int zp1 = (zi + 1) % (unsigned long long)(non_cubic_factor * dimension); + + // double scaled_density = 1 + init_growth_factor * __ldg(&hires_density[r_index]); + double scaled_density = 1.0 + init_growth_factor * hires_density[r_index]; + + if (perturb_on_high_res) { + // Redistribute the mass over the 8 neighbouring cells according to cloud in cell + // Cell mass = (1 + init_growth_factor * orig_density) * (proportion of mass to distribute) + atomicAdd(&resampled_box[compute_R_INDEX(xi, yi, zi, DIM, d_para)], scaled_density * t_x * t_y * t_z); + atomicAdd(&resampled_box[compute_R_INDEX(xp1, yi, zi, DIM, d_para)], scaled_density * d_x * t_y * t_z); + atomicAdd(&resampled_box[compute_R_INDEX(xi, yp1, zi, DIM, d_para)], scaled_density * t_x * d_y * t_z); + atomicAdd(&resampled_box[compute_R_INDEX(xp1, yp1, zi, DIM, d_para)], scaled_density * d_x * d_y * t_z); + atomicAdd(&resampled_box[compute_R_INDEX(xi, yi, zp1, DIM, d_para)], scaled_density * t_x * t_y * d_z); + atomicAdd(&resampled_box[compute_R_INDEX(xp1, yi, zp1, DIM, d_para)], scaled_density * d_x * t_y * d_z); + atomicAdd(&resampled_box[compute_R_INDEX(xi, yp1, zp1, DIM, d_para)], scaled_density * t_x * d_y * d_z); + atomicAdd(&resampled_box[compute_R_INDEX(xp1, yp1, zp1, DIM, d_para)], scaled_density * d_x * d_y * d_z); + } + else { + atomicAdd(&resampled_box[compute_HII_R_INDEX(xi, yi, zi, hii_d, hii_d_para)], scaled_density * t_x * t_y * t_z); + atomicAdd(&resampled_box[compute_HII_R_INDEX(xp1, yi, zi, hii_d, hii_d_para)], scaled_density * d_x * t_y * t_z); + atomicAdd(&resampled_box[compute_HII_R_INDEX(xi, yp1, zi, hii_d, hii_d_para)], scaled_density * t_x * d_y * t_z); + atomicAdd(&resampled_box[compute_HII_R_INDEX(xp1, yp1, zi, hii_d, hii_d_para)], scaled_density * d_x * d_y * t_z); + atomicAdd(&resampled_box[compute_HII_R_INDEX(xi, yi, zp1, hii_d, hii_d_para)], scaled_density * t_x * t_y * d_z); + atomicAdd(&resampled_box[compute_HII_R_INDEX(xp1, yi, zp1, hii_d, hii_d_para)], scaled_density * d_x * t_y * d_z); + atomicAdd(&resampled_box[compute_HII_R_INDEX(xi, yp1, zp1, hii_d, hii_d_para)], scaled_density * t_x * d_y * d_z); + atomicAdd(&resampled_box[compute_HII_R_INDEX(xp1, yp1, zp1, hii_d, hii_d_para)], scaled_density * d_x * d_y * d_z); + } + } +} + +double* MapMass_gpu( + InitialConditions *boxes, double *resampled_box, + int dimension, float f_pixel_factor, float init_growth_factor +) { + // Box shapes from outputs.py and convenience macros + size_t size_double, size_float; + if(matter_options_global->PERTURB_ON_HIGH_RES) { + size_double = TOT_NUM_PIXELS * sizeof(double); + size_float = TOT_NUM_PIXELS * sizeof(float); + } + else { + size_double = HII_TOT_NUM_PIXELS * sizeof(double); + size_float = HII_TOT_NUM_PIXELS * sizeof(float); + } + + // Allocate device memory for output box and set to 0. + double* d_resampled_box; + cudaMalloc((void**)&d_resampled_box, size_double); + cudaMemset(d_resampled_box, 0, size_double); // fills size_double bytes with byte=0 + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + LOG_ERROR("CUDA error: %s", cudaGetErrorString(err)); + Throw(CUDAError); + } + + // Allocate device memory for density field + float* hires_density; + cudaMalloc(&hires_density, (TOT_NUM_PIXELS * sizeof(float))); // from 21cmFAST.h, outputs.py & indexing.h + cudaMemcpy(hires_density, boxes->hires_density, (TOT_NUM_PIXELS * sizeof(float)), cudaMemcpyHostToDevice); + + err = cudaGetLastError(); + if (err != cudaSuccess) { + LOG_ERROR("CUDA error: %s", cudaGetErrorString(err)); + Throw(CUDAError); + } + + // Allocate device memory and copy arrays to device as per user_params + float* hires_vx; // floats as per 21cmFAST.h + float* hires_vy; + float* hires_vz; + float* lowres_vx; + float* lowres_vy; + float* lowres_vz; + float* hires_vx_2LPT; + float* hires_vy_2LPT; + float* hires_vz_2LPT; + float* lowres_vx_2LPT; + float* lowres_vy_2LPT; + float* lowres_vz_2LPT; + + if (matter_options_global->PERTURB_ON_HIGH_RES) { + cudaMalloc(&hires_vx, size_float); + cudaMalloc(&hires_vy, size_float); + cudaMalloc(&hires_vz, size_float); + cudaMemcpy(hires_vx, boxes->hires_vx, size_float, cudaMemcpyHostToDevice); + cudaMemcpy(hires_vy, boxes->hires_vy, size_float, cudaMemcpyHostToDevice); + cudaMemcpy(hires_vz, boxes->hires_vz, size_float, cudaMemcpyHostToDevice); + } + else { + cudaMalloc(&lowres_vx, size_float); + cudaMalloc(&lowres_vy, size_float); + cudaMalloc(&lowres_vz, size_float); + cudaMemcpy(lowres_vx, boxes->lowres_vx, size_float, cudaMemcpyHostToDevice); + cudaMemcpy(lowres_vy, boxes->lowres_vy, size_float, cudaMemcpyHostToDevice); + cudaMemcpy(lowres_vz, boxes->lowres_vz, size_float, cudaMemcpyHostToDevice); + } + if (matter_options_global->USE_2LPT) { + if (matter_options_global->PERTURB_ON_HIGH_RES) { + cudaMalloc(&hires_vx_2LPT, size_float); + cudaMalloc(&hires_vy_2LPT, size_float); + cudaMalloc(&hires_vz_2LPT, size_float); + cudaMemcpy(hires_vx_2LPT, boxes->hires_vx_2LPT, size_float, cudaMemcpyHostToDevice); + cudaMemcpy(hires_vy_2LPT, boxes->hires_vy_2LPT, size_float, cudaMemcpyHostToDevice); + cudaMemcpy(hires_vz_2LPT, boxes->hires_vz_2LPT, size_float, cudaMemcpyHostToDevice); + } + else { + cudaMalloc(&lowres_vx_2LPT, size_float); + cudaMalloc(&lowres_vy_2LPT, size_float); + cudaMalloc(&lowres_vz_2LPT, size_float); + cudaMemcpy(lowres_vx_2LPT, boxes->lowres_vx_2LPT, size_float, cudaMemcpyHostToDevice); + cudaMemcpy(lowres_vy_2LPT, boxes->lowres_vy_2LPT, size_float, cudaMemcpyHostToDevice); + cudaMemcpy(lowres_vz_2LPT, boxes->lowres_vz_2LPT, size_float, cudaMemcpyHostToDevice); + } + } + + err = cudaGetLastError(); + if (err != cudaSuccess) { + LOG_ERROR("CUDA error: %s", cudaGetErrorString(err)); + Throw(CUDAError); + } + + // Can't pass macro straight to kernel + long long d_para = D_PARA; + long long hii_d = HII_D; + long long hii_d_para = HII_D_PARA; + + // Invoke kernel + int threadsPerBlock = 256; + int numBlocks = (TOT_NUM_PIXELS + threadsPerBlock - 1) / threadsPerBlock; + perturb_density_field_kernel<<>>( + d_resampled_box, hires_density, hires_vx, hires_vy, hires_vz, lowres_vx, lowres_vy, lowres_vz, + hires_vx_2LPT, hires_vy_2LPT, hires_vz_2LPT, lowres_vx_2LPT, lowres_vy_2LPT, lowres_vz_2LPT, + dimension, simulation_options_global->DIM, d_para, hii_d, hii_d_para, simulation_options_global->NON_CUBIC_FACTOR, + f_pixel_factor, init_growth_factor, matter_options_global->PERTURB_ON_HIGH_RES, matter_options_global->USE_2LPT); + + // // Only use during development! + // err = cudaDeviceSynchronize(); + // CATCH_CUDA_ERROR(err); + + err = cudaGetLastError(); + if (err != cudaSuccess) { + LOG_ERROR("Kernel launch error: %s", cudaGetErrorString(err)); + Throw(CUDAError); + } + + // Copy results from device to host + err = cudaMemcpy(resampled_box, d_resampled_box, size_double, cudaMemcpyDeviceToHost); + if (err != cudaSuccess) { + LOG_ERROR("CUDA error: %s", cudaGetErrorString(err)); + Throw(CUDAError); + } + + // Deallocate device memory + cudaFree(d_resampled_box); + cudaFree(hires_density); + + if (matter_options_global->PERTURB_ON_HIGH_RES) { + cudaFree(hires_vx); + cudaFree(hires_vy); + cudaFree(hires_vz); + } + else { + cudaFree(lowres_vx); + cudaFree(lowres_vy); + cudaFree(lowres_vz); + } + if (matter_options_global->USE_2LPT) { + if (matter_options_global->PERTURB_ON_HIGH_RES) { + cudaFree(hires_vx_2LPT); + cudaFree(hires_vy_2LPT); + cudaFree(hires_vz_2LPT); + } + else { + cudaFree(lowres_vx_2LPT); + cudaFree(lowres_vy_2LPT); + cudaFree(lowres_vz_2LPT); + } + } + + err = cudaGetLastError(); + if (err != cudaSuccess) { + LOG_ERROR("CUDA error: %s", cudaGetErrorString(err)); + Throw(CUDAError); + } + + return resampled_box; +} diff --git a/src/py21cmfast/src/OutputStructs.h b/src/py21cmfast/src/OutputStructs.h index 9ebf63167..3f9f183cb 100644 --- a/src/py21cmfast/src/OutputStructs.h +++ b/src/py21cmfast/src/OutputStructs.h @@ -6,8 +6,97 @@ #include "InputParameters.h" -// since ffi.cdef() cannot include directives, we store the types and globals in another file -// Since it is unguarded, make sure to ONLY include this file from here -#include "_outputstructs_wrapper.h" +typedef struct InitialConditions { + float *lowres_density, *lowres_vx, *lowres_vy, *lowres_vz; + float *lowres_vx_2LPT, *lowres_vy_2LPT, *lowres_vz_2LPT; + float *hires_density, *hires_vx, *hires_vy, *hires_vz; + float *hires_vx_2LPT, *hires_vy_2LPT, *hires_vz_2LPT; // cw addition + float *lowres_vcb; +} InitialConditions; + +typedef struct PerturbedField { + float *density, *velocity_x, *velocity_y, *velocity_z; +} PerturbedField; + +typedef struct HaloField { + long long unsigned int n_halos; + long long unsigned int buffer_size; + float *halo_masses; + float *halo_coords; + + // Halo properties for stochastic model + float *star_rng; + float *sfr_rng; + float *xray_rng; +} HaloField; + +typedef struct PerturbHaloField { + long long unsigned int n_halos; + long long unsigned int buffer_size; + float *halo_masses; + float *halo_coords; + + // Halo properties for stochastic model + float *star_rng; + float *sfr_rng; + float *xray_rng; +} PerturbHaloField; + +typedef struct HaloBox { + // Things that aren't used in radiation fields but useful outputs + float *halo_mass; + float *halo_stars; + float *halo_stars_mini; + int *count; + + // For IonisationBox.c and SpinTemperatureBox.c + float *n_ion; // weighted by F_ESC*PopN_ion + float *halo_sfr; // for x-rays and Ts stuff + float *halo_xray; + float *halo_sfr_mini; // for x-rays and Ts stuff + float *whalo_sfr; // SFR weighted by PopN_ion and F_ESC, used for Gamma12 + + // Average volume-weighted log10 Turnover masses are kept in order to compare with the expected + // MF integrals + double log10_Mcrit_ACG_ave; + double log10_Mcrit_MCG_ave; +} HaloBox; + +typedef struct XraySourceBox { + float *filtered_sfr; + float *filtered_xray; + float *filtered_sfr_mini; + + double *mean_log10_Mcrit_LW; + double *mean_sfr; + double *mean_sfr_mini; +} XraySourceBox; + +typedef struct TsBox { + float *spin_temperature; + float *xray_ionised_fraction; + float *kinetic_temp_neutral; + float *J_21_LW; +} TsBox; + +typedef struct IonizedBox { + double mean_f_coll; + double mean_f_coll_MINI; + double log10_Mturnover_ave; + double log10_Mturnover_MINI_ave; + float *neutral_fraction; + float *ionisation_rate_G12; + float *mean_free_path; + float *z_reion; + float *cumulative_recombinations; + float *kinetic_temperature; + float *unnormalised_nion; + float *unnormalised_nion_mini; +} IonizedBox; + +typedef struct BrightnessTemp { + float *brightness_temp; + float *tau_21; +} BrightnessTemp; #endif diff --git a/src/py21cmfast/src/PerturbField.c b/src/py21cmfast/src/PerturbField.c index c3b2baf16..715ea93f2 100644 --- a/src/py21cmfast/src/PerturbField.c +++ b/src/py21cmfast/src/PerturbField.c @@ -19,113 +19,7 @@ #include "filtering.h" #include "indexing.h" #include "logger.h" - -static inline void do_cic_interpolation(double *resampled_box, double pos[3], int box_dim[3], - double curr_dens) { - // get the CIC indices and distances - int ipos[3], iposp1[3]; - double dist[3]; - // NOTE: assumes the cell at idx == 0 is *centred* at (0,0,0) - for (int axis = 0; axis < 3; axis++) { - ipos[axis] = (int)floor(pos[axis]); - iposp1[axis] = ipos[axis] + 1; - dist[axis] = pos[axis] - ipos[axis]; - } - - wrap_coord(ipos, box_dim); - wrap_coord(iposp1, box_dim); - - unsigned long long int cic_indices[8] = { - grid_index_general(ipos[0], ipos[1], ipos[2], box_dim), - grid_index_general(iposp1[0], ipos[1], ipos[2], box_dim), - grid_index_general(ipos[0], iposp1[1], ipos[2], box_dim), - grid_index_general(iposp1[0], iposp1[1], ipos[2], box_dim), - grid_index_general(ipos[0], ipos[1], iposp1[2], box_dim), - grid_index_general(iposp1[0], ipos[1], iposp1[2], box_dim), - grid_index_general(ipos[0], iposp1[1], iposp1[2], box_dim), - grid_index_general(iposp1[0], iposp1[1], iposp1[2], box_dim)}; - - double cic_weights[8] = {(1. - dist[0]) * (1. - dist[1]) * (1. - dist[2]), - dist[0] * (1. - dist[1]) * (1. - dist[2]), - (1. - dist[0]) * dist[1] * (1. - dist[2]), - dist[0] * dist[1] * (1. - dist[2]), - (1. - dist[0]) * (1. - dist[1]) * dist[2], - dist[0] * (1. - dist[1]) * dist[2], - (1. - dist[0]) * dist[1] * dist[2], - dist[0] * dist[1] * dist[2]}; - - for (int i = 0; i < 8; i++) { -#pragma omp atomic update - resampled_box[cic_indices[i]] += curr_dens * cic_weights[i]; - } -} - -// Function that maps a IC density grid to the perturbed density grid -void move_grid_masses(double redshift, float *dens_pointer, int dens_dim[3], float *vel_pointers[3], - float *vel_pointers_2LPT[3], int vel_dim[3], double *resampled_box, - int out_dim[3]) { - // grid dimension constants - double boxlen = simulation_options_global->BOX_LEN; - double boxlen_z = boxlen * simulation_options_global->NON_CUBIC_FACTOR; - double box_size[3] = {boxlen, boxlen, boxlen_z}; - double dim_ratio_vel = (double)vel_dim[0] / (double)dens_dim[0]; - double dim_ratio_out = (double)out_dim[0] / (double)dens_dim[0]; - - // Setup IC velocity factors - double growth_factor = dicke(redshift); - double displacement_factor_2LPT = -(3.0 / 7.0) * growth_factor * growth_factor; // 2LPT eq. D8 - - double init_growth_factor = dicke(simulation_options_global->INITIAL_REDSHIFT); - double init_displacement_factor_2LPT = - -(3.0 / 7.0) * init_growth_factor * init_growth_factor; // 2LPT eq. D8 - - double velocity_displacement_factor[3] = { - (growth_factor - init_growth_factor) / box_size[0] * simulation_options_global->DIM, - (growth_factor - init_growth_factor) / box_size[1] * simulation_options_global->DIM, - (growth_factor - init_growth_factor) / box_size[2] * D_PARA}; - double velocity_displacement_factor_2LPT[3] = { - (displacement_factor_2LPT - init_displacement_factor_2LPT) / box_size[0] * - simulation_options_global->DIM, - (displacement_factor_2LPT - init_displacement_factor_2LPT) / box_size[1] * - simulation_options_global->DIM, - (displacement_factor_2LPT - init_displacement_factor_2LPT) / box_size[2] * D_PARA}; -#pragma omp parallel num_threads(simulation_options_global->N_THREADS) - { - int i, j, k, axis; - double pos[3], curr_dens; - int ipos[3]; - unsigned long long vel_index, dens_index; -#pragma omp for - for (i = 0; i < dens_dim[0]; i++) { - for (j = 0; j < dens_dim[1]; j++) { - for (k = 0; k < dens_dim[2]; k++) { - // Transform position to units of box size - pos[0] = i; - pos[1] = j; - pos[2] = k; - resample_index((int[3]){i, j, k}, dim_ratio_vel, ipos); - wrap_coord(ipos, vel_dim); - vel_index = grid_index_general(ipos[0], ipos[1], ipos[2], vel_dim); - for (axis = 0; axis < 3; axis++) { - pos[axis] += - vel_pointers[axis][vel_index] * velocity_displacement_factor[axis]; - // add 2LPT second order corrections - if (matter_options_global->PERTURB_ALGORITHM == 2) { - pos[axis] -= vel_pointers_2LPT[axis][vel_index] * - velocity_displacement_factor_2LPT[axis]; - } - pos[axis] *= dim_ratio_out; - } - - // CIC interpolation - dens_index = grid_index_general(i, j, k, dens_dim); - curr_dens = 1.0 + dens_pointer[dens_index] * init_growth_factor; - do_cic_interpolation(resampled_box, pos, out_dim, curr_dens); - } - } - } - } -} +#include "map_mass.h" void make_density_grid(float redshift, fftwf_complex *fft_density_grid, InitialConditions *boxes) { int i, j, k; @@ -210,8 +104,18 @@ void make_density_grid(float redshift, fftwf_complex *fft_density_grid, InitialC resampled_box = (double *)calloc(HII_TOT_NUM_PIXELS, sizeof(double)); } int hi_dim[3] = {simulation_options_global->DIM, simulation_options_global->DIM, D_PARA}; - move_grid_masses(redshift, boxes->hires_density, hi_dim, vel_pointers, vel_pointers_2LPT, - box_dim, resampled_box, box_dim); + bool use_cuda = false; // pass this as a parameter later + if (use_cuda) { +#if CUDA_FOUND + resampled_box = + MapMass_gpu(boxes, resampled_box, dimension, f_pixel_factor, init_growth_factor); +#else + LOG_ERROR("CUDA version of MapMass() called but code was not compiled for CUDA."); +#endif + } else { + move_grid_masses(redshift, boxes->hires_density, hi_dim, vel_pointers, + vel_pointers_2LPT, box_dim, resampled_box, box_dim); + } LOG_SUPER_DEBUG("resampled_box: "); debugSummarizeBoxDouble(resampled_box, box_dim[0], box_dim[1], box_dim[2], " "); diff --git a/src/py21cmfast/src/PerturbField.h b/src/py21cmfast/src/PerturbField.h index a99e42ac3..6f2fba56d 100644 --- a/src/py21cmfast/src/PerturbField.h +++ b/src/py21cmfast/src/PerturbField.h @@ -1,9 +1,17 @@ #ifndef _PERTURBFIELD_H #define _PERTURBFIELD_H +// #include + #include "InputParameters.h" #include "OutputStructs.h" +#ifdef __cplusplus +extern "C" { +#endif int ComputePerturbField(float redshift, InitialConditions *boxes, PerturbedField *perturbed_field); +#ifdef __cplusplus +} +#endif #endif diff --git a/src/py21cmfast/src/PerturbHaloField.c b/src/py21cmfast/src/PerturbHaloField.c index ca0747f21..0db4a7aed 100644 --- a/src/py21cmfast/src/PerturbHaloField.c +++ b/src/py21cmfast/src/PerturbHaloField.c @@ -94,7 +94,7 @@ int ComputePerturbHaloField(float redshift, InitialConditions *boxes, HaloField halos_perturbed->n_halos = halos->n_halos; // ****************** END INITIALIZATION ******************************** // - int n_exact_dim = 0; + unsigned long long int n_exact_dim = 0; bool error_in_parallel = false; #pragma omp parallel private(i_halo) num_threads(simulation_options_global -> N_THREADS) \ reduction(+ : n_exact_dim) @@ -136,7 +136,7 @@ int ComputePerturbHaloField(float redshift, InitialConditions *boxes, HaloField } } // Divide out multiplicative factor to return to pristine state - LOG_SUPER_DEBUG("Number of halos exactly on the box edge = %d of %d", n_exact_dim, + LOG_SUPER_DEBUG("Number of halos exactly on the box edge = %llu of %llu", n_exact_dim, halos->n_halos); if (error_in_parallel) { LOG_ERROR("Error in parallel processing, some halos were out of bounds."); diff --git a/src/py21cmfast/src/PerturbHaloField.h b/src/py21cmfast/src/PerturbHaloField.h index 1868c4b39..3854adf36 100644 --- a/src/py21cmfast/src/PerturbHaloField.h +++ b/src/py21cmfast/src/PerturbHaloField.h @@ -4,7 +4,13 @@ #include "InputParameters.h" #include "OutputStructs.h" +#ifdef __cplusplus +extern "C" { +#endif int ComputePerturbHaloField(float redshift, InitialConditions *boxes, HaloField *halos, PerturbHaloField *halos_perturbed); +#ifdef __cplusplus +} +#endif #endif diff --git a/src/py21cmfast/src/SpinTemperatureBox.c b/src/py21cmfast/src/SpinTemperatureBox.c index ea1e0ce6b..162b99111 100644 --- a/src/py21cmfast/src/SpinTemperatureBox.c +++ b/src/py21cmfast/src/SpinTemperatureBox.c @@ -745,7 +745,7 @@ int UpdateXraySourceBox(HaloBox *halobox, double R_inner, double R_outer, int R_ // NOTE: Frequency integrals are based on PREVIOUS x_e_ave // The x_e tables are not regular, hence the precomputation of indices/interp points void fill_freqint_tables(double zp, double x_e_ave, double filling_factor_of_HI_zp, - double *log10_Mcrit_LW_ave, int R_mm, struct ScalingConstants *sc) { + double *log10_Mcrit_LW_ave, int R_mm, ScalingConstants *sc) { double lower_int_limit; int x_e_ct, R_ct; int R_start, R_end; @@ -867,7 +867,7 @@ int global_reion_properties(double zp, double x_e_ave, double *log10_Mcrit_LW_av double determine_zpp_max, determine_zpp_min; // at z', we need a differenc constant struct - struct ScalingConstants sc; + ScalingConstants sc; set_scaling_constants(zp, &sc, false); if (matter_options_global->USE_INTERPOLATION_TABLES > 1) { @@ -930,7 +930,8 @@ int global_reion_properties(double zp, double x_e_ave, double *log10_Mcrit_LW_av void calculate_sfrd_from_grid(int R_ct, float *dens_R_grid, float *Mcrit_R_grid, float *sfrd_grid, float *sfrd_grid_mini, double *ave_sfrd, double *ave_sfrd_mini, - struct ScalingConstants *sc) { + unsigned int threadsPerBlock, float *d_y_arr, float *d_dens_R_grid, + float *d_sfrd_grid, double *d_ave_sfrd_buf, ScalingConstants *sc) { double ave_sfrd_buf = 0; double ave_sfrd_buf_mini = 0; if (astro_options_global->INTEGRATION_METHOD_ATOMIC == 1 || @@ -951,43 +952,61 @@ void calculate_sfrd_from_grid(int R_ct, float *dens_R_grid, float *Mcrit_R_grid, } } + // -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + // If GPU is to be used and flags are ideal, call GPU version of reduction + bool use_cuda = false; // pass this as a parameter later + if (use_cuda && astro_options_global->USE_MASS_DEPENDENT_ZETA && + matter_options_global->USE_INTERPOLATION_TABLES && !astro_options_global->USE_MINI_HALOS) { +#if CUDA_FOUND + RGTable1D_f *SFRD_conditional_table = get_SFRD_conditional_table(); + ave_sfrd_buf = + calculate_sfrd_from_grid_gpu(SFRD_conditional_table, dens_R_grid, zpp_growth, R_ct, + sfrd_grid, HII_TOT_NUM_PIXELS, threadsPerBlock, + // d_data + d_y_arr, d_dens_R_grid, d_sfrd_grid, d_ave_sfrd_buf); +#else + LOG_ERROR("calculate_sfrd_from_grid_gpu() called but code was not compiled for CUDA."); +#endif + } else { #pragma omp parallel num_threads(simulation_options_global->N_THREADS) - { - unsigned long long int box_ct; - double curr_dens; - double curr_mcrit = 0.; - double fcoll, dfcoll; - double fcoll_MINI = 0; + { + unsigned long long int box_ct; + double curr_dens; + double curr_mcrit = 0.; + double fcoll, dfcoll; + double fcoll_MINI = 0; #pragma omp for reduction(+ : ave_sfrd_buf, ave_sfrd_buf_mini) - for (box_ct = 0; box_ct < HII_TOT_NUM_PIXELS; box_ct++) { - curr_dens = dens_R_grid[box_ct] * zpp_growth[R_ct]; - if (astro_options_global->USE_MINI_HALOS) curr_mcrit = Mcrit_R_grid[box_ct]; + for (box_ct = 0; box_ct < HII_TOT_NUM_PIXELS; box_ct++) { + curr_dens = dens_R_grid[box_ct] * zpp_growth[R_ct]; + if (astro_options_global->USE_MINI_HALOS) curr_mcrit = Mcrit_R_grid[box_ct]; - if (astro_options_global->USE_MASS_DEPENDENT_ZETA) { - fcoll = EvaluateSFRD_Conditional(curr_dens, zpp_growth[R_ct], M_min_R[R_ct], + if (astro_options_global->USE_MASS_DEPENDENT_ZETA) { + fcoll = + EvaluateSFRD_Conditional(curr_dens, zpp_growth[R_ct], M_min_R[R_ct], M_max_R[R_ct], M_max_R[R_ct], sigma_max[R_ct], sc); - sfrd_grid[box_ct] = (1. + curr_dens) * fcoll; + sfrd_grid[box_ct] = (1. + curr_dens) * fcoll; - if (astro_options_global->USE_MINI_HALOS) { - fcoll_MINI = EvaluateSFRD_Conditional_MINI( - curr_dens, curr_mcrit, zpp_growth[R_ct], M_min_R[R_ct], M_max_R[R_ct], - M_max_R[R_ct], sigma_max[R_ct], sc); - sfrd_grid_mini[box_ct] = (1. + curr_dens) * fcoll_MINI; + if (astro_options_global->USE_MINI_HALOS) { + fcoll_MINI = EvaluateSFRD_Conditional_MINI( + curr_dens, curr_mcrit, zpp_growth[R_ct], M_min_R[R_ct], M_max_R[R_ct], + M_max_R[R_ct], sigma_max[R_ct], sc); + sfrd_grid_mini[box_ct] = (1. + curr_dens) * fcoll_MINI; + } + } else { + fcoll = EvaluateFcoll_delta(curr_dens, zpp_growth[R_ct], sigma_min[R_ct], + sigma_max[R_ct]); + dfcoll = EvaluatedFcolldz(curr_dens, zpp_for_evolve_list[R_ct], sigma_min[R_ct], + sigma_max[R_ct]); + sfrd_grid[box_ct] = (1. + curr_dens) * dfcoll; } - } else { - fcoll = EvaluateFcoll_delta(curr_dens, zpp_growth[R_ct], sigma_min[R_ct], - sigma_max[R_ct]); - dfcoll = EvaluatedFcolldz(curr_dens, zpp_for_evolve_list[R_ct], sigma_min[R_ct], - sigma_max[R_ct]); - sfrd_grid[box_ct] = (1. + curr_dens) * dfcoll; + ave_sfrd_buf += fcoll; + ave_sfrd_buf_mini += fcoll_MINI; } - ave_sfrd_buf += fcoll; - ave_sfrd_buf_mini += fcoll_MINI; } + *ave_sfrd = ave_sfrd_buf / HII_TOT_NUM_PIXELS; + *ave_sfrd_mini = ave_sfrd_buf_mini / HII_TOT_NUM_PIXELS; } - *ave_sfrd = ave_sfrd_buf / HII_TOT_NUM_PIXELS; - *ave_sfrd_mini = ave_sfrd_buf_mini / HII_TOT_NUM_PIXELS; // These functions check for allocation free_conditional_tables(); @@ -1458,10 +1477,49 @@ void ts_main(float redshift, float prev_redshift, float perturbed_field_redshift int R_index; float *delta_box_input; float *Mcrit_box_input = NULL; // may be unused - struct ScalingConstants sc, sc_sfrd; + ScalingConstants sc, sc_sfrd; // if we have stars, fill in the heating term boxes if (!NO_LIGHT) { + // Device pointers that reference GPU memory and need to persist across loop iterations + // ------------------------------------------------------------------------- + float *d_y_arr = NULL; + float *d_dens_R_grid = NULL; + float *d_sfrd_grid = NULL; + double *d_ave_sfrd_buf = NULL; + + // initialise pointer to struct of pointers + // ---------------------------------------------------------------------------------------------------------------------- + // sfrd_gpu_data *device_data; + // sfrd_gpu_data *device_data = (sfrd_gpu_data *)malloc(sizeof(sfrd_gpu_data)); + unsigned int threadsPerBlock = 0; + unsigned int sfrd_nbins = get_nbins(); + + // GPU=True + // if (true) { + // // unsigned int init_sfrd_gpu_data(float *dens_R_grid, float *sfrd_grid, unsigned + // long long num_pixels, + // // unsigned int nbins, sfrd_gpu_data *d_data); + // threadsPerBlock = init_sfrd_gpu_data(delta_box_input, del_fcoll_Rct, + // HII_TOT_NUM_PIXELS, sfrd_nbins, &device_data); + // } + // struct + // --------------------------------------------------------------------------------------------------------------------------------------------------------- + // threadsPerBlock = init_sfrd_gpu_data(delta_box_input, del_fcoll_Rct, HII_TOT_NUM_PIXELS, + // sfrd_nbins, &device_data); pointers + // ------------------------------------------------------------------------------------------------------------------------------------------------------- + bool use_cuda = false; // pass this as a parameter later + if (use_cuda) { +#if CUDA_FOUND + threadsPerBlock = + init_sfrd_gpu_data(delta_box_input, del_fcoll_Rct, HII_TOT_NUM_PIXELS, sfrd_nbins, + &d_y_arr, &d_dens_R_grid, &d_sfrd_grid, &d_ave_sfrd_buf); +#else + LOG_ERROR( + "CUDA function init_sfrd_gpu_data() called but code was not compiled for CUDA."); +#endif + } + // --------------------------------------------------------------------------------------------------------------------------------------------------------------- for (R_ct = astro_params_global->N_STEP_TS; R_ct--;) { dzpp_for_evolve = dzpp_list[R_ct]; zpp = zpp_for_evolve_list[R_ct]; @@ -1509,7 +1567,9 @@ void ts_main(float redshift, float prev_redshift, float perturbed_field_redshift Mcrit_box_input = log10_Mcrit_LW[R_index]; } calculate_sfrd_from_grid(R_ct, delta_box_input, Mcrit_box_input, del_fcoll_Rct, - del_fcoll_Rct_MINI, &ave_fcoll, &ave_fcoll_MINI, &sc); + del_fcoll_Rct_MINI, &ave_fcoll, &ave_fcoll_MINI, + threadsPerBlock, d_y_arr, d_dens_R_grid, d_sfrd_grid, + d_ave_sfrd_buf, &sc); avg_fix_term = mean_sfr_zpp[R_ct] / ave_fcoll; if (astro_options_global->USE_MINI_HALOS) avg_fix_term_MINI = mean_sfr_zpp_mini[R_ct] / ave_fcoll_MINI; @@ -1665,6 +1725,21 @@ void ts_main(float redshift, float prev_redshift, float perturbed_field_redshift } } } + // struct + // ------------------------------------------------------------------------------------------------------------------------------------------------------------------ + // free_sfrd_gpu_data(device_data); + // free(device_data); + // pointers + // ---------------------------------------------------------------------------------------------------------------------------------------------------------------- + if (use_cuda) { +#if CUDA_FOUND + free_sfrd_gpu_data(&d_y_arr, &d_dens_R_grid, &d_sfrd_grid, &d_ave_sfrd_buf); +#else + LOG_ERROR( + "CUDA function free_sfrd_gpu_data() called but code was not compiled for CUDA."); +#endif + } + // ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- } // we definitely don't need these tables anymore diff --git a/src/py21cmfast/src/SpinTemperatureBox.cu b/src/py21cmfast/src/SpinTemperatureBox.cu new file mode 100644 index 000000000..c0b9dd49f --- /dev/null +++ b/src/py21cmfast/src/SpinTemperatureBox.cu @@ -0,0 +1,260 @@ +// Most of the following includes likely can be removed. +#include +#include +#include +#include +#include +#include + +// GPU +#include +#include +// We use thrust for reduction +#include +#include +#include // thrust::plus + +#include "cexcept.h" +#include "exceptions.h" +#include "logger.h" +#include "Constants.h" +#include "indexing.h" +#include "InputParameters.h" +#include "OutputStructs.h" +#include "heating_helper_progs.h" +#include "elec_interp.h" +#include "interp_tables.h" +#include "debugging.h" +#include "cosmology.h" +#include "hmf.h" +#include "dft.h" +#include "filtering.h" +#include "thermochem.h" +#include "interpolation.h" + +#include "cuda_utils.cuh" +#include "SpinTemperatureBox.h" + + +__device__ inline double EvaluateRGTable1D_f_gpu(double x, double x_min, double x_width, float *y_arr) { + + int idx = (int)floor((x - x_min) / x_width); + + double table_val = x_min + x_width * (float)idx; + double interp_point = (x - table_val) / x_width; + + return y_arr[idx] * (1 - interp_point) + y_arr[idx + 1] * (interp_point); +} + +template +__device__ void warp_reduce(volatile double *sdata, unsigned int tid) { + // Reduce by half + // No syncing required with threads < 32 + if (threadsPerBlock >= 64) { sdata[tid] += sdata[tid + 32]; } + if (threadsPerBlock >= 32) { sdata[tid] += sdata[tid + 16]; } + if (threadsPerBlock >= 16) { sdata[tid] += sdata[tid + 8]; } + if (threadsPerBlock >= 8) { sdata[tid] += sdata[tid + 4]; } + if (threadsPerBlock >= 4) { sdata[tid] += sdata[tid + 2]; } + if (threadsPerBlock >= 2) { sdata[tid] += sdata[tid + 1]; } +} + +// As seen in talk by Mark Harris, NVIDIA. +// https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf +// https://www.youtube.com/watch?v=NrWhZMHrP4w +template +__global__ void compute_and_reduce( + double x_min, // reference + double x_width, // reference + float *y_arr, // reference + float *dens_R_grid, // reference + double zpp_growth_R_ct, // value + float *sfrd_grid, // star formation rate density grid to be updated + double *ave_sfrd_buf, // output buffer of length ceil(n / (threadsPerBlock * 2)) + unsigned long long num_pixels // length of input data +) { + + // An array to store intermediate summations + // Shared between all threads in block + extern __shared__ double sdata[]; + + unsigned int tid = threadIdx.x; // thread within current block + unsigned int i = blockIdx.x * (threadsPerBlock * 2) + tid; // index of input data + unsigned int gridSize = threadsPerBlock * 2 * gridDim.x; + + sdata[tid] = 0; + + // In bounds of gridSize, sum pairs of collapse fraction data together + // And update the star formation rate density grid. + double curr_dens_i; + double curr_dens_j; + double fcoll_i; + double fcoll_j; + + while (i < num_pixels) { + // Compute current density from density grid value * redshift-scaled growth factor + curr_dens_i = dens_R_grid[i] * zpp_growth_R_ct; + + // Compute fraction of mass that has collapsed to form stars/other structures + fcoll_i = exp(EvaluateRGTable1D_f_gpu(curr_dens_i, x_min, x_width, y_arr)); + + // Update the shared buffer with the collapse fractions + sdata[tid] += fcoll_i; + + // Update the relevant cells in the star formation rate density grid + sfrd_grid[i] = (1. + curr_dens_i) * fcoll_i; + + // Repeat for i + threadsPerBlock + if ((i + threadsPerBlock) < num_pixels) { + curr_dens_j = dens_R_grid[i + threadsPerBlock] * zpp_growth_R_ct; + fcoll_j = exp(EvaluateRGTable1D_f_gpu(curr_dens_j, x_min, x_width, y_arr)); + sdata[tid] += fcoll_j; + sfrd_grid[i + threadsPerBlock] = (1. + curr_dens_j) * fcoll_j; + } + + i += gridSize; + } + __syncthreads(); + + // Reduce by half and sync (and repeat) + if (threadsPerBlock >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } __syncthreads(); } + if (threadsPerBlock >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } __syncthreads(); } + if (threadsPerBlock >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; } __syncthreads(); } + + // Final reduction by separate kernel + if (tid < 32) { warp_reduce(sdata, tid); } + + // The first thread of each block updates the block totals + if (tid == 0) { ave_sfrd_buf[blockIdx.x] = sdata[0]; } +} + +unsigned int init_sfrd_gpu_data( + float *dens_R_grid, // input data + float *sfrd_grid, // star formation rate density grid to be updated + unsigned long long num_pixels, // length of input data + unsigned int nbins, // nbins for sfrd_grid->y + float **d_y_arr, // copies of pointers to pointers + float **d_dens_R_grid, + float **d_sfrd_grid, + double **d_ave_sfrd_buf +) { + // Allocate device memory + CALL_CUDA(cudaMalloc(d_y_arr, sizeof(float) * nbins)); // already pointers to pointers (no & needed) + CALL_CUDA(cudaMalloc(d_dens_R_grid, sizeof(float) * num_pixels)); + CALL_CUDA(cudaMalloc(d_sfrd_grid, sizeof(float) * num_pixels)); + LOG_INFO("SFRD_conditional_table.y_arr and density and sfrd grids allocated on device."); + + // Initialise sfrd_grid to 0 (fill with byte=0) + CALL_CUDA(cudaMemset(*d_sfrd_grid, 0, sizeof(float) * num_pixels)); // dereference the pointer to a pointer (*) + LOG_INFO("sfrd grid initialised to 0."); + + // Get max threads/block for device + int maxThreadsPerBlock; + CALL_CUDA(cudaDeviceGetAttribute(&maxThreadsPerBlock, cudaDevAttrMaxThreadsPerBlock, 0)); + + // Set threads/block based on device max + unsigned int threadsPerBlock; + if (maxThreadsPerBlock >= 512) { + threadsPerBlock = 512; + } else if (maxThreadsPerBlock >= 256) { + threadsPerBlock = 256; + } else if (maxThreadsPerBlock >= 128) { + threadsPerBlock = 128; + } else if (maxThreadsPerBlock >= 64) { + threadsPerBlock = 64; + } else if (maxThreadsPerBlock >= 32) { + threadsPerBlock = 32; + } else { + threadsPerBlock = 16; + } + + // Allocate memory for SFRD sum buffer and initialise to 0 only for initial filter step; + // reuse memory for remaining filter steps. + unsigned int numBlocks = ceil(num_pixels / (threadsPerBlock * 2)); + CALL_CUDA(cudaMalloc(d_ave_sfrd_buf, sizeof(double) * numBlocks)); // already pointer to a pointer (no & needed) + LOG_INFO("SFRD sum reduction buffer allocated on device."); + + // Initialise buffer to 0 (fill with byte=0) + CALL_CUDA(cudaMemset(*d_ave_sfrd_buf, 0, sizeof(double) * numBlocks)); // dereference the pointer to a pointer (*) + LOG_INFO("SFRD sum reduction buffer initialised to 0."); + + return threadsPerBlock; +} + +double calculate_sfrd_from_grid_gpu( + RGTable1D_f *SFRD_conditional_table, // input data + float *dens_R_grid, // input data + double *zpp_growth, // input data + int R_ct, // filter step/loop iteration/spherical annuli (out of 40 by default) + float *sfrd_grid, // star formation rate density grid to be updated + unsigned long long num_pixels, // length of input data + unsigned int threadsPerBlock, // computed in init function + float *d_y_arr, + float *d_dens_R_grid, + float *d_sfrd_grid, + double *d_ave_sfrd_buf +) { + // Get growth factor for current filter step + double zpp_growth_R_ct = zpp_growth[R_ct]; + + // Copy data from host to device + CALL_CUDA(cudaMemcpy(d_y_arr, SFRD_conditional_table->y_arr, sizeof(float) * SFRD_conditional_table->n_bin, cudaMemcpyHostToDevice)); + CALL_CUDA(cudaMemcpy(d_dens_R_grid, dens_R_grid, sizeof(float) * num_pixels, cudaMemcpyHostToDevice)); + LOG_INFO("SFRD_conditional_table.y_arr and density grid copied to device."); + + unsigned int numBlocks = ceil(num_pixels / (threadsPerBlock * 2)); + unsigned int smemSize = threadsPerBlock * sizeof(double); // shared memory + + // Invoke kernel + switch (threadsPerBlock) { + case 512: + compute_and_reduce<512><<< numBlocks, threadsPerBlock, smemSize >>>(SFRD_conditional_table->x_min, SFRD_conditional_table->x_width, d_y_arr, d_dens_R_grid, zpp_growth_R_ct, d_sfrd_grid, d_ave_sfrd_buf, num_pixels); + break; + case 256: + compute_and_reduce<256><<< numBlocks, threadsPerBlock, smemSize >>>(SFRD_conditional_table->x_min, SFRD_conditional_table->x_width, d_y_arr, d_dens_R_grid, zpp_growth_R_ct, d_sfrd_grid, d_ave_sfrd_buf, num_pixels); + break; + case 128: + compute_and_reduce<128><<< numBlocks, threadsPerBlock, smemSize >>>(SFRD_conditional_table->x_min, SFRD_conditional_table->x_width, d_y_arr, d_dens_R_grid, zpp_growth_R_ct, d_sfrd_grid, d_ave_sfrd_buf, num_pixels); + break; + case 64: + compute_and_reduce<64><<< numBlocks, threadsPerBlock, smemSize >>>(SFRD_conditional_table->x_min, SFRD_conditional_table->x_width, d_y_arr, d_dens_R_grid, zpp_growth_R_ct, d_sfrd_grid, d_ave_sfrd_buf, num_pixels); + break; + case 32: + compute_and_reduce<32><<< numBlocks, threadsPerBlock, smemSize >>>(SFRD_conditional_table->x_min, SFRD_conditional_table->x_width, d_y_arr, d_dens_R_grid, zpp_growth_R_ct, d_sfrd_grid, d_ave_sfrd_buf, num_pixels); + break; + default: + LOG_WARNING("Thread size invalid; defaulting to 256."); + compute_and_reduce<256><<< numBlocks, 256, 256 * sizeof(double) >>>(SFRD_conditional_table->x_min, SFRD_conditional_table->x_width, d_y_arr, d_dens_R_grid, zpp_growth_R_ct, d_sfrd_grid, d_ave_sfrd_buf, num_pixels); + } + CALL_CUDA(cudaGetLastError()); + // CALL_CUDA(cudaDeviceSynchronize()); // Only use during development + LOG_INFO("SpinTemperatureBox compute-and-reduce kernel called."); + + // Use thrust to reduce computed sums to one value. + // Wrap device pointer in a thrust::device_ptr + thrust::device_ptr d_ave_sfrd_buf_ptr(d_ave_sfrd_buf); + // Reduce final buffer sums to one value + double ave_sfrd_buf = thrust::reduce(d_ave_sfrd_buf_ptr, d_ave_sfrd_buf_ptr + numBlocks, 0., thrust::plus()); + CALL_CUDA(cudaGetLastError()); + // CALL_CUDA(cudaDeviceSynchronize()); // Only use during development + LOG_INFO("SFRD sum reduced to single value by thrust::reduce operation."); + + // Copy results from device to host + CALL_CUDA(cudaMemcpy(sfrd_grid, d_sfrd_grid, sizeof(float) * num_pixels, cudaMemcpyDeviceToHost)); + LOG_INFO("SFRD sum copied to host."); + + return ave_sfrd_buf; +} + +void free_sfrd_gpu_data( + float **d_y_arr, // copies of pointers to pointers + float **d_dens_R_grid, + float **d_sfrd_grid, + double **d_ave_sfrd_buf +) { + // Need to dereference the pointers to pointers (*) + CALL_CUDA(cudaFree(*d_y_arr)); + CALL_CUDA(cudaFree(*d_dens_R_grid)); + CALL_CUDA(cudaFree(*d_sfrd_grid)); + CALL_CUDA(cudaFree(*d_ave_sfrd_buf)); + LOG_INFO("Device memory freed."); +} diff --git a/src/py21cmfast/src/SpinTemperatureBox.h b/src/py21cmfast/src/SpinTemperatureBox.h index 2c607ff38..033ea2e99 100644 --- a/src/py21cmfast/src/SpinTemperatureBox.h +++ b/src/py21cmfast/src/SpinTemperatureBox.h @@ -1,8 +1,22 @@ #ifndef _SPINTEMP_H #define _SPINTEMP_H +// #include + #include "InputParameters.h" #include "OutputStructs.h" +#include "interpolation.h" +#include "scaling_relations.h" + +#ifdef __cplusplus +extern "C" { +#endif +// typedef struct sfrd_gpu_data { +// float *d_y_arr; +// float *d_dens_R_grid; +// float *d_sfrd_grid; +// double *d_ave_sfrd_buf; +// } sfrd_gpu_data; int ComputeTsBox(float redshift, float prev_redshift, float perturbed_field_redshift, short cleanup, PerturbedField *perturbed_field, XraySourceBox *source_box, @@ -11,4 +25,44 @@ int ComputeTsBox(float redshift, float prev_redshift, float perturbed_field_reds int UpdateXraySourceBox(HaloBox *halobox, double R_inner, double R_outer, int R_ct, XraySourceBox *source_box); +// pointers +// -------------------------------------------------------------------------------------------------------- +void calculate_sfrd_from_grid(int R_ct, float *dens_R_grid, float *Mcrit_R_grid, float *sfrd_grid, + float *sfrd_grid_mini, double *ave_sfrd, double *ave_sfrd_mini, + unsigned int threadsPerBlock, float *d_y_arr, float *d_dens_R_grid, + float *d_sfrd_grid, double *d_ave_sfrd_buf, + struct ScalingConstants *sc); + +unsigned int init_sfrd_gpu_data(float *dens_R_grid, float *sfrd_grid, unsigned long long num_pixels, + unsigned int nbins, float **d_y_arr, float **d_dens_R_grid, + float **d_sfrd_grid, double **d_ave_sfrd_buf); + +double calculate_sfrd_from_grid_gpu(RGTable1D_f *SFRD_conditional_table, float *dens_R_grid, + double *zpp_growth, int R_ct, float *sfrd_grid, + unsigned long long num_pixels, unsigned int threadsPerBlock, + float *d_y_arr, float *d_dens_R_grid, float *d_sfrd_grid, + double *d_ave_sfrd_buf, struct ScalingConstants *sc); + +void free_sfrd_gpu_data(float **d_y_arr, float **d_dens_R_grid, float **d_sfrd_grid, + double **d_ave_sfrd_buf); + +// wrap pointers in struct +// ------------------------------------------------------------------------------------------ void +// calculate_sfrd_from_grid(int R_ct, float *dens_R_grid, float *Mcrit_R_grid, float *sfrd_grid, +// float *sfrd_grid_mini, double *ave_sfrd, double *ave_sfrd_mini, +// unsigned int threadsPerBlock, const sfrd_gpu_data *d_data); + +// unsigned int init_sfrd_gpu_data(float *dens_R_grid, float *sfrd_grid, unsigned long long +// num_pixels, +// unsigned int nbins, sfrd_gpu_data *d_data); + +// double calculate_sfrd_from_grid_gpu(RGTable1D_f *SFRD_conditional_table, float *dens_R_grid, +// double *zpp_growth, int R_ct, float *sfrd_grid, unsigned long long num_pixels, +// unsigned int threadsPerBlock, const sfrd_gpu_data *d_data); + +// void free_sfrd_gpu_data(sfrd_gpu_data *d_data); + +#ifdef __cplusplus +} +#endif #endif diff --git a/src/py21cmfast/src/Stochasticity.c b/src/py21cmfast/src/Stochasticity.c index a64e481c6..5610b4557 100644 --- a/src/py21cmfast/src/Stochasticity.c +++ b/src/py21cmfast/src/Stochasticity.c @@ -3,22 +3,29 @@ * other halo relations.*/ #include "Stochasticity.h" +#include #include #include #include #include #include +#include #include "Constants.h" +#include "HaloField.cuh" #include "InitialConditions.h" #include "InputParameters.h" #include "OutputStructs.h" +#include "Stochasticity.cuh" #include "cexcept.h" #include "cosmology.h" +#include "device_rng.cuh" #include "exceptions.h" #include "hmf.h" #include "indexing.h" +#include "interp_tables.cuh" #include "interp_tables.h" +#include "interpolation.h" #include "logger.h" #include "rng.h" // buffer size (per cell of arbitrary size) in the sampling function @@ -161,6 +168,19 @@ void stoc_set_consts_z(struct HaloSamplingConstants *const_struct, double redshi return; } +double get_max_nhalo(struct HaloSamplingConstants *const_struct, float *halo_masses, int size) { + int idx_max = cblas_isamax(size, halo_masses, 1); + float mass_max = halo_masses[idx_max]; + double ln_mm = log(mass_max); + double sigma_cond = EvaluateSigma(ln_mm); + double delta = get_delta_crit(matter_options_global->HMF, sigma_cond, const_struct->growth_in) / + const_struct->growth_in * const_struct->growth_out; + int n_exp = EvaluateNhalo(ln_mm, const_struct->growth_out, const_struct->lnM_min, + const_struct->lnM_max_tb, mass_max, sigma_cond, delta); + double expected_N = n_exp * mass_max; + return expected_N; +} + // set the constants which are calculated once per condition void stoc_set_consts_cond(struct HaloSamplingConstants *const_struct, double cond_val) { double m_exp, n_exp; @@ -860,7 +880,6 @@ int sample_halo_grids(gsl_rng **rng_arr, double redshift, float *dens_field, // sometimes halos are subtracted from the sample (set to zero) // we do not want to save these if (hm_buf[i] < simulation_options_global->SAMPLER_MIN_MASS) continue; - if (count >= arraysize_local) { out_of_buffer = true; continue; @@ -906,14 +925,14 @@ int sample_halo_grids(gsl_rng **rng_arr, double redshift, float *dens_field, nhalo_threads[threadnum] = count; } if (out_of_buffer) { - LOG_ERROR("Halo buffer overflow (allocated %d halos per thread)", arraysize_local); + LOG_ERROR("Halo buffer overflow (allocated %llu halos per thread)", arraysize_local); for (int n_t = 0; n_t < simulation_options_global->N_THREADS; n_t++) { - LOG_ERROR("Thread %d: %d halos", n_t, nhalo_threads[n_t]); + LOG_ERROR("Thread %d: %llu halos", n_t, nhalo_threads[n_t]); } LOG_ERROR( "If you expected to have an above average halo number try raising " "config['HALO_CATALOG_MEM_FACTOR']"); - Throw(ValueError); + Throw(ParallelError); } LOG_SUPER_DEBUG("Total dexm volume %.6e Total volume excluded %.6e (In units of HII_DIM cells)", @@ -951,126 +970,140 @@ int sample_halo_progenitors(gsl_rng **rng_arr, double z_in, double z_out, HaloFi double boxlen[3] = {simulation_options_global->BOX_LEN, simulation_options_global->BOX_LEN, BOXLEN_PARA}; - bool out_of_buffer = false; + // use cuda function if use_cuda is true + bool use_cuda = false; // pass this as a parameter later + if (use_cuda) { +#if CUDA_FOUND + // get parameters needed for sigma calculation + + RGTable1D_f *sigma_table = GetSigmaInterpTable(); + double x_min = sigma_table->x_min; + double x_width = sigma_table->x_width; + int sigma_bin = sigma_table->n_bin; + float *sigma_y_arr = sigma_table->y_arr; + // Create a copy of hs_constants for passing to cuda + struct HaloSamplingConstants d_hs_constants; + d_hs_constants = *hs_constants; + // get in halo data + float *halo_m = halofield_in->halo_masses; + float *halo_star_rng = halofield_in->star_rng; + float *halo_sfr_rng = halofield_in->sfr_rng; + float *halo_xray_rng = halofield_in->xray_rng; + int *halo_c = halofield_in->halo_coords; + + printf("Start cuda calculation for progenitors. "); + updateHaloOut(halo_m, halo_star_rng, halo_sfr_rng, halo_xray_rng, halo_c, nhalo_in, + sigma_y_arr, sigma_bin, x_min, x_width, d_hs_constants, arraysize_total, + halofield_out); + printf("End cuda calculation for progenitors. "); + +#else + LOG_ERROR("CUDA function updateHaloOut() called but code was not compiled for CUDA."); + Throw(ValueError); +#endif + } else { // CPU fallback + bool parallel_error = false; #pragma omp parallel num_threads(simulation_options_global->N_THREADS) - { - float prog_buf[MAX_HALO_CELL]; - int n_prog; - double M_prog; + { + float prog_buf[MAX_HALO_CELL]; + int n_prog; - double propbuf_in[3]; - double propbuf_out[3]; + double propbuf_in[3]; + double propbuf_out[3]; - int threadnum = omp_get_thread_num(); - double M2, R2, R1; - int jj; - unsigned long long int ii; - unsigned long long int count = 0; - unsigned long long int istart = threadnum * arraysize_local; - double pos_prog[3], pos_desc[3]; + int threadnum = omp_get_thread_num(); + double M2, R2, R1; + int jj; + unsigned long long int ii; + unsigned long long int count = 0; + unsigned long long int istart = threadnum * arraysize_local; + double pos_prog[3], pos_desc[3]; - // we need a private version - // also the naming convention should be better between structs/struct pointers - struct HaloSamplingConstants hs_constants_priv; - hs_constants_priv = *hs_constants; + // we need a private version + // also the naming convention should be better between structs/struct pointers + struct HaloSamplingConstants hs_constants_priv; + hs_constants_priv = *hs_constants; #pragma omp for - for (ii = 0; ii < nhalo_in; ii++) { - if (out_of_buffer) continue; - M2 = halofield_in->halo_masses[ii]; - R2 = MtoR(M2); - if (M2 < Mmin || M2 > Mmax_tb) { - LOG_ERROR( - "Input Mass = %.2e at %llu of %llu, something went wrong in the input " - "catalogue", - M2, ii, nhalo_in); - Throw(ValueError); - } - // set condition-dependent variables for sampling - stoc_set_consts_cond(&hs_constants_priv, M2); - - // Sample the CMF set by the descendant - stoc_sample(&hs_constants_priv, rng_arr[threadnum], &n_prog, prog_buf); - - propbuf_in[0] = halofield_in->star_rng[ii]; - propbuf_in[1] = halofield_in->sfr_rng[ii]; - propbuf_in[2] = halofield_in->xray_rng[ii]; - pos_desc[0] = halofield_in->halo_coords[3 * ii + 0]; - pos_desc[1] = halofield_in->halo_coords[3 * ii + 1]; - pos_desc[2] = halofield_in->halo_coords[3 * ii + 2]; - - // place progenitors in local list - M_prog = 0; - for (jj = 0; jj < n_prog; jj++) { - // sometimes halos are subtracted from the sample (set to zero) - // we do not want to save these - if (prog_buf[jj] < simulation_options_global->SAMPLER_MIN_MASS) continue; - - if (count >= arraysize_local) { - out_of_buffer = true; - continue; + for (ii = 0; ii < nhalo_in; ii++) { + if (parallel_error) continue; + M2 = halofield_in->halo_masses[ii]; + R2 = MtoR(M2); + if (M2 < Mmin || M2 > Mmax_tb) { + LOG_ERROR( + "Input Mass = %.2e at %llu of %llu, something went wrong in the input " + "catalogue", + M2, ii, nhalo_in); + parallel_error = true; + } + // set condition-dependent variables for sampling + stoc_set_consts_cond(&hs_constants_priv, M2); + + // Sample the CMF set by the descendant + stoc_sample(&hs_constants_priv, rng_arr[threadnum], &n_prog, prog_buf); + + propbuf_in[0] = halofield_in->star_rng[ii]; + propbuf_in[1] = halofield_in->sfr_rng[ii]; + propbuf_in[2] = halofield_in->xray_rng[ii]; + pos_desc[0] = halofield_in->halo_coords[3 * ii + 0]; + pos_desc[1] = halofield_in->halo_coords[3 * ii + 1]; + pos_desc[2] = halofield_in->halo_coords[3 * ii + 2]; + + // place progenitors in local list + for (jj = 0; jj < n_prog; jj++) { + // sometimes halos are subtracted from the sample (set to zero) + // we do not want to save these + if (prog_buf[jj] < simulation_options_global->SAMPLER_MIN_MASS) continue; + + if (parallel_error || count >= arraysize_local) { + parallel_error = true; + continue; + } + halofield_out->halo_masses[istart + count] = prog_buf[jj]; + + // Place the progenitor in a random position within the condition + // Such that a sphere of the progenitor's Lagrangian radius is placed + // entirely within the descendant's Lagrangian radius, + R1 = MtoR(prog_buf[jj]); + random_point_in_sphere(pos_desc, R2 - R1, rng_arr[threadnum], pos_prog); + wrap_position(pos_prog, boxlen); + + set_prop_rng(rng_arr[threadnum], true, corr_arr, propbuf_in, propbuf_out); + halofield_out->halo_coords[3 * (istart + count) + 0] = pos_prog[0]; + halofield_out->halo_coords[3 * (istart + count) + 1] = pos_prog[1]; + halofield_out->halo_coords[3 * (istart + count) + 2] = pos_prog[2]; + halofield_out->star_rng[istart + count] = propbuf_out[0]; + halofield_out->sfr_rng[istart + count] = propbuf_out[1]; + halofield_out->xray_rng[istart + count] = propbuf_out[2]; + count++; + if (ii == 0) { + LOG_ULTRA_DEBUG( + "Halo %d Prog %d: Mass %.2e Stellar %.2e SFR %.2e XRAY %.2e", ii, jj, + prog_buf[jj], propbuf_out[0], propbuf_out[1], propbuf_out[2]); + } } - - set_prop_rng(rng_arr[threadnum], true, corr_arr, propbuf_in, propbuf_out); - - halofield_out->halo_masses[istart + count] = prog_buf[jj]; - - // Place the progenitor in a random position within the condition - // Such that a sphere of the progenitor's Lagrangian radius is placed - // entirely within the descendant's Lagrangian radius, - R1 = MtoR(prog_buf[jj]); - random_point_in_sphere(pos_desc, R2 - R1, rng_arr[threadnum], pos_prog); - wrap_position(pos_prog, boxlen); - - halofield_out->halo_coords[3 * (istart + count) + 0] = pos_prog[0]; - halofield_out->halo_coords[3 * (istart + count) + 1] = pos_prog[1]; - halofield_out->halo_coords[3 * (istart + count) + 2] = pos_prog[2]; - halofield_out->star_rng[istart + count] = propbuf_out[0]; - halofield_out->sfr_rng[istart + count] = propbuf_out[1]; - halofield_out->xray_rng[istart + count] = propbuf_out[2]; - count++; - if (ii == 0) { - M_prog += prog_buf[jj]; - - LOG_ULTRA_DEBUG( - "First Halo Prog %d: Mass %.2e Stellar %.2e SFR %.2e XRAY %.2e e_d %.3f", - jj, prog_buf[jj], propbuf_out[0], propbuf_out[1], propbuf_out[2], - Deltac * hs_constants->growth_out / hs_constants->growth_in); + print_hs_consts(&hs_constants_priv); } } - if (ii == 0) { - LOG_ULTRA_DEBUG( - " HMF %d delta %.3f delta_coll %.3f delta_desc %.3f adjusted %.3f", - matter_options_global->HMF, hs_constants_priv.delta, - get_delta_crit(matter_options_global->HMF, hs_constants_priv.sigma_cond, - hs_constants->growth_out), - get_delta_crit(matter_options_global->HMF, hs_constants_priv.sigma_cond, - hs_constants->growth_in), - get_delta_crit(matter_options_global->HMF, hs_constants_priv.sigma_cond, - hs_constants->growth_in) * - hs_constants->growth_out / hs_constants->growth_in); - print_hs_consts(&hs_constants_priv); - LOG_SUPER_DEBUG( - "First Halo: Mass %.2f | N %d (exp. %.2e) | Total M %.2e (exp. %.2e)", M2, - n_prog, hs_constants_priv.expected_N, M_prog, hs_constants_priv.expected_M); + istart_threads[threadnum] = istart; + nhalo_threads[threadnum] = count; + } + if (parallel_error) { + LOG_ERROR("More than %llu halos (expected %.1e) with buffer size factor %.1f", + arraysize_local, arraysize_local / config_settings.HALO_CATALOG_MEM_FACTOR, + config_settings.HALO_CATALOG_MEM_FACTOR); + for (int n_t = 0; n_t < simulation_options_global->N_THREADS; n_t++) { + LOG_ERROR("Thread %d: %llu halos", n_t, nhalo_threads[n_t]); } + LOG_ERROR( + "If you expected to have an above average halo number try raising " + "config_settings.HALO_CATALOG_MEM_FACTOR"); + Throw(ParallelError); } - istart_threads[threadnum] = istart; - nhalo_threads[threadnum] = count; + condense_sparse_halolist(halofield_out, istart_threads, nhalo_threads); } - if (out_of_buffer) { - LOG_ERROR("Halo buffer overflow (allocated %d halos per thread)", arraysize_local); - for (int n_t = 0; n_t < simulation_options_global->N_THREADS; n_t++) { - LOG_ERROR("Thread %d: %d halos", n_t, nhalo_threads[n_t]); - } - LOG_ERROR( - "If you expected to have an above average halo number try raising " - "config['HALO_CATALOG_MEM_FACTOR']"); - Throw(ValueError); - } - condense_sparse_halolist(halofield_out, istart_threads, nhalo_threads); return 0; } @@ -1092,19 +1125,56 @@ int stochastic_halofield(unsigned long long int seed, float redshift_desc, float struct HaloSamplingConstants hs_constants; stoc_set_consts_z(&hs_constants, redshift, redshift_desc); + bool use_cuda = false; + if (use_cuda) { +#if CUDA_FOUND + // get interp tables needed for sampling progenitors + RGTable1D *nhalo_table = GetNhaloTable(); + RGTable1D *mcoll_table = GetMcollTable(); + RGTable2D *nhalo_inv_table = GetNhaloInvTable(); + // copy the tables to the device + copyTablesToDevice(*nhalo_table, *mcoll_table, *nhalo_inv_table); + + // copy global variables to the device + // todo: move the following operation to InitialConditions.c + updateGlobalParams(simulation_options_global, cosmo_params_global, astro_params_global); +#else + LOG_ERROR("CUDA function copyTablesToDevice called but code was not compiled for CUDA."); +#endif + } + // Fill them // NOTE:Halos prev in the first box corresponds to the large DexM halos if (redshift_desc <= 0.) { LOG_DEBUG("building first halo field at z=%.1f", redshift); sample_halo_grids(rng_stoc, redshift, dens_field, halo_overlap_box, halos_desc, halos, &hs_constants); + + if (use_cuda) { + // initiate rand states on the device +#if CUDA_FOUND + unsigned long long int nhalo_first = halos->n_halos; + int buffer_scale = HALO_CUDA_THREAD_FACTOR + 1; + unsigned long long int n_rstates = nhalo_first * buffer_scale; + printf("initializing %llu random states on the device... \n", n_rstates); + init_rand_states(seed, n_rstates); + + printf("finish initializing \n"); + + // todo: add a signal to free rand states once all iterations are done +#else + LOG_ERROR( + "CUDA function init_rand_states() called but code was not compiled for CUDA."); + Throw(ValueError); +#endif + } + } else { LOG_DEBUG("Calculating halo progenitors from z=%.1f to z=%.1f | %llu", redshift_desc, redshift, halos_desc->n_halos); sample_halo_progenitors(rng_stoc, redshift_desc, redshift, halos_desc, halos, &hs_constants); } - LOG_DEBUG("Found %llu Halos", halos->n_halos); if (halos->n_halos >= 3) { diff --git a/src/py21cmfast/src/Stochasticity.cu b/src/py21cmfast/src/Stochasticity.cu new file mode 100644 index 000000000..33947691a --- /dev/null +++ b/src/py21cmfast/src/Stochasticity.cu @@ -0,0 +1,1031 @@ +#include +#include + +#include +#include // host-side header file +#include // device-side header file + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "Constants.h" +#include "interpolation_types.h" +#include "Stochasticity.h" + +#include "cuda_utils.cuh" +#include "Stochasticity.cuh" +#include "DeviceConstants.cuh" +#include "device_rng.cuh" +#include "hmf.cu" +#include "interp_tables.cu" + + + + +#ifndef MAX_DELTAC_FRAC +#define MAX_DELTAC_FRAC (float)0.99 // max delta/deltac for the mass function integrals +#endif + +#ifndef DELTA_MIN +#define DELTA_MIN -1 // minimum delta for Lagrangian mass function integrals +#endif + +#ifndef MAX_HALO_CELL +#define MAX_HALO_CELL (int)1e5 +#endif + +void validate_thrust() +{ + // Create a host vector with some values + thrust::host_vector h_vec(5); + h_vec[0] = 1; + h_vec[1] = 2; + h_vec[2] = 3; + h_vec[3] = 4; + h_vec[4] = 5; + + // Transfer data from host to device + thrust::device_vector d_vec = h_vec; + + // Calculate the sum of all elements in the device vector + int sum = thrust::reduce(d_vec.begin(), d_vec.end(), 0, thrust::plus()); + + // Print the result + std::cout << "Sum is: " << sum << std::endl; // Should print "Sum is: 15" +} + +void condense_device_vector() +{ + // Step 1: Create a device vector with some elements, including -1 + thrust::device_vector d_vec(10); + d_vec[0] = 1; + d_vec[1] = -1; + d_vec[2] = 3; + d_vec[3] = -1; + d_vec[4] = 5; + d_vec[5] = 6; + d_vec[6] = -1; + d_vec[7] = 7; + d_vec[8] = -1; + d_vec[9] = 9; + + // Step 2: Use thrust::remove_if to remove all occurrences of -1 + thrust::device_vector::iterator new_end = thrust::remove(d_vec.begin(), d_vec.end(), -1); + + // Step 3: Resize the vector to remove the trailing elements after the "new_end" iterator + d_vec.erase(new_end, d_vec.end()); + + // Step 4: Copy the result to the host to check + thrust::host_vector h_vec = d_vec; + + // Step 5: Print the result + std::cout << "Condensed Vector: "; + for (size_t i = 0; i < h_vec.size(); i++) + { + std::cout << h_vec[i] << " "; + } + std::cout << std::endl; +} + +// int condenseDeviceArray(float *d_array, int original_size, float mask_value) +// { +// // Wrap the raw device pointer into a thrust device pointer +// thrust::device_ptr d_array_ptr(d_array); + +// // Remove elements with mask value +// // i.e.move elements not equal to mask value to the beginning of the array without changing order +// auto new_end = thrust::remove(d_array_ptr, d_array_ptr + original_size, mask_value); + +// // Calculate the number of valid elements +// int valid_size = new_end - d_array_ptr; + +// // Fill the remaining space with mask value +// thrust::fill(new_end, d_array_ptr + original_size, mask_value); + +// // Print results (on host side) +// // std::cout << "Valid elements count: " << valid_size << "\n"; +// return valid_size; +// } + +template +int condenseDeviceArray(T *d_array, int original_size, T mask_value) +{ + // Wrap the raw device pointer into a thrust device pointer + thrust::device_ptr d_array_ptr(d_array); + + // Remove elements with mask value + auto new_end = thrust::remove(d_array_ptr, d_array_ptr + original_size, mask_value); + + // Calculate the number of valid elements + int valid_size = new_end - d_array_ptr; + + // Fill the remaining space with mask value + thrust::fill(new_end, d_array_ptr + original_size, mask_value); + + return valid_size; +} + +// todo: maybe add python wrapper for test functions +void testCondenseDeviceArray() +{ + // Input data + float h_array[] = {1.0f, 0.0f, 2.0f, 3.0f, 0.0f, 4.0f}; + float mask_value = 0.0f; + int original_size = 6; + + // Expected outputs + float expected_array[] = {1.0f, 2.0f, 3.0f, 4.0f, 0.0f, 0.0f}; + int expected_valid_size = 4; + + // Allocate and copy to device + float *d_array; + cudaMalloc(&d_array, original_size * sizeof(float)); + cudaMemcpy(d_array, h_array, original_size * sizeof(float), cudaMemcpyHostToDevice); + + // Call the function from Stochasticity.cu + int valid_size = condenseDeviceArray(d_array, original_size, mask_value); + + // Copy the results back to the host + float h_result[original_size]; + cudaMemcpy(h_result, d_array, original_size * sizeof(float), cudaMemcpyDeviceToHost); + + // Validate the results + assert(valid_size == expected_valid_size); + for (int i = 0; i < original_size; ++i) + { + assert(h_result[i] == expected_array[i]); + } + + std::cout << "Test passed: condenseDeviceArray\n"; + + // Free device memory + cudaFree(d_array); +} + +// todo: add more tests to check with large number of input; fix the type mismatch (int, ull) +int filterWithMask(float *d_data, int *d_mask, int original_size) +{ + // Wrap the raw pointers into thrust device pointers + thrust::device_ptr d_data_ptr(d_data); + thrust::device_ptr d_mask_ptr(d_mask); + + // Use the mask to select only elements that correspond to a value of 1 in the mask + auto end = thrust::copy_if(d_data_ptr, d_data_ptr + original_size, d_mask_ptr, d_data_ptr, thrust::identity()); + + // Calculate the new valid size after filtering + int valid_size = end - d_data_ptr; + + // Optionally, print the number of valid elements + // std::cout << "Valid elements count: " << valid_size << "\n"; + + return valid_size; +} + +void testFilterWithMask() +{ + // Input arrays + float h_data[] = {1.1f, 2.2f, 3.3f, 4.4f, 5.5f}; // Input data + int h_mask[] = {1, 0, 1, 0, 1}; // Mask array + int original_size = 5; + + // Expected outputs + float expected_data[] = {1.1f, 3.3f, 5.5f}; // Expected filtered data + int expected_size = 3; // Number of valid elements + + // Allocate device memory + float *d_data; + int *d_mask; + cudaMalloc(&d_data, original_size * sizeof(float)); + cudaMalloc(&d_mask, original_size * sizeof(int)); + + // Copy data to device + cudaMemcpy(d_data, h_data, original_size * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(d_mask, h_mask, original_size * sizeof(int), cudaMemcpyHostToDevice); + + // Call the function + int valid_size = filterWithMask(d_data, d_mask, original_size); + + // Copy the filtered data back to host + float h_result[original_size]; + cudaMemcpy(h_result, d_data, original_size * sizeof(float), cudaMemcpyDeviceToHost); + + // Validate the size of the filtered array + assert(valid_size == expected_size); + + // Validate the filtered elements + for (int i = 0; i < valid_size; ++i) + { + assert(h_result[i] == expected_data[i]); + } + + // Print success message + std::cout << "Test passed: filterWithMask\n"; + + // Free device memory + cudaFree(d_data); + cudaFree(d_mask); +} + +void countElements(const int *array, int size, const std::vector &values_to_count) +{ + // Initialize a frequency array to count occurrences + int count[values_to_count.size()] = {0}; + + // Iterate through the input array + for (int i = 0; i < size; ++i) + { + // Find the index of the value in values_to_count + for (size_t j = 0; j < values_to_count.size(); ++j) + { + if (array[i] == values_to_count[j]) + { + count[j]++; + break; + } + } + } + + // Print the results + for (size_t i = 0; i < values_to_count.size(); ++i) + { + std::cout << "Value " << values_to_count[i] << ": " << count[i] << " occurrences\n"; + } +} + +// decide the number of sparsity +int getSparsity(int n_buffer, int n_halo){ + if (n_halo > 0){ + int power = floor(log2(n_buffer / n_halo)); + int sparsity = 1 << power; + return sparsity; + } + else{ + return -1; + } + +} + +// initialize device array with given value +void initializeArray(int *d_array, int n_elements, int value){ + thrust::device_ptr d_array_ptr(d_array); + thrust::fill(d_array_ptr, d_array_ptr + n_elements, value); +} + +// void getKernelAttr(){ +// cudaFuncAttributes attr; +// cudaFuncGetAttributes(&attr, myKernel); +// printf("Kernel Shared Memory per Block: %zu bytes\n", attr.sharedSizeBytes); +// printf("Kernel Registers per Thread: %d\n", attr.numRegs); +// printf("Kernel Max Threads per Block: %d\n", attr.maxThreadsPerBlock); +// } + +struct GridLayout{ + int n_threads; + int n_blocks; +}; +// calculate workload +// todo: add more checks on sparsity +GridLayout getWorkload(int sparsity, unsigned long long int n_halos){ + GridLayout res; + int n_threads, n_blocks; + if (sparsity != 0 && 256 % sparsity == 0){ + n_threads = 256; + } + else { + n_threads = std::min(sparsity,512); + } + res.n_threads = n_threads; + n_blocks = (n_halos * sparsity + n_threads -1)/n_threads; + res.n_blocks = n_blocks; + return res; +} + +// 11-30: the following implementation works (before using any global params on gpu) +__device__ void stoc_set_consts_cond(struct HaloSamplingConstants *const_struct, float cond_val, int HMF, double x_min, double x_width, float *d_y_arr, int n_bin, double *expected_mass) +{ + double m_exp, n_exp; + // Here the condition is a mass, volume is the Lagrangian volume and delta_l is set by the + // redshift difference which represents the difference in delta_crit across redshifts + if (const_struct->from_catalog){ + const_struct->M_cond = cond_val; + const_struct->lnM_cond = log(cond_val); + const_struct->sigma_cond = EvaluateSigma(const_struct->lnM_cond, x_min, x_width, d_y_arr, n_bin); //todo: update this function using global tables in constant memory + // mean stellar mass of this halo mass, used for stellar z correlations + const_struct->cond_val = const_struct->lnM_cond; + // condition delta is the previous delta crit + const_struct->delta = get_delta_crit(HMF, const_struct->sigma_cond, const_struct->growth_in) / const_struct->growth_in * const_struct->growth_out; //todo: update this function using global variables in constant memory + } + // Here the condition is a cell of a given density, the volume/mass is given by the grid parameters + else + { + // since the condition mass/sigma is already set all we need is delta + const_struct->delta = cond_val; + const_struct->cond_val = cond_val; + } + // Get expected N and M from interptables + // the splines don't work well for cells above Deltac, but there CAN be cells above deltac, since this calculation happens + // before the overlap, and since the smallest dexm mass is M_cell*(1.01^3) there *could* be a cell above Deltac not in a halo + // NOTE: all this does is prevent integration errors below since these cases are also dealt with in stoc_sample + if (const_struct->delta > MAX_DELTAC_FRAC * get_delta_crit(d_matter_options.HMF, const_struct->sigma_cond, const_struct->growth_out)){ + const_struct->expected_M = const_struct->M_cond; + const_struct->expected_N = 1; + } + else if (const_struct->delta <= DELTA_MIN){ + const_struct->expected_M = 0; + const_struct->expected_N = 0; + } + else + { + n_exp = EvaluateNhalo(const_struct->cond_val, const_struct->growth_out, const_struct->lnM_min, + const_struct->lnM_max_tb, const_struct->M_cond, const_struct->sigma_cond, const_struct->delta); + m_exp = EvaluateMcoll(const_struct->cond_val, const_struct->growth_out, const_struct->lnM_min, + const_struct->lnM_max_tb, const_struct->M_cond, const_struct->sigma_cond, const_struct->delta); + const_struct->expected_N = n_exp * const_struct->M_cond; + const_struct->expected_M = m_exp * const_struct->M_cond; + } + *expected_mass = const_struct->expected_M; + return; +} + +__device__ double sample_dndM_inverse(double condition, struct HaloSamplingConstants *hs_constants, curandState *state) +{ + double p_in, result; + p_in = curand_uniform_double(state); + // printf("curand uniform random number: %f\n", p_in); + result = EvaluateNhaloInv(condition, p_in); + result = fmin(1.0, fmax(0.0, result)); // clip in case of extrapolation + result = result * hs_constants->M_cond; + return result; +} + +__device__ double remove_random_halo(curandState *state, int n_halo, int *idx, float *M_prog, float *M_out){ + double last_M_del; + int random_idx; + do { + random_idx = (int)(curand_uniform(state) * n_halo); + } while (M_out[random_idx] == 0.0f); + last_M_del = M_out[random_idx]; + *M_prog -= last_M_del; + M_out[random_idx] = 0.0f; // -1 mass halos are skipped and not counted + + *idx = random_idx; + return last_M_del; +} + +__device__ void fix_mass_sample(curandState *state, double exp_M, float *M_prog, float *M_out, int write_limit, int *n_prog){ + // Keep the last halo if it brings us closer to the expected mass + // This is done by addition or subtraction over the limit to balance + // the bias of the last halo being larger + int random_idx; + double last_M_del; + int sel = curand(state) % 2; + // int sel = 1; //tmp: implement the first case + if (sel) + { + if (fabs(*M_prog - M_out[write_limit] - exp_M) < fabs(*M_prog - exp_M)) + { + // *M_tot_pt -= M_out[*n_halo_pt - 1]; + // here we remove by setting the counter one lower so it isn't read + M_out[write_limit] = 0.0f; + (*n_prog)--; + } + } + else + { + do { + // here we remove by setting halo mass to zero, skipping it during the consolidation + last_M_del = remove_random_halo(state, write_limit+1, &random_idx, M_prog, M_out); + } while (*M_prog > exp_M); + + // if the sample with the last subtracted halo is closer to the expected mass, keep it + // LOG_ULTRA_DEBUG("Deciding to keep last halo M %.3e tot %.3e exp %.3e",last_M_del,*M_tot_pt,exp_M); + if (fabs(*M_prog + last_M_del - exp_M) < fabs(*M_prog - exp_M)) + { + M_out[random_idx] = last_M_del; + *M_prog += last_M_del; + } + + } +} + +__device__ int stoc_mass_sample(struct HaloSamplingConstants *hs_constants, curandState *state, float *M_out){ + double exp_M = hs_constants->expected_M; + + // The mass-limited sampling as-is has a slight bias to producing too many halos, + // which is independent of density or halo mass, + // this factor reduces the total expected mass to bring it into line with the CMF + // exp_M *= user_params_global->HALOMASS_CORRECTION; + exp_M *= d_matter_options.HALOMASS_CORRECTION; + + // int n_halo_sampled = 0; + // double M_prog = 0; + // double M_sample; + + double tbl_arg = hs_constants->cond_val; + + // tmp (start) + double M_sample = sample_dndM_inverse(tbl_arg, hs_constants, state); + + // M_prog += M_sample; + // tmp (end) + + // while (M_prog < exp_M){ + // M_sample = sample_dndM_inverse(tbl_arg, hs_constants, state); + + // M_prog += M_sample; + // M_out[n_halo_sampled++] = M_sample; + // } + // todo: enable fix_mass_sample + // The above sample is above the expected mass, by up to 100%. I wish to make the average mass equal to exp_M + // fix_mass_sample(state, exp_M, &n_halo_sampled, &M_prog, M_out); + + // *n_halo_out = n_halo_sampled; + // if (M_prog < exp_M){ + // *further_process = 1; + // return 1; + // } + *M_out = M_sample; + return 0; +} + +__device__ int stoc_sample(struct HaloSamplingConstants *hs_constants, curandState *state, float *M_out, int *sampleCondition){ + // TODO: really examine the case for number/mass sampling + // The poisson sample fails spectacularly for high delta (from_catalogs or dense cells) + // and excludes the correlation between number and mass (e.g many small halos or few large ones) + // The mass sample underperforms at low exp_M/M_max by excluding stochasticity in the total collapsed fraction + // and excluding larger halos (e.g if exp_M is 0.1*M_max we can effectively never sample the large halos) + // i.e there is some case for a delta cut between these two methods however I have no intuition for the exact levels + + int err; + + // If the expected mass is below our minimum saved mass, don't bother calculating + // NOTE: some of these conditions are redundant with set_consts_cond() + if (hs_constants->delta <= DELTA_MIN || hs_constants->expected_M < d_simulation_options.SAMPLER_MIN_MASS) + { + // *n_halo_out = 0; + *sampleCondition = 0; + return 0; + } + // if delta is above critical, form one big halo + if (hs_constants->delta >= MAX_DELTAC_FRAC * get_delta_crit(d_matter_options.HMF, hs_constants->sigma_cond, hs_constants->growth_out)){ + // *n_halo_out = 1; + + // Expected mass takes into account potential dexm overlap + *M_out = hs_constants->expected_M; + *sampleCondition = 1; + return 0; + } + + // todo: implement callee functions for SAMPLE_METHOD (1,2,3) + // We always use Number-Limited sampling for grid-based cases + if (d_matter_options.SAMPLE_METHOD == 1 || !hs_constants->from_catalog) + { + // err = stoc_halo_sample(hs_constants, rng, n_halo_out, M_out); + return 0; + } + else if (d_matter_options.SAMPLE_METHOD == 0) + { + err = stoc_mass_sample(hs_constants, state, M_out); + } + else if (d_matter_options.SAMPLE_METHOD == 2) + { + // err = stoc_partition_sample(hs_constants, rng, n_halo_out, M_out); + return 0; + } + else if (d_matter_options.SAMPLE_METHOD == 3) + { + // err = stoc_split_sample(hs_constants, rng, n_halo_out, M_out); + return 0; + } + else + { + printf("Invalid sampling method \n"); + return 0; + // todo: check how to throw error in cuda + // LOG_ERROR("Invalid sampling method"); + // Throw(ValueError); + } + // if (*n_halo_out > MAX_HALO_CELL) + // { + // printf("too many halos in conditin, buffer overflow\n"); + // // todo: check how to throw error in cuda + // // LOG_ERROR("too many halos in condition, buffer overflow"); + // // Throw(ValueError); + // } + return err; +} + +// todo: implement condense_sparse_halolist +// // todo: just copied the original function here, need to verify it works with cuda +// __device__ void condense_sparse_halolist(HaloField *halofield, unsigned long long int *istart_threads, unsigned long long int *nhalo_threads) +// { +// int i = 0; +// unsigned long long int count_total = 0; +// for (i = 0; i < user_params_global->N_THREADS; i++) +// { +// memmove(&halofield->halo_masses[count_total], &halofield->halo_masses[istart_threads[i]], sizeof(float) * nhalo_threads[i]); +// memmove(&halofield->star_rng[count_total], &halofield->star_rng[istart_threads[i]], sizeof(float) * nhalo_threads[i]); +// memmove(&halofield->sfr_rng[count_total], &halofield->sfr_rng[istart_threads[i]], sizeof(float) * nhalo_threads[i]); +// memmove(&halofield->xray_rng[count_total], &halofield->xray_rng[istart_threads[i]], sizeof(float) * nhalo_threads[i]); +// memmove(&halofield->halo_coords[3 * count_total], &halofield->halo_coords[3 * istart_threads[i]], sizeof(int) * 3 * nhalo_threads[i]); +// LOG_SUPER_DEBUG("Moved array (start,count) (%llu, %llu) to position %llu", istart_threads[i], nhalo_threads[i], count_total); +// count_total += nhalo_threads[i]; +// } +// halofield->n_halos = count_total; + +// // replace the rest with zeros for clarity +// memset(&halofield->halo_masses[count_total], 0, (halofield->buffer_size - count_total) * sizeof(float)); +// memset(&halofield->halo_coords[3 * count_total], 0, 3 * (halofield->buffer_size - count_total) * sizeof(int)); +// memset(&halofield->star_rng[count_total], 0, (halofield->buffer_size - count_total) * sizeof(float)); +// memset(&halofield->sfr_rng[count_total], 0, (halofield->buffer_size - count_total) * sizeof(float)); +// memset(&halofield->xray_rng[count_total], 0, (halofield->buffer_size - count_total) * sizeof(float)); +// LOG_SUPER_DEBUG("Set %llu elements beyond %llu to zero", halofield->buffer_size - count_total, count_total); +// } + +__device__ void set_prop_rng(curandState *state, bool from_catalog, double *interp, float *input, float *output) +{ + float rng_star, rng_sfr, rng_xray; + + // Correlate properties by interpolating between the sampled and descendant gaussians + rng_star = d_astro_params.SIGMA_STAR > 0. ? curand_normal(state) : 0.; + rng_sfr = d_astro_params.SIGMA_SFR_LIM > 0. ? curand_normal(state) : 0.; + rng_xray = d_astro_params.SIGMA_LX > 0. ? curand_normal(state) : 0.; + + if (from_catalog) + { + // this transforms the sample to one from the multivariate Gaussian, conditioned on the first sample + rng_star = sqrt(1 - interp[0] * interp[0]) * rng_star + interp[0] * input[0]; + rng_sfr = sqrt(1 - interp[1] * interp[1]) * rng_sfr + interp[1] * input[1]; + rng_xray = sqrt(1 - interp[2] * interp[2]) * rng_xray + interp[2] * input[2]; + } + + output[0] = rng_star; + output[1] = rng_sfr; + output[2] = rng_xray; + return; +} + +__global__ void update_halo_constants(float *d_halo_masses, float *d_star_rng_in, float *d_sfr_rng_in, float *d_xray_rng_in, + int *d_halo_coords_in, float *d_y_arr, double x_min, double x_width, + unsigned long long int n_halos, int n_bin, struct HaloSamplingConstants d_hs_constants, + int HMF, + float *d_halo_masses_out, float *d_star_rng_out, + float *d_sfr_rng_out, float *d_xray_rng_out, int *d_halo_coords_out, int *d_sum_check, + int *d_further_process, int *d_nprog_predict, int sparsity, unsigned long long int write_offset, + double *expected_mass, int *d_n_prog, int offset_shared) +{ + // Define shared memory for block-level reduction + extern __shared__ float shared_memory[]; + // __shared__ float shared_mass[256]; + + // partition shared memory + float *shared_mass = shared_memory; + float *shared_prop_rng = shared_memory + offset_shared; + + // get local thread idx + int tid = threadIdx.x; + + // initialize shared_mass + shared_mass[tid] = 0.0f; + + // initialize shared_prop_rng + for (int i=0;i<3;i++){ + shared_prop_rng[tid+i*offset_shared] = 0.0f; + } + + + // get global thread idx + int ind = blockIdx.x * blockDim.x + threadIdx.x; + + // get halo idx + int hid = ind / sparsity; + if (hid >= n_halos) + { + // printf("Out of halo range.\n"); + return; + } + + // get halo mass + float M = d_halo_masses[hid]; + + // get stoc properties from in halo + float prop_in[3] = {d_star_rng_in[hid], d_sfr_rng_in[hid], d_xray_rng_in[hid]}; + + // get correction + double corr_arr[3] = {d_hs_constants.corr_star, d_hs_constants.corr_sfr, d_hs_constants.corr_xray}; + + // get coordinate + int coords_in[3] = {d_halo_coords_in[hid*3], d_halo_coords_in[hid*3+1], d_halo_coords_in[hid*3+2]}; + + // idx of d_halo_masses_out and other halo field arrays + int out_id = write_offset + ind; + + // set condition-dependent variables for sampling + stoc_set_consts_cond(&d_hs_constants, M, HMF, x_min, x_width, d_y_arr, n_bin, &expected_mass[hid]); + // if (hid == 1){ + // printf("check here. \n"); + // } + + // if (hid == 2){ + // printf("check here. \n"); + // } + + // tmp: just to verify the tables have been copied correctly + // if (ind == 0) + // { + // printf("The first element of Nhalo y_arr: %e (%e) \n", d_Nhalo_yarr[0], d_Nhalo_table.y_arr[0]); + // printf("The nhalo table n_bin: %d\n", d_Nhalo_table.n_bin); + // printf("The nhalo_inv table nx_bin: %d\n", d_Nhalo_inv_table.nx_bin); + // printf("HII_DIM: %d \n", d_user_params.HII_DIM); + // printf("test params: %f \n", d_test_params); + // printf("A_VCB: %f \n", d_astro_params.A_VCB); + // printf("SIGMA_8: %f \n", d_cosmo_params.SIGMA_8); + // printf("number of rng states: %d\n", g_numRNGStates); + // // tiger tmp: debug (start) + // double res1, res2, res3, res4; + // res1 = EvaluateNhaloInv(18.694414138793945, 0.0046723012881037529); + // printf("tmp res1 on gpu: %.17f \n", res1); + // res2 = EvaluateNhaloInv(20.084152221679688, 0.32153863360286256); + // printf("tmp res2 on gpu: %.17f \n", res2); + // res3 = EvaluateNhaloInv(26.806314468383789, 0.8698794976081996); + // printf("tmp res3 on gpu: %.17f \n", res3); + // res4 = EvaluateNhaloInv(19.00053596496582, 0.83130413049947305); + // printf("tmp res4 on gpu: %.17f \n", res4); + // // tiger tmp: debug (end) + // } + + curandState local_state = d_randStates[ind]; + // if (blockIdx.x > 100000){ + // // printf("check here. \n"); + // } + // tmp: for validation only + // sample_dndM_inverse(0.38, &d_hs_constants, &local_state); + // int tmp1 = 20; + // double tmp2 = 681273355217.0; + // float tmp3 = 101976856.0; + // remove_random_halo(&local_state, 59, &tmp1, &tmp2, &tmp3); + + // check sample condition + // condition 0: no sampling; condition 1: use expected_M; condition 2: sampling + int sampleCondition = 2; + stoc_sample(&d_hs_constants, &local_state, &shared_mass[tid], &sampleCondition); + + // get stochastic halo properties + set_prop_rng(&local_state, true, corr_arr, prop_in, &shared_prop_rng[tid*3]); + + + + __syncthreads(); + + if (tid % sparsity == 0){ + if (sampleCondition == 0){ + d_n_prog[hid] = 0; + } + if (sampleCondition == 1){ + if(shared_mass[tid] >= d_simulation_options.SAMPLER_MIN_MASS){ + d_halo_masses_out[out_id] = shared_mass[tid]; + d_n_prog[hid] = 1; + d_star_rng_out[out_id] = shared_prop_rng[3 * tid]; + d_sfr_rng_out[out_id] = shared_prop_rng[3 * tid + 1]; + d_xray_rng_out[out_id] = shared_prop_rng[3 * tid + 2]; + d_halo_coords_out[out_id*3] = coords_in[0]; + d_halo_coords_out[out_id*3+1] = coords_in[1]; + d_halo_coords_out[out_id*3+2] = coords_in[2]; + + } + } + if (sampleCondition == 2){ + float Mprog = 0.0; + int write_limit = 0; + int meetCondition = 0; + + for (int i = 0; i < sparsity; ++i){ + Mprog += shared_mass[tid + i]; + if (Mprog >= d_hs_constants.expected_M) + { + write_limit = i; + meetCondition = 1; + break; + } + } + + if (meetCondition){ + // correct the mass samples + int n_prog = write_limit +1; + + fix_mass_sample(&local_state, d_hs_constants.expected_M, &Mprog, &shared_mass[tid], write_limit, &n_prog); + + // record number of progenitors + d_n_prog[hid] = min(100,n_prog); + + for (int i = 0; i < write_limit + 1; ++i) + { + if(shared_mass[tid + i] < d_simulation_options.SAMPLER_MIN_MASS) continue; + // write the final mass sample to array in global memory + d_halo_masses_out[out_id + i] = shared_mass[tid + i]; + d_star_rng_out[out_id + i] = shared_prop_rng[3*(tid +i)]; + d_sfr_rng_out[out_id + i] = shared_prop_rng[3*(tid+i) + 1]; + d_xray_rng_out[out_id + i] = shared_prop_rng[3*(tid+i) + 2]; + d_halo_coords_out[(out_id+i) * 3] = coords_in[0]; + d_halo_coords_out[(out_id+i) * 3 + 1] = coords_in[1]; + d_halo_coords_out[(out_id+i) * 3 + 2] = coords_in[2]; + } + } + else{ + d_further_process[hid] = 1; + d_nprog_predict[hid] = ceil(d_hs_constants.expected_M * sparsity / Mprog); + + } + } + } + + // Perform reduction within the block + // for (int stride = blockDim.x / 2; stride > 0; stride /= 2) + // { + // if (tid < stride) + // { + // shared_check[tid] += shared_check[tid + stride]; + // } + // __syncthreads(); // Ensure all threads have completed each stage of reduction + // } + + // Write the result from each block to the global sum + // if (tid == 0) + // { + // atomicAdd(d_sum_check, shared_check[0]); + // } + + // Sample the CMF set by the descendant + // stoc_sample(&hs_constants, &local_state, &n_prog, prog_buf); + + // double sigma = EvaluateSigma(log(M), x_min, x_width, d_y_arr, n_bin); + // double delta = get_delta_crit(HMF, sigma, d_hs_constants.growth_in)\ + // / d_hs_constants.growth_in * d_hs_constants.growth_out; + + d_randStates[ind] = local_state; + return; +} + +// function to launch kernel grids +int updateHaloOut(float *halo_masses, float *star_rng, float *sfr_rng, float *xray_rng, int *halo_coords, + unsigned long long int n_halos, float *y_arr, int n_bin_y, double x_min, double x_width, + struct HaloSamplingConstants hs_constants, unsigned long long int n_buffer, HaloField *halofield_out) +{ + // allocate memory and copy halo data to the device (halo in) + size_t size_halo = sizeof(float) * n_halos; + float *d_halo_masses; + CALL_CUDA(cudaMalloc(&d_halo_masses, size_halo)); + CALL_CUDA(cudaMemcpy(d_halo_masses, halo_masses, size_halo, cudaMemcpyHostToDevice)); + + float *d_star_rng; + CALL_CUDA(cudaMalloc(&d_star_rng, size_halo)); + CALL_CUDA(cudaMemcpy(d_star_rng, star_rng, size_halo, cudaMemcpyHostToDevice)); + + float *d_sfr_rng; + CALL_CUDA(cudaMalloc(&d_sfr_rng, size_halo)); + CALL_CUDA(cudaMemcpy(d_sfr_rng, sfr_rng, size_halo, cudaMemcpyHostToDevice)); + + float *d_xray_rng; + CALL_CUDA(cudaMalloc(&d_xray_rng, size_halo)); + CALL_CUDA(cudaMemcpy(d_xray_rng, xray_rng, size_halo, cudaMemcpyHostToDevice)); + + int *d_halo_coords; + size_t size_halo_coords = 3 * sizeof(int) * n_halos; + CALL_CUDA(cudaMalloc(&d_halo_coords, size_halo_coords)); + CALL_CUDA(cudaMemcpy(d_halo_coords, halo_coords, size_halo_coords, cudaMemcpyHostToDevice)); + + // allocate memory and copy y_arr of sigma_table to the device + size_t size_yarr = sizeof(float) * n_bin_y; + float *d_y_arr; + CALL_CUDA(cudaMalloc(&d_y_arr, size_yarr)); + CALL_CUDA(cudaMemcpy(d_y_arr, y_arr, size_yarr, cudaMemcpyHostToDevice)); + + // allocate memory for d_check_sum (tmp) + int *d_sum_check; + CALL_CUDA(cudaMalloc((void **)&d_sum_check, sizeof(int))); + CALL_CUDA(cudaMemset(d_sum_check, 0, sizeof(int))); + + // allocate memory to store list of halo index need further process + int *d_further_process; + CALL_CUDA(cudaMalloc(&d_further_process, sizeof(int)*n_halos)); + CALL_CUDA(cudaMemset(d_further_process, 0, sizeof(int)*n_halos)); + + // allocate memory to store number of progenitors per halo + int *d_n_prog; + CALL_CUDA(cudaMalloc(&d_n_prog, sizeof(int) * n_halos)); + initializeArray(d_n_prog, n_halos, 32); + + // allocate memory to store estimated n_prog after the first kernel launch + int *d_nprog_predict; + CALL_CUDA(cudaMalloc(&d_nprog_predict, sizeof(int) * n_halos)); + CALL_CUDA(cudaMemset(d_nprog_predict, 0, sizeof(int) * n_halos)); + + // tmp: check expected_M + double *d_expected_mass, *h_expected_mass; + CALL_CUDA(cudaMalloc(&d_expected_mass, sizeof(double) * n_halos)); + CALL_CUDA(cudaMemset(d_expected_mass, 0, sizeof(double) * n_halos)); + CALL_CUDA(cudaHostAlloc((void **)&h_expected_mass, sizeof(double) * n_halos, cudaHostAllocDefault)); + + // get parameters needed by the kernel + int HMF = user_params_global->HMF; + + // set buffer size (hard-coded) + int scale = 5; + size_t d_n_buffer = n_halos * scale; + size_t buffer_size = sizeof(float) * d_n_buffer; + + // allocate memory for out halos (just allocate once at each call of this grid launch function) + float *d_halo_masses_out; + CALL_CUDA(cudaMalloc(&d_halo_masses_out, buffer_size)); + CALL_CUDA(cudaMemset(d_halo_masses_out, 0, buffer_size)); + // initializeArray(d_halo_masses_out, d_n_buffer, -1.2f); + + float *d_star_rng_out; + CALL_CUDA(cudaMalloc(&d_star_rng_out, buffer_size)); + CALL_CUDA(cudaMemset(d_star_rng_out, 0, buffer_size)); + // initializeArray(d_halo_masses_out, d_n_buffer, -1.2f); + + float *d_sfr_rng_out; + CALL_CUDA(cudaMalloc(&d_sfr_rng_out, buffer_size)); + CALL_CUDA(cudaMemset(d_sfr_rng_out, 0, buffer_size)); + + float *d_xray_rng_out; + CALL_CUDA(cudaMalloc(&d_xray_rng_out, buffer_size)); + CALL_CUDA(cudaMemset(d_xray_rng_out, 0, buffer_size)); + + int *d_halo_coords_out; + CALL_CUDA(cudaMalloc(&d_halo_coords_out, sizeof(int) * d_n_buffer * 3)); + initializeArray(d_halo_coords_out, d_n_buffer * 3, -1000); + + // initiate n_halo check + // unsigned long long int n_halo_check = n_halos; + + // initiate offset for writing output data + unsigned long long int write_offset = 0; + + // initialize n filter halo + unsigned long long int n_halos_tbp = n_halos; + + // initialize number of progenitors processed + unsigned long long int n_processed_prog; + + // todo: add the following to debug + cudaFuncAttributes attr; + cudaFuncGetAttributes(&attr, update_halo_constants); + // printf("Kernel Shared Memory per Block: %zu bytes\n", attr.sharedSizeBytes); + // printf("Kernel Registers per Thread: %d\n", attr.numRegs); + // printf("Kernel Max Threads per Block: %d\n", attr.maxThreadsPerBlock); + + // start with 4 threads work with one halo + int sparsity = 4; + + // Check if sparsity is smaller than scale + if (sparsity >= scale) + { + throw std::runtime_error("'sparsity' must be smaller than 'scale'."); + } + + // initial kernel grid + GridLayout grids = getWorkload(sparsity, n_halos); + + // launch kernel grid + while (n_halos_tbp > 0){ + size_t shared_size = grids.n_threads * sizeof(float) * 4; + int offset_shared = grids.n_threads; + printf("start launching kernel function.\n"); + update_halo_constants<<>>(d_halo_masses, d_star_rng, d_sfr_rng, d_xray_rng, d_halo_coords, + d_y_arr, x_min, x_width, n_halos_tbp, n_bin_y, hs_constants, HMF, d_halo_masses_out, d_star_rng_out, + d_sfr_rng_out, d_xray_rng_out, d_halo_coords_out, d_sum_check, d_further_process, d_nprog_predict, sparsity, write_offset, d_expected_mass, + d_n_prog, offset_shared); + + // Check kernel launch errors + CALL_CUDA(cudaGetLastError()); + + CALL_CUDA(cudaDeviceSynchronize()); + + // filter device halo masses in-place + n_halos_tbp = filterWithMask(d_halo_masses, d_further_process, n_halos_tbp); + printf("The number of halos for further processing: %d \n", n_halos_tbp); + + // // tmp 2025-01-19: check d_halo_masses_out writing out + // float *h_halo_masses_out_check; + // CALL_CUDA(cudaHostAlloc((void **)&h_halo_masses_out_check, buffer_size, cudaHostAllocDefault)); + // CALL_CUDA(cudaMemcpy(h_halo_masses_out_check, d_halo_masses_out, buffer_size, cudaMemcpyDeviceToHost)); + + // number of progenitors per halo + int *h_n_prog; + CALL_CUDA(cudaHostAlloc((void **)&h_n_prog, sizeof(int)*n_halos, cudaHostAllocDefault)); + CALL_CUDA(cudaMemcpy(h_n_prog, d_n_prog, sizeof(int)*n_halos, cudaMemcpyDeviceToHost)); + + // debug only + // // Values to count + // std::vector values_to_count = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100,32}; + + // // Count and display occurrences + // countElements(h_n_prog, n_halos, values_to_count); + + // condense halo mass array on the device + n_processed_prog = condenseDeviceArray(d_halo_masses_out, d_n_buffer, 0.0f); + printf("The number of progenitors written in out halo field so far: %d \n", n_processed_prog); + + // condense other halo field arrays on the device + unsigned long long int n_processed_star_rng = condenseDeviceArray(d_star_rng_out, d_n_buffer, 0.0f); + printf("The number of star prop rng written in out halo field so far: %d \n", n_processed_star_rng); + + unsigned long long int n_processed_sfr_rng = condenseDeviceArray(d_sfr_rng_out, d_n_buffer, 0.0f); + printf("The number of sfr prop rng written in out halo field so far: %d \n", n_processed_sfr_rng); + + unsigned long long int n_processed_xray_rng = condenseDeviceArray(d_xray_rng_out, d_n_buffer, 0.0f); + printf("The number of xray prop rng written in out halo field so far: %d \n", n_processed_xray_rng); + + unsigned long long int n_processed_coords = condenseDeviceArray(d_halo_coords_out, d_n_buffer*3, -1000); + printf("The number of halo coords written in out halo field so far: %d \n", n_processed_coords); + + // tmp: the following is just needed for debugging purpose + // float *h_filter_halos; + // CALL_CUDA(cudaHostAlloc((void **)&h_filter_halos, sizeof(float) * n_halos_tbp, cudaHostAllocDefault)); + // CALL_CUDA(cudaMemcpy(h_filter_halos, d_halo_masses, sizeof(float) * n_halos_tbp, cudaMemcpyDeviceToHost)); + + // int *h_nprog_predict; + // CALL_CUDA(cudaHostAlloc((void **)&h_nprog_predict, sizeof(int) * n_halos, cudaHostAllocDefault)); + // CALL_CUDA(cudaMemcpy(h_nprog_predict, d_nprog_predict, sizeof(int) * n_halos, cudaMemcpyDeviceToHost)); + + if (n_halos_tbp > 0){ + // update sparsity value + unsigned long long int available_n_buffer = d_n_buffer - n_processed_prog; + sparsity = getSparsity(available_n_buffer, n_halos_tbp); + + + // sparsity should not exceed the max threads per block + // sparsity = 256; + sparsity = std::min(sparsity, 512); + + // reset grids layout + grids = getWorkload(sparsity, n_halos_tbp); + + // update write offset + write_offset = n_processed_prog; + + // reset mask array + CALL_CUDA(cudaMemset(d_further_process, 0, sizeof(int) * n_halos)); + + // copy data from device to host + int h_sum_check; + CALL_CUDA(cudaMemcpy(&h_sum_check, d_sum_check, sizeof(int), cudaMemcpyDeviceToHost)); + } + // tmp: for debug only + // CALL_CUDA(cudaFreeHost(h_filter_halos)); + // CALL_CUDA(cudaFreeHost(h_sum_check)); + + } + + // write data back to the host + halofield_out->n_halos = n_processed_prog; + size_t out_size = sizeof(float) * n_processed_prog; + + // float *h_halo_masses_out; + // CALL_CUDA(cudaHostAlloc((void **)&h_halo_masses_out, out_size, cudaHostAllocDefault)); + CALL_CUDA(cudaGetLastError()); + CALL_CUDA(cudaDeviceSynchronize()); + + CALL_CUDA(cudaMemcpy(halofield_out->halo_masses, d_halo_masses_out, out_size, cudaMemcpyDeviceToHost)); + + + CALL_CUDA(cudaMemcpy(halofield_out->star_rng, d_star_rng_out, out_size, cudaMemcpyDeviceToHost)); + CALL_CUDA(cudaMemcpy(halofield_out->sfr_rng, d_sfr_rng_out, out_size, cudaMemcpyDeviceToHost)); + CALL_CUDA(cudaMemcpy(halofield_out->xray_rng, d_xray_rng_out, out_size, cudaMemcpyDeviceToHost)); + + size_t out_coords_size = sizeof(int) * n_processed_prog * 3; + CALL_CUDA(cudaMemcpy(halofield_out->halo_coords, d_halo_coords_out, out_coords_size, cudaMemcpyDeviceToHost)); + + + // Free device memory + CALL_CUDA(cudaFree(d_halo_masses)); + CALL_CUDA(cudaFree(d_y_arr)); + CALL_CUDA(cudaFree(d_halo_masses_out)); + CALL_CUDA(cudaFree(d_star_rng_out)); + CALL_CUDA(cudaFree(d_sfr_rng_out)); + CALL_CUDA(cudaFree(d_xray_rng_out)); + CALL_CUDA(cudaFree(d_halo_coords_out)); + CALL_CUDA(cudaFree(d_further_process)); + + validate_thrust(); + + condense_device_vector(); + + testCondenseDeviceArray(); + + testFilterWithMask(); + + CALL_CUDA(cudaGetLastError()); + CALL_CUDA(cudaDeviceSynchronize()); + return 0; +} diff --git a/src/py21cmfast/src/Stochasticity.cuh b/src/py21cmfast/src/Stochasticity.cuh new file mode 100644 index 000000000..25a1670aa --- /dev/null +++ b/src/py21cmfast/src/Stochasticity.cuh @@ -0,0 +1,17 @@ +#ifndef _STOCHASTICITY_CUH +#define _STOCHASTICITY_CUH + +#define HALO_CUDA_THREAD_FACTOR (int) (4) + +#ifdef __cplusplus +extern "C" +{ +#endif + int updateHaloOut(float *halo_masses, float *star_rng, float *sfr_rng, float *xray_rng, int *halo_coords, + unsigned long long int n_halos, float *y_arr, int n_bin_y, double x_min, double x_width, + struct HaloSamplingConstants hs_constants, unsigned long long int n_buffer, HaloField *halofield_out); +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/py21cmfast/src/Stochasticity.h b/src/py21cmfast/src/Stochasticity.h index ea6260a6b..63d44de74 100644 --- a/src/py21cmfast/src/Stochasticity.h +++ b/src/py21cmfast/src/Stochasticity.h @@ -39,6 +39,11 @@ struct HaloSamplingConstants { double expected_M; }; +#ifdef __cplusplus +extern "C" { + +#endif + int stochastic_halofield(unsigned long long int seed, float redshift_desc, float redshift, float *dens_field, float *halo_overlap_box, HaloField *halos_desc, HaloField *halos); @@ -59,4 +64,7 @@ void stoc_set_consts_z(struct HaloSamplingConstants *const_struct, double redshi double redshift_desc); void stoc_set_consts_cond(struct HaloSamplingConstants *const_struct, double cond_val); +#ifdef __cplusplus +} #endif +#endif //_STOCHASTICITY_H diff --git a/src/py21cmfast/src/_functionprototypes_wrapper.h b/src/py21cmfast/src/_functionprototypes_wrapper.h index 4732dcfd4..eb552f07e 100644 --- a/src/py21cmfast/src/_functionprototypes_wrapper.h +++ b/src/py21cmfast/src/_functionprototypes_wrapper.h @@ -1,7 +1,3 @@ -/* This file contains the repeated function prototypes which are needed by CFFI - to be included explicitly via ffi.cdef(), These are the only functions which - are visible to the python wrapper */ - /* OutputStruct COMPUTE FUNCTIONS */ int ComputeInitialConditions(unsigned long long random_seed, InitialConditions *boxes); @@ -25,9 +21,8 @@ int ComputeIonizedBox(float redshift, float prev_redshift, PerturbedField *pertu int ComputeBrightnessTemp(float redshift, TsBox *spin_temp, IonizedBox *ionized_box, PerturbedField *perturb_field, BrightnessTemp *box); -int ComputeHaloBox(double redshift, InitialConditions *ini_boxes, PerturbedField *perturbed_field, - PerturbHaloField *halos, TsBox *previous_spin_temp, - IonizedBox *previous_ionize_box, HaloBox *grids); +int ComputeHaloBox(double redshift, InitialConditions *ini_boxes, PerturbHaloField *halos, + TsBox *previous_spin_temp, IonizedBox *previous_ionize_box, HaloBox *grids); int UpdateXraySourceBox(HaloBox *halobox, double R_inner, double R_outer, int R_ct, XraySourceBox *source_box); diff --git a/src/py21cmfast/src/_inputparams_wrapper.h b/src/py21cmfast/src/_inputparams_wrapper.h deleted file mode 100644 index 0927b3c2a..000000000 --- a/src/py21cmfast/src/_inputparams_wrapper.h +++ /dev/null @@ -1,182 +0,0 @@ -/*We need to explicitly define the types used by the warpper using ffi.cdef() - However, that function does not take directives, so we separate the types here -*/ -// WARNING: DO NOT #include THIS FILE IN THE C CODE EXCEPT FOR IN InputParameters.h - -typedef struct CosmoParams { - float SIGMA_8; - float hlittle; - float OMm; - float OMl; - float OMb; - float POWER_INDEX; - - float OMn; - float OMk; - float OMr; - float OMtot; - float Y_He; - float wl; - -} CosmoParams; - -typedef struct SimulationOptions { - // Parameters taken from INIT_PARAMS.H - int HII_DIM; - int DIM; - float BOX_LEN; - float NON_CUBIC_FACTOR; - int N_THREADS; - double Z_HEAT_MAX; - double ZPRIME_STEP_FACTOR; - - // Halo Sampler Options - float SAMPLER_MIN_MASS; - double SAMPLER_BUFFER_FACTOR; - int N_COND_INTERP; - int N_PROB_INTERP; - double MIN_LOGPROB; - double HALOMASS_CORRECTION; - double PARKINSON_G0; - double PARKINSON_y1; - double PARKINSON_y2; - - float INITIAL_REDSHIFT; - double DELTA_R_FACTOR; - double DENSITY_SMOOTH_RADIUS; - - double DEXM_OPTIMIZE_MINMASS; - double DEXM_R_OVERLAP; - - double CORR_STAR; - double CORR_SFR; - double CORR_LX; -} SimulationOptions; - -typedef struct MatterOptions { - bool USE_FFTW_WISDOM; - int HMF; - int USE_RELATIVE_VELOCITIES; - int POWER_SPECTRUM; - int USE_INTERPOLATION_TABLES; - bool PERTURB_ON_HIGH_RES; - int PERTURB_ALGORITHM; - bool MINIMIZE_MEMORY; - bool KEEP_3D_VELOCITIES; - bool DEXM_OPTIMIZE; - int FILTER; - int HALO_FILTER; - bool SMOOTH_EVOLVED_DENSITY_FIELD; - - bool USE_HALO_FIELD; - bool HALO_STOCHASTICITY; - bool FIXED_HALO_GRIDS; - int SAMPLE_METHOD; -} MatterOptions; - -typedef struct AstroParams { - float HII_EFF_FACTOR; - - // SHMR - float F_STAR10; - float ALPHA_STAR; - float ALPHA_STAR_MINI; - float SIGMA_STAR; - double UPPER_STELLAR_TURNOVER_MASS; - double UPPER_STELLAR_TURNOVER_INDEX; - float F_STAR7_MINI; - - // SFMS - float t_STAR; - double SIGMA_SFR_INDEX; - double SIGMA_SFR_LIM; - - // L_X/SFR - double L_X; - double L_X_MINI; - double SIGMA_LX; - - // Escape Fraction - float F_ESC10; - float ALPHA_ESC; - float F_ESC7_MINI; - - float T_RE; - - float M_TURN; - float R_BUBBLE_MAX; - float ION_Tvir_MIN; - double F_H2_SHIELD; - float NU_X_THRESH; - float X_RAY_SPEC_INDEX; - float X_RAY_Tvir_MIN; - - double A_LW; - double BETA_LW; - double A_VCB; - double BETA_VCB; - - double FIXED_VAVG; - double POP2_ION; - double POP3_ION; - - double PHOTONCONS_CALIBRATION_END; - double CLUMPING_FACTOR; - double ALPHA_UVB; - - float R_MAX_TS; - int N_STEP_TS; - double DELTA_R_HII_FACTOR; - float R_BUBBLE_MIN; - double MAX_DVDR; - double NU_X_MAX; - double NU_X_BAND_MAX; -} AstroParams; - -typedef struct AstroOptions { - bool USE_MINI_HALOS; - bool USE_CMB_HEATING; // CMB Heating Flag - bool USE_LYA_HEATING; // Lya Heating Flag - bool USE_MASS_DEPENDENT_ZETA; - bool INHOMO_RECO; - bool USE_TS_FLUCT; - bool M_MIN_in_Mass; - bool FIX_VCB_AVG; - bool USE_EXP_FILTER; - bool CELL_RECOMB; - int PHOTON_CONS_TYPE; - bool USE_UPPER_STELLAR_TURNOVER; - bool HALO_SCALING_RELATIONS_MEDIAN; - int HII_FILTER; - int HEAT_FILTER; - bool IONISE_ENTIRE_SPHERE; - bool AVG_BELOW_SAMPLER; - int INTEGRATION_METHOD_ATOMIC; - int INTEGRATION_METHOD_MINI; -} AstroOptions; - -typedef struct ConfigSettings { - double HALO_CATALOG_MEM_FACTOR; - - char *external_table_path; - char *wisdoms_path; -} ConfigSettings; - -/* Previously, we had a few structures spread throughout the code e.g simulation_options_ufunc which - were all globally defined and separately broadcast at different times. Several of these were used - across different files and some inside #defines (e.g indexing.h), so for now I've combined - the parameter structures to avoid confusion (we shouldn't have the possibility of two files using - different parameters). - - In future we should have a parameter structure in each .c file containing ONLY parameters - relevant to it (look at HaloBox.c), and force the broadcast at each _compute() step (or even - decorate any library call) However this would require us to be very careful about initialising - the globals when ANY function from that file is called */ -// The structs declared here defined in InputParameters.c -extern SimulationOptions *simulation_options_global; -extern MatterOptions *matter_options_global; -extern CosmoParams *cosmo_params_global; -extern AstroParams *astro_params_global; -extern AstroOptions *astro_options_global; - -extern ConfigSettings config_settings; diff --git a/src/py21cmfast/src/_outputstructs_wrapper.h b/src/py21cmfast/src/_outputstructs_wrapper.h deleted file mode 100644 index c347df68f..000000000 --- a/src/py21cmfast/src/_outputstructs_wrapper.h +++ /dev/null @@ -1,97 +0,0 @@ -/*We need to explicitly define the types used by the warpper using ffi.cdef() - However, that function does not take directives, so we separate the types here -*/ -// WARNING: DO NOT #include THIS FILE IN THE C CODE EXCEPT FOR IN OutputStructs.h - -typedef struct InitialConditions { - float *lowres_density, *lowres_vx, *lowres_vy, *lowres_vz, *lowres_vx_2LPT, *lowres_vy_2LPT, - *lowres_vz_2LPT; - float *hires_density, *hires_vx, *hires_vy, *hires_vz, *hires_vx_2LPT, *hires_vy_2LPT, - *hires_vz_2LPT; // cw addition - float *lowres_vcb; -} InitialConditions; - -typedef struct PerturbedField { - float *density, *velocity_x, *velocity_y, *velocity_z; -} PerturbedField; - -typedef struct HaloField { - long long unsigned int n_halos; - long long unsigned int buffer_size; - float *halo_masses; - float *halo_coords; - - // Halo properties for stochastic model - float *star_rng; - float *sfr_rng; - float *xray_rng; -} HaloField; - -typedef struct PerturbHaloField { - long long unsigned int n_halos; - long long unsigned int buffer_size; - float *halo_masses; - float *halo_coords; - - // Halo properties for stochastic model - float *star_rng; - float *sfr_rng; - float *xray_rng; -} PerturbHaloField; - -typedef struct HaloBox { - // Things that aren't used in radiation fields but useful outputs - float *halo_mass; - float *halo_stars; - float *halo_stars_mini; - int *count; - - // For IonisationBox.c and SpinTemperatureBox.c - float *n_ion; // weighted by F_ESC*PopN_ion - float *halo_sfr; // for x-rays and Ts stuff - float *halo_xray; - float *halo_sfr_mini; // for x-rays and Ts stuff - float *whalo_sfr; // SFR weighted by PopN_ion and F_ESC, used for Gamma12 - - // Average volume-weighted log10 Turnover masses are kept in order to compare with the expected - // MF integrals - double log10_Mcrit_ACG_ave; - double log10_Mcrit_MCG_ave; -} HaloBox; - -typedef struct XraySourceBox { - float *filtered_sfr; - float *filtered_xray; - float *filtered_sfr_mini; - - double *mean_log10_Mcrit_LW; - double *mean_sfr; - double *mean_sfr_mini; -} XraySourceBox; - -typedef struct TsBox { - float *spin_temperature; - float *xray_ionised_fraction; - float *kinetic_temp_neutral; - float *J_21_LW; -} TsBox; - -typedef struct IonizedBox { - double mean_f_coll; - double mean_f_coll_MINI; - double log10_Mturnover_ave; - double log10_Mturnover_MINI_ave; - float *neutral_fraction; - float *ionisation_rate_G12; - float *mean_free_path; - float *z_reion; - float *cumulative_recombinations; - float *kinetic_temperature; - float *unnormalised_nion; - float *unnormalised_nion_mini; -} IonizedBox; - -typedef struct BrightnessTemp { - float *brightness_temp; - float *tau_21; -} BrightnessTemp; diff --git a/src/py21cmfast/src/_wrapper.cpp b/src/py21cmfast/src/_wrapper.cpp new file mode 100644 index 000000000..b6099fef0 --- /dev/null +++ b/src/py21cmfast/src/_wrapper.cpp @@ -0,0 +1,686 @@ +#include +#include +#include + +namespace nb = nanobind; + +extern "C" { +#include "21cmFAST.h" +} + +NB_MODULE(c_21cmfast, m) { + m.doc() = "This is the docstring for the 21cmFAST Python extension."; + + // Bind input parameters + + // Bind CosmoParams + nb::class_(m, "CosmoParams") + .def(nb::init<>()) + .def_rw("SIGMA_8", &CosmoParams::SIGMA_8) + .def_rw("hlittle", &CosmoParams::hlittle) + .def_rw("OMm", &CosmoParams::OMm) + .def_rw("OMl", &CosmoParams::OMl) + .def_rw("OMb", &CosmoParams::OMb) + .def_rw("OMn", &CosmoParams::OMn) + .def_rw("OMk", &CosmoParams::OMk) + .def_rw("OMr", &CosmoParams::OMr) + .def_rw("OMtot", &CosmoParams::OMtot) + .def_rw("Y_He", &CosmoParams::Y_He) + .def_rw("wl", &CosmoParams::wl) + .def_rw("POWER_INDEX", &CosmoParams::POWER_INDEX); + + // Bind SimulationOptions + nb::class_(m, "SimulationOptions") + .def(nb::init<>()) + .def_rw("HII_DIM", &SimulationOptions::HII_DIM) + .def_rw("DIM", &SimulationOptions::DIM) + .def_rw("BOX_LEN", &SimulationOptions::BOX_LEN) + .def_rw("NON_CUBIC_FACTOR", &SimulationOptions::NON_CUBIC_FACTOR) + .def_rw("N_THREADS", &SimulationOptions::N_THREADS) + .def_rw("Z_HEAT_MAX", &SimulationOptions::Z_HEAT_MAX) + .def_rw("ZPRIME_STEP_FACTOR", &SimulationOptions::ZPRIME_STEP_FACTOR) + .def_rw("SAMPLER_MIN_MASS", &SimulationOptions::SAMPLER_MIN_MASS) + .def_rw("SAMPLER_BUFFER_FACTOR", &SimulationOptions::SAMPLER_BUFFER_FACTOR) + .def_rw("N_COND_INTERP", &SimulationOptions::N_COND_INTERP) + .def_rw("N_PROB_INTERP", &SimulationOptions::N_PROB_INTERP) + .def_rw("MIN_LOGPROB", &SimulationOptions::MIN_LOGPROB) + .def_rw("HALOMASS_CORRECTION", &SimulationOptions::HALOMASS_CORRECTION) + .def_rw("PARKINSON_G0", &SimulationOptions::PARKINSON_G0) + .def_rw("PARKINSON_y1", &SimulationOptions::PARKINSON_y1) + .def_rw("PARKINSON_y2", &SimulationOptions::PARKINSON_y2) + .def_rw("INITIAL_REDSHIFT", &SimulationOptions::INITIAL_REDSHIFT) + .def_rw("DELTA_R_FACTOR", &SimulationOptions::DELTA_R_FACTOR) + .def_rw("DENSITY_SMOOTH_RADIUS", &SimulationOptions::DENSITY_SMOOTH_RADIUS) + .def_rw("DEXM_OPTIMIZE_MINMASS", &SimulationOptions::DEXM_OPTIMIZE_MINMASS) + .def_rw("DEXM_R_OVERLAP", &SimulationOptions::DEXM_R_OVERLAP) + .def_rw("CORR_STAR", &SimulationOptions::CORR_STAR) + .def_rw("CORR_SFR", &SimulationOptions::CORR_SFR) + .def_rw("CORR_LX", &SimulationOptions::CORR_LX); + + // Bind MatterOptions + nb::class_(m, "MatterOptions") + .def(nb::init<>()) + .def_rw("USE_FFTW_WISDOM", &MatterOptions::USE_FFTW_WISDOM) + .def_rw("HMF", &MatterOptions::HMF) + .def_rw("USE_RELATIVE_VELOCITIES", &MatterOptions::USE_RELATIVE_VELOCITIES) + .def_rw("POWER_SPECTRUM", &MatterOptions::POWER_SPECTRUM) + .def_rw("USE_INTERPOLATION_TABLES", &MatterOptions::USE_INTERPOLATION_TABLES) + .def_rw("PERTURB_ON_HIGH_RES", &MatterOptions::PERTURB_ON_HIGH_RES) + .def_rw("PERTURB_ALGORITHM", &MatterOptions::PERTURB_ALGORITHM) + .def_rw("MINIMIZE_MEMORY", &MatterOptions::MINIMIZE_MEMORY) + .def_rw("KEEP_3D_VELOCITIES", &MatterOptions::KEEP_3D_VELOCITIES) + .def_rw("DEXM_OPTIMIZE", &MatterOptions::DEXM_OPTIMIZE) + .def_rw("FILTER", &MatterOptions::FILTER) + .def_rw("HALO_FILTER", &MatterOptions::HALO_FILTER) + .def_rw("SMOOTH_EVOLVED_DENSITY_FIELD", &MatterOptions::SMOOTH_EVOLVED_DENSITY_FIELD) + .def_rw("USE_HALO_FIELD", &MatterOptions::USE_HALO_FIELD) + .def_rw("HALO_STOCHASTICITY", &MatterOptions::HALO_STOCHASTICITY) + .def_rw("FIXED_HALO_GRIDS", &MatterOptions::FIXED_HALO_GRIDS) + .def_rw("SAMPLE_METHOD", &MatterOptions::SAMPLE_METHOD); + + // Bind AstroParams + nb::class_(m, "AstroParams") + .def(nb::init<>()) + .def_rw("HII_EFF_FACTOR", &AstroParams::HII_EFF_FACTOR) + .def_rw("F_STAR10", &AstroParams::F_STAR10) + .def_rw("ALPHA_STAR", &AstroParams::ALPHA_STAR) + .def_rw("ALPHA_STAR_MINI", &AstroParams::ALPHA_STAR_MINI) + .def_rw("SIGMA_STAR", &AstroParams::SIGMA_STAR) + .def_rw("UPPER_STELLAR_TURNOVER_MASS", &AstroParams::UPPER_STELLAR_TURNOVER_MASS) + .def_rw("UPPER_STELLAR_TURNOVER_INDEX", &AstroParams::UPPER_STELLAR_TURNOVER_INDEX) + .def_rw("F_STAR7_MINI", &AstroParams::F_STAR7_MINI) + .def_rw("t_STAR", &AstroParams::t_STAR) + .def_rw("SIGMA_SFR_INDEX", &AstroParams::SIGMA_SFR_INDEX) + .def_rw("SIGMA_SFR_LIM", &AstroParams::SIGMA_SFR_LIM) + .def_rw("L_X", &AstroParams::L_X) + .def_rw("L_X_MINI", &AstroParams::L_X_MINI) + .def_rw("SIGMA_LX", &AstroParams::SIGMA_LX) + .def_rw("F_ESC10", &AstroParams::F_ESC10) + .def_rw("ALPHA_ESC", &AstroParams::ALPHA_ESC) + .def_rw("F_ESC7_MINI", &AstroParams::F_ESC7_MINI) + .def_rw("T_RE", &AstroParams::T_RE) + .def_rw("M_TURN", &AstroParams::M_TURN) + .def_rw("R_BUBBLE_MAX", &AstroParams::R_BUBBLE_MAX) + .def_rw("ION_Tvir_MIN", &AstroParams::ION_Tvir_MIN) + .def_rw("F_H2_SHIELD", &AstroParams::F_H2_SHIELD) + .def_rw("NU_X_THRESH", &AstroParams::NU_X_THRESH) + .def_rw("X_RAY_SPEC_INDEX", &AstroParams::X_RAY_SPEC_INDEX) + .def_rw("X_RAY_Tvir_MIN", &AstroParams::X_RAY_Tvir_MIN) + .def_rw("A_LW", &AstroParams::A_LW) + .def_rw("BETA_LW", &AstroParams::BETA_LW) + .def_rw("A_VCB", &AstroParams::A_VCB) + .def_rw("BETA_VCB", &AstroParams::BETA_VCB) + .def_rw("FIXED_VAVG", &AstroParams::FIXED_VAVG) + .def_rw("POP2_ION", &AstroParams::POP2_ION) + .def_rw("POP3_ION", &AstroParams::POP3_ION) + .def_rw("PHOTONCONS_CALIBRATION_END", &AstroParams::PHOTONCONS_CALIBRATION_END) + .def_rw("CLUMPING_FACTOR", &AstroParams::CLUMPING_FACTOR) + .def_rw("ALPHA_UVB", &AstroParams::ALPHA_UVB) + .def_rw("R_MAX_TS", &AstroParams::R_MAX_TS) + .def_rw("N_STEP_TS", &AstroParams::N_STEP_TS) + .def_rw("DELTA_R_HII_FACTOR", &AstroParams::DELTA_R_HII_FACTOR) + .def_rw("R_BUBBLE_MIN", &AstroParams::R_BUBBLE_MIN) + .def_rw("MAX_DVDR", &AstroParams::MAX_DVDR) + .def_rw("NU_X_MAX", &AstroParams::NU_X_MAX) + .def_rw("NU_X_BAND_MAX", &AstroParams::NU_X_BAND_MAX); + + // Bind AstroOptions + nb::class_(m, "AstroOptions") + .def(nb::init<>()) + .def_rw("USE_MINI_HALOS", &AstroOptions::USE_MINI_HALOS) + .def_rw("USE_CMB_HEATING", &AstroOptions::USE_CMB_HEATING) + .def_rw("USE_LYA_HEATING", &AstroOptions::USE_LYA_HEATING) + .def_rw("USE_MASS_DEPENDENT_ZETA", &AstroOptions::USE_MASS_DEPENDENT_ZETA) + .def_rw("INHOMO_RECO", &AstroOptions::INHOMO_RECO) + .def_rw("USE_TS_FLUCT", &AstroOptions::USE_TS_FLUCT) + .def_rw("M_MIN_in_Mass", &AstroOptions::M_MIN_in_Mass) + .def_rw("FIX_VCB_AVG", &AstroOptions::FIX_VCB_AVG) + .def_rw("USE_EXP_FILTER", &AstroOptions::USE_EXP_FILTER) + .def_rw("CELL_RECOMB", &AstroOptions::CELL_RECOMB) + .def_rw("PHOTON_CONS_TYPE", &AstroOptions::PHOTON_CONS_TYPE) + .def_rw("USE_UPPER_STELLAR_TURNOVER", &AstroOptions::USE_UPPER_STELLAR_TURNOVER) + .def_rw("HALO_SCALING_RELATIONS_MEDIAN", &AstroOptions::HALO_SCALING_RELATIONS_MEDIAN) + .def_rw("HII_FILTER", &AstroOptions::HII_FILTER) + .def_rw("HEAT_FILTER", &AstroOptions::HEAT_FILTER) + .def_rw("IONISE_ENTIRE_SPHERE", &AstroOptions::IONISE_ENTIRE_SPHERE) + .def_rw("AVG_BELOW_SAMPLER", &AstroOptions::AVG_BELOW_SAMPLER) + .def_rw("INTEGRATION_METHOD_ATOMIC", &AstroOptions::INTEGRATION_METHOD_ATOMIC) + .def_rw("INTEGRATION_METHOD_MINI", &AstroOptions::INTEGRATION_METHOD_MINI); + + // Bind ConfigSettings + nb::class_(m, "ConfigSettings") + .def(nb::init<>()) + .def_rw("HALO_CATALOG_MEM_FACTOR", &ConfigSettings::HALO_CATALOG_MEM_FACTOR) + .def("set_external_table_path", + [](ConfigSettings& self, const std::string& path) { + strcpy(self.external_table_path, path.c_str()); + }) + .def("get_external_table_path", + [](ConfigSettings& self) { return std::string(self.external_table_path); }) + .def("set_wisdoms_path", + [](ConfigSettings& self, const std::string& path) { + strcpy(self.wisdoms_path, path.c_str()); + }) + .def("get_wisdoms_path", + [](ConfigSettings& self) { return std::string(self.wisdoms_path); }); + + // Output Struct Bindings + // Bind InitialConditions + nb::class_(m, "InitialConditions") + .def(nb::init<>()) + .def("set_lowres_density", + [](InitialConditions& self, nb::ndarray array) { + self.lowres_density = array.data(); + }) + .def("set_lowres_vx", [](InitialConditions& self, + nb::ndarray array) { self.lowres_vx = array.data(); }) + .def("set_lowres_vy", [](InitialConditions& self, + nb::ndarray array) { self.lowres_vy = array.data(); }) + .def("set_lowres_vz", [](InitialConditions& self, + nb::ndarray array) { self.lowres_vz = array.data(); }) + .def("set_lowres_vx_2LPT", + [](InitialConditions& self, nb::ndarray array) { + self.lowres_vx_2LPT = array.data(); + }) + .def("set_lowres_vy_2LPT", + [](InitialConditions& self, nb::ndarray array) { + self.lowres_vy_2LPT = array.data(); + }) + .def("set_lowres_vz_2LPT", + [](InitialConditions& self, nb::ndarray array) { + self.lowres_vz_2LPT = array.data(); + }) + .def("set_hires_density", + [](InitialConditions& self, nb::ndarray array) { + self.hires_density = array.data(); + }) + .def("set_hires_vx", [](InitialConditions& self, + nb::ndarray array) { self.hires_vx = array.data(); }) + .def("set_hires_vy", [](InitialConditions& self, + nb::ndarray array) { self.hires_vy = array.data(); }) + .def("set_hires_vz", [](InitialConditions& self, + nb::ndarray array) { self.hires_vz = array.data(); }) + .def("set_hires_vx_2LPT", + [](InitialConditions& self, nb::ndarray array) { + self.hires_vx_2LPT = array.data(); + }) + .def("set_hires_vy_2LPT", + [](InitialConditions& self, nb::ndarray array) { + self.hires_vy_2LPT = array.data(); + }) + .def("set_hires_vz_2LPT", + [](InitialConditions& self, nb::ndarray array) { + self.hires_vz_2LPT = array.data(); + }) + .def("set_lowres_vcb", [](InitialConditions& self, nb::ndarray array) { + self.lowres_vcb = array.data(); + }); + + // Bind PerturbedField + nb::class_(m, "PerturbedField") + .def(nb::init<>()) + .def("set_density", + [](PerturbedField& self, nb::ndarray array) { self.density = array.data(); }) + .def("set_velocity_x", + [](PerturbedField& self, nb::ndarray array) { self.velocity_x = array.data(); }) + .def("set_velocity_y", + [](PerturbedField& self, nb::ndarray array) { self.velocity_y = array.data(); }) + .def("set_velocity_z", [](PerturbedField& self, nb::ndarray array) { + self.velocity_z = array.data(); + }); + + // Bind HaloField + nb::class_(m, "HaloField") + .def(nb::init<>()) + .def_rw("n_halos", &HaloField::n_halos) + .def_rw("buffer_size", &HaloField::buffer_size) + .def("set_halo_masses", + [](HaloField& self, nb::ndarray array) { self.halo_masses = array.data(); }) + .def("set_halo_coords", + [](HaloField& self, nb::ndarray array) { self.halo_coords = array.data(); }) + .def("set_star_rng", + [](HaloField& self, nb::ndarray array) { self.star_rng = array.data(); }) + .def("set_sfr_rng", + [](HaloField& self, nb::ndarray array) { self.sfr_rng = array.data(); }) + .def("set_xray_rng", + [](HaloField& self, nb::ndarray array) { self.xray_rng = array.data(); }); + + // Bind PerturbHaloField + nb::class_(m, "PerturbHaloField") + .def(nb::init<>()) + .def_rw("n_halos", &PerturbHaloField::n_halos) + .def_rw("buffer_size", &PerturbHaloField::buffer_size) + .def("set_halo_masses", [](PerturbHaloField& self, + nb::ndarray array) { self.halo_masses = array.data(); }) + .def("set_halo_coords", [](PerturbHaloField& self, + nb::ndarray array) { self.halo_coords = array.data(); }) + .def("set_star_rng", + [](PerturbHaloField& self, nb::ndarray array) { self.star_rng = array.data(); }) + .def("set_sfr_rng", + [](PerturbHaloField& self, nb::ndarray array) { self.sfr_rng = array.data(); }) + .def("set_xray_rng", [](PerturbHaloField& self, nb::ndarray array) { + self.xray_rng = array.data(); + }); + + // Bind HaloBox + nb::class_(m, "HaloBox") + .def(nb::init<>()) + .def("set_halo_mass", + [](HaloBox& self, nb::ndarray array) { self.halo_mass = array.data(); }) + .def("set_halo_stars", + [](HaloBox& self, nb::ndarray array) { self.halo_stars = array.data(); }) + .def("set_halo_stars_mini", + [](HaloBox& self, nb::ndarray array) { self.halo_stars_mini = array.data(); }) + .def("set_count", [](HaloBox& self, nb::ndarray array) { self.count = array.data(); }) + .def("set_n_ion", + [](HaloBox& self, nb::ndarray array) { self.n_ion = array.data(); }) + .def("set_halo_sfr", + [](HaloBox& self, nb::ndarray array) { self.halo_sfr = array.data(); }) + .def("set_halo_xray", + [](HaloBox& self, nb::ndarray array) { self.halo_xray = array.data(); }) + .def("set_halo_sfr_mini", + [](HaloBox& self, nb::ndarray array) { self.halo_sfr_mini = array.data(); }) + .def("set_whalo_sfr", + [](HaloBox& self, nb::ndarray array) { self.whalo_sfr = array.data(); }) + .def_rw("log10_Mcrit_ACG_ave", &HaloBox::log10_Mcrit_ACG_ave) + .def_rw("log10_Mcrit_MCG_ave", &HaloBox::log10_Mcrit_MCG_ave); + + // Bind XraySourceBox + nb::class_(m, "XraySourceBox") + .def(nb::init<>()) + .def("set_filtered_sfr", [](XraySourceBox& self, + nb::ndarray array) { self.filtered_sfr = array.data(); }) + .def("set_filtered_xray", + [](XraySourceBox& self, nb::ndarray array) { + self.filtered_xray = array.data(); + }) + .def("set_filtered_sfr_mini", + [](XraySourceBox& self, nb::ndarray array) { + self.filtered_sfr_mini = array.data(); + }) + .def("set_mean_log10_Mcrit_LW", + [](XraySourceBox& self, nb::ndarray array) { + self.mean_log10_Mcrit_LW = array.data(); + }) + .def("set_mean_sfr", + [](XraySourceBox& self, nb::ndarray array) { self.mean_sfr = array.data(); }) + .def("set_mean_sfr_mini", [](XraySourceBox& self, nb::ndarray array) { + self.mean_sfr_mini = array.data(); + }); + + // Bind TsBox + nb::class_(m, "TsBox") + .def(nb::init<>()) + .def("set_spin_temperature", + [](TsBox& self, nb::ndarray array) { self.spin_temperature = array.data(); }) + .def("set_xray_ionised_fraction", + [](TsBox& self, nb::ndarray array) { + self.xray_ionised_fraction = array.data(); + }) + .def( + "set_kinetic_temp_neutral", + [](TsBox& self, nb::ndarray array) { self.kinetic_temp_neutral = array.data(); }) + .def("set_J_21_LW", + [](TsBox& self, nb::ndarray array) { self.J_21_LW = array.data(); }); + + // Bind IonizedBox + nb::class_(m, "IonizedBox") + .def(nb::init<>()) + .def_rw("mean_f_coll", &IonizedBox::mean_f_coll) + .def_rw("mean_f_coll_MINI", &IonizedBox::mean_f_coll_MINI) + .def_rw("log10_Mturnover_ave", &IonizedBox::log10_Mturnover_ave) + .def_rw("log10_Mturnover_MINI_ave", &IonizedBox::log10_Mturnover_MINI_ave) + .def("set_neutral_fraction", + [](IonizedBox& self, nb::ndarray array) { + self.neutral_fraction = array.data(); + }) + .def("set_ionisation_rate_G12", + [](IonizedBox& self, nb::ndarray array) { + self.ionisation_rate_G12 = array.data(); + }) + .def("set_mean_free_path", + [](IonizedBox& self, nb::ndarray array) { self.mean_free_path = array.data(); }) + .def("set_z_reion", + [](IonizedBox& self, nb::ndarray array) { self.z_reion = array.data(); }) + .def("set_cumulative_recombinations", + [](IonizedBox& self, nb::ndarray array) { + self.cumulative_recombinations = array.data(); + }) + .def("set_kinetic_temperature", + [](IonizedBox& self, nb::ndarray array) { + self.kinetic_temperature = array.data(); + }) + .def("set_unnormalised_nion", + [](IonizedBox& self, nb::ndarray array) { + self.unnormalised_nion = array.data(); + }) + .def("set_unnormalised_nion_mini", [](IonizedBox& self, nb::ndarray array) { + self.unnormalised_nion_mini = array.data(); + }); + + // Bind BrightnessTemp + nb::class_(m, "BrightnessTemp") + .def(nb::init<>()) + .def("set_brightness_temp", + [](BrightnessTemp& self, nb::ndarray array) { + self.brightness_temp = array.data(); + }) + .def("set_tau_21", + [](BrightnessTemp& self, nb::ndarray array) { self.tau_21 = array.data(); }); + + // Function Bindings + // OutputStruct COMPUTE FUNCTIONS + m.def("ComputeInitialConditions", &ComputeInitialConditions); + m.def("ComputePerturbField", &ComputePerturbField); + m.def("ComputeHaloField", &ComputeHaloField); + m.def("ComputePerturbHaloField", &ComputePerturbHaloField); + m.def("ComputeTsBox", &ComputeTsBox); + m.def("ComputeIonizedBox", &ComputeIonizedBox); + m.def("ComputeBrightnessTemp", &ComputeBrightnessTemp); + m.def("ComputeHaloBox", &ComputeHaloBox); + m.def("UpdateXraySourceBox", &UpdateXraySourceBox); + + // PHOTON CONSERVATION MODEL FUNCTIONS + m.def("InitialisePhotonCons", &InitialisePhotonCons); + m.def("PhotonCons_Calibration", + [](nb::ndarray z_estimate, nb::ndarray xH_estimate) { + int n_spline = z_estimate.size(); + if (xH_estimate.size() != n_spline) { + throw std::runtime_error("Array sizes do not match the specified NSpline."); + } + int status = PhotonCons_Calibration(z_estimate.data(), xH_estimate.data(), n_spline); + if (status != 0) { + throw std::runtime_error("PhotonCons_Calibration failed with status: " + + std::to_string(status)); + } + }); + m.def("ComputeZstart_PhotonCons", [](nb::ndarray zstart) { + if (zstart.size() != 1) { + throw std::runtime_error("zstart array must have size 1."); + } + int status = ComputeZstart_PhotonCons(zstart.data()); + if (status != 0) { + throw std::runtime_error("ComputeZstart_PhotonCons failed with status: " + + std::to_string(status)); + } + }); + m.def("adjust_redshifts_for_photoncons", + [](double z_step_factor, nb::ndarray redshift, nb::ndarray stored_redshift, + nb::ndarray absolute_delta_z) { + adjust_redshifts_for_photoncons(z_step_factor, redshift.data(), + stored_redshift.data(), absolute_delta_z.data()); + }); + m.def("determine_deltaz_for_photoncons", &determine_deltaz_for_photoncons); + m.def("ObtainPhotonConsData", + [](nb::ndarray z_at_Q_data, nb::ndarray Q_data, + nb::ndarray Ndata_analytic, nb::ndarray z_cal_data, + nb::ndarray nf_cal_data, nb::ndarray Ndata_calibration, + nb::ndarray PhotonCons_NFdata, nb::ndarray PhotonCons_deltaz, + nb::ndarray Ndata_PhotonCons) { + if (Ndata_analytic.size() != 1 || Ndata_calibration.size() != 1 || + Ndata_PhotonCons.size() != 1) { + throw std::runtime_error( + "Ndata_analytic, Ndata_calibration, and Ndata_PhotonCons must have size 1."); + } + int status = ObtainPhotonConsData( + z_at_Q_data.data(), Q_data.data(), Ndata_analytic.data(), z_cal_data.data(), + nf_cal_data.data(), Ndata_calibration.data(), PhotonCons_NFdata.data(), + PhotonCons_deltaz.data(), Ndata_PhotonCons.data()); + if (status != 0) { + throw std::runtime_error("ObtainPhotonConsData failed with status: " + + std::to_string(status)); + } + }); + m.def("FreePhotonConsMemory", &FreePhotonConsMemory); + m.def("set_alphacons_params", &set_alphacons_params); + + // Non-OutputStruct data products + m.def("ComputeLF", + [](int component, size_t n_bins_mass, nb::ndarray z_LF, nb::ndarray M_TURNs, + nb::ndarray M_uv_z, nb::ndarray M_h_z, nb::ndarray log10phi) { + size_t n_redshifts = z_LF.shape(0); + if (M_h_z.shape(0) != n_redshifts || M_h_z.shape(1) != n_bins_mass || + M_uv_z.shape(0) != n_redshifts || M_uv_z.shape(1) != n_bins_mass || + log10phi.shape(0) != n_redshifts || log10phi.shape(1) != n_bins_mass || + M_TURNs.shape(0) != n_redshifts) { + throw std::runtime_error( + "Array size mismatch: M_h_z shape: " + std::to_string(M_h_z.shape(0)) + "x" + + std::to_string(M_h_z.shape(1)) + ", M_uv_z shape: " + + std::to_string(M_uv_z.shape(0)) + "x" + std::to_string(M_uv_z.shape(1)) + + ", log10phi shape: " + std::to_string(log10phi.shape(0)) + "x" + + std::to_string(log10phi.shape(1)) + + ", M_TURNs shape: " + std::to_string(M_TURNs.shape(0))); + } + ComputeLF(n_bins_mass, component, n_redshifts, z_LF.data(), M_TURNs.data(), + M_h_z.data(), M_uv_z.data(), log10phi.data()); + }); + m.def("ComputeTau", + [](nb::ndarray redshifts, nb::ndarray global_xHI, float z_re_HeII) { + size_t n_redshifts = redshifts.shape(0); + if (global_xHI.shape(0) != n_redshifts) { + throw std::runtime_error("XHI array size" + std::to_string(global_xHI.shape(0)) + + "does not match the number of redshifts." + + std::to_string(n_redshifts)); + } + return ComputeTau(n_redshifts, redshifts.data(), global_xHI.data(), z_re_HeII); + }); + + // Initialisation functions needed in the wrapper + m.def("init_ps", &init_ps); + m.def("init_heat", &init_heat); + m.def("CreateFFTWWisdoms", &CreateFFTWWisdoms); + m.def("Broadcast_struct_global_noastro", &Broadcast_struct_global_noastro); + m.def("Broadcast_struct_global_all", &Broadcast_struct_global_all); + m.def("initialiseSigmaMInterpTable", &initialiseSigmaMInterpTable); + m.def("initialise_GL", &initialise_GL); + + // Integration routines + // TODO: it may be a better choice to rewrite integral_wrappers in C++ directly + m.def("get_sigma", [](nb::ndarray mass_values, nb::ndarray sigma_out, + nb::ndarray dsigmasqdm_out) { + size_t n_masses = mass_values.shape(0); + if (sigma_out.shape(0) != n_masses || dsigmasqdm_out.shape(0) != n_masses) { + throw std::runtime_error("Array sizes do not match the number of masses."); + } + get_sigma(n_masses, mass_values.data(), sigma_out.data(), dsigmasqdm_out.data()); + }); + + m.def("get_condition_integrals", + [](double redshift, double z_prev, nb::ndarray cond_values, + nb::ndarray out_n_exp, nb::ndarray out_m_exp) { + size_t n_conditions = cond_values.shape(0); + if (out_n_exp.shape(0) != n_conditions || out_m_exp.shape(0) != n_conditions) { + throw std::runtime_error("Array sizes do not match the number of conditions."); + } + get_condition_integrals(redshift, z_prev, n_conditions, cond_values.data(), + out_n_exp.data(), out_m_exp.data()); + }); + + m.def("get_halo_chmf_interval", + [](double redshift, double z_prev, nb::ndarray cond_values, + nb::ndarray lnM_lo, nb::ndarray lnM_hi, nb::ndarray out_n) { + size_t n_conditions = cond_values.shape(0); + size_t n_masslim = lnM_lo.shape(0); + if (lnM_hi.shape(0) != n_masslim || out_n.shape(0) != n_conditions || + out_n.shape(1) != n_masslim) { + throw std::runtime_error("Array sizes do not match the specified dimensions."); + } + get_halo_chmf_interval(redshift, z_prev, n_conditions, cond_values.data(), n_masslim, + lnM_lo.data(), lnM_hi.data(), out_n.data()); + }); + + m.def("get_halomass_at_probability", + [](double redshift, double z_prev, nb::ndarray cond_values, + nb::ndarray probabilities, nb::ndarray out_mass) { + size_t n_conditions = cond_values.shape(0) * cond_values.shape(1); + if (probabilities.shape(0) * probabilities.shape(1) != n_conditions || + out_mass.shape(0) * out_mass.shape(1) != n_conditions) { + throw std::runtime_error("Array sizes do not match the number of conditions."); + } + get_halomass_at_probability(redshift, z_prev, n_conditions, cond_values.data(), + probabilities.data(), out_mass.data()); + }); + + m.def("get_global_SFRD_z", + [](nb::ndarray redshifts, nb::ndarray log10_turnovers_mcg, + nb::ndarray out_sfrd, nb::ndarray out_sfrd_mini) { + size_t n_redshift = redshifts.size(); + if (log10_turnovers_mcg.size() != n_redshift || out_sfrd.size() != n_redshift || + out_sfrd_mini.size() != n_redshift) { + throw std::runtime_error("Array sizes do not match the number of redshifts."); + } + get_global_SFRD_z(n_redshift, redshifts.data(), log10_turnovers_mcg.data(), + out_sfrd.data(), out_sfrd_mini.data()); + }); + + m.def("get_global_Nion_z", + [](nb::ndarray redshifts, nb::ndarray log10_turnovers_mcg, + nb::ndarray out_nion, nb::ndarray out_nion_mini) { + size_t n_redshift = redshifts.size(); + if (log10_turnovers_mcg.size() != n_redshift || out_nion.size() != n_redshift || + out_nion_mini.size() != n_redshift) { + throw std::runtime_error("Array sizes do not match the number of redshifts."); + } + get_global_Nion_z(n_redshift, redshifts.data(), log10_turnovers_mcg.data(), + out_nion.data(), out_nion_mini.data()); + }); + + m.def("get_conditional_FgtrM", + [](double redshift, double R, nb::ndarray densities, + nb::ndarray out_fcoll, nb::ndarray out_dfcoll) { + size_t n_densities = densities.size(); + if (out_fcoll.size() != n_densities || out_dfcoll.size() != n_densities) { + throw std::runtime_error("Array sizes do not match the number of densities."); + } + get_conditional_FgtrM(redshift, R, n_densities, densities.data(), out_fcoll.data(), + out_dfcoll.data()); + }); + + m.def("get_conditional_SFRD", [](double redshift, double R, nb::ndarray densities, + nb::ndarray log10_mturns, nb::ndarray out_sfrd, + nb::ndarray out_sfrd_mini) { + size_t n_densities = densities.size(); + if (log10_mturns.size() != n_densities || out_sfrd.size() != n_densities || + out_sfrd_mini.size() != n_densities) { + throw std::runtime_error("Array sizes do not match the number of densities."); + } + get_conditional_SFRD(redshift, R, n_densities, densities.data(), log10_mturns.data(), + out_sfrd.data(), out_sfrd_mini.data()); + }); + + m.def("get_conditional_Nion", [](double redshift, double R, nb::ndarray densities, + nb::ndarray log10_mturns_acg, + nb::ndarray log10_mturns_mcg, + nb::ndarray out_nion, + nb::ndarray out_nion_mini) { + size_t n_densities = densities.size(); + if (log10_mturns_acg.size() != n_densities || log10_mturns_mcg.size() != n_densities || + out_nion.size() != n_densities || out_nion_mini.size() != n_densities) { + throw std::runtime_error("Array sizes do not match the number of densities."); + } + get_conditional_Nion(redshift, R, n_densities, densities.data(), log10_mturns_acg.data(), + log10_mturns_mcg.data(), out_nion.data(), out_nion_mini.data()); + }); + + m.def("get_conditional_Xray", + [](double redshift, double R, nb::ndarray densities, + nb::ndarray log10_mturns, nb::ndarray out_xray) { + size_t n_densities = densities.size(); + if (log10_mturns.size() != n_densities || out_xray.size() != n_densities) { + throw std::runtime_error("Array sizes do not match the number of densities."); + } + get_conditional_Xray(redshift, R, n_densities, densities.data(), log10_mturns.data(), + out_xray.data()); + }); + + // Error framework testing + m.def("SomethingThatCatches", &SomethingThatCatches); + m.def("FunctionThatCatches", [](bool sub_func, bool pass, nb::ndarray answer) { + return FunctionThatCatches(sub_func, pass, answer.data()); + }); + m.def("FunctionThatThrows", &FunctionThatThrows); + + m.def("single_test_sample", + [](unsigned long long int seed, nb::ndarray conditions, + nb::ndarray cond_crd, double z_out, double z_in, nb::ndarray out_n_tot, + nb::ndarray out_n_cell, nb::ndarray out_n_exp, + nb::ndarray out_m_cell, nb::ndarray out_m_exp, + nb::ndarray out_halo_masses, nb::ndarray out_halo_coords) { + size_t n_condition = conditions.shape(0); + if (cond_crd.shape(0) != n_condition || cond_crd.shape(1) != 3) { + throw std::runtime_error("cond_crd must have shape (n_condition, 3)."); + } + if (out_n_cell.shape(0) != n_condition || out_n_exp.shape(0) != n_condition || + out_m_cell.shape(0) != n_condition || out_m_exp.shape(0) != n_condition) { + throw std::runtime_error("Output arrays must match the number of conditions."); + } + int status = single_test_sample(seed, n_condition, conditions.data(), cond_crd.data(), + z_out, z_in, out_n_tot.data(), out_n_cell.data(), + out_n_exp.data(), out_m_cell.data(), out_m_exp.data(), + out_halo_masses.data(), out_halo_coords.data()); + if (status != 0) { + throw std::runtime_error("single_test_sample failed with status: " + + std::to_string(status)); + } + }); + + m.def("test_halo_props", [](double redshift, nb::ndarray vcb_grid, + nb::ndarray J21_LW_grid, nb::ndarray z_re_grid, + nb::ndarray Gamma12_ion_grid, nb::ndarray halo_masses, + nb::ndarray halo_coords, nb::ndarray star_rng, + nb::ndarray sfr_rng, nb::ndarray xray_rng, + nb::ndarray halo_props_out) { + size_t n_halos = halo_masses.shape(0); + if (halo_coords.shape(0) != n_halos || halo_coords.shape(1) != 3 || + star_rng.shape(0) != n_halos || sfr_rng.shape(0) != n_halos || + xray_rng.shape(0) != n_halos || halo_props_out.shape(0) != n_halos || + halo_props_out.shape(1) != 12) { + throw std::runtime_error( + "Input/output arrays must have the same shape as the number of halos. halo_coords " + "shape: " + + std::to_string(halo_coords.shape(0)) + "x" + std::to_string(halo_coords.shape(1)) + + ", " + "halo_masses shape: " + std::to_string(halo_masses.shape(0)) + ", " + + "star_rng shape: " + std::to_string(star_rng.shape(0)) + ", " + + "sfr_rng shape: " + std::to_string(sfr_rng.shape(0)) + ", " + + "halo_props_out shape: " + std::to_string(halo_props_out.shape(0)) + "x" + + std::to_string(halo_props_out.shape(1))); + } + int status = test_halo_props(redshift, vcb_grid.data(), J21_LW_grid.data(), + z_re_grid.data(), Gamma12_ion_grid.data(), n_halos, + halo_masses.data(), halo_coords.data(), star_rng.data(), + sfr_rng.data(), xray_rng.data(), halo_props_out.data()); + if (status != 0) { + throw std::runtime_error("test_halo_props failed with status: " + + std::to_string(status)); + } + }); + + m.def("test_filter", [](nb::ndarray input_box, double R, double R_param, int filter_flag, + nb::ndarray result) { + size_t n_elements = input_box.size(); + if (result.size() != n_elements) { + throw std::runtime_error("result array must have the same size as input_box."); + } + int status = test_filter(input_box.data(), R, R_param, filter_flag, result.data()); + if (status != 0) { + throw std::runtime_error("test_filter failed with status: " + std::to_string(status)); + } + }); + + // Functions required to access cosmology & mass functions directly + m.def("dicke", &dicke); + m.def("sigma_z0", &sigma_z0); + m.def("dsigmasqdm_z0", &dsigmasqdm_z0); + m.def("power_in_k", &power_in_k); + m.def("get_delta_crit", &get_delta_crit); + m.def("atomic_cooling_threshold", &atomic_cooling_threshold); + m.def("unconditional_hmf", &unconditional_hmf); + m.def("conditional_hmf", &conditional_hmf); + m.def("expected_nhalo", &expected_nhalo); + + m.def( + "get_config_settings", []() -> ConfigSettings& { return config_settings; }, + nb::rv_policy::reference); + + m.attr("photon_cons_allocated") = nb::cast(&photon_cons_allocated); +} diff --git a/src/py21cmfast/src/bubble_helper_progs.h b/src/py21cmfast/src/bubble_helper_progs.h index 8ea6087a8..91eedd24f 100644 --- a/src/py21cmfast/src/bubble_helper_progs.h +++ b/src/py21cmfast/src/bubble_helper_progs.h @@ -2,8 +2,14 @@ #ifndef _BUBBLEHELP_H #define _BUBBLEHELP_H +#ifdef __cplusplus +extern "C" { +#endif // NOTE: This file is only used for the old bubble finding algorithm which updates the whole sphere void update_in_sphere(float* box, int dimensions, int dimensions_ncf, float R, float xf, float yf, float zf); +#ifdef __cplusplus +} +#endif #endif diff --git a/src/py21cmfast/src/cosmology.c b/src/py21cmfast/src/cosmology.c index 6e489bfc5..b1cde18aa 100644 --- a/src/py21cmfast/src/cosmology.c +++ b/src/py21cmfast/src/cosmology.c @@ -126,7 +126,7 @@ double transfer_function_CLASS(double k, int flag_int, int flag_dv) { int gsl_status; FILE *F; - static bool warning_printed; + static bool warning_printed = false; static double eh_ratio_at_kmax; char filename[500]; @@ -137,7 +137,6 @@ double transfer_function_CLASS(double k, int flag_int, int flag_dv) { LOG_ERROR("Unable to open file: %s for reading.", filename); Throw(IOError); } - warning_printed = false; int nscans; for (i = 0; i < CLASS_LENGTH; i++) { @@ -190,9 +189,10 @@ double transfer_function_CLASS(double k, int flag_int, int flag_dv) { if (k > kclass[CLASS_LENGTH - 1]) { // k>kmax if (!warning_printed) { LOG_WARNING( - "Called transfer_function_CLASS with k=%f, larger than kmax! performing linear " + "Called transfer_function_CLASS with k=%f > %f, larger than kmax! performing " + "linear " "extrapolation with Eisenstein & Hu", - k); + k, kclass[CLASS_LENGTH - 1]); warning_printed = true; } if (flag_dv == 0) { // output is density diff --git a/src/py21cmfast/src/cosmology.h b/src/py21cmfast/src/cosmology.h index 7da11f2df..871ef1fd2 100644 --- a/src/py21cmfast/src/cosmology.h +++ b/src/py21cmfast/src/cosmology.h @@ -1,6 +1,9 @@ #ifndef _PS_H #define _PS_H +#ifdef __cplusplus +extern "C" { +#endif void init_ps(); double dicke(double z); double sigma_z0(double M); @@ -33,4 +36,7 @@ double hubble(float z); double t_hubble(float z); double M_J_WDM(); +#ifdef __cplusplus +} +#endif #endif diff --git a/src/py21cmfast/src/cuda_hello_world.cu b/src/py21cmfast/src/cuda_hello_world.cu new file mode 100644 index 000000000..f63633844 --- /dev/null +++ b/src/py21cmfast/src/cuda_hello_world.cu @@ -0,0 +1,31 @@ +#include +#include + +#include "cuda_utils.cuh" +#include "cuda_hello_world.cuh" + +__global__ void hello_kernel() { + printf("Hello World from GPU! BlockIdx: %d, ThreadIdx: %d\n", blockIdx.x, threadIdx.x); +} + +int call_cuda() { + hello_kernel<<<3, 3>>>(); + cudaDeviceSynchronize(); + return 0; +} + +// more members of deviceprop can be found in cura_runtime_api documentation +void print_key_device_properties(){ + int device; + CALL_CUDA(cudaGetDevice(&device)); + cudaDeviceProp deviceProp; + CALL_CUDA(cudaGetDeviceProperties(&deviceProp, device)); + printf("Device name: %s\n", deviceProp.name); + printf("Total global memory: %zu bytes \n", deviceProp.totalGlobalMem); + printf("Shared memory per block: %zu bytes\n", deviceProp.sharedMemPerBlock); + printf("Registers per block: %d\n", deviceProp.regsPerBlock); + printf("Warp size: %d \n", deviceProp.warpSize); + printf("Memory pitch: %zu bytes \n", deviceProp.memPitch); + printf("Max threads per block: %d \n", deviceProp.maxThreadsPerBlock); + printf("Total constant memory: %zu bytes \n", deviceProp.totalConstMem); +} diff --git a/src/py21cmfast/src/cuda_hello_world.cuh b/src/py21cmfast/src/cuda_hello_world.cuh new file mode 100644 index 000000000..5a34921db --- /dev/null +++ b/src/py21cmfast/src/cuda_hello_world.cuh @@ -0,0 +1,14 @@ +#ifndef _CUDA_HELLO_WORLD_CUH +#define _CUDA_HELLO_WORLD_CUH + +#ifdef __cplusplus +extern "C" +{ +#endif + int call_cuda(); + void print_key_device_properties(); +#ifdef __cplusplus +} +#endif + +#endif // _CUDA_HELLO_WORLD_CUH diff --git a/src/py21cmfast/src/cuda_utils.cuh b/src/py21cmfast/src/cuda_utils.cuh new file mode 100644 index 000000000..f2e992651 --- /dev/null +++ b/src/py21cmfast/src/cuda_utils.cuh @@ -0,0 +1,18 @@ +#ifndef _CUDA_UTILS_CUH +#define _CUDA_UTILS_CUH + +#include +#include + +#define CALL_CUDA(x) \ + do \ + { \ + cudaError_t err = (x); \ + if (err != cudaSuccess) \ + { \ + printf("Error %s at %s:%d\n", cudaGetErrorString(err), __FILE__, __LINE__); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +#endif diff --git a/src/py21cmfast/src/debugging.c b/src/py21cmfast/src/debugging.c index 71f4f0ea3..afeb5d794 100644 --- a/src/py21cmfast/src/debugging.c +++ b/src/py21cmfast/src/debugging.c @@ -147,8 +147,7 @@ void writeAstroParams(AstroParams *p) { " HII_EFF_FACTOR=%10.3e\n" " ION_Tvir_MIN=%10.3e\n" " X_RAY_Tvir_MIN=%10.3e\n", - p->HII_EFF_FACTOR, p->ION_Tvir_MIN, p->X_RAY_Tvir_MIN, p->R_BUBBLE_MAX, p->L_X, - p->NU_X_THRESH, p->X_RAY_SPEC_INDEX, p->F_STAR10, p->t_STAR); + p->HII_EFF_FACTOR, p->ION_Tvir_MIN, p->X_RAY_Tvir_MIN); } void writeAstroOptions(AstroOptions *p) { diff --git a/src/py21cmfast/src/debugging.h b/src/py21cmfast/src/debugging.h index c0c876257..bce8deb97 100644 --- a/src/py21cmfast/src/debugging.h +++ b/src/py21cmfast/src/debugging.h @@ -7,6 +7,9 @@ #include "InputParameters.h" #include "OutputStructs.h" +#ifdef __cplusplus +extern "C" { +#endif // Input debugging void writeAstroOptions(AstroOptions *p); void writeSimulationOptions(SimulationOptions *p); @@ -26,4 +29,7 @@ int SomethingThatCatches(bool sub_func); int FunctionThatCatches(bool sub_func, bool pass, double *result); void FunctionThatThrows(); +#ifdef __cplusplus +} +#endif #endif diff --git a/src/py21cmfast/src/device_rng.cu b/src/py21cmfast/src/device_rng.cu new file mode 100644 index 000000000..eda652e39 --- /dev/null +++ b/src/py21cmfast/src/device_rng.cu @@ -0,0 +1,83 @@ +#include +#include +#include +#include + +#include "cuda_utils.cuh" +#include "device_rng.cuh" + +__device__ curandState *d_randStates = nullptr; +__device__ int d_numStates = 0; + +// initiate random states +// use the same random seed, different sub-sequence, and with offset of 0 +__global__ void initRandStates(unsigned long long int random_seed, int totalStates) +{ + // get thread idx + int ind = blockIdx.x * blockDim.x + threadIdx.x; + + if (ind < totalStates){ + curand_init(random_seed, ind, 0, &d_randStates[ind]); + + // todo: add the following block to debug + if (ind < 2) + { + printf("temp check rng init.\n"); + printf("Thread %d: d = %u, v0 = %u, boxmuller_flag = %d, boxmuller_extra = %f\n", + ind, d_randStates[ind].d, d_randStates[ind].v[0], + d_randStates[ind].boxmuller_flag, d_randStates[ind].boxmuller_extra); + } + } +} + +// Function to initialize RNG states. +void init_rand_states(unsigned long long int seed, int numStates) +{ + // ensure previously allocated random states on the device are freed before allocating new ones + free_rand_states(); + + CALL_CUDA(cudaMemcpyToSymbol(d_numStates, &numStates, sizeof(int), 0, cudaMemcpyHostToDevice)); + + // todo: add the following block to debug + curandState *checkPtr0 = nullptr; + CALL_CUDA(cudaMemcpyFromSymbol(&checkPtr0, d_randStates, sizeof(checkPtr0), 0, cudaMemcpyDeviceToHost)); + printf("init device pointer = %p\n", checkPtr0); + + curandState *tmpPtr = nullptr; + CALL_CUDA(cudaMalloc((void **)&tmpPtr, numStates * sizeof(curandState))); + CALL_CUDA(cudaMemcpyToSymbol(d_randStates, &tmpPtr, sizeof(tmpPtr), 0, cudaMemcpyHostToDevice)); + tmpPtr = nullptr; + + // todo: add the following block to debug (verify device pointer has been updated successfully) + curandState *checkPtr = nullptr; + CALL_CUDA(cudaMemcpyFromSymbol(&checkPtr, d_randStates, sizeof(checkPtr), 0, cudaMemcpyDeviceToHost)); + printf("updated device pointer = %p\n", checkPtr); + + // define kernel grids + int threadsPerBlock = 256; + int blocks = (numStates + threadsPerBlock - 1) / threadsPerBlock; + + // launch kernel function + initRandStates<<>>(seed, numStates); + CALL_CUDA(cudaGetLastError()); + cudaDeviceSynchronize(); +} + +void free_rand_states() +{ + // copy device pointer/variable to the host + curandState *h_randStates = nullptr; + int h_numStates = 0; + CALL_CUDA(cudaMemcpyFromSymbol(&h_randStates, d_randStates, sizeof(d_randStates), 0, cudaMemcpyDeviceToHost)); + CALL_CUDA(cudaMemcpyFromSymbol(&h_numStates, d_numStates, sizeof(int), 0, cudaMemcpyDeviceToHost)); + if (h_randStates){ + CALL_CUDA(cudaFree(h_randStates)); + h_randStates = nullptr; + CALL_CUDA(cudaMemcpyToSymbol(d_randStates, &h_randStates, sizeof(h_randStates), 0, cudaMemcpyHostToDevice)); + } + + if (h_numStates){ + h_numStates = 0; + CALL_CUDA(cudaMemcpyToSymbol(d_numStates, &h_numStates, sizeof(int), 0, cudaMemcpyHostToDevice)); + } +} diff --git a/src/py21cmfast/src/device_rng.cuh b/src/py21cmfast/src/device_rng.cuh new file mode 100644 index 000000000..4d2b449d8 --- /dev/null +++ b/src/py21cmfast/src/device_rng.cuh @@ -0,0 +1,24 @@ +#ifndef _DEVICE_RNG_CUH +#define _DEVICE_RNG_CUH + +#ifdef __CUDACC__ +#include +// Declare the device variables as extern so that they can be shared across CUDA files. +extern __device__ curandState *d_randStates; +extern __device__ int d_numStates; +#endif + + +#ifdef __cplusplus +extern "C" +{ +#endif + // Function prototypes. + void init_rand_states(unsigned long long int seed, int numStates); + void free_rand_states(); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/py21cmfast/src/dft.h b/src/py21cmfast/src/dft.h index f8e977b68..e8f0b2afd 100644 --- a/src/py21cmfast/src/dft.h +++ b/src/py21cmfast/src/dft.h @@ -8,8 +8,14 @@ #include "InputParameters.h" +#ifdef __cplusplus +extern "C" { +#endif int dft_c2r_cube(bool use_wisdom, int dim, int dim_los, int n_threads, fftwf_complex *box); int dft_r2c_cube(bool use_wisdom, int dim, int dim_los, int n_threads, fftwf_complex *box); int CreateFFTWWisdoms(); +#ifdef __cplusplus +} +#endif #endif diff --git a/src/py21cmfast/src/elec_interp.h b/src/py21cmfast/src/elec_interp.h index 815998b7c..dfed888ad 100644 --- a/src/py21cmfast/src/elec_interp.h +++ b/src/py21cmfast/src/elec_interp.h @@ -5,6 +5,10 @@ #define x_int_NXHII 14 #define x_int_NENERGY 258 +#ifdef __cplusplus +extern "C" { +#endif + void initialize_interp_arrays(); // Primary functions to compute heating fractions and number of Lya photons or ionization produced, @@ -26,4 +30,7 @@ int locate_xHII_index(float xHII_call); // TODO: remove it and make it static in elec_interp.c extern float x_int_XHII[x_int_NXHII]; +#ifdef __cplusplus +} +#endif #endif diff --git a/src/py21cmfast/src/exceptions.h b/src/py21cmfast/src/exceptions.h index d951d6cae..fcd62dd54 100644 --- a/src/py21cmfast/src/exceptions.h +++ b/src/py21cmfast/src/exceptions.h @@ -3,6 +3,9 @@ #include "cexcept.h" +#ifdef __cplusplus +extern "C" { +#endif define_exception_type(int); // NOTE: declaration here, definition in debugging.c @@ -19,11 +22,21 @@ extern struct exception_context the_exception_context[1]; #define InfinityorNaNError 7 #define MassDepZetaError 8 #define MemoryAllocError 9 +#define CUDAError 10 +#define ParallelError 11 #define CATCH_GSL_ERROR(status) \ if (status > 0) { \ LOG_ERROR("GSL Error Encountered (Code = %d): %s", status, gsl_strerror(status)); \ Throw(GSLError); \ } +#define CATCH_CUDA_ERROR(err) \ + if (err != cudaSuccess) { \ + LOG_ERROR("CUDA Error Encountered: %s", cudaGetErrorString(err)); \ + Throw(CUDAError); \ + } +#ifdef __cplusplus +} +#endif #endif diff --git a/src/py21cmfast/src/filtering.c b/src/py21cmfast/src/filtering.c index 408e94ad4..ae17c9c92 100644 --- a/src/py21cmfast/src/filtering.c +++ b/src/py21cmfast/src/filtering.c @@ -1,3 +1,4 @@ +#include "filtering.h" #include #include @@ -115,7 +116,7 @@ double spherical_shell_filter(double k, double R_outer, double R_inner) { (sin(kR_outer) - cos(kR_outer) * kR_outer - sin(kR_inner) + cos(kR_inner) * kR_inner); } -void filter_box(fftwf_complex *box, int RES, int filter_type, float R, float R_param) { +void filter_box_cpu(fftwf_complex *box, int RES, int filter_type, float R, float R_param) { int dimension, midpoint; // TODO: figure out why defining as ULL breaks this switch (RES) { case 0: @@ -167,28 +168,23 @@ void filter_box(fftwf_complex *box, int RES, int filter_type, float R, float R_p grid_index = RES == 1 ? HII_C_INDEX(n_x, n_y, n_z) : C_INDEX(n_x, n_y, n_z); // TODO: it would be nice to combine these into the filter_function call, *but* - // since - // each can take different arguments more thought is needed + // since each can take different arguments more thought is needed if (filter_type == 0) { // real space top-hat kR = sqrt(k_mag_sq) * R; box[grid_index] *= real_tophat_filter(kR); } else if (filter_type == 1) { // k-space top hat - // NOTE: why was this commented???? - // This is actually (kR^2) but since we zero the value and find kR > 1 this - // is more computationally efficient kR = 0.17103765852*( k_x*k_x + k_y*k_y - // + k_z*k_z )*R*R; + // NOTE: Since it's a tophat we could just supply kr^2 for speed kR = sqrt(k_mag_sq) * R; box[grid_index] *= sharp_k_filter(kR); } else if (filter_type == 2) { // gaussian - // This is actually (kR^2) but since we zero the value and find kR > 1 this - // is more computationally efficient + // NOTE: This is actually (kR^2) kR = k_mag_sq * R * R; box[grid_index] *= gaussian_filter(kR); } // The next two filters are not given by the HII_FILTER global, but used for // specific grids - else if (filter_type == - 3) { // exponentially decaying tophat, param == scale of decay (MFP) + // exponentially decaying tophat, param == scale of decay (MFP) + else if (filter_type == 3) { // NOTE: This should be optimized, I havne't looked at it in a while box[grid_index] *= exp_mfp_filter(sqrt(k_mag_sq), R, R_param, R_const); } else if (filter_type == 4) { // spherical shell, R_param == inner radius @@ -206,8 +202,21 @@ void filter_box(fftwf_complex *box, int RES, int filter_type, float R, float R_p return; } +void filter_box(fftwf_complex *box, int RES, int filter_type, float R, float R_param) { + bool use_cuda = false; // pass this as a parameter later + if (use_cuda) { +#if CUDA_FOUND + filter_box_gpu(box, RES, filter_type, R, R_param); +#else + LOG_ERROR("CUDA version of filter_box() called but code was not compiled for CUDA."); +#endif + } else { + filter_box_cpu(box, RES, filter_type, R, R_param); + } +} + // Test function to filter a box without computing a whole output box -int test_filter(float *input_box, double R, double R_param, int filter_flag, double *result) { +int test_filter_cpu(float *input_box, double R, double R_param, int filter_flag, double *result) { int i, j, k; unsigned long long int ii; @@ -232,7 +241,7 @@ int test_filter(float *input_box, double R, double R_param, int filter_flag, dou memcpy(box_filtered, box_unfiltered, sizeof(fftwf_complex) * HII_KSPACE_NUM_PIXELS); - filter_box(box_filtered, 1, filter_flag, R, R_param); + filter_box_cpu(box_filtered, 1, filter_flag, R, R_param); dft_c2r_cube(matter_options_global->USE_FFTW_WISDOM, simulation_options_global->HII_DIM, HII_D_PARA, simulation_options_global->N_THREADS, box_filtered); @@ -247,3 +256,17 @@ int test_filter(float *input_box, double R, double R_param, int filter_flag, dou return 0; } + +int test_filter(float *input_box, double R, double R_param, int filter_flag, double *result) { + bool use_cuda = false; // pass this as a parameter later + if (use_cuda) { +#if CUDA_FOUND + return test_filter_gpu(input_box, R, R_param, filter_flag, result); +#else + LOG_ERROR("CUDA version of test_filter() called but code was not compiled for CUDA."); + return 1; +#endif + } else { + return test_filter_cpu(input_box, R, R_param, filter_flag, result); + } +} diff --git a/src/py21cmfast/src/filtering.cu b/src/py21cmfast/src/filtering.cu new file mode 100644 index 000000000..fb4a3b859 --- /dev/null +++ b/src/py21cmfast/src/filtering.cu @@ -0,0 +1,255 @@ +#include +#include +#include +#include +#include +#include + +// GPU +#include +#include +#include +// #include +// #include + +#include "cexcept.h" +#include "exceptions.h" +#include "logger.h" + +#include "Constants.h" +#include "InputParameters.h" +#include "indexing.h" +#include "dft.h" +#include "filtering.h" + +__device__ inline double real_tophat_filter(double kR) { + // Second order taylor expansion around kR==0 + if (kR < 1e-4) + return 1 - kR*kR/10; + return 3.0*pow(kR, -3) * (sin(kR) - cos(kR)*kR); +} + +__device__ inline double sharp_k_filter(double kR) { + if (kR * 0.413566994 > 1) + return 0.; + return 1; +} + +__device__ inline double gaussian_filter(double kR_squared) { + return exp(-0.643 * 0.643 * kR_squared / 2.); +} + +__device__ inline double exp_mfp_filter(double k, double R, double mfp, double exp_term) { + double f; + double kR = k * R; + double ratio = mfp / R; + + // Second order taylor expansion around kR==0 + if (kR < 1e-4) { + double ts_0 = 6 * pow(ratio, 3) - exp_term * (6 * pow(ratio, 3) + 6 * pow(ratio, 2) + 3 * ratio); + return ts_0 + (exp_term * (2 * pow(ratio, 2) + 0.5 * ratio) - 2 * ts_0 * pow(ratio, 2)) * kR * kR; + } + // Davies & Furlanetto MFP-eps(r) window function + f = (kR * kR * pow(ratio, 2) + 2 * ratio + 1) * ratio * cos(kR); + f += (kR * kR * (pow(ratio, 2) - pow(ratio, 3)) + ratio + 1) * sin(kR) / kR; + f *= exp_term; + f -= 2 * pow(ratio, 2); + f *= -3 * ratio/pow(pow(kR * ratio, 2) + 1, 2); + return f; +} + +__device__ inline double spherical_shell_filter(double k, double R_outer, double R_inner) { + double kR_inner = k * R_inner; + double kR_outer = k * R_outer; + + // Second order taylor expansion around kR_outer==0 + if (kR_outer < 1e-4) + return 1. - kR_outer*kR_outer / 10 * \ + (pow(R_inner / R_outer, 5) - 1) / \ + (pow(R_inner / R_outer, 3) - 1); + + return 3.0 / (pow(kR_outer, 3) - pow(kR_inner, 3)) \ + * (sin(kR_outer) - cos(kR_outer) * kR_outer \ + - sin(kR_inner) + cos(kR_inner) * kR_inner); +} + +__global__ void filter_box_kernel(cuFloatComplex *box, int num_pixels, int dimension, int midpoint, int midpoint_para, double delta_k, float R, float R_param, double R_const, int filter_type) { + + // Get index of box (flattened k-box) + unsigned long long idx = blockIdx.x * blockDim.x + threadIdx.x; + + // Bound check (in case number of threads != multiple of block size) + if (idx >= num_pixels) { + return; + } + // Compute the 3D indices (n_x, n_y, n_z) for the k-box from the flattened index (idx) + // Based on convenience macros in indexing.h + int n_z = idx % (midpoint_para + 1); + unsigned long long remaining = idx / (midpoint_para + 1); + int n_y = remaining % dimension; + int n_x = remaining / dimension; + + // Compute wave vector components + float k_x = (n_x - dimension * (n_x > midpoint)) * delta_k; // Wrap around midpoint + float k_y = (n_y - dimension * (n_y > midpoint)) * delta_k; + float k_z = n_z * delta_k; + + // TODO: Try alternative vectorised coords & wave vector components? + // int *cell_coords = (int[]) {idx % (midpoint_para + 1), (idx / (midpoint_para + 1)) % dimension, (idx / (midpoint_para + 1)) / dimension)}; // (as above and * delta_k to vector at end) + // int *wave_vector = (float[]) { ... } + + // Compute squared magnitude of wave vector + float k_mag_sq = k_x*k_x + k_y*k_y + k_z*k_z; + + float kR; + if (filter_type == 0) { // real space top-hat + kR = sqrt(k_mag_sq) * R; + // box[idx] *= real_tophat_filter(kR); + box[idx] = cuCmulf(box[idx], make_cuFloatComplex((float)real_tophat_filter(kR), 0.f)); + } + else if (filter_type == 1) { // k-space top hat + kR = sqrt(k_mag_sq) * R; + // box[idx] *= sharp_k_filter(kR); + box[idx] = cuCmulf(box[idx], make_cuFloatComplex((float)sharp_k_filter(kR), 0.f)); + } + else if (filter_type == 2) { // gaussian + kR = k_mag_sq * R * R; + // box[idx] *= gaussian_filter(kR); + box[idx] = cuCmulf(box[idx], make_cuFloatComplex((float)gaussian_filter(kR), 0.f)); + } + else if (filter_type == 3) { // exponentially decaying tophat + // box[idx] *= exp_mfp_filter(sqrt(k_mag_sq), R, R_param, R_const); + box[idx] = cuCmulf(box[idx], make_cuFloatComplex((float)exp_mfp_filter(sqrt(k_mag_sq), R, R_param, R_const), 0.f)); + } + else if (filter_type == 4) { //spherical shell + // box[idx] *= spherical_shell_filter(sqrt(k_mag_sq), R, R_param); + box[idx] = cuCmulf(box[idx], make_cuFloatComplex((float)spherical_shell_filter(sqrt(k_mag_sq), R, R_param), 0.f)); + } +} + +void filter_box_gpu(fftwf_complex *box, int RES, int filter_type, float R, float R_param) { + + // Check for valid filter type + if (filter_type < 0 || filter_type > 4) { + LOG_WARNING("Filter type %i is undefined. Box is unfiltered.", filter_type); + return; + } + + // Get required values + int dimension, midpoint, midpoint_para, num_pixels; + switch(RES) { + case 0: + dimension = user_params_global->DIM; + midpoint = MIDDLE; // midpoint of x,y = DIM / 2 + midpoint_para = MID_PARA; // midpoint of z = NON_CUBIC_FACTOR * HII_DIM / 2 + num_pixels = KSPACE_NUM_PIXELS; + break; + case 1: + dimension = user_params_global->HII_DIM; + midpoint = HII_MIDDLE; // midpoint of x,y = HII_DIM / 2 + midpoint_para = HII_MID_PARA; // midpoint of z = NON_CUBIC_FACTOR * HII_DIM / 2 + num_pixels = HII_KSPACE_NUM_PIXELS; + break; + default: + LOG_ERROR("Resolution for filter functions must be 0(DIM) or 1(HII_DIM)"); + Throw(ValueError); + break; + } + double delta_k = DELTA_K; + double R_const; + if (filter_type == 3) { + R_const = exp(-R / R_param); + } + + // Get size of flattened array + size_t size = num_pixels * sizeof(fftwf_complex); + + cudaError_t err; + + // Allocate device memory + fftwf_complex* d_box; + err = cudaMalloc(&d_box, size); + if (err != cudaSuccess) { + LOG_ERROR("CUDA error: %s", cudaGetErrorString(err)); + Throw(CUDAError); + } + + // Copy array from host to device + err = cudaMemcpy(d_box, box, size, cudaMemcpyHostToDevice); + if (err != cudaSuccess) { + LOG_ERROR("CUDA error: %s", cudaGetErrorString(err)); + Throw(CUDAError); + } + + // Invoke kernel + int threadsPerBlock = 256; + int numBlocks = (num_pixels + threadsPerBlock - 1) / threadsPerBlock; + // d_box must be cast to cuFloatComplex (from fftwf_complex) for CUDA + filter_box_kernel<<>>(reinterpret_cast(d_box), num_pixels, dimension, midpoint, midpoint_para, delta_k, R, R_param, R_const, filter_type); + + // // Only use during development! + err = cudaDeviceSynchronize(); + CATCH_CUDA_ERROR(err); + + err = cudaGetLastError(); + if (err != cudaSuccess) { + LOG_ERROR("Kernel launch error: %s", cudaGetErrorString(err)); + Throw(CUDAError); + } + + // Copy results from device to host + err = cudaMemcpy(box, d_box, size, cudaMemcpyDeviceToHost); + if (err != cudaSuccess) { + LOG_ERROR("CUDA error: %s", cudaGetErrorString(err)); + Throw(CUDAError); + } + + // Deallocate device memory + err = cudaFree(d_box); + if (err != cudaSuccess) { + LOG_ERROR("CUDA error: %s", cudaGetErrorString(err)); + Throw(CUDAError); + } +} + +// Test function to filter a box without computing a whole output box +//TODO: set device constants here +int test_filter_gpu(float *input_box, double R, double R_param, int filter_flag, double *result) { + int i,j,k; + unsigned long long int ii; + + //setup the box + fftwf_complex *box_unfiltered = (fftwf_complex *) fftwf_malloc(sizeof(fftwf_complex)*HII_KSPACE_NUM_PIXELS); + fftwf_complex *box_filtered = (fftwf_complex *) fftwf_malloc(sizeof(fftwf_complex)*HII_KSPACE_NUM_PIXELS); + + for (i=0; iHII_DIM; i++) + for (j=0; jHII_DIM; j++) + for (k=0; kUSE_FFTW_WISDOM, user_params->HII_DIM, HII_D_PARA, user_params->N_THREADS, box_unfiltered); + + // Convert to CUDA complex type + cuFloatComplex* box_unfiltered_cu = reinterpret_cast(box_unfiltered); + + for(ii=0;iiUSE_FFTW_WISDOM, user_params->HII_DIM, HII_D_PARA, user_params->N_THREADS, box_filtered); + + for (i=0; iHII_DIM; i++) + for (j=0; jHII_DIM; j++) + for (k=0; k #include -#include "InputParameters.h" +#ifdef __cplusplus +extern "C" { +#endif void filter_box(fftwf_complex *box, int RES, int filter_type, float R, float R_param); +void filter_box_cpu(fftwf_complex *box, int RES, int filter_type, float R, float R_param); +void filter_box_gpu(fftwf_complex *box, int RES, int filter_type, float R, float R_param); int test_filter(float *input_box, double R, double R_param, int filter_flag, double *result); +int test_filter_cpu(float *input_box, double R, double R_param, int filter_flag, double *result); +int test_filter_gpu(float *input_box, double R, double R_param, int filter_flag, double *result); double filter_function(double k, int filter_type); double dwdm_filter(double k, double R, int filter_type); +#ifdef __cplusplus +} +#endif #endif diff --git a/src/py21cmfast/src/heating_helper_progs.c b/src/py21cmfast/src/heating_helper_progs.c index 1552c1747..17bff67ea 100644 --- a/src/py21cmfast/src/heating_helper_progs.c +++ b/src/py21cmfast/src/heating_helper_progs.c @@ -857,7 +857,7 @@ typedef struct { double ion_eff; double ion_eff_MINI; double log10_Mturn_MINI; - struct ScalingConstants *scale_consts; + ScalingConstants *scale_consts; } tauX_params; double tauX_integrand_MINI(double zhat, void *params) { @@ -919,8 +919,7 @@ double tauX_integrand(double zhat, void *params) { return drpropdz * n * HI_filling_factor_zhat * sigma_tilde; } double tauX_MINI(double nu, double x_e, double x_e_ave, double zp, double zpp, - double HI_filling_factor_zp, double log10_Mturn_MINI, - struct ScalingConstants *sc) { + double HI_filling_factor_zp, double log10_Mturn_MINI, ScalingConstants *sc) { double result, error; gsl_function F; @@ -963,7 +962,7 @@ double tauX_MINI(double nu, double x_e, double x_e_ave, double zp, double zpp, } double tauX(double nu, double x_e, double x_e_ave, double zp, double zpp, - double HI_filling_factor_zp, struct ScalingConstants *sc) { + double HI_filling_factor_zp, ScalingConstants *sc) { double result, error, fcoll; gsl_function F; double rel_tol = 0.005; //<- relative tolerance @@ -1026,7 +1025,7 @@ typedef struct { double zpp; double HI_filling_factor_zp; double log10_Mturn_MINI; - struct ScalingConstants *scale_consts; + ScalingConstants *scale_consts; } nu_tau_one_params; double nu_tau_one_helper_MINI(double nu, void *params) { nu_tau_one_params *p = (nu_tau_one_params *)params; @@ -1039,7 +1038,7 @@ double nu_tau_one_helper(double nu, void *params) { return tauX(nu, p->x_e, p->x_e, p->zp, p->zpp, p->HI_filling_factor_zp, p->scale_consts) - 1; } double nu_tau_one_MINI(double zp, double zpp, double x_e, double HI_filling_factor_zp, - double log10_Mturn_MINI, struct ScalingConstants *sc) { + double log10_Mturn_MINI, ScalingConstants *sc) { int status, iter, max_iter; const gsl_root_fsolver_type *T; gsl_root_fsolver *s; @@ -1107,7 +1106,7 @@ double nu_tau_one_MINI(double zp, double zpp, double x_e, double HI_filling_fact } double nu_tau_one(double zp, double zpp, double x_e, double HI_filling_factor_zp, - struct ScalingConstants *sc) { + ScalingConstants *sc) { int status, iter, max_iter; const gsl_root_fsolver_type *T; gsl_root_fsolver *s; diff --git a/src/py21cmfast/src/heating_helper_progs.h b/src/py21cmfast/src/heating_helper_progs.h index cbd0f1c08..97b806a8e 100644 --- a/src/py21cmfast/src/heating_helper_progs.h +++ b/src/py21cmfast/src/heating_helper_progs.h @@ -1,6 +1,9 @@ #ifndef _HEATHELPER_H #define _HEATHELPER_H +#ifdef __cplusplus +extern "C" { +#endif #include "scaling_relations.h" // * initialization routine * // @@ -45,11 +48,14 @@ double Energy_Lya_heating(double Tk, double Ts, double tau_gp, int flag); // rootfind to get the distance at which GP optical depth tau==1 double nu_tau_one_MINI(double zp, double zpp, double x_e, double HI_filling_factor_zp, - double log10_Mturn_MINI, struct ScalingConstants *sc); + double log10_Mturn_MINI, ScalingConstants *sc); double nu_tau_one(double zp, double zpp, double x_e, double HI_filling_factor_zp, - struct ScalingConstants *sc); + ScalingConstants *sc); // xray heating integrals over frequency double integrate_over_nu(double zp, double local_x_e, double lower_int_limit, int FLAG); +#ifdef __cplusplus +} +#endif #endif diff --git a/src/py21cmfast/src/hmf.c b/src/py21cmfast/src/hmf.c index ccac26017..4ec577e49 100644 --- a/src/py21cmfast/src/hmf.c +++ b/src/py21cmfast/src/hmf.c @@ -842,7 +842,7 @@ double Fcoll_General(double z, double lnM_min, double lnM_max) { } double Nion_General(double z, double lnM_Min, double lnM_Max, double MassTurnover, - struct ScalingConstants *sc) { + ScalingConstants *sc) { struct parameters_gsl_MF_integrals params = { .redshift = z, .growthf = dicke(z), @@ -860,7 +860,7 @@ double Nion_General(double z, double lnM_Min, double lnM_Max, double MassTurnove } double Nion_General_MINI(double z, double lnM_Min, double lnM_Max, double MassTurnover, - struct ScalingConstants *sc) { + ScalingConstants *sc) { struct parameters_gsl_MF_integrals params = { .redshift = z, .growthf = dicke(z), @@ -879,7 +879,7 @@ double Nion_General_MINI(double z, double lnM_Min, double lnM_Max, double MassTu } double Xray_General(double z, double lnM_Min, double lnM_Max, double mturn_acg, double mturn_mcg, - struct ScalingConstants *sc) { + ScalingConstants *sc) { // NOTE:in the _General functions, we don't use the scaling relation constants // that are z-dependent so we can evaluate them at multiple redshifts without redoing the // constants @@ -954,7 +954,7 @@ double Mcoll_Conditional(double growthf, double lnM1, double lnM2, double lnM_co double Nion_ConditionalM_MINI(double growthf, double lnM1, double lnM2, double lnM_cond, double sigma2, double delta2, double MassTurnover, - struct ScalingConstants *sc, int method) { + ScalingConstants *sc, int method) { struct parameters_gsl_MF_integrals params = { .growthf = growthf, .Mturn_mcg = MassTurnover, @@ -992,8 +992,7 @@ double Nion_ConditionalM_MINI(double growthf, double lnM1, double lnM2, double l } double Nion_ConditionalM(double growthf, double lnM1, double lnM2, double lnM_cond, double sigma2, - double delta2, double MassTurnover, struct ScalingConstants *sc, - int method) { + double delta2, double MassTurnover, ScalingConstants *sc, int method) { struct parameters_gsl_MF_integrals params = { .growthf = growthf, .Mturn_acg = MassTurnover, @@ -1029,7 +1028,7 @@ double Nion_ConditionalM(double growthf, double lnM1, double lnM2, double lnM_co double Xray_ConditionalM(double redshift, double growthf, double lnM1, double lnM2, double lnM_cond, double sigma2, double delta2, double mturn_acg, double mturn_mcg, - struct ScalingConstants *sc, int method) { + ScalingConstants *sc, int method) { // re-using escape fraction for minihalo parameters struct parameters_gsl_MF_integrals params = { .redshift = redshift, diff --git a/src/py21cmfast/src/hmf.cu b/src/py21cmfast/src/hmf.cu new file mode 100644 index 000000000..379a90a62 --- /dev/null +++ b/src/py21cmfast/src/hmf.cu @@ -0,0 +1,21 @@ +#include +#include + +#include "Constants.h" +#include "hmf.cuh" + +__device__ double sheth_delc_fixed(double del, double sig) +{ + return sqrt(JENKINS_a) * del * (1. + JENKINS_b * pow(sig * sig / (JENKINS_a * del * del), JENKINS_c)); +} + +// Get the relevant excursion set barrier density given the user-specified HMF +__device__ double get_delta_crit(int HMF, double sigma, double growthf) +{ + if (HMF == 4) + return DELTAC_DELOS; + if (HMF == 1) + return sheth_delc_fixed(Deltac / growthf, sigma) * growthf; + + return Deltac; +} diff --git a/src/py21cmfast/src/hmf.cuh b/src/py21cmfast/src/hmf.cuh new file mode 100644 index 000000000..84316a5ef --- /dev/null +++ b/src/py21cmfast/src/hmf.cuh @@ -0,0 +1,24 @@ +#include + +#ifndef _HMF_CUH +#define _HMF_CUH + +// define macros +#ifndef JENKINS_a +#define JENKINS_a (0.73) // Jenkins+01, SMT has 0.707 +#endif + +#ifndef JENKINS_b +#define JENKINS_b (0.34) // Jenkins+01 fit from Barkana+01, SMT has 0.5 +#endif + +#ifndef JENKINS_c +#define JENKINS_c (0.81) // Jenkins+01 from from Barkana+01, SMT has 0.6 +#endif + +// #ifdef __CUDA_ARCH__ +__device__ double sheth_delc_fixed(double del, double sig); +__device__ double get_delta_crit(int HMF, double sigma, double growthf); +// #endif + +#endif diff --git a/src/py21cmfast/src/hmf.h b/src/py21cmfast/src/hmf.h index 57796e592..3db969bb4 100644 --- a/src/py21cmfast/src/hmf.h +++ b/src/py21cmfast/src/hmf.h @@ -5,6 +5,10 @@ #include "scaling_relations.h" // integrals +#ifdef __cplusplus +extern "C" { +#endif + #define MAX_DELTAC_FRAC (float)0.99 // max delta/deltac for the mass function integrals #define DELTA_MIN -1 // minimum delta for Lagrangian mass function integrals #define M_MIN_INTEGRAL 1e5 @@ -14,11 +18,11 @@ void initialise_GL(double lnM_Min, double lnM_Max); double Nion_General(double z, double lnM_Min, double lnM_Max, double MassTurnover, - struct ScalingConstants *sc); + ScalingConstants *sc); double Nion_General_MINI(double z, double lnM_Min, double lnM_Max, double MassTurnover, - struct ScalingConstants *sc); + ScalingConstants *sc); double Xray_General(double z, double lnM_Min, double lnM_Max, double mturn_acg, double mturn_mcg, - struct ScalingConstants *sc); + ScalingConstants *sc); double Fcoll_General(double z, double lnM_min, double lnM_max); double Nhalo_General(double z, double lnM_min, double lnM_max); @@ -28,13 +32,12 @@ double Mcoll_Conditional(double growthf, double lnM1, double lnM2, double lnM_co double delta, int method); double Nion_ConditionalM_MINI(double growthf, double lnM1, double lnM2, double lnM_cond, double sigma2, double delta2, double MassTurnover, - struct ScalingConstants *sc, int method); + ScalingConstants *sc, int method); double Nion_ConditionalM(double growthf, double lnM1, double lnM2, double lnM_cond, double sigma2, - double delta2, double MassTurnover, struct ScalingConstants *sc, - int method); + double delta2, double MassTurnover, ScalingConstants *sc, int method); double Xray_ConditionalM(double redshift, double growthf, double lnM1, double lnM2, double lnM_cond, double sigma2, double delta2, double mturn_acg, double mturn_mcg, - struct ScalingConstants *sc, int method); + ScalingConstants *sc, int method); double unconditional_hmf(double growthf, double lnM, double z, int HMF); double conditional_hmf(double growthf, double lnM, double delta_cond, double sigma_cond, int HMF); @@ -54,4 +57,7 @@ double sheth_delc_dexm(double del, double sig); float Mass_limit_bisection(float Mmin, float Mmax, float PL, float FRAC); double euler_to_lagrangian_delta(double delta); +#ifdef __cplusplus +} +#endif #endif diff --git a/src/py21cmfast/src/integral_wrappers.c b/src/py21cmfast/src/integral_wrappers.c index 4481dc98a..a25f5c759 100644 --- a/src/py21cmfast/src/integral_wrappers.c +++ b/src/py21cmfast/src/integral_wrappers.c @@ -116,7 +116,7 @@ void get_global_SFRD_z(int n_redshift, double *redshifts, double *log10_turnover if (matter_options_global->USE_INTERPOLATION_TABLES > 0) initialiseSigmaMInterpTable(M_min, 1e20); - struct ScalingConstants sc; + ScalingConstants sc; set_scaling_constants(redshifts[0], &sc, false); int i; @@ -146,7 +146,7 @@ void get_global_Nion_z(int n_redshift, double *redshifts, double *log10_turnover if (matter_options_global->USE_INTERPOLATION_TABLES > 0) initialiseSigmaMInterpTable(M_min, 1e20); - struct ScalingConstants sc; + ScalingConstants sc; set_scaling_constants(redshifts[0], &sc, false); int i; @@ -217,7 +217,7 @@ void get_conditional_SFRD(double redshift, double R, int n_densities, double *de astro_options_global->INTEGRATION_METHOD_MINI == 1)) initialise_GL(log(M_min), log(M_cond)); - struct ScalingConstants sc; + ScalingConstants sc; set_scaling_constants(redshift, &sc, false); int i; @@ -260,7 +260,7 @@ void get_conditional_Nion(double redshift, double R, int n_densities, double *de astro_options_global->INTEGRATION_METHOD_MINI == 1)) initialise_GL(log(M_min), log(M_cond)); - struct ScalingConstants sc; + ScalingConstants sc; set_scaling_constants(redshift, &sc, false); int i; @@ -315,7 +315,7 @@ void get_conditional_Xray(double redshift, double R, int n_densities, double *de astro_options_global->INTEGRATION_METHOD_MINI == 1)) initialise_GL(log(M_min), log(M_cond)); - struct ScalingConstants sc; + ScalingConstants sc; set_scaling_constants(redshift, &sc, false); int i; diff --git a/src/py21cmfast/src/interp_tables.c b/src/py21cmfast/src/interp_tables.c index 88452b10e..185f502c5 100644 --- a/src/py21cmfast/src/interp_tables.c +++ b/src/py21cmfast/src/interp_tables.c @@ -43,9 +43,8 @@ static RGTable2D SFRD_z_table_MINI = {.allocated = false}; static RGTable2D Nion_z_table_MINI = {.allocated = false}; static RGTable2D Xray_z_table_2D = {.allocated = false}; // TODO: SFRD tables assume no reionisation feedback, this is self-inconsistent, but probably okay -// given -// it's used (mostly) in the SpinTemperature, which deals with neutral regions -// Will overestimate integral component of SFRD lightcones used in observation +// given it's used (mostly) in the SpinTemperature, which deals with neutral regions +// Will overestimate integral component of SFRD lightcones used in observation static RGTable1D_f SFRD_conditional_table = {.allocated = false}; static RGTable1D_f Nion_conditional_table1D = {.allocated = false}; static RGTable2D_f Nion_conditional_table2D = {.allocated = false}; @@ -92,7 +91,7 @@ static RGTable1D_f dSigmasqdm_InterpTable = { // NOTE: this table is initialised for up to N_redshift x N_Mturn, but only called N_filter times to // assign ST_over_PS in Spintemp. // It may be better to just do the integrals at each R -void initialise_SFRD_spline(int Nbin, float zmin, float zmax, struct ScalingConstants *sc) { +void initialise_SFRD_spline(int Nbin, float zmin, float zmax, ScalingConstants *sc) { int i, j; double Mmax = M_MAX_INTEGRAL; double lnMmax = log(Mmax); @@ -117,7 +116,7 @@ void initialise_SFRD_spline(int Nbin, float zmin, float zmax, struct ScalingCons #pragma omp parallel private(i, j) num_threads(simulation_options_global -> N_THREADS) { - struct ScalingConstants sc_sfrd; + ScalingConstants sc_sfrd; sc_sfrd = evolve_scaling_constants_sfr(sc); double mturn_mcg; double lnMmin; @@ -159,7 +158,7 @@ void initialise_SFRD_spline(int Nbin, float zmin, float zmax, struct ScalingCons // Unlike the SFRD spline, this one is used more due to the nu_tau_one() rootfind // although still ignores reionisation feedback -void initialise_Nion_Ts_spline(int Nbin, float zmin, float zmax, struct ScalingConstants *sc) { +void initialise_Nion_Ts_spline(int Nbin, float zmin, float zmax, ScalingConstants *sc) { int i, j; double Mmax = M_MAX_INTEGRAL; double lnMmax = log(Mmax); @@ -183,7 +182,7 @@ void initialise_Nion_Ts_spline(int Nbin, float zmin, float zmax, struct ScalingC #pragma omp parallel private(i, j) num_threads(simulation_options_global -> N_THREADS) { - struct ScalingConstants sc_z; + ScalingConstants sc_z; double mturn_mcg; double z_val; double lnMmin; @@ -290,7 +289,7 @@ void initialise_Nion_Conditional_spline(double z, double min_density, double max double Mmin, double Mmax, double Mcond, double log10Mturn_min, double log10Mturn_max, double log10Mturn_min_MINI, double log10Mturn_max_MINI, - struct ScalingConstants *sc, bool prev) { + ScalingConstants *sc, bool prev) { int i, j; double overdense_table[NDELTA]; double mturns[NMTURN], mturns_MINI[NMTURN]; @@ -412,7 +411,7 @@ void initialise_Nion_Conditional_spline(double z, double min_density, double max // This function initialises one table, for table Rx arrays I will call this function in a loop void initialise_SFRD_Conditional_table(double z, double min_density, double max_density, double Mmin, double Mmax, double Mcond, - struct ScalingConstants *sc) { + ScalingConstants *sc) { float sigma2; int i, k; @@ -449,7 +448,7 @@ void initialise_SFRD_Conditional_table(double z, double min_density, double max_ SFRD_conditional_table_MINI.y_width = (LOG10_MTURN_MAX - LOG10_MTURN_MIN) / (NMTURN - 1.); } - struct ScalingConstants sc_sfrd = evolve_scaling_constants_sfr(sc); + ScalingConstants sc_sfrd = evolve_scaling_constants_sfr(sc); #pragma omp parallel private(i, k) num_threads(simulation_options_global -> N_THREADS) { @@ -494,7 +493,7 @@ void initialise_SFRD_Conditional_table(double z, double min_density, double max_ // This function initialises one table, for table Rx arrays I will call this function in a loop void initialise_Xray_Conditional_table(double redshift, double min_density, double max_density, double Mmin, double Mmax, double Mcond, - struct ScalingConstants *sc) { + ScalingConstants *sc) { int i, k; LOG_SUPER_DEBUG("Initialising Xray conditional table at mass %.2e from delta %.2e to %.2e", @@ -884,7 +883,7 @@ void free_global_tables() { // JD: moving the interp table evaluations here since some of them are needed in nu_tau_one // NOTE: with !USE_MASS_DEPENDENT_ZETA both EvaluateNionTs and EvaluateSFRD return Fcoll -double EvaluateNionTs(double redshift, struct ScalingConstants *sc) { +double EvaluateNionTs(double redshift, ScalingConstants *sc) { // differences in turnover are handled by table setup if (matter_options_global->USE_INTERPOLATION_TABLES > 1) { if (astro_options_global->USE_MASS_DEPENDENT_ZETA) @@ -898,7 +897,7 @@ double EvaluateNionTs(double redshift, struct ScalingConstants *sc) { double lnMmin = log(minimum_source_mass(redshift, true)); double lnMmax = log(M_MAX_INTEGRAL); - struct ScalingConstants sc_z = evolve_scaling_constants_to_redshift(redshift, sc, false); + ScalingConstants sc_z = evolve_scaling_constants_to_redshift(redshift, sc, false); // minihalos uses a different turnover mass if (astro_options_global->USE_MASS_DEPENDENT_ZETA) @@ -907,19 +906,18 @@ double EvaluateNionTs(double redshift, struct ScalingConstants *sc) { return Fcoll_General(redshift, lnMmin, lnMmax); } -double EvaluateNionTs_MINI(double redshift, double log10_Mturn_LW_ave, - struct ScalingConstants *sc) { +double EvaluateNionTs_MINI(double redshift, double log10_Mturn_LW_ave, ScalingConstants *sc) { if (matter_options_global->USE_INTERPOLATION_TABLES > 1) { return EvaluateRGTable2D(redshift, log10_Mturn_LW_ave, &Nion_z_table_MINI); } double lnMmin = log(minimum_source_mass(redshift, true)); double lnMmax = log(M_MAX_INTEGRAL); - struct ScalingConstants sc_z = evolve_scaling_constants_to_redshift(redshift, sc, false); + ScalingConstants sc_z = evolve_scaling_constants_to_redshift(redshift, sc, false); return Nion_General_MINI(redshift, lnMmin, lnMmax, pow(10., log10_Mturn_LW_ave), &sc_z); } -double EvaluateSFRD(double redshift, struct ScalingConstants *sc) { +double EvaluateSFRD(double redshift, ScalingConstants *sc) { if (matter_options_global->USE_INTERPOLATION_TABLES > 1) { if (astro_options_global->USE_MASS_DEPENDENT_ZETA) return EvaluateRGTable1D(redshift, &SFRD_z_table); @@ -934,7 +932,7 @@ double EvaluateSFRD(double redshift, struct ScalingConstants *sc) { // The SFRD calls the same function as N_ion but sets escape fractions to unity // NOTE: since this only occurs on integration, the struct copy shouldn't be a bottleneck - struct ScalingConstants sc_sfrd = evolve_scaling_constants_sfr(sc); + ScalingConstants sc_sfrd = evolve_scaling_constants_sfr(sc); sc_sfrd = evolve_scaling_constants_to_redshift(redshift, &sc_sfrd, false); if (astro_options_global->USE_MASS_DEPENDENT_ZETA) @@ -942,7 +940,7 @@ double EvaluateSFRD(double redshift, struct ScalingConstants *sc) { return Fcoll_General(redshift, lnMmin, lnMmax); } -double EvaluateSFRD_MINI(double redshift, double log10_Mturn_LW_ave, struct ScalingConstants *sc) { +double EvaluateSFRD_MINI(double redshift, double log10_Mturn_LW_ave, ScalingConstants *sc) { if (matter_options_global->USE_INTERPOLATION_TABLES > 1) { return EvaluateRGTable2D(redshift, log10_Mturn_LW_ave, &SFRD_z_table_MINI); } @@ -950,19 +948,19 @@ double EvaluateSFRD_MINI(double redshift, double log10_Mturn_LW_ave, struct Scal double lnMmin = log(minimum_source_mass(redshift, true)); double lnMmax = log(M_MAX_INTEGRAL); - struct ScalingConstants sc_sfrd = evolve_scaling_constants_sfr(sc); + ScalingConstants sc_sfrd = evolve_scaling_constants_sfr(sc); sc_sfrd = evolve_scaling_constants_to_redshift(redshift, &sc_sfrd, false); return Nion_General_MINI(redshift, lnMmin, lnMmax, pow(10., log10_Mturn_LW_ave), &sc_sfrd); } double EvaluateSFRD_Conditional(double delta, double growthf, double M_min, double M_max, - double M_cond, double sigma_max, struct ScalingConstants *sc) { + double M_cond, double sigma_max, ScalingConstants *sc) { if (matter_options_global->USE_INTERPOLATION_TABLES > 1) { return exp(EvaluateRGTable1D_f(delta, &SFRD_conditional_table)); } - struct ScalingConstants sc_sfrd = evolve_scaling_constants_sfr(sc); + ScalingConstants sc_sfrd = evolve_scaling_constants_sfr(sc); // SFRD in Ts assumes no (reion) feedback on ACG return Nion_ConditionalM(growthf, log(M_min), log(M_max), log(M_cond), sigma_max, delta, sc_sfrd.mturn_a_nofb, &sc_sfrd, @@ -971,20 +969,20 @@ double EvaluateSFRD_Conditional(double delta, double growthf, double M_min, doub double EvaluateSFRD_Conditional_MINI(double delta, double log10Mturn_m, double growthf, double M_min, double M_max, double M_cond, double sigma_max, - struct ScalingConstants *sc) { + ScalingConstants *sc) { if (matter_options_global->USE_INTERPOLATION_TABLES > 1) { return exp(EvaluateRGTable2D_f(delta, log10Mturn_m, &SFRD_conditional_table_MINI)); } - struct ScalingConstants sc_sfrd = evolve_scaling_constants_sfr(sc); + ScalingConstants sc_sfrd = evolve_scaling_constants_sfr(sc); return Nion_ConditionalM_MINI(growthf, log(M_min), log(M_max), log(M_cond), sigma_max, delta, pow(10, log10Mturn_m), &sc_sfrd, astro_options_global->INTEGRATION_METHOD_MINI); } double EvaluateNion_Conditional(double delta, double log10Mturn, double growthf, double M_min, - double M_max, double M_cond, double sigma_max, - struct ScalingConstants *sc, bool prev) { + double M_max, double M_cond, double sigma_max, ScalingConstants *sc, + bool prev) { RGTable2D_f *table = prev ? &Nion_conditional_table_prev : &Nion_conditional_table2D; if (matter_options_global->USE_INTERPOLATION_TABLES > 1) { if (astro_options_global->USE_MINI_HALOS) @@ -1001,7 +999,7 @@ double EvaluateNion_Conditional(double delta, double log10Mturn, double growthf, double EvaluateNion_Conditional_MINI(double delta, double log10Mturn_m, double growthf, double M_min, double M_max, double M_cond, double sigma_max, - struct ScalingConstants *sc, bool prev) { + ScalingConstants *sc, bool prev) { RGTable2D_f *table = prev ? &Nion_conditional_table_MINI_prev : &Nion_conditional_table_MINI; if (matter_options_global->USE_INTERPOLATION_TABLES > 1) { return exp(EvaluateRGTable2D_f(delta, log10Mturn_m, table)); @@ -1014,7 +1012,7 @@ double EvaluateNion_Conditional_MINI(double delta, double log10Mturn_m, double g double EvaluateXray_Conditional(double delta, double log10Mturn_m, double redshift, double growthf, double M_min, double M_max, double M_cond, double sigma_max, - struct ScalingConstants *sc) { + ScalingConstants *sc) { if (matter_options_global->USE_INTERPOLATION_TABLES > 1) { if (astro_options_global->USE_MINI_HALOS) return exp(EvaluateRGTable2D_f(delta, log10Mturn_m, &Xray_conditional_table_2D)); @@ -1183,3 +1181,34 @@ double EvaluatedSigmasqdm(double lnM) { } return dsigmasqdm_z0(exp(lnM)); } + +// Accessor function for the GPU SpinTemp kernel to access table. +RGTable1D_f *get_SFRD_conditional_table(void) { return &SFRD_conditional_table; } + +// Accessor function for the GPU Ionisation kernel to access table. +RGTable1D_f *get_Nion_conditional_table1D(void) { return &Nion_conditional_table1D; } + +// Accessor function for GPU memory allocation functions to access nbins. +int get_nbins(void) { return NDELTA; } + +// todo: only return when it's been initialized +RGTable1D *GetNhaloTable() { + printf("The number of bins: %d; x_min: %f\n", Nhalo_table.n_bin, Nhalo_table.x_min); + return &Nhalo_table; +} + +RGTable1D *GetMcollTable() { + printf("The number of bins: %d; x_min: %f\n", Mcoll_table.n_bin, Mcoll_table.x_min); + return &Mcoll_table; +} + +RGTable2D *GetNhaloInvTable() { + printf("The number of nx bins: %d; the number of ny bins: %d \n", Nhalo_inv_table.nx_bin, + Nhalo_inv_table.ny_bin); + return &Nhalo_inv_table; +} + +RGTable1D_f *GetSigmaInterpTable() { + printf("The number of bins: %d; x_min: %f\n", Sigma_InterpTable.n_bin, Sigma_InterpTable.x_min); + return &Sigma_InterpTable; +} diff --git a/src/py21cmfast/src/interp_tables.cu b/src/py21cmfast/src/interp_tables.cu new file mode 100644 index 000000000..1f0702af5 --- /dev/null +++ b/src/py21cmfast/src/interp_tables.cu @@ -0,0 +1,157 @@ +#include + +#include +// #include + +// #include "InputParameters.h" +#include "interpolation_types.h" + +#include "cuda_utils.cuh" +#include "interp_tables.cuh" +#include "DeviceConstants.cuh" + +#include "interpolation.cu" + +// define relevant variables stored in constant memory +__constant__ RGTable1D d_Nhalo_table; +__constant__ RGTable1D d_Mcoll_table; +__constant__ RGTable2D d_Nhalo_inv_table; + +// specify a max size of yarr +const int device_n_max = 200; +__constant__ double d_Nhalo_yarr[device_n_max]; +__constant__ double d_Mcoll_yarr[device_n_max]; + + +// copy tables to gpu +void copyTablesToDevice(RGTable1D h_Nhalo_table, RGTable1D h_Mcoll_table, RGTable2D h_Nhalo_inv_table) +{ + // copy Nhalo table and its member y_arr + size_t size_Nhalo_yarr = sizeof(double) * h_Nhalo_table.n_bin; + // get a copy of the Nhalo table + RGTable1D h_Nhalo_table_to_device = h_Nhalo_table; + if (h_Nhalo_table.n_bin > device_n_max){ + // double *d_Nhalo_yarr; + // todo: declare device yarr (not using constant) + return; + } + else{ + CALL_CUDA(cudaMemcpyToSymbol(d_Nhalo_yarr, h_Nhalo_table.y_arr, size_Nhalo_yarr, 0, cudaMemcpyHostToDevice)); + // get memory address on the device + double *d_Nhalo_yarr_device; + CALL_CUDA(cudaGetSymbolAddress((void **)&d_Nhalo_yarr_device, d_Nhalo_yarr)); + + h_Nhalo_table_to_device.y_arr = d_Nhalo_yarr_device; + } + CALL_CUDA(cudaMemcpyToSymbol(d_Nhalo_table, &h_Nhalo_table_to_device, sizeof(RGTable1D), 0, cudaMemcpyHostToDevice)); + + // copy Mcoll table and its member y_arr + size_t size_Mcoll_yarr = sizeof(double) * h_Mcoll_table.n_bin; + // get a copy of Mcoll table + RGTable1D h_Mcoll_table_to_device = h_Mcoll_table; + if (h_Mcoll_table.n_bin > device_n_max){ + return; + } + else{ + CALL_CUDA(cudaMemcpyToSymbol(d_Mcoll_yarr, h_Mcoll_table.y_arr, size_Mcoll_yarr, 0, cudaMemcpyHostToDevice)); + // get memory address on the device + double *d_Mcoll_yarr_device; + CALL_CUDA(cudaGetSymbolAddress((void **)&d_Mcoll_yarr_device, d_Mcoll_yarr)); + + h_Mcoll_table_to_device.y_arr = d_Mcoll_yarr_device; + } + CALL_CUDA(cudaMemcpyToSymbol(d_Mcoll_table, &h_Mcoll_table_to_device, sizeof(RGTable1D), 0, cudaMemcpyHostToDevice)); + + // copy Nhalo_inv table and its member flatten_data + size_t size_Nhalo_inv_flatten_data = sizeof(double) * h_Nhalo_inv_table.nx_bin * h_Nhalo_inv_table.ny_bin; + // get a copy of Nhalo_inv_table + RGTable2D h_Nhalo_inv_table_to_device = h_Nhalo_inv_table; + + double *d_Nhalo_flatten_data; + CALL_CUDA(cudaMalloc(&d_Nhalo_flatten_data, size_Nhalo_inv_flatten_data)); + CALL_CUDA(cudaMemcpy(d_Nhalo_flatten_data, h_Nhalo_inv_table.flatten_data, size_Nhalo_inv_flatten_data, cudaMemcpyHostToDevice)); + + double **d_z_arr, **z_arr_to_device; + size_t size_z_arr = sizeof(double *) * h_Nhalo_inv_table.nx_bin; + CALL_CUDA(cudaHostAlloc((void **)&z_arr_to_device, size_z_arr, cudaHostAllocDefault)); + // get the address of flatten data on the device + int i; + for (i=0;i= n_bin - 1) + { + return 0.0; // Out-of-bounds handling + } + + double table_val = x_min + x_width * (float)idx; + double interp_point = (x - table_val) / x_width; + + return y_arr[idx] * (1 - interp_point) + y_arr[idx + 1] * (interp_point); +} + +__device__ double extrapolate_dNdM_inverse(double condition, double lnp) +{ + double x_min = d_Nhalo_inv_table.x_min; + double x_width = d_Nhalo_inv_table.x_width; + // printf("condition: %f; lnp: %f \n", condition, lnp); //tmp + int x_idx = (int)floor((condition - x_min) / x_width); + double x_table = x_min + x_idx * x_width; + double interp_point_x = (condition - x_table) / x_width; + + double extrap_point_y = (lnp - d_user_params.MIN_LOGPROB) / d_Nhalo_inv_table.y_width; + + // find the log-mass at the edge of the table for this condition + double xlimit = d_Nhalo_inv_table.z_arr[x_idx][0] * (interp_point_x) + d_Nhalo_inv_table.z_arr[x_idx + 1][0] * (1 - interp_point_x); + double xlimit_m1 = d_Nhalo_inv_table.z_arr[x_idx][1] * (interp_point_x) + d_Nhalo_inv_table.z_arr[x_idx + 1][1] * (1 - interp_point_x); + + double result = xlimit + (xlimit_m1 - xlimit) * (extrap_point_y); + + return result; +} + +__device__ double EvaluateNhaloInv(double condition, double prob) +{ + if (prob == 0.) + return 1.; // q == 1 -> condition mass + double lnp = log(prob); + if (lnp < d_user_params.MIN_LOGPROB) + return extrapolate_dNdM_inverse(condition, lnp); + return EvaluateRGTable2D(condition, lnp, &d_Nhalo_inv_table); +} + +__device__ double EvaluateMcoll(double condition, double growthf, double lnMmin, double lnMmax, double M_cond, double sigma, double delta) +{ + if (d_user_params.USE_INTERPOLATION_TABLES) + return EvaluateRGTable1D(condition, &d_Mcoll_table); + // todo: implement Mcoll_Conditional + return 0; +} + +__device__ double EvaluateNhalo(double condition, double growthf, double lnMmin, double lnMmax, double M_cond, double sigma, double delta) +{ + if (d_user_params.USE_INTERPOLATION_TABLES) + return EvaluateRGTable1D(condition, &d_Nhalo_table); + // todo: implement Nhalo_Conditional + return 0; +} diff --git a/src/py21cmfast/src/interp_tables.cuh b/src/py21cmfast/src/interp_tables.cuh new file mode 100644 index 000000000..8616f691e --- /dev/null +++ b/src/py21cmfast/src/interp_tables.cuh @@ -0,0 +1,23 @@ +#ifndef _INTERP_TABLES_CUH +#define _INTERP_TABLES_CUH + +#include "interpolation_types.h" + +#ifdef __CUDA_ARCH__ +__device__ double EvaluateSigma(float x, double x_min, double x_width, float *y_arr, int n_bin); +__device__ double extrapolate_dNdM_inverse(double condition, double lnp); +__device__ double EvaluateNhaloInv(double condition, double prob); +__device__ double EvaluateMcoll(double condition, double growthf, double lnMmin, double lnMmax, double M_cond, double sigma, double delta); +__device__ double EvaluateNhalo(double condition, double growthf, double lnMmin, double lnMmax, double M_cond, double sigma, double delta); +#endif + +#ifdef __cplusplus +extern "C" +{ +#endif + void copyTablesToDevice(RGTable1D h_Nhalo_table, RGTable1D h_Mcoll_table, RGTable2D h_Nhalo_inv_table); +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/py21cmfast/src/interp_tables.h b/src/py21cmfast/src/interp_tables.h index 0fe068a49..56d98ea5b 100644 --- a/src/py21cmfast/src/interp_tables.h +++ b/src/py21cmfast/src/interp_tables.h @@ -2,19 +2,23 @@ #define _INTERP_TABLES_H #include "InputParameters.h" +#include "interpolation.h" #include "scaling_relations.h" // Functions within interp_tables.c need the parameter structures, but we don't want to pass them // all down the chain, so we broadcast them -// TODO: in future it would be better to use a context struct. See `HaloBox.c` -void initialise_SFRD_spline(int Nbin, float zmin, float zmax, struct ScalingConstants *sc); -double EvaluateSFRD(double redshift, struct ScalingConstants *sc); -double EvaluateSFRD_MINI(double redshift, double log10_Mturn_LW_ave, struct ScalingConstants *sc); +#ifdef __cplusplus +extern "C" { +#endif + +void initialise_SFRD_spline(int Nbin, float zmin, float zmax, ScalingConstants *sc); +double EvaluateSFRD(double redshift, ScalingConstants *sc); +double EvaluateSFRD_MINI(double redshift, double log10_Mturn_LW_ave, ScalingConstants *sc); -void initialise_Nion_Ts_spline(int Nbin, float zmin, float zmax, struct ScalingConstants *sc); -double EvaluateNionTs(double redshift, struct ScalingConstants *sc); -double EvaluateNionTs_MINI(double redshift, double log10_Mturn_LW_ave, struct ScalingConstants *sc); +void initialise_Nion_Ts_spline(int Nbin, float zmin, float zmax, ScalingConstants *sc); +double EvaluateNionTs(double redshift, ScalingConstants *sc); +double EvaluateNionTs_MINI(double redshift, double log10_Mturn_LW_ave, ScalingConstants *sc); void initialise_FgtrM_delta_table(double min_dens, double max_dens, double zpp, double growth_zpp, double smin_zpp, double smax_zpp); @@ -27,27 +31,27 @@ void initialise_Nion_Conditional_spline(double z, double min_density, double max double Mmin, double Mmax, double Mcond, double log10Mturn_min, double log10Mturn_max, double log10Mturn_min_MINI, double log10Mturn_max_MINI, - struct ScalingConstants *sc, bool prev); + ScalingConstants *sc, bool prev); double EvaluateNion_Conditional(double delta, double log10Mturn, double growthf, double M_min, - double M_max, double M_cond, double sigma_max, - struct ScalingConstants *sc, bool prev); + double M_max, double M_cond, double sigma_max, ScalingConstants *sc, + bool prev); double EvaluateNion_Conditional_MINI(double delta, double log10Mturn_m, double growthf, double M_min, double M_max, double M_cond, double sigma_max, - struct ScalingConstants *sc, bool prev); + ScalingConstants *sc, bool prev); void initialise_Xray_Conditional_table(double redshift, double min_density, double max_density, double Mmin, double Mmax, double Mcond, - struct ScalingConstants *sc); + ScalingConstants *sc); double EvaluateXray_Conditional(double delta, double log10Mturn_m, double redshift, double growthf, double M_min, double M_max, double M_cond, double sigma_max, - struct ScalingConstants *sc); + ScalingConstants *sc); void initialise_SFRD_Conditional_table(double z, double min_density, double max_density, double Mmin, double Mmax, double Mcond, - struct ScalingConstants *sc); + ScalingConstants *sc); double EvaluateSFRD_Conditional(double delta, double growthf, double M_min, double M_max, - double M_cond, double sigma_max, struct ScalingConstants *sc); + double M_cond, double sigma_max, ScalingConstants *sc); double EvaluateSFRD_Conditional_MINI(double delta, double log10Mturn_m, double growthf, double M_min, double M_max, double M_cond, double sigma_max, - struct ScalingConstants *sc); + ScalingConstants *sc); void initialise_dNdM_tables(double xmin, double xmax, double ymin, double ymax, double growth1, double param, bool from_catalog); @@ -75,4 +79,17 @@ void free_conditional_tables(); void free_global_tables(); void free_dNdM_tables(); +RGTable1D_f *get_SFRD_conditional_table(void); +RGTable1D_f *get_Nion_conditional_table1D(void); +int get_nbins(void); + +RGTable1D *GetNhaloTable(); +RGTable1D *GetMcollTable(); +RGTable2D *GetNhaloInvTable(); +RGTable1D_f *GetSigmaInterpTable(); + +#ifdef __cplusplus +} #endif + +#endif //_INTERP_TABLES_H diff --git a/src/py21cmfast/src/interpolation.c b/src/py21cmfast/src/interpolation.c index 180ab1836..6f4810d92 100644 --- a/src/py21cmfast/src/interpolation.c +++ b/src/py21cmfast/src/interpolation.c @@ -43,10 +43,12 @@ void allocate_RGTable2D(int n_x, int n_y, RGTable2D *ptr) { ptr->nx_bin = n_x; ptr->ny_bin = n_y; + ptr->flatten_data = (double *)calloc(n_x * n_y, sizeof(double)); ptr->z_arr = calloc(n_x, sizeof(double *)); for (i = 0; i < n_x; i++) { - ptr->z_arr[i] = calloc(n_y, sizeof(double)); + ptr->z_arr[i] = &ptr->flatten_data[i * n_y]; } + ptr->allocated = true; } @@ -74,7 +76,7 @@ void free_RGTable2D_f(RGTable2D_f *ptr) { void free_RGTable2D(RGTable2D *ptr) { int i; if (ptr->allocated) { - for (i = 0; i < ptr->nx_bin; i++) free(ptr->z_arr[i]); + free(ptr->flatten_data); free(ptr->z_arr); ptr->allocated = false; } diff --git a/src/py21cmfast/src/interpolation.cu b/src/py21cmfast/src/interpolation.cu new file mode 100644 index 000000000..761ea6b94 --- /dev/null +++ b/src/py21cmfast/src/interpolation.cu @@ -0,0 +1,42 @@ +#include + +#include "interpolation.cuh" + +__device__ double EvaluateRGTable1D(double x, RGTable1D *table) +{ + double x_min = table->x_min; + double x_width = table->x_width; + int idx = (int)floor((x - x_min) / x_width); + double table_val = x_min + x_width * (double)idx; + double interp_point = (x - table_val) / x_width; + + // a + f(a-b) is one fewer operation but less precise + double result = table->y_arr[idx] * (1 - interp_point) + table->y_arr[idx + 1] * (interp_point); + + return result; +} + +__device__ double EvaluateRGTable2D(double x, double y, RGTable2D *table) +{ + double x_min = table->x_min; + double x_width = table->x_width; + double y_min = table->y_min; + double y_width = table->y_width; + int x_idx = (int)floor((x - x_min) / x_width); + int y_idx = (int)floor((y - y_min) / y_width); + + double x_table = x_min + x_width * (double)x_idx; + double y_table = y_min + y_width * (double)y_idx; + + double interp_point_x = (x - x_table) / x_width; + double interp_point_y = (y - y_table) / y_width; + + double left_edge, right_edge, result; + + left_edge = table->z_arr[x_idx][y_idx] * (1 - interp_point_y) + table->z_arr[x_idx][y_idx + 1] * (interp_point_y); + right_edge = table->z_arr[x_idx + 1][y_idx] * (1 - interp_point_y) + table->z_arr[x_idx + 1][y_idx + 1] * (interp_point_y); + + result = left_edge * (1 - interp_point_x) + right_edge * (interp_point_x); + + return result; +} diff --git a/src/py21cmfast/src/interpolation.cuh b/src/py21cmfast/src/interpolation.cuh new file mode 100644 index 000000000..2ea3bffb7 --- /dev/null +++ b/src/py21cmfast/src/interpolation.cuh @@ -0,0 +1,14 @@ +#ifndef _INTERPOLATION_CUH +#define _INTERPOLATION_CUH + +#include +#include "interpolation_types.h" + +#ifdef __CUDA_ARCH__ + +__device__ double EvaluateRGTable1D(double x, RGTable1D *table); +__device__ double EvaluateRGTable2D(double x, double y, RGTable2D *table); + +#endif + +#endif diff --git a/src/py21cmfast/src/interpolation.h b/src/py21cmfast/src/interpolation.h index 02fa6d411..a3251068d 100644 --- a/src/py21cmfast/src/interpolation.h +++ b/src/py21cmfast/src/interpolation.h @@ -3,46 +3,11 @@ #include -typedef struct RGTable1D { - int n_bin; - double x_min; - double x_width; - - double *y_arr; - bool allocated; -} RGTable1D; - -typedef struct RGTable2D { - int nx_bin, ny_bin; - double x_min, y_min; - double x_width, y_width; - - double **z_arr; - - double saved_ll, saved_ul; // for future acceleration - bool allocated; -} RGTable2D; - -typedef struct RGTable1D_f { - int n_bin; - double x_min; - double x_width; - - float *y_arr; - bool allocated; -} RGTable1D_f; - -typedef struct RGTable2D_f { - int nx_bin, ny_bin; - double x_min, y_min; - double x_width, y_width; - - float **z_arr; - - double saved_ll, saved_ul; // for future acceleration - bool allocated; -} RGTable2D_f; +#include "interpolation_types.h" +#ifdef __cplusplus +extern "C" { +#endif void allocate_RGTable1D(int n_bin, RGTable1D *ptr); void allocate_RGTable1D_f(int n_bin, RGTable1D_f *ptr); void allocate_RGTable2D(int n_x, int n_y, RGTable2D *ptr); @@ -57,6 +22,9 @@ double EvaluateRGTable1D(double x, RGTable1D *table); double EvaluateRGTable2D(double x, double y, RGTable2D *table); double EvaluateRGTable1D_f(double x, RGTable1D_f *table); double EvaluateRGTable2D_f(double x, double y, RGTable2D_f *table); +#ifdef __cplusplus +} +#endif bool RGTable2D_out_of_bounds(RGTable2D *table, double x_val, double y_val); bool RGTable2Df_out_of_bounds(RGTable2D_f *table, double x_val, double y_val); diff --git a/src/py21cmfast/src/interpolation_types.h b/src/py21cmfast/src/interpolation_types.h new file mode 100644 index 000000000..c869dc248 --- /dev/null +++ b/src/py21cmfast/src/interpolation_types.h @@ -0,0 +1,45 @@ +#ifndef _INTERPOLATION_TYPES_H +#define _INTERPOLATION_TYPES_H + +typedef struct RGTable1D { + int n_bin; + double x_min; + double x_width; + + double *y_arr; + bool allocated; +} RGTable1D; + +typedef struct RGTable2D { + int nx_bin, ny_bin; + double x_min, y_min; + double x_width, y_width; + + double **z_arr; + double *flatten_data; + + double saved_ll, saved_ul; // for future acceleration + bool allocated; +} RGTable2D; + +typedef struct RGTable1D_f { + int n_bin; + double x_min; + double x_width; + + float *y_arr; + bool allocated; +} RGTable1D_f; + +typedef struct RGTable2D_f { + int nx_bin, ny_bin; + double x_min, y_min; + double x_width, y_width; + + float **z_arr; + + double saved_ll, saved_ul; // for future acceleration + bool allocated; +} RGTable2D_f; + +#endif diff --git a/src/py21cmfast/src/logger.h b/src/py21cmfast/src/logger.h index 45872477f..501e21d3c 100644 --- a/src/py21cmfast/src/logger.h +++ b/src/py21cmfast/src/logger.h @@ -49,7 +49,7 @@ #include // === auxiliary functions -static inline char *timenow(); +// static inline char *timenow(); #define _FILE strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__ @@ -130,6 +130,9 @@ static inline char *timenow(); #define LOG_IF_ERROR(condition, message, args...) #endif +#ifdef __cplusplus +extern "C" { +#endif static inline char *timenow() { static char buffer[64]; time_t rawtime; @@ -143,4 +146,8 @@ static inline char *timenow() { return buffer; } +#ifdef __cplusplus +} +#endif + #endif diff --git a/src/py21cmfast/src/map_mass.c b/src/py21cmfast/src/map_mass.c new file mode 100644 index 000000000..ef0d62018 --- /dev/null +++ b/src/py21cmfast/src/map_mass.c @@ -0,0 +1,294 @@ +// Functions in this file map units of mass from Lagrangian (IC) +// coordinates to their real (Eulerian) Locations, these can sum +// masses or galaxy properties from grids or from coordinate catalogues + +#include "map_mass.h" + +#include +#include +#include +#include + +#include "Constants.h" +#include "HaloBox.h" +#include "InputParameters.h" +#include "cosmology.h" +#include "indexing.h" +#include "logger.h" + +#define do_cic_interpolation(arr, ...) \ + _Generic((arr), float *: do_cic_interpolation_float, double *: do_cic_interpolation_double)( \ + arr, __VA_ARGS__) + +static inline void do_cic_interpolation_double(double *resampled_box, double pos[3], int box_dim[3], + double curr_dens) { + // get the CIC indices and distances + int ipos[3], iposp1[3]; + double dist[3]; + // NOTE: assumes the cell at idx == 0 is *centred* at (0,0,0) + for (int axis = 0; axis < 3; axis++) { + ipos[axis] = (int)floor(pos[axis]); + iposp1[axis] = ipos[axis] + 1; + dist[axis] = pos[axis] - ipos[axis]; + } + + wrap_coord(ipos, box_dim); + wrap_coord(iposp1, box_dim); + + unsigned long long int cic_indices[8] = { + grid_index_general(ipos[0], ipos[1], ipos[2], box_dim), + grid_index_general(iposp1[0], ipos[1], ipos[2], box_dim), + grid_index_general(ipos[0], iposp1[1], ipos[2], box_dim), + grid_index_general(iposp1[0], iposp1[1], ipos[2], box_dim), + grid_index_general(ipos[0], ipos[1], iposp1[2], box_dim), + grid_index_general(iposp1[0], ipos[1], iposp1[2], box_dim), + grid_index_general(ipos[0], iposp1[1], iposp1[2], box_dim), + grid_index_general(iposp1[0], iposp1[1], iposp1[2], box_dim)}; + + double cic_weights[8] = {(1. - dist[0]) * (1. - dist[1]) * (1. - dist[2]), + dist[0] * (1. - dist[1]) * (1. - dist[2]), + (1. - dist[0]) * dist[1] * (1. - dist[2]), + dist[0] * dist[1] * (1. - dist[2]), + (1. - dist[0]) * (1. - dist[1]) * dist[2], + dist[0] * (1. - dist[1]) * dist[2], + (1. - dist[0]) * dist[1] * dist[2], + dist[0] * dist[1] * dist[2]}; + + for (int i = 0; i < 8; i++) { +#pragma omp atomic update + resampled_box[cic_indices[i]] += curr_dens * cic_weights[i]; + } +} + +// Identical code as above, using a single precision output +static inline void do_cic_interpolation_float(float *resampled_box, double pos[3], int box_dim[3], + double curr_dens) { + // get the CIC indices and distances + int ipos[3], iposp1[3]; + double dist[3]; + // NOTE: assumes the cell at idx == 0 is *centred* at (0,0,0) + for (int axis = 0; axis < 3; axis++) { + ipos[axis] = (int)floor(pos[axis]); + iposp1[axis] = ipos[axis] + 1; + dist[axis] = pos[axis] - ipos[axis]; + } + + wrap_coord(ipos, box_dim); + wrap_coord(iposp1, box_dim); + + unsigned long long int cic_indices[8] = { + grid_index_general(ipos[0], ipos[1], ipos[2], box_dim), + grid_index_general(iposp1[0], ipos[1], ipos[2], box_dim), + grid_index_general(ipos[0], iposp1[1], ipos[2], box_dim), + grid_index_general(iposp1[0], iposp1[1], ipos[2], box_dim), + grid_index_general(ipos[0], ipos[1], iposp1[2], box_dim), + grid_index_general(iposp1[0], ipos[1], iposp1[2], box_dim), + grid_index_general(ipos[0], iposp1[1], iposp1[2], box_dim), + grid_index_general(iposp1[0], iposp1[1], iposp1[2], box_dim)}; + + double cic_weights[8] = {(1. - dist[0]) * (1. - dist[1]) * (1. - dist[2]), + dist[0] * (1. - dist[1]) * (1. - dist[2]), + (1. - dist[0]) * dist[1] * (1. - dist[2]), + dist[0] * dist[1] * (1. - dist[2]), + (1. - dist[0]) * (1. - dist[1]) * dist[2], + dist[0] * (1. - dist[1]) * dist[2], + (1. - dist[0]) * dist[1] * dist[2], + dist[0] * dist[1] * dist[2]}; + + for (int i = 0; i < 8; i++) { +#pragma omp atomic update + resampled_box[cic_indices[i]] += curr_dens * cic_weights[i]; + } +} + +// Function that maps a IC density grid to the perturbed density grid +void move_grid_masses(double redshift, float *dens_pointer, int dens_dim[3], float *vel_pointers[3], + float *vel_pointers_2LPT[3], int vel_dim[3], double *resampled_box, + int out_dim[3]) { + // grid dimension constants + double boxlen = simulation_options_global->BOX_LEN; + double boxlen_z = boxlen * simulation_options_global->NON_CUBIC_FACTOR; + double box_size[3] = {boxlen, boxlen, boxlen_z}; + double dim_ratio_vel = (double)vel_dim[0] / (double)dens_dim[0]; + double dim_ratio_out = (double)out_dim[0] / (double)dens_dim[0]; + + // Setup IC velocity factors + double growth_factor = dicke(redshift); + double displacement_factor_2LPT = -(3.0 / 7.0) * growth_factor * growth_factor; // 2LPT eq. D8 + + double init_growth_factor = dicke(simulation_options_global->INITIAL_REDSHIFT); + double init_displacement_factor_2LPT = + -(3.0 / 7.0) * init_growth_factor * init_growth_factor; // 2LPT eq. D8 + + double velocity_displacement_factor[3] = { + (growth_factor - init_growth_factor) / box_size[0] * dens_dim[0], + (growth_factor - init_growth_factor) / box_size[1] * dens_dim[1], + (growth_factor - init_growth_factor) / box_size[2] * dens_dim[2]}; + double velocity_displacement_factor_2LPT[3] = { + (displacement_factor_2LPT - init_displacement_factor_2LPT) / box_size[0] * dens_dim[0], + (displacement_factor_2LPT - init_displacement_factor_2LPT) / box_size[1] * dens_dim[1], + (displacement_factor_2LPT - init_displacement_factor_2LPT) / box_size[2] * dens_dim[2]}; +#pragma omp parallel num_threads(simulation_options_global->N_THREADS) + { + int i, j, k, axis; + double pos[3], curr_dens; + int ipos[3]; + unsigned long long vel_index, dens_index; +#pragma omp for + for (i = 0; i < dens_dim[0]; i++) { + for (j = 0; j < dens_dim[1]; j++) { + for (k = 0; k < dens_dim[2]; k++) { + // Transform position to units of box size + pos[0] = i; + pos[1] = j; + pos[2] = k; + resample_index((int[3]){i, j, k}, dim_ratio_vel, ipos); + wrap_coord(ipos, vel_dim); + vel_index = grid_index_general(ipos[0], ipos[1], ipos[2], vel_dim); + for (axis = 0; axis < 3; axis++) { + pos[axis] += + vel_pointers[axis][vel_index] * velocity_displacement_factor[axis]; + // add 2LPT second order corrections + if (matter_options_global->PERTURB_ALGORITHM == 2) { + pos[axis] -= vel_pointers_2LPT[axis][vel_index] * + velocity_displacement_factor_2LPT[axis]; + } + pos[axis] *= dim_ratio_out; + } + + // CIC interpolation + dens_index = grid_index_general(i, j, k, dens_dim); + curr_dens = 1.0 + dens_pointer[dens_index] * init_growth_factor; + do_cic_interpolation(resampled_box, pos, out_dim, curr_dens); + } + } + } + } +} + +// Function that maps a IC density grid to the perturbed density grid +// TODO: This shares a lot of code with move_grid_masses and (future) move_cat_galprops. +// I should look into combining elements, however since the differences +// are on the innermost loops, any generalisation is likely to slow things down. +void move_grid_galprops(double redshift, float *dens_pointer, int dens_dim[3], + float *vel_pointers[3], float *vel_pointers_2LPT[3], int vel_dim[3], + HaloBox *boxes, int out_dim[3], float *mturn_a_grid, float *mturn_m_grid, + ScalingConstants *consts, IntegralCondition *integral_cond) { + // grid dimension constants + double boxlen = simulation_options_global->BOX_LEN; + double boxlen_z = boxlen * simulation_options_global->NON_CUBIC_FACTOR; + double box_size[3] = {boxlen, boxlen, boxlen_z}; + double dim_ratio_vel = (double)vel_dim[0] / (double)dens_dim[0]; + double dim_ratio_out = (double)out_dim[0] / (double)dens_dim[0]; + + double prefactor_mass = RHOcrit * cosmo_params_global->OMm; + double prefactor_stars = RHOcrit * cosmo_params_global->OMb * consts->fstar_10; + double prefactor_stars_mini = RHOcrit * cosmo_params_global->OMb * consts->fstar_7; + double prefactor_sfr = prefactor_stars / consts->t_star / consts->t_h; + double prefactor_sfr_mini = prefactor_stars_mini / consts->t_star / consts->t_h; + double prefactor_nion = prefactor_stars * consts->fesc_10 * consts->pop2_ion; + double prefactor_nion_mini = prefactor_stars_mini * consts->fesc_7 * consts->pop3_ion; + double prefactor_xray = RHOcrit * cosmo_params_global->OMm; + + // Setup IC velocity factors + double growth_factor = dicke(redshift); + double displacement_factor_2LPT = -(3.0 / 7.0) * growth_factor * growth_factor; // 2LPT eq. D8 + + double init_growth_factor = dicke(simulation_options_global->INITIAL_REDSHIFT); + double init_displacement_factor_2LPT = + -(3.0 / 7.0) * init_growth_factor * init_growth_factor; // 2LPT eq. D8 + + double velocity_displacement_factor[3] = { + (growth_factor - init_growth_factor) / box_size[0] * dens_dim[0], + (growth_factor - init_growth_factor) / box_size[1] * dens_dim[1], + (growth_factor - init_growth_factor) / box_size[2] * dens_dim[2]}; + double velocity_displacement_factor_2LPT[3] = { + (displacement_factor_2LPT - init_displacement_factor_2LPT) / box_size[0] * dens_dim[0], + (displacement_factor_2LPT - init_displacement_factor_2LPT) / box_size[1] * dens_dim[1], + (displacement_factor_2LPT - init_displacement_factor_2LPT) / box_size[2] * dens_dim[2]}; +#pragma omp parallel num_threads(simulation_options_global->N_THREADS) + { + int i, j, k, axis; + double pos[3], curr_dens; + int ipos[3]; + unsigned long long vel_index, dens_index; + double l10_mturn_a = log10(consts->mturn_a_nofb); + double l10_mturn_m = log10(consts->mturn_m_nofb); + HaloProperties properties; +#pragma omp for + for (i = 0; i < dens_dim[0]; i++) { + for (j = 0; j < dens_dim[1]; j++) { + for (k = 0; k < dens_dim[2]; k++) { + // Transform position to units of box size + pos[0] = i; + pos[1] = j; + pos[2] = k; + resample_index((int[3]){i, j, k}, dim_ratio_vel, ipos); + wrap_coord(ipos, vel_dim); + vel_index = grid_index_general(ipos[0], ipos[1], ipos[2], vel_dim); + for (axis = 0; axis < 3; axis++) { + pos[axis] += + vel_pointers[axis][vel_index] * velocity_displacement_factor[axis]; + // add 2LPT second order corrections + if (matter_options_global->PERTURB_ALGORITHM == 2) { + pos[axis] -= vel_pointers_2LPT[axis][vel_index] * + velocity_displacement_factor_2LPT[axis]; + } + pos[axis] *= dim_ratio_out; + } + + // CIC interpolation + dens_index = grid_index_general(i, j, k, dens_dim); + curr_dens = dens_pointer[dens_index] * growth_factor; + if (astro_options_global->USE_MINI_HALOS) { + l10_mturn_a = mturn_a_grid[dens_index]; + l10_mturn_m = mturn_m_grid[dens_index]; + } + + get_cell_integrals(curr_dens, l10_mturn_a, l10_mturn_m, consts, integral_cond, + &properties); + + // using the properties struct: + // stellar_mass --> no F_esc integral ACG + // stellar_mass_mini --> no F_esc integral MCG + // n_ion --> F_esc integral ACG + // fescweighted_sfr --> F_esc integral MCG + // halo_xray --> Xray integral + do_cic_interpolation(boxes->halo_sfr, pos, out_dim, + properties.stellar_mass * prefactor_sfr); + do_cic_interpolation(boxes->n_ion, pos, out_dim, + properties.n_ion * prefactor_nion + + properties.fescweighted_sfr * prefactor_nion_mini); + + if (astro_options_global->USE_MINI_HALOS) { + do_cic_interpolation(boxes->halo_sfr_mini, pos, out_dim, + properties.stellar_mass_mini * prefactor_sfr_mini); + } + if (astro_options_global->USE_TS_FLUCT) { + do_cic_interpolation(boxes->halo_xray, pos, out_dim, + properties.halo_xray * prefactor_xray); + } + + if (config_settings.EXTRA_HALOBOX_FIELDS) { + do_cic_interpolation(boxes->halo_mass, pos, out_dim, + properties.halo_mass * prefactor_mass); + do_cic_interpolation(boxes->halo_stars, pos, out_dim, + properties.stellar_mass * prefactor_stars); + if (astro_options_global->USE_MINI_HALOS) { + do_cic_interpolation( + boxes->halo_stars_mini, pos, out_dim, + properties.stellar_mass_mini * prefactor_stars_mini); + } + } + } + } + } + } + // Without stochasticity, these grids are the same to a constant + double prefactor_wsfr = 1 / consts->t_h / consts->t_star; + if (astro_options_global->INHOMO_RECO) { + for (int i = 0; i < HII_TOT_NUM_PIXELS; i++) { + boxes->whalo_sfr[i] = boxes->n_ion[i] * prefactor_wsfr; + } + } +} diff --git a/src/py21cmfast/src/map_mass.h b/src/py21cmfast/src/map_mass.h new file mode 100644 index 000000000..75e5d55da --- /dev/null +++ b/src/py21cmfast/src/map_mass.h @@ -0,0 +1,24 @@ + +#include "HaloBox.h" +#include "OutputStructs.h" +#include "scaling_relations.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void move_grid_masses(double redshift, float *dens_pointer, int dens_dim[3], float *vel_pointers[3], + float *vel_pointers_2LPT[3], int vel_dim[3], double *resampled_box, + int out_dim[3]); + +void move_grid_galprops(double redshift, float *dens_pointer, int dens_dim[3], + float *vel_pointers[3], float *vel_pointers_2LPT[3], int vel_dim[3], + HaloBox *boxes, int out_dim[3], float *mturn_a_grid, float *mturn_m_grid, + ScalingConstants *consts, IntegralCondition *integral_cond); + +double *MapMass_gpu(InitialConditions *boxes, double *resampled_box, int dimension, + float f_pixel_factor, float init_growth_factor); + +#ifdef __cplusplus +} +#endif diff --git a/src/py21cmfast/src/meson.build b/src/py21cmfast/src/meson.build new file mode 100644 index 000000000..9eb82ed63 --- /dev/null +++ b/src/py21cmfast/src/meson.build @@ -0,0 +1,151 @@ +# Define the C source files +c_source_files = files([ + 'BrightnessTemperatureBox.c', + 'HaloBox.c', + 'HaloField.c', + 'InitialConditions.c', + 'InputParameters.c', + 'IonisationBox.c', + 'LuminosityFunction.c', + 'PerturbField.c', + 'PerturbHaloField.c', + 'SpinTemperatureBox.c', + 'Stochasticity.c', + 'bubble_helper_progs.c', + 'cosmology.c', + 'debugging.c', + 'dft.c', + 'elec_interp.c', + 'filtering.c', + 'heating_helper_progs.c', + 'hmf.c', + 'indexing.c', + 'integral_wrappers.c', + 'interp_tables.c', + 'interpolation.c', + 'photoncons.c', + 'recombinations.c', + 'rng.c', + 'scaling_relations.c', + 'thermochem.c', + 'map_mass.c', +]) + +# Define the CUDA source files +cuda_source_files = files([ + 'HaloField.cu', + 'IonisationBox.cu', + 'SpinTemperatureBox.cu', + 'Stochasticity.cu', + 'filtering.cu', + 'hmf.cu', + 'interp_tables.cu', + 'interpolation.cu', + 'device_rng.cu', + 'MapMass_gpu.cu', + 'test_Stochasticity.cu', + 'cuda_hello_world.cu', +]) + +# C++ wrapper file +cpp_source_files = files(['_wrapper.cpp']) + +# Define the 21cmFast dependencies +omp = dependency('openmp') +gsl = dependency('gsl') +nanobind = dependency('nanobind', static: true) + +# If/when fftw gets added to Meson WrapDB, we'll be able to use this: +# fftw = dependency('fftw3f_threads') +# ... but until then, we need to jump through some hoops: +cc = meson.get_compiler ('c') +search_paths = [ '/usr/lib', '/usr/local/lib', '/opt/homebrew/lib' ] +fftw = cc.find_library ('fftw3f', required: true, dirs: search_paths) +fftw_threads = cc.find_library ('fftw3f_threads', required: true, dirs: search_paths) + +# Base dependencies (always needed) +deps = [omp, gsl, nanobind, fftw, fftw_threads] + +# CUDA dependency (optional) +# The root meson.build already checked for USE_CUDA environment variable and nvcc availability +cuda_dep = dependency('cuda', version: '>=10.0', required: false) + +# Check if CUDA language is available (this will be true only if root meson.build successfully added it) +if cuda_dep.found() + message('Using CUDA compilation in source build') + cuda_compiler = meson.get_compiler('cuda', required: false) + add_project_arguments('-DUSE_CUDA=1', language: ['c', 'cpp', 'cuda']) + + # CUDA compiler arguments + # Try to detect GPU architecture, fall back to a reasonable default + detect_arch_cmd = run_command('nvidia-smi', '--query-gpu=compute_cap', '--format=csv,noheader,nounits', check: false) + + if detect_arch_cmd.returncode() == 0 and detect_arch_cmd.stdout().strip() != '' + # Parse the compute capability (e.g., "7.5" -> "sm_75") + compute_cap = detect_arch_cmd.stdout().strip().split('\n')[0] + arch_major = compute_cap.split('.')[0] + arch_minor = compute_cap.split('.')[1] + detected_arch = 'sm_' + arch_major + arch_minor + message('Detected GPU architecture: ' + detected_arch) + cuda_arch = detected_arch + else + # Fall back to a widely compatible architecture + cuda_arch = 'sm_60' # Pascal - widely compatible + message('Could not detect GPU architecture, using default: ' + cuda_arch) + endif + + cuda_args = [ + '-arch=' + cuda_arch, + '--extended-lambda', + '--expt-relaxed-constexpr', + ] + add_project_arguments(cuda_args, language: 'cuda') + + # Add CUDA runtime library + cuda_rt = cuda_compiler.find_library('cudart', required: true) + deps = deps + [cuda_dep, cuda_rt] + + # Include CUDA source files + all_source_files = c_source_files + cuda_source_files + cpp_source_files +else + message('Using CPU-only compilation in source build') + add_project_arguments('-DUSE_CUDA=0', language: ['c', 'cpp']) + # Only include C/C++ files, no CUDA files + all_source_files = c_source_files + cpp_source_files +endif + +# Define a mapping of log level strings to integers +log_level_map = { + 'NO_LOG': 0, + 'ERROR': 1, + 'WARNING': 2, + 'INFO': 3, + 'DEBUG': 4, + 'SUPER_DEBUG': 5, + 'ULTRA_DEBUG': 6, +} + +# Check for environment variable first, then fall back to meson option +env_log_level = run_command('printenv', 'LOG_LEVEL', check: false).stdout().strip() +if env_log_level != '' + log_level_str = env_log_level +else + log_level_str = get_option('log_level') +endif + +# Convert the string to an integer using the map, defaulting to 2 (warnings) if the key is invalid +log_level = log_level_map.get(log_level_str, 2) + +# Print the selected log level for debugging purposes +message('Selected log level: ' + log_level.to_string()) + +add_project_arguments('-DLOG_LEVEL=' + log_level.to_string(), language: 'c') + +# Define the Python extension module +py.extension_module( + 'c_21cmfast', + all_source_files, + dependencies: deps, + install: true, + subdir:'py21cmfast', +) diff --git a/src/py21cmfast/src/photoncons.c b/src/py21cmfast/src/photoncons.c index 9f96a352d..9af217f01 100644 --- a/src/py21cmfast/src/photoncons.c +++ b/src/py21cmfast/src/photoncons.c @@ -111,7 +111,7 @@ int InitialisePhotonCons() { z_arr = calloc(Nmax, sizeof(double)); Q_arr = calloc(Nmax, sizeof(double)); - struct ScalingConstants sc_i, sc_0, sc_1; + ScalingConstants sc_i, sc_0, sc_1; set_scaling_constants(a_end, &sc_i, false); // set the minimum source mass diff --git a/src/py21cmfast/src/photoncons.h b/src/py21cmfast/src/photoncons.h index 5f428b6dc..91a5e2de5 100644 --- a/src/py21cmfast/src/photoncons.h +++ b/src/py21cmfast/src/photoncons.h @@ -5,6 +5,9 @@ #include "InputParameters.h" +#ifdef __cplusplus +extern "C" { +#endif // This is directly accessed in the wrapper currently // TODO: remove this global declaration and make an internal checking function extern bool photon_cons_allocated; @@ -29,4 +32,7 @@ int ObtainPhotonConsData(double *z_at_Q_data, double *Q_data, int *Ndata_analyti void set_alphacons_params(double norm, double slope); double get_fesc_fit(double redshift); +#ifdef __cplusplus +} +#endif #endif diff --git a/src/py21cmfast/src/recombinations.h b/src/py21cmfast/src/recombinations.h index 788f9e28e..1d0aa9645 100644 --- a/src/py21cmfast/src/recombinations.h +++ b/src/py21cmfast/src/recombinations.h @@ -1,8 +1,14 @@ #ifndef _RECOMB_H #define _RECOMB_H +#ifdef __cplusplus +extern "C" { +#endif double splined_recombination_rate(double z_eff, double gamma12_bg); /*initializes the lookup table for the PDF density integral in MHR00 model at redshift z*/ void init_MHR(); +#ifdef __cplusplus +} +#endif #endif diff --git a/src/py21cmfast/src/scaling_relations.c b/src/py21cmfast/src/scaling_relations.c index 37c452ec6..c567a4a85 100644 --- a/src/py21cmfast/src/scaling_relations.c +++ b/src/py21cmfast/src/scaling_relations.c @@ -18,7 +18,7 @@ #include "photoncons.h" #include "thermochem.h" -void print_sc_consts(struct ScalingConstants *c) { +void print_sc_consts(ScalingConstants *c) { LOG_DEBUG("Printing scaling relation constants z = %.3f....", c->redshift); LOG_DEBUG("SHMR: f10 %.2e a %.2e f7 %.2e a_mini %.2e sigma %.2e", c->fstar_10, c->alpha_star, c->fstar_7, c->alpha_star_mini, c->sigma_star); @@ -33,11 +33,11 @@ void print_sc_consts(struct ScalingConstants *c) { return; } -void set_scaling_constants(double redshift, struct ScalingConstants *consts, bool use_photoncons) { +void set_scaling_constants(double redshift, ScalingConstants *consts, bool use_photoncons) { consts->redshift = redshift; // Set on for the fixed grid case since we are missing halos above the cell mass - consts->fix_mean = matter_options_global->FIXED_HALO_GRIDS; + consts->fix_mean = matter_options_global->HMF == 2 || matter_options_global->HMF == 3; // whether to fix *integrated* (not sampled) galaxy properties to the expected mean consts->scaling_median = astro_options_global->HALO_SCALING_RELATIONS_MEDIAN; @@ -103,8 +103,8 @@ void set_scaling_constants(double redshift, struct ScalingConstants *consts, boo } // It's often useful to create a copy of scaling constants without F_ESC -struct ScalingConstants evolve_scaling_constants_sfr(struct ScalingConstants *sc) { - struct ScalingConstants sc_sfrd = *sc; +ScalingConstants evolve_scaling_constants_sfr(ScalingConstants *sc) { + ScalingConstants sc_sfrd = *sc; sc_sfrd.fesc_10 = 1.; sc_sfrd.fesc_7 = 1.; sc_sfrd.alpha_esc = 0.; @@ -115,10 +115,9 @@ struct ScalingConstants evolve_scaling_constants_sfr(struct ScalingConstants *sc } // It's often useful to create a copy of scaling relations at a different z -struct ScalingConstants evolve_scaling_constants_to_redshift(double redshift, - struct ScalingConstants *sc, - bool use_photoncons) { - struct ScalingConstants sc_z = *sc; +ScalingConstants evolve_scaling_constants_to_redshift(double redshift, ScalingConstants *sc, + bool use_photoncons) { + ScalingConstants sc_z = *sc; sc_z.redshift = redshift; sc_z.t_h = t_hubble(redshift); @@ -268,7 +267,7 @@ double get_lx_on_sfr(double sfr, double metallicity, double lx_constant) { } void get_halo_stellarmass(double halo_mass, double mturn_acg, double mturn_mcg, double star_rng, - struct ScalingConstants *consts, double *star_acg, double *star_mcg) { + ScalingConstants *consts, double *star_acg, double *star_mcg) { // low-mass ACG power-law parameters double f_10 = consts->fstar_10; double f_a = consts->alpha_star; @@ -320,7 +319,7 @@ void get_halo_stellarmass(double halo_mass, double mturn_acg, double mturn_mcg, } void get_halo_sfr(double stellar_mass, double stellar_mass_mini, double sfr_rng, - struct ScalingConstants *consts, double *sfr, double *sfr_mini) { + ScalingConstants *consts, double *sfr, double *sfr_mini) { double sfr_mean, sfr_mean_mini; double sfr_sample, sfr_sample_mini; @@ -376,7 +375,7 @@ void get_halo_metallicity(double sfr, double stellar, double redshift, double *z } void get_halo_xray(double sfr, double sfr_mini, double metallicity, double xray_rng, - struct ScalingConstants *consts, double *xray_out) { + ScalingConstants *consts, double *xray_out) { double sigma_xray = consts->sigma_xray; // adjustment to the mean for lognormal scatter diff --git a/src/py21cmfast/src/scaling_relations.h b/src/py21cmfast/src/scaling_relations.h index 17163ee72..afee70521 100644 --- a/src/py21cmfast/src/scaling_relations.h +++ b/src/py21cmfast/src/scaling_relations.h @@ -9,7 +9,7 @@ // These are just the values which come from the InputStruct objects and don't change within the // snapshot using this reduces the use of the global parameter structs and allows fewer exp/log // unit changes -struct ScalingConstants { +typedef struct ScalingConstants { double redshift; bool fix_mean; bool scaling_median; @@ -49,28 +49,27 @@ struct ScalingConstants { double Mlim_Fesc; double Mlim_Fstar_mini; double Mlim_Fesc_mini; -}; +} ScalingConstants; -void set_scaling_constants(double redshift, struct ScalingConstants *consts, bool use_photoncons); +void set_scaling_constants(double redshift, ScalingConstants *consts, bool use_photoncons); double get_lx_on_sfr(double sfr, double metallicity, double lx_constant); void get_halo_stellarmass(double halo_mass, double mturn_acg, double mturn_mcg, double star_rng, - struct ScalingConstants *consts, double *star_acg, double *star_mcg); + ScalingConstants *consts, double *star_acg, double *star_mcg); void get_halo_sfr(double stellar_mass, double stellar_mass_mini, double sfr_rng, - struct ScalingConstants *consts, double *sfr, double *sfr_mini); + ScalingConstants *consts, double *sfr, double *sfr_mini); void get_halo_metallicity(double sfr, double stellar, double redshift, double *z_out); void get_halo_xray(double sfr, double sfr_mini, double metallicity, double xray_rng, - struct ScalingConstants *consts, double *xray_out); + ScalingConstants *consts, double *xray_out); double scaling_PL_limit(double M, double norm, double alpha, double pivot, double limit); double log_scaling_PL_limit(double lnM, double ln_norm, double alpha, double ln_pivot, double ln_limit); double scaling_double_PL(double M, double alpha_lo, double pivot_ratio, double alpha_hi, double pivot_hi); -struct ScalingConstants evolve_scaling_constants_sfr(struct ScalingConstants *sc); -struct ScalingConstants evolve_scaling_constants_to_redshift(double redshift, - struct ScalingConstants *sc, - bool use_photoncons); -void print_sc_consts(struct ScalingConstants *c); +ScalingConstants evolve_scaling_constants_sfr(ScalingConstants *sc); +ScalingConstants evolve_scaling_constants_to_redshift(double redshift, ScalingConstants *sc, + bool use_photoncons); +void print_sc_consts(ScalingConstants *c); #endif diff --git a/src/py21cmfast/src/test_Stochasticity.cu b/src/py21cmfast/src/test_Stochasticity.cu new file mode 100644 index 000000000..c1bbdec12 --- /dev/null +++ b/src/py21cmfast/src/test_Stochasticity.cu @@ -0,0 +1,46 @@ +#include +#include + +#include "Stochasticity.cu" + +void testCondenseDeviceArray() +{ + // Input data + float h_array[] = {1.0f, 0.0f, 2.0f, 3.0f, 0.0f, 4.0f}; + float mask_value = 0.0f; + int original_size = 6; + + // Expected outputs + float expected_array[] = {1.0f, 2.0f, 3.0f, 4.0f, 0.0f, 0.0f}; + int expected_valid_size = 4; + + // Allocate and copy to device + float *d_array; + cudaMalloc(&d_array, original_size * sizeof(float)); + cudaMemcpy(d_array, h_array, original_size * sizeof(float), cudaMemcpyHostToDevice); + + // Call the function from Stochasticity.cu + int valid_size = condenseDeviceArray(d_array, original_size, mask_value); + + // Copy the results back to the host + float h_result[original_size]; + cudaMemcpy(h_result, d_array, original_size * sizeof(float), cudaMemcpyDeviceToHost); + + // Validate the results + assert(valid_size == expected_valid_size); + for (int i = 0; i < original_size; ++i) + { + assert(h_result[i] == expected_array[i]); + } + + std::cout << "Test passed: condenseDeviceArray\n"; + + // Free device memory + cudaFree(d_array); +} + +int main() +{ + testCondenseDeviceArray(); + return 0; +} diff --git a/src/py21cmfast/src/thermochem.h b/src/py21cmfast/src/thermochem.h index 4a18642d2..f2e92d35f 100644 --- a/src/py21cmfast/src/thermochem.h +++ b/src/py21cmfast/src/thermochem.h @@ -3,6 +3,9 @@ #include "InputParameters.h" +#ifdef __cplusplus +extern "C" { +#endif float ComputeTau(int Npoints, float *redshifts, float *global_xHI, float z_re_HeII); double molecular_cooling_threshold(float z); double atomic_cooling_threshold(float z); @@ -21,4 +24,7 @@ double HeII_ion_crosssec(double nu); double HI_ion_crosssec(double nu); double neutral_fraction(double density, double T4, double gamma, int usecaseB); +#ifdef __cplusplus +} +#endif #endif diff --git a/src/py21cmfast/wrapper/_utils.py b/src/py21cmfast/wrapper/_utils.py index 24b599c83..9b28e745b 100644 --- a/src/py21cmfast/wrapper/_utils.py +++ b/src/py21cmfast/wrapper/_utils.py @@ -3,12 +3,10 @@ import logging import numpy as np -from cffi import FFI -from ..c_21cmfast import ffi, lib -from .exceptions import _process_exitcode +import py21cmfast.c_21cmfast as lib -_ffi = FFI() +from .exceptions import _process_exitcode logger = logging.getLogger(__name__) @@ -30,33 +28,55 @@ def asarray(ptr, shape): """Get the canonical C type of the elements of ptr as a string.""" - ctype = _ffi.getctype(_ffi.typeof(ptr).item).split("*")[0].strip() + ctype = type(ptr).__name__ # TODO: check if ctype not in ctype2dtype: raise RuntimeError( f"Cannot create an array for element type: {ctype}. Can do {list(ctype2dtype.values())}." ) - array = np.frombuffer( - _ffi.buffer(ptr, _ffi.sizeof(ctype) * np.prod(shape)), ctype2dtype[ctype] - ) + array = np.frombuffer(ptr, ctype2dtype[ctype]) # TODO: check array.shape = shape return array +def _nb_initialise_return_value(arg_string, out_shape=(1,)): + """Return a zero-initialised object of the correct type given a nanobind signature. + + Currently only works with wrapped structures or numpy arrays. + """ + # If it's a wrapped class, return the class + if "py21cmfast.c_21cmfast" in arg_string: + return getattr(lib, arg_string.split("py21cmfast.c_21cmfast")[-1])() + + if "*" in arg_string or "ndarray" in arg_string: + base_type = arg_string.split("dtype=")[1].split("]")[0] + return np.zeros(out_shape, dtype=getattr(np, base_type)) + + raise ValueError( + f"Cannot create a zero-initialised object of type {arg_string}." + "As it is not a pointer, array or class. Please check the function signature." + ) + + def _call_c_simple(fnc, *args): """Call a simple C function that just returns an object. - Any such function should be defined such that the last argument is an int pointer generating - the status. + Assumes that the last argument is a pointer to an object that will be filled in by the C function. + This argument is initialised here and returned. """ # Parse the function to get the type of the last argument - cdata = str(ffi.addressof(lib, fnc.__name__)) - kind = cdata.split("(")[-1].split(")")[0].split(",")[-1] - result = ffi.new(kind) + cdata = fnc.__nb_signature__[0][0] + # Nanobind signature is 'def fnc.__name__(arg0: type0, arg1: type1, ..., argN: typeN, /) -> returntype' + # We wish to extract the type of the last argument only. + signature_string = ( + cdata.split("(")[-1].split(")")[0].split(",")[-2].replace("arg: ", "").strip() + ) + # NOTE: This uses the default return size == 1 for arrays + result = _nb_initialise_return_value(signature_string) status = fnc(*args, result) _process_exitcode(status, fnc, args) - return result[0] + return result def camel_to_snake(word: str, depublicize: bool = False): diff --git a/src/py21cmfast/wrapper/cfuncs.py b/src/py21cmfast/wrapper/cfuncs.py index 285efdef8..a1e3e7208 100644 --- a/src/py21cmfast/wrapper/cfuncs.py +++ b/src/py21cmfast/wrapper/cfuncs.py @@ -9,8 +9,9 @@ from numpy.typing import NDArray from scipy.interpolate import interp1d +import py21cmfast.c_21cmfast as lib + from .._cfg import config -from ..c_21cmfast import ffi, lib from ._utils import _process_exitcode from .inputs import ( InputParameters, @@ -21,7 +22,16 @@ # Ideally, backend functions that we access here should do all the broadcasting/initialisation themselves # These decorators are for lower functions which are called directly in one or two lines, like delta_crit -# TODO: a lot of these assume input as numpy arrays via use of .shape, explicitly require this +# NOTE: On casting to C pointers: +# ------------------------------- +# Currently our wrapper functions directly take C type pointers, which +# requires us to cast the data to the correct type before passing it to the C. +# This is made annoying by the fact that CAMB (which is indirectly imported somewhere) +# appears to have overwritten the ctypes library pointer types which cause errors. +# We will use the nanobind ndarray casters, which allow us to pass +# numpy arrays directly to C++ functions, with size and type information. +# We will have to translate the `integral_wrapper.c` functions to C++ and (maybe?) define +# some wrapper layer functions in C++ for the output struct functions to parse the array data. def broadcast_input_struct(inputs: InputParameters): @@ -191,14 +201,10 @@ def compute_tau( redshifts = np.array(redshifts, dtype="float32") global_xHI = np.array(global_xHI, dtype="float32") - z = ffi.cast("float *", ffi.from_buffer(redshifts)) - xHI = ffi.cast("float *", ffi.from_buffer(global_xHI)) - # Run the C code return lib.ComputeTau( - len(redshifts), - z, - xHI, + redshifts, + global_xHI, z_re_HeII, ) @@ -280,85 +286,61 @@ def compute_luminosity_function( ) component = "acg" - lfunc = np.zeros(len(redshifts) * nbins) - Muvfunc = np.zeros(len(redshifts) * nbins) - Mhfunc = np.zeros(len(redshifts) * nbins) - - lfunc.shape = (len(redshifts), nbins) - Muvfunc.shape = (len(redshifts), nbins) - Mhfunc.shape = (len(redshifts), nbins) - - c_Muvfunc = ffi.cast("double *", ffi.from_buffer(Muvfunc)) - c_Mhfunc = ffi.cast("double *", ffi.from_buffer(Mhfunc)) - c_lfunc = ffi.cast("double *", ffi.from_buffer(lfunc)) + lfunc = np.zeros((len(redshifts), nbins)) + Muvfunc = np.zeros((len(redshifts), nbins)) + Mhfunc = np.zeros((len(redshifts), nbins)) - lfunc_MINI = np.zeros(len(redshifts) * nbins) - Muvfunc_MINI = np.zeros(len(redshifts) * nbins) - Mhfunc_MINI = np.zeros(len(redshifts) * nbins) - - lfunc_MINI.shape = (len(redshifts), nbins) - Muvfunc_MINI.shape = (len(redshifts), nbins) - Mhfunc_MINI.shape = (len(redshifts), nbins) - - c_Muvfunc_MINI = ffi.cast("double *", ffi.from_buffer(Muvfunc_MINI)) - c_Mhfunc_MINI = ffi.cast("double *", ffi.from_buffer(Mhfunc_MINI)) - c_lfunc_MINI = ffi.cast("double *", ffi.from_buffer(lfunc_MINI)) + lfunc_MINI = np.zeros((len(redshifts), nbins)) + Muvfunc_MINI = np.zeros((len(redshifts), nbins)) + Mhfunc_MINI = np.zeros((len(redshifts), nbins)) if component in ("both", "acg"): # Run the C code errcode = lib.ComputeLF( - nbins, 1, - len(redshifts), - ffi.cast("float *", ffi.from_buffer(redshifts)), - ffi.cast("float *", ffi.from_buffer(mturnovers)), - c_Muvfunc, - c_Mhfunc, - c_lfunc, + nbins, + redshifts, + mturnovers, + Muvfunc, + Mhfunc, + lfunc, ) _process_exitcode( errcode, lib.ComputeLF, ( - nbins, 1, - len(redshifts), + nbins, ), ) if component in ("both", "mcg"): # Run the C code errcode = lib.ComputeLF( - nbins, 2, - len(redshifts), - ffi.cast("float *", ffi.from_buffer(redshifts)), - ffi.cast("float *", ffi.from_buffer(mturnovers_mini)), - c_Muvfunc_MINI, - c_Mhfunc_MINI, - c_lfunc_MINI, + nbins, + redshifts, + mturnovers_mini, + Muvfunc_MINI, + Mhfunc_MINI, + lfunc_MINI, ) _process_exitcode( errcode, lib.ComputeLF, ( - nbins, 2, - len(redshifts), + nbins, ), ) if component == "both": # redo the Muv range using the faintest (most likely MINI) and the brightest (most likely massive) - lfunc_all = np.zeros(len(redshifts) * nbins) - Muvfunc_all = np.zeros(len(redshifts) * nbins) - Mhfunc_all = np.zeros(len(redshifts) * nbins * 2) - - lfunc_all.shape = (len(redshifts), nbins) - Muvfunc_all.shape = (len(redshifts), nbins) - Mhfunc_all.shape = (len(redshifts), nbins, 2) + lfunc_all = np.zeros((len(redshifts), nbins)) + Muvfunc_all = np.zeros((len(redshifts), nbins)) + Mhfunc_all = np.zeros((len(redshifts), nbins, 2)) for iz in range(len(redshifts)): Muvfunc_all[iz] = np.linspace( @@ -439,7 +421,7 @@ def get_matter_power_values( def evaluate_sigma( *, inputs: InputParameters, - masses: NDArray[float], + masses: NDArray[np.float64], ): """ Evaluate the variance of a mass scale. @@ -447,14 +429,13 @@ def evaluate_sigma( Uses the 21cmfast backend """ masses = masses.astype("f8") - sigma = np.zeros_like(masses) - dsigmasq = np.zeros_like(masses) + sigma = np.zeros_like(masses, dtype="f8") + dsigmasq = np.zeros_like(masses, dtype="f8") lib.get_sigma( - masses.size, - ffi.cast("double *", ffi.from_buffer(masses)), - ffi.cast("double *", ffi.from_buffer(sigma)), - ffi.cast("double *", ffi.from_buffer(dsigmasq)), + masses, + sigma, + dsigmasq, ) return sigma, dsigmasq @@ -507,7 +488,7 @@ def get_delta_crit_nu(hmf_int_flag: int, sigma: float, growth: float): @broadcast_params def evaluate_condition_integrals( inputs: InputParameters, - cond_array: NDArray[float], + cond_array: NDArray[np.float64], redshift: float, redshift_prev: float | None = None, ): @@ -524,10 +505,9 @@ def evaluate_condition_integrals( lib.get_condition_integrals( redshift, redshift_prev if redshift_prev is not None else -1, - cond_array.size, - ffi.cast("double *", ffi.from_buffer(cond_array)), - ffi.cast("double *", ffi.from_buffer(n_halo)), - ffi.cast("double *", ffi.from_buffer(m_coll)), + cond_array, + n_halo, + m_coll, ) return n_halo, m_coll @@ -537,9 +517,9 @@ def evaluate_condition_integrals( def integrate_chmf_interval( inputs: InputParameters, redshift: float, - lnm_lower: NDArray[float], - lnm_upper: NDArray[float], - cond_values: NDArray[float], + lnm_lower: NDArray[np.float64], + lnm_upper: NDArray[np.float64], + cond_values: NDArray[np.float64], redshift_prev: float | None = None, ): """Evaluate conditional mass function integrals at a range of mass intervals.""" @@ -555,12 +535,10 @@ def integrate_chmf_interval( lib.get_halo_chmf_interval( redshift, redshift_prev if redshift_prev is not None else -1, - len(cond_values), - ffi.cast("double *", ffi.from_buffer(cond_values)), - len(lnm_lower), - ffi.cast("double *", ffi.from_buffer(lnm_lower)), - ffi.cast("double *", ffi.from_buffer(lnm_upper)), - ffi.cast("double *", ffi.from_buffer(out_prob)), + cond_values, + lnm_lower, + lnm_upper, + out_prob, ) return out_prob @@ -569,8 +547,8 @@ def integrate_chmf_interval( @broadcast_params def evaluate_inverse_table( inputs: InputParameters, - cond_array: NDArray[float], - probabilities: NDArray[float], + cond_array: NDArray[np.float64], + probabilities: NDArray[np.float64], redshift: float, redshift_prev: float | None = None, ): @@ -591,10 +569,9 @@ def evaluate_inverse_table( lib.get_halomass_at_probability( redshift, redshift_prev, - cond_array.size, - ffi.cast("double *", ffi.from_buffer(cond_array)), - ffi.cast("double *", ffi.from_buffer(probabilities)), - ffi.cast("double *", ffi.from_buffer(masses)), + cond_array, + probabilities, + masses, ) return masses @@ -603,7 +580,7 @@ def evaluate_inverse_table( @broadcast_params def evaluate_FgtrM_cond( inputs: InputParameters, - densities: NDArray[float], + densities: NDArray[np.float64], redshift: float, R: float, ): @@ -615,10 +592,9 @@ def evaluate_FgtrM_cond( lib.get_conditional_FgtrM( redshift, R, - densities.size, - ffi.cast("double *", ffi.from_buffer(densities)), - ffi.cast("double *", ffi.from_buffer(fcoll)), - ffi.cast("double *", ffi.from_buffer(dfcoll)), + densities, + fcoll, + dfcoll, ) return fcoll, dfcoll @@ -627,8 +603,8 @@ def evaluate_FgtrM_cond( def evaluate_SFRD_z( *, inputs: InputParameters, - redshifts: NDArray[float], - log10mturns: NDArray[float], + redshifts: NDArray[np.float64], + log10mturns: NDArray[np.float64], ): """Evaluate the global star formation rate density expected at a range of redshifts.""" if redshifts.shape != log10mturns.shape: @@ -643,11 +619,10 @@ def evaluate_SFRD_z( sfrd_mini = np.zeros_like(redshifts) lib.get_global_SFRD_z( - redshifts.size, - ffi.cast("double *", ffi.from_buffer(redshifts)), - ffi.cast("double *", ffi.from_buffer(log10mturns)), - ffi.cast("double *", ffi.from_buffer(sfrd)), - ffi.cast("double *", ffi.from_buffer(sfrd_mini)), + redshifts, + log10mturns, + sfrd, + sfrd_mini, ) return sfrd, sfrd_mini @@ -657,8 +632,8 @@ def evaluate_SFRD_z( def evaluate_Nion_z( *, inputs: InputParameters, - redshifts: NDArray[float], - log10mturns: NDArray[float], + redshifts: NDArray[np.float64], + log10mturns: NDArray[np.float64], ): """Evaluate the global ionising emissivity expected at a range of redshifts.""" if redshifts.shape != log10mturns.shape: @@ -673,11 +648,10 @@ def evaluate_Nion_z( nion_mini = np.zeros_like(redshifts) lib.get_global_Nion_z( - redshifts.size, - ffi.cast("double *", ffi.from_buffer(redshifts)), - ffi.cast("double *", ffi.from_buffer(log10mturns)), - ffi.cast("double *", ffi.from_buffer(nion)), - ffi.cast("double *", ffi.from_buffer(nion_mini)), + redshifts, + log10mturns, + nion, + nion_mini, ) return nion, nion_mini @@ -689,8 +663,8 @@ def evaluate_SFRD_cond( inputs: InputParameters, redshift: float, radius: float, - densities: NDArray[float], - log10mturns: NDArray[float], + densities: NDArray[np.float64], + log10mturns: NDArray[np.float64], ): """Evaluate the conditional star formation rate density expected at a range of densities.""" if densities.shape != log10mturns.shape: @@ -706,11 +680,10 @@ def evaluate_SFRD_cond( lib.get_conditional_SFRD( redshift, radius, - densities.size, - ffi.cast("double *", ffi.from_buffer(densities)), - ffi.cast("double *", ffi.from_buffer(log10mturns)), - ffi.cast("double *", ffi.from_buffer(sfrd)), - ffi.cast("double *", ffi.from_buffer(sfrd_mini)), + densities, + log10mturns, + sfrd, + sfrd_mini, ) return sfrd, sfrd_mini @@ -722,9 +695,9 @@ def evaluate_Nion_cond( inputs: InputParameters, redshift: float, radius: float, - densities: NDArray[float], - l10mturns_acg: NDArray[float], - l10mturns_mcg: NDArray[float], + densities: NDArray[np.float64], + l10mturns_acg: NDArray[np.float64], + l10mturns_mcg: NDArray[np.float64], ): """Evaluate the conditional ionising emissivity expected at a range of densities.""" if not (densities.shape == l10mturns_mcg.shape == l10mturns_acg.shape): @@ -741,12 +714,11 @@ def evaluate_Nion_cond( lib.get_conditional_Nion( redshift, radius, - densities.size, - ffi.cast("double *", ffi.from_buffer(densities)), - ffi.cast("double *", ffi.from_buffer(l10mturns_acg)), - ffi.cast("double *", ffi.from_buffer(l10mturns_mcg)), - ffi.cast("double *", ffi.from_buffer(nion)), - ffi.cast("double *", ffi.from_buffer(nion_mini)), + densities, + l10mturns_acg, + l10mturns_mcg, + nion, + nion_mini, ) return nion, nion_mini @@ -758,8 +730,8 @@ def evaluate_Xray_cond( inputs: InputParameters, redshift: float, radius: float, - densities: NDArray[float], - log10mturns: NDArray[float], + densities: NDArray[np.float64], + log10mturns: NDArray[np.float64], ): """Evaluate the conditional star formation rate density expected at a range of densities.""" if densities.shape != log10mturns.shape: @@ -775,10 +747,9 @@ def evaluate_Xray_cond( lib.get_conditional_Xray( redshift, radius, - densities.size, - ffi.cast("double *", ffi.from_buffer(densities)), - ffi.cast("double *", ffi.from_buffer(log10mturns)), - ffi.cast("double *", ffi.from_buffer(xray)), + densities, + log10mturns, + xray, ) return xray @@ -800,8 +771,7 @@ def sample_halos_from_conditions( n_cond = cond_array.size # all coordinates zero - crd_in = np.zeros(3 * n_cond).astype("f4") - + crd_in = np.zeros((n_cond, 3)).astype("f4") cond_array = cond_array.astype("f4") nhalo_out = np.zeros(1).astype("i4") N_out = np.zeros(n_cond).astype("i4") @@ -809,22 +779,21 @@ def sample_halos_from_conditions( exp_M = np.zeros(n_cond).astype("f8") exp_N = np.zeros(n_cond).astype("f8") halomass_out = np.zeros(buffer_size).astype("f4") - halocrd_out = np.zeros(int(3 * buffer_size)).astype("i4") + halocrd_out = np.zeros((buffer_size, 3)).astype("i4") lib.single_test_sample( inputs.random_seed, - n_cond, - ffi.cast("float *", cond_array.ctypes.data), - ffi.cast("float *", crd_in.ctypes.data), + cond_array, + crd_in, redshift, z_prev, - ffi.cast("int *", nhalo_out.ctypes.data), - ffi.cast("int *", N_out.ctypes.data), - ffi.cast("double *", exp_N.ctypes.data), - ffi.cast("double *", M_out.ctypes.data), - ffi.cast("double *", exp_M.ctypes.data), - ffi.cast("float *", halomass_out.ctypes.data), - ffi.cast("float *", halocrd_out.ctypes.data), + nhalo_out, + N_out, + exp_N, + M_out, + exp_M, + halomass_out, + halocrd_out, ) return { @@ -842,15 +811,15 @@ def convert_halo_properties( *, redshift: float, inputs: InputParameters, - halo_masses: NDArray[float], - star_rng: NDArray[float], - sfr_rng: NDArray[float], - xray_rng: NDArray[float], - halo_coords: NDArray[float] | None = None, - vcb_grid: NDArray[float] | None = None, - J_21_LW_grid: NDArray[float] | None = None, - z_re_grid: NDArray[float] | None = None, - Gamma12_grid: NDArray[float] | None = None, + halo_masses: NDArray[np.float64], + star_rng: NDArray[np.float64], + sfr_rng: NDArray[np.float64], + xray_rng: NDArray[np.float64], + halo_coords: NDArray[np.float64] | None = None, + vcb_grid: NDArray[np.float64] | None = None, + J_21_LW_grid: NDArray[np.float64] | None = None, + z_re_grid: NDArray[np.float64] | None = None, + Gamma12_grid: NDArray[np.float64] | None = None, ): """ Convert a halo catalogue's mass and RNG fields to halo properties. @@ -876,11 +845,12 @@ def convert_halo_properties( raise ValueError("Halo masses and rng shapes must be identical.") n_halos = halo_masses.size + orig_shape = halo_masses.shape out_buffer = np.zeros((n_halos, 12), dtype="f4") lo_dim = (inputs.simulation_options.HII_DIM,) * 3 if halo_coords is None: - halo_coords = np.zeros(3 * n_halos) + halo_coords = np.zeros((n_halos, 3)) if vcb_grid is None: vcb_grid = np.zeros(lo_dim) if J_21_LW_grid is None: @@ -895,42 +865,41 @@ def convert_halo_properties( z_re_grid = z_re_grid.astype("f4") Gamma12_grid = Gamma12_grid.astype("f4") - halo_masses = halo_masses.astype("f4") - halo_coords = halo_coords.astype("f4") - star_rng = star_rng.astype("f4") - sfr_rng = sfr_rng.astype("f4") - xray_rng = xray_rng.astype("f4") + halo_masses = halo_masses.reshape(n_halos).astype("f4") + halo_coords = halo_coords.reshape(n_halos, 3).astype("f4") + star_rng = star_rng.reshape(n_halos).astype("f4") + sfr_rng = sfr_rng.reshape(n_halos).astype("f4") + xray_rng = xray_rng.reshape(n_halos).astype("f4") lib.test_halo_props( redshift, - ffi.cast("float *", vcb_grid.ctypes.data), - ffi.cast("float *", J_21_LW_grid.ctypes.data), - ffi.cast("float *", z_re_grid.ctypes.data), - ffi.cast("float *", Gamma12_grid.ctypes.data), - n_halos, - ffi.cast("float *", halo_masses.ctypes.data), - ffi.cast("float *", halo_coords.ctypes.data), - ffi.cast("float *", star_rng.ctypes.data), - ffi.cast("float *", sfr_rng.ctypes.data), - ffi.cast("float *", xray_rng.ctypes.data), - ffi.cast("float *", out_buffer.ctypes.data), + vcb_grid, + J_21_LW_grid, + z_re_grid, + Gamma12_grid, + halo_masses, + halo_coords, + star_rng, + sfr_rng, + xray_rng, + out_buffer, ) out_buffer = out_buffer.reshape(n_halos, 12) return { - "halo_mass": out_buffer[:, 0].reshape(halo_masses.shape), - "halo_stars": out_buffer[:, 1].reshape(halo_masses.shape), - "halo_sfr": out_buffer[:, 2].reshape(halo_masses.shape), - "halo_xray": out_buffer[:, 3].reshape(halo_masses.shape), - "n_ion": out_buffer[:, 4].reshape(halo_masses.shape), - "halo_wsfr": out_buffer[:, 5].reshape(halo_masses.shape), - "halo_stars_mini": out_buffer[:, 6].reshape(halo_masses.shape), - "halo_sfr_mini": out_buffer[:, 7].reshape(halo_masses.shape), - "mturn_a": out_buffer[:, 8].reshape(halo_masses.shape), - "mturn_m": out_buffer[:, 9].reshape(halo_masses.shape), - "mturn_r": out_buffer[:, 10].reshape(halo_masses.shape), - "metallicity": out_buffer[:, 11].reshape(halo_masses.shape), + "halo_mass": out_buffer[:, 0].reshape(orig_shape), + "halo_stars": out_buffer[:, 1].reshape(orig_shape), + "halo_sfr": out_buffer[:, 2].reshape(orig_shape), + "halo_xray": out_buffer[:, 3].reshape(orig_shape), + "n_ion": out_buffer[:, 4].reshape(orig_shape), + "halo_wsfr": out_buffer[:, 5].reshape(orig_shape), + "halo_stars_mini": out_buffer[:, 6].reshape(orig_shape), + "halo_sfr_mini": out_buffer[:, 7].reshape(orig_shape), + "mturn_a": out_buffer[:, 8].reshape(orig_shape), + "mturn_m": out_buffer[:, 9].reshape(orig_shape), + "mturn_r": out_buffer[:, 10].reshape(orig_shape), + "metallicity": out_buffer[:, 11].reshape(orig_shape), } diff --git a/src/py21cmfast/wrapper/exceptions.py b/src/py21cmfast/wrapper/exceptions.py index 6bfed3749..e64982551 100644 --- a/src/py21cmfast/wrapper/exceptions.py +++ b/src/py21cmfast/wrapper/exceptions.py @@ -81,6 +81,12 @@ class MemoryAllocError(FatalCError): default_message = """An error has occured while attempting to allocate memory! (check the LOG for more info)""" +class CUDAError(FatalCError): + """An exception when an error occurs with CUDA.""" + + default_message = """A CUDA error has occured! (check the LOG for more info)""" + + SUCCESS = 0 IOERROR = 1 GSLERROR = 2 @@ -91,12 +97,15 @@ class MemoryAllocError(FatalCError): INFINITYORNANERROR = 7 MASSDEPZETAERROR = 8 MEMORYALLOCERROR = 9 +CUDAERROR = 10 def _process_exitcode(exitcode, fnc, args): """Determine what happens for different values of the (integer) exit code from a C function.""" if exitcode != SUCCESS: - logger.error(f"In function: {fnc.__name__}. Arguments: {args}") + logger.error( + f"Error code {exitcode} in function: {fnc.__name__}. Arguments: {args}" + ) if exitcode: try: @@ -110,6 +119,7 @@ def _process_exitcode(exitcode, fnc, args): INFINITYORNANERROR: InfinityorNaNError, MASSDEPZETAERROR: MassDepZetaError, MEMORYALLOCERROR: MemoryAllocError, + CUDAERROR: CUDAError, }[exitcode] except KeyError as e: # pragma: no cover raise FatalCError( diff --git a/src/py21cmfast/wrapper/inputs.py b/src/py21cmfast/wrapper/inputs.py index a007a119e..690b47c5b 100644 --- a/src/py21cmfast/wrapper/inputs.py +++ b/src/py21cmfast/wrapper/inputs.py @@ -119,11 +119,6 @@ class InputStruct: .. warning:: This class will *not* deal well with parameters of the struct which are pointers. All parameters should be primitive types, except for strings, which are dealt with specially. - - Parameters - ---------- - ffi : cffi object - The ffi object from any cffi-wrapped library. """ _subclasses: ClassVar = {} @@ -188,11 +183,6 @@ def cstruct(self) -> StructWrapper: cdict = self.cdict for k in self.struct.fieldnames: val = cdict[k] - - if isinstance(val, str): - # If it is a string, need to convert it to C string ourselves. - val = self.ffi.new("char[]", val.encode()) - setattr(self.struct.cstruct, k, val) return self.struct.cstruct @@ -349,7 +339,7 @@ def OMl(self): """Omega lambda, dark energy density.""" return 1 - self.OMm - @property + @cached_property def cosmo(self): """An astropy cosmology object for this cosmology.""" return self._base_cosmo.clone( @@ -1385,7 +1375,7 @@ def _astro_options_validator(self, att, val): ) elif ( val.INTEGRATION_METHOD_ATOMIC == "GAMMA-APPROX" - and self.matter_options.HMF != 0 + and self.matter_options.HMF != "PS" ): warnings.warn( "The 'GAMMA-APPROX' integration method uses the EPS conditional mass function" diff --git a/src/py21cmfast/wrapper/outputs.py b/src/py21cmfast/wrapper/outputs.py index 96556e937..3ed46cdfa 100644 --- a/src/py21cmfast/wrapper/outputs.py +++ b/src/py21cmfast/wrapper/outputs.py @@ -28,7 +28,9 @@ from astropy.cosmology import z_at_value from bidict import bidict -from ..c_21cmfast import lib +import py21cmfast.c_21cmfast as lib + +from .._cfg import config from .arrays import Array from .exceptions import _process_exitcode from .inputs import ( @@ -889,10 +891,10 @@ class HaloBox(OutputStructZ): _meta = False _c_compute_function = lib.ComputeHaloBox - halo_mass = _arrayfield() - halo_stars = _arrayfield() + count = _arrayfield(optional=True) + halo_mass = _arrayfield(optional=True) + halo_stars = _arrayfield(optional=True) halo_stars_mini = _arrayfield(optional=True) - count = _arrayfield() halo_sfr = _arrayfield() halo_sfr_mini = _arrayfield(optional=True) halo_xray = _arrayfield(optional=True) @@ -922,15 +924,11 @@ def new(cls, inputs: InputParameters, redshift: float, **kw) -> Self: shape = (dim, dim, int(inputs.simulation_options.NON_CUBIC_FACTOR * dim)) out = { - "halo_mass": Array(shape, dtype=np.float32), - "halo_stars": Array(shape, dtype=np.float32), - "count": Array(shape, dtype=np.int32), "halo_sfr": Array(shape, dtype=np.float32), "n_ion": Array(shape, dtype=np.float32), } if inputs.astro_options.USE_MINI_HALOS: - out["halo_stars_mini"] = Array(shape, dtype=np.float32) out["halo_sfr_mini"] = Array(shape, dtype=np.float32) if inputs.astro_options.INHOMO_RECO: @@ -939,6 +937,13 @@ def new(cls, inputs: InputParameters, redshift: float, **kw) -> Self: if inputs.astro_options.USE_TS_FLUCT: out["halo_xray"] = Array(shape, dtype=np.float32) + if config["EXTRA_HALOBOX_FIELDS"]: + out["count"] = Array(shape, dtype=np.int32) + out["halo_mass"] = Array(shape, dtype=np.float32) + out["halo_stars"] = Array(shape, dtype=np.float32) + if inputs.astro_options.USE_MINI_HALOS: + out["halo_stars_mini"] = Array(shape, dtype=np.float32) + return cls( inputs=inputs, redshift=redshift, @@ -958,20 +963,24 @@ def get_required_input_arrays(self, input_box: OutputStruct) -> list[str]: "sfr_rng", "xray_rng", ] - elif isinstance(input_box, PerturbedField): - if self.matter_options.FIXED_HALO_GRIDS: - required += ["density"] elif isinstance(input_box, TsBox): if self.astro_options.USE_MINI_HALOS: required += ["J_21_LW"] elif isinstance(input_box, IonizedBox): required += ["ionisation_rate_G12", "z_reion"] elif isinstance(input_box, InitialConditions): - if ( - self.matter_options.HALO_STOCHASTICITY - and self.astro_options.AVG_BELOW_SAMPLER - ): - required += ["lowres_density"] + required += [ + "lowres_density", + "lowres_vx", + "lowres_vy", + "lowres_vz", + ] + if self.matter_options.PERTURB_ALGORITHM == "2LPT": + required += [ + "lowres_vx_2LPT", + "lowres_vy_2LPT", + "lowres_vz_2LPT", + ] if self.matter_options.USE_RELATIVE_VELOCITIES: required += ["lowres_vcb"] else: @@ -984,7 +993,6 @@ def compute( *, initial_conditions: InitialConditions, pt_halos: PerturbHaloField, - perturbed_field: PerturbedField, previous_spin_temp: TsBox, previous_ionize_box: IonizedBox, allow_already_computed: bool = False, @@ -994,7 +1002,6 @@ def compute( allow_already_computed, self.redshift, initial_conditions, - perturbed_field, pt_halos, previous_spin_temp, previous_ionize_box, diff --git a/src/py21cmfast/wrapper/photoncons.py b/src/py21cmfast/wrapper/photoncons.py index 9970b305c..19febdba5 100644 --- a/src/py21cmfast/wrapper/photoncons.py +++ b/src/py21cmfast/wrapper/photoncons.py @@ -55,7 +55,8 @@ import numpy as np from scipy.optimize import curve_fit -from ..c_21cmfast import ffi, lib +import py21cmfast.c_21cmfast as lib + from ._utils import _process_exitcode from .cfuncs import broadcast_params from .inputs import InputParameters @@ -79,7 +80,7 @@ def c_memory_allocated(self) -> bool: @c_memory_allocated.setter def c_memory_allocated(self, val): - lib.photon_cons_allocated = ffi.cast("bool", val) + lib.photon_cons_allocated = val _photoncons_state = _PhotonConservationState() @@ -93,20 +94,18 @@ def _init_photon_conservation_correction(*, inputs): return lib.InitialisePhotonCons() -def _calibrate_photon_conservation_correction( - *, redshifts_estimate, nf_estimate, NSpline -): +def _calibrate_photon_conservation_correction(*, redshifts_estimate, nf_estimate): # This function passes the calibration simulation results to C, # Storing a clipped version in global arrays nf_vals and z_vals, # and constructing the GSL interpolator z_NFHistory_spline redshifts_estimate = np.array(redshifts_estimate, dtype="float64") nf_estimate = np.array(nf_estimate, dtype="float64") - z = ffi.cast("double *", ffi.from_buffer(redshifts_estimate)) - xHI = ffi.cast("double *", ffi.from_buffer(nf_estimate)) + z = redshifts_estimate + xHI = nf_estimate logger.debug(f"PhotonCons nf estimates: {nf_estimate}") - return lib.PhotonCons_Calibration(z, xHI, NSpline) + return lib.PhotonCons_Calibration(z, xHI) def _calc_zstart_photon_cons(): @@ -114,7 +113,7 @@ def _calc_zstart_photon_cons(): # Set by neutral fraction astro_params.PHOTONCONS_ZSTART from ._utils import _call_c_simple - return _call_c_simple(lib.ComputeZstart_PhotonCons) + return _call_c_simple(lib.ComputeZstart_PhotonCons)[0] def _get_photon_nonconservation_data() -> dict: @@ -149,16 +148,16 @@ def _get_photon_nonconservation_data() -> dict: IntVal2 = np.array(np.zeros(1), dtype="int32") IntVal3 = np.array(np.zeros(1), dtype="int32") - c_z_at_Q = ffi.cast("double *", ffi.from_buffer(data[0])) - c_Qval = ffi.cast("double *", ffi.from_buffer(data[1])) - c_z_cal = ffi.cast("double *", ffi.from_buffer(data[2])) - c_nf_cal = ffi.cast("double *", ffi.from_buffer(data[3])) - c_PC_nf = ffi.cast("double *", ffi.from_buffer(data[4])) - c_PC_deltaz = ffi.cast("double *", ffi.from_buffer(data[5])) + c_z_at_Q = data[0] + c_Qval = data[1] + c_z_cal = data[2] + c_nf_cal = data[3] + c_PC_nf = data[4] + c_PC_deltaz = data[5] - c_int_NQ = ffi.cast("int *", ffi.from_buffer(IntVal1)) - c_int_NC = ffi.cast("int *", ffi.from_buffer(IntVal2)) - c_int_NP = ffi.cast("int *", ffi.from_buffer(IntVal3)) + c_int_NQ = IntVal1 + c_int_NC = IntVal2 + c_int_NP = IntVal3 # Run the C code errcode = lib.ObtainPhotonConsData( @@ -299,7 +298,8 @@ def calibrate_photon_cons( prev_perturb = None # Arrays for redshift and neutral fraction for the calibration curve - neutral_fraction_photon_cons = [] + # TODO: double check, this was empty before, was that a bug? + neutral_fraction_photon_cons = [1.0] # Initialise the analytic expression for the reionisation history logger.info("About to start photon conservation correction") @@ -361,7 +361,6 @@ def calibrate_photon_cons( _calibrate_photon_conservation_correction( redshifts_estimate=fast_node_redshifts, nf_estimate=neutral_fraction_photon_cons, - NSpline=len(fast_node_redshifts), ) @@ -374,9 +373,9 @@ def get_photoncons_dz(inputs, redshift): redshift_pc_in = np.array([redshift]).astype("f4") stored_redshift_pc_in = np.array([redshift]).astype("f4") lib.adjust_redshifts_for_photoncons( - ffi.cast("float *", redshift_pc_in.ctypes.data), - ffi.cast("float *", stored_redshift_pc_in.ctypes.data), - ffi.cast("float *", deltaz.ctypes.data), + redshift_pc_in, + stored_redshift_pc_in, + deltaz, ) return redshift_pc_in[0], stored_redshift_pc_in[0], deltaz[0] @@ -452,7 +451,8 @@ def photoncons_alpha(inputs): # ratio of given alpha with calibration ratio_ref = (1 - ref_pc_data["nf_calibration"]) / ref_interp - ratio_diff = ratio_test - 1 / ratio_ref[None, :] # find N(alpha)/ref == ref/cal + # find N(alpha)/ref == ref/cal + ratio_diff = ratio_test - 1 / ratio_ref[None, :] diff_test = ( (test_pc_data) + (1 - ref_pc_data["nf_calibration"])[None, ...] diff --git a/src/py21cmfast/wrapper/structs.py b/src/py21cmfast/wrapper/structs.py index e8a603701..729ee4cf4 100644 --- a/src/py21cmfast/wrapper/structs.py +++ b/src/py21cmfast/wrapper/structs.py @@ -2,14 +2,13 @@ from __future__ import annotations -import contextlib import logging from typing import Any import attrs -from bidict import bidict -from ..c_21cmfast import ffi +import py21cmfast.c_21cmfast as lib + from .arrays import Array logger = logging.getLogger(__name__) @@ -29,9 +28,8 @@ class StructWrapper: _name: str = attrs.field(converter=str) cstruct = attrs.field(default=None) - _ffi = attrs.field(default=ffi) - _TYPEMAP = bidict({"float32": "float *", "float64": "double *", "int32": "int *"}) + primitive_types = (bool, str, int, float) @_name.default def _name_default(self): @@ -43,16 +41,21 @@ def __init__(self, *args): This instantiates the memory associated with the C struct, attached to this inst. """ self.__attrs_init__(*args) - self.cstruct = self._new() + self._cobj = getattr(lib, self._name) # The wrapped class + self.cstruct = self._new() # The instance of the wrapped class def _new(self): """Return a new empty C structure corresponding to this class.""" - return self._ffi.new(f"struct {self._name}*") + return self._cobj() @property def fields(self) -> dict[str, Any]: """A list of fields of the underlying C struct (a list of tuples of "name, type").""" - return dict(self._ffi.typeof(self.cstruct[0]).fields) + result = {} + for attr in dir(self.cstruct): + if not attr.startswith("__"): + result[attr] = type(getattr(self.cstruct, attr)) + return result @property def fieldnames(self) -> list[str]: @@ -62,19 +65,17 @@ def fieldnames(self) -> list[str]: @property def pointer_fields(self) -> list[str]: """A list of names of fields which have pointer type in the C struct.""" - return [f for f, t in self.fields.items() if t.type.kind == "pointer"] + return [f.split("set_")[1] for f in self.fields if f.startswith("set_")] @property def primitive_fields(self) -> list[str]: """The list of names of fields which have primitive type in the C struct.""" - return [f for f, t in self.fields.items() if t.type.kind == "primitive"] + return [f for f, t in self.fields.items() if t in self.primitive_types] def __getstate__(self): """Return the current state of the class without pointers.""" return { - k: v - for k, v in self.__dict__.items() - if k not in ["_strings", "cstruct", "_ffi"] + k: v for k, v in self.__dict__.items() if k not in ["_strings", "cstruct"] } def expose_to_c(self, array: Array, name: str): @@ -82,15 +83,13 @@ def expose_to_c(self, array: Array, name: str): if not array.state.initialized: raise ValueError("Array must be initialized before exposing to C") - def _ary2buf(ary): - return self._ffi.cast( - self._TYPEMAP[ary.dtype.name], self._ffi.from_buffer(ary) - ) - try: - setattr(self.cstruct, name, _ary2buf(array.value)) - except TypeError as e: - raise TypeError(f"Error setting {name}") from e + setter = getattr(self.cstruct, "set_" + name) + setter(array.value) + except AttributeError as e: + raise TypeError( + f"Error setting {name} on {self.__class__.__name__}, no setter found" + ) from e class StructInstanceWrapper: @@ -101,35 +100,63 @@ class StructInstanceWrapper: Parameters ---------- wrapped : - The reference to the C object to wrap (contained in the ``cffi.lib`` object). - ffi : - The ``cffi.ffi`` object. + The reference to the C object to wrap. """ - def __init__(self, wrapped, ffi): + # NOTE: currently assumes that the C object is not internally changed + # We get all the values from C on initialization, and pass changes back to C + # The StructInstanceWrapper holds the attributes as they appear in python, + # whereas ._cobj holds primitives and getters/setters for pointers. + # TODO: we should ditch the object attributes and just use the C object + # with a custom __getattr__ + def __init__(self, wrapped): self._cobj = wrapped - self._ffi = ffi - - for nm, _tp in self._ffi.typeof(self._cobj).fields: - setattr(self, nm, getattr(self._cobj, nm)) + # nanobind does not supply a list of fields like CFFI does, so we do + # this instead to return a list of members + for attr in dir(self._cobj): + # ignore dunders + if not attr.startswith("__"): + if attr.startswith("get_"): + # If the attribute is a getter, we need to set the value in python + # to the value of the C++ attribute without the "get_" prefix + setattr(self, attr[4:], getattr(self._cobj, attr)()) + elif not callable(getattr(self._cobj, attr)): + # Otherwise, we just set the attribute to the value + setattr(self, attr, getattr(self._cobj, attr)) # Get the name of the structure - self._ctype = self._ffi.typeof(self._cobj).cname.split()[-1] + self._ctype = type(self._cobj).__name__ def __setattr__(self, name, value): """Set an attribute of the instance, attempting to change it in the C struct as well.""" - with contextlib.suppress(AttributeError): - setattr(self._cobj, name, value) + # use the non-overridden __setattr__ to set the attribute in Python object.__setattr__(self, name, value) + # Set the attribute in the C struct + if not name.startswith("_"): + if "set_" + name in dir(self._cobj): + getattr(self._cobj, "set_" + name)(value) + elif name in dir(self._cobj): + setattr(self._cobj, name, value) + else: + raise AttributeError( + f"Attribute {name} not found in {self.__class__.__name__}" + ) + def items(self): """Yield (name, value) pairs for each element of the struct.""" - for nm, _tp in self._ffi.typeof(self._cobj).fields: - yield nm, getattr(self, nm) + # nanobind does not supply a list of fileds like CFFI does, so we do + # this instead to return a list of members + for attr in dir(self._cobj): + if not attr.startswith("__"): + if attr.startswith("get_"): + yield attr[4:], getattr(self._cobj, attr)() + elif not attr.startswith("set_"): + yield attr, getattr(self._cobj, attr) def keys(self): """Return a list of names of elements in the struct.""" - return [nm for nm, tp in self.items()] + return [nm for nm, _ in self.items()] def __iter__(self): """Iterate over the object like a dict.""" diff --git a/subprojects/.gitignore b/subprojects/.gitignore new file mode 100644 index 000000000..9a1d1e46e --- /dev/null +++ b/subprojects/.gitignore @@ -0,0 +1,3 @@ +* +!.gitignore +!*.wrap diff --git a/subprojects/nanobind.wrap b/subprojects/nanobind.wrap new file mode 100644 index 000000000..78e2e7c5d --- /dev/null +++ b/subprojects/nanobind.wrap @@ -0,0 +1,13 @@ +[wrap-file] +directory = nanobind-2.4.0 +source_url = https://github.com/wjakob/nanobind/archive/refs/tags/v2.4.0.tar.gz +source_filename = nanobind-2.4.0.tar.gz +source_hash = bb35deaed7efac5029ed1e33880a415638352f757d49207a8e6013fefb6c49a7 +patch_filename = nanobind_2.4.0-2_patch.zip +patch_url = https://wrapdb.mesonbuild.com/v2/nanobind_2.4.0-2/get_patch +patch_hash = cf493bda0b11ea4e8d9dd42229c3bbdd52af88cc4aedac75a1eccb102b86dd4a +source_fallback_url = https://github.com/mesonbuild/wrapdb/releases/download/nanobind_2.4.0-2/nanobind-2.4.0.tar.gz +wrapdb_version = 2.4.0-2 + +[provide] +nanobind = nanobind_dep diff --git a/subprojects/robin-map.wrap b/subprojects/robin-map.wrap new file mode 100644 index 000000000..3da2993bb --- /dev/null +++ b/subprojects/robin-map.wrap @@ -0,0 +1,13 @@ +[wrap-file] +directory = robin-map-1.3.0 +source_url = https://github.com/Tessil/robin-map/archive/refs/tags/v1.3.0.tar.gz +source_filename = robin-map-1.3.0.tar.gz +source_hash = a8424ad3b0affd4c57ed26f0f3d8a29604f0e1f2ef2089f497f614b1c94c7236 +patch_filename = robin-map_1.3.0-1_patch.zip +patch_url = https://wrapdb.mesonbuild.com/v2/robin-map_1.3.0-1/get_patch +patch_hash = 6d090f988541ffb053512607e0942cbd0dbc2a4fa0563e44ff6a37e810b8c739 +source_fallback_url = https://github.com/mesonbuild/wrapdb/releases/download/robin-map_1.3.0-1/robin-map-1.3.0.tar.gz +wrapdb_version = 1.3.0-1 + +[provide] +robin-map = robin_map_dep diff --git a/tests/test_data/power_spectra_dexm.h5 b/tests/test_data/power_spectra_dexm.h5 index 1c830080b..b87c7e6db 100644 Binary files a/tests/test_data/power_spectra_dexm.h5 and b/tests/test_data/power_spectra_dexm.h5 differ diff --git a/tests/test_data/power_spectra_fixed_halogrids.h5 b/tests/test_data/power_spectra_fixed_halogrids.h5 index 913dfb5e5..9dff2b705 100644 Binary files a/tests/test_data/power_spectra_fixed_halogrids.h5 and b/tests/test_data/power_spectra_fixed_halogrids.h5 differ diff --git a/tests/test_data/power_spectra_sampler.h5 b/tests/test_data/power_spectra_sampler.h5 index 328320c5f..a1ea2c15e 100644 Binary files a/tests/test_data/power_spectra_sampler.h5 and b/tests/test_data/power_spectra_sampler.h5 differ diff --git a/tests/test_data/power_spectra_sampler_ir.h5 b/tests/test_data/power_spectra_sampler_ir.h5 index 3c6952e7e..a70e356f7 100644 Binary files a/tests/test_data/power_spectra_sampler_ir.h5 and b/tests/test_data/power_spectra_sampler_ir.h5 differ diff --git a/tests/test_data/power_spectra_sampler_mini.h5 b/tests/test_data/power_spectra_sampler_mini.h5 index 88cd31c8e..645a0d0b1 100644 Binary files a/tests/test_data/power_spectra_sampler_mini.h5 and b/tests/test_data/power_spectra_sampler_mini.h5 differ diff --git a/tests/test_data/power_spectra_sampler_noncubic.h5 b/tests/test_data/power_spectra_sampler_noncubic.h5 index 1a9920dad..3606f45eb 100644 Binary files a/tests/test_data/power_spectra_sampler_noncubic.h5 and b/tests/test_data/power_spectra_sampler_noncubic.h5 differ diff --git a/tests/test_data/power_spectra_sampler_ts.h5 b/tests/test_data/power_spectra_sampler_ts.h5 index d01c17845..9db1f2e96 100644 Binary files a/tests/test_data/power_spectra_sampler_ts.h5 and b/tests/test_data/power_spectra_sampler_ts.h5 differ diff --git a/tests/test_data/power_spectra_sampler_ts_ir.h5 b/tests/test_data/power_spectra_sampler_ts_ir.h5 index e0c67b4bf..c7418c7c1 100644 Binary files a/tests/test_data/power_spectra_sampler_ts_ir.h5 and b/tests/test_data/power_spectra_sampler_ts_ir.h5 differ diff --git a/tests/test_data/power_spectra_sampler_ts_ir_onethread.h5 b/tests/test_data/power_spectra_sampler_ts_ir_onethread.h5 index 5722abbd9..b8b00a7b3 100644 Binary files a/tests/test_data/power_spectra_sampler_ts_ir_onethread.h5 and b/tests/test_data/power_spectra_sampler_ts_ir_onethread.h5 differ diff --git a/tests/test_exceptions.py b/tests/test_exceptions.py index d0d39fb57..1b5a968c0 100644 --- a/tests/test_exceptions.py +++ b/tests/test_exceptions.py @@ -3,10 +3,10 @@ import numpy as np import pytest -from py21cmfast.c_21cmfast import ffi, lib +import py21cmfast.c_21cmfast as lib from py21cmfast.wrapper.exceptions import ( PHOTONCONSERROR, - ParameterError, + PhotonConsError, _process_exitcode, ) @@ -21,19 +21,16 @@ def test_basic(subfunc): def test_simple(subfunc): answer = np.array([0], dtype="f8") - status = lib.FunctionThatCatches( - subfunc, False, ffi.cast("double *", ffi.from_buffer(answer)) - ) - - with pytest.raises(ParameterError): + status = lib.FunctionThatCatches(subfunc, False, answer) + with pytest.raises(PhotonConsError): _process_exitcode( status, lib.FunctionThatCatches, - (False, ffi.cast("double *", ffi.from_buffer(answer))), + (subfunc, False, answer), ) def test_pass(): answer = np.array([0], dtype="f8") - lib.FunctionThatCatches(True, True, ffi.cast("double *", ffi.from_buffer(answer))) + lib.FunctionThatCatches(True, True, answer) assert answer == 5.0 diff --git a/tests/test_filtering.py b/tests/test_filtering.py index adbd9a00e..9e92224e2 100644 --- a/tests/test_filtering.py +++ b/tests/test_filtering.py @@ -6,7 +6,7 @@ from matplotlib.colors import Normalize from scipy.stats import binned_statistic as binstat -from py21cmfast.c_21cmfast import ffi, lib +import py21cmfast.c_21cmfast as lib from py21cmfast.wrapper.cfuncs import broadcast_input_struct from . import produce_integration_test_data as prd @@ -98,19 +98,19 @@ def test_filters(filter_flag, R, plt): output_box_centre = np.zeros((up.HII_DIM,) * 3, dtype="f8") # use MFP=20 for the exp filter, use a 4 cell shell for the annular filter if filter_flag == 3: - R_param = 20 + R_param = 20.0 elif filter_flag == 4: - R_param = max(R - 4 * (up.BOX_LEN / up.HII_DIM), 0) + R_param = max(R - 4 * (up.BOX_LEN / up.HII_DIM), 0.0) else: - R_param = 0 + R_param = 0.0 broadcast_input_struct(inputs) lib.test_filter( - ffi.cast("float *", input_box_centre.ctypes.data), + input_box_centre, R, R_param, filter_flag, - ffi.cast("double *", output_box_centre.ctypes.data), + output_box_centre, ) # expected outputs given in cell units diff --git a/tests/test_halo_sampler.py b/tests/test_halo_sampler.py index 961417b19..54c9c611d 100644 --- a/tests/test_halo_sampler.py +++ b/tests/test_halo_sampler.py @@ -4,11 +4,6 @@ import numpy as np import pytest -from py21cmfast import ( - compute_halo_grid, - compute_initial_conditions, - perturb_field, -) from py21cmfast.wrapper import cfuncs as cf from . import test_c_interpolation_tables as cint @@ -231,112 +226,6 @@ def test_halo_prop_sampling(default_input_struct_ts, plt): np.testing.assert_allclose(exp_LX, sim_LX, rtol=1e-4) -# testing that the integrals in HaloBox.c are done correctly by -# using the fixed grids -# TODO: extend test to minihalos w/o feedback -# TODO: maybe let this run with the default ics and perturbed field, -# even though they have different flag options? -def test_fixed_grids(default_input_struct_ts, plt): - inputs = default_input_struct_ts.evolve_input_structs( - USE_HALO_FIELD=True, - FIXED_HALO_GRIDS=True, - USE_UPPER_STELLAR_TURNOVER=False, - ) - - ic = compute_initial_conditions( - inputs=inputs, - ) - perturbed_field = perturb_field(initial_conditions=ic, redshift=10.0, inputs=inputs) - dens = perturbed_field.get("density") - - hbox = compute_halo_grid( - initial_conditions=ic, - inputs=inputs, - perturbed_field=perturbed_field, - ) - - cell_radius = 0.620350491 * ( - inputs.simulation_options.BOX_LEN / inputs.simulation_options.HII_DIM - ) - mt_grid = np.full_like(dens, inputs.astro_params.M_TURN) - - integral_sfrd, _ = cf.evaluate_SFRD_cond( - inputs=inputs, - redshift=perturbed_field.redshift, - radius=cell_radius, - densities=dens, - log10mturns=mt_grid, - ) - integral_sfrd *= 1 + dens - - integral_nion, _ = cf.evaluate_Nion_cond( - inputs=inputs, - redshift=perturbed_field.redshift, - radius=cell_radius, - densities=dens, - l10mturns_acg=mt_grid, - l10mturns_mcg=mt_grid, - ) - integral_nion *= 1 + dens - - integral_xray = cf.evaluate_Xray_cond( - inputs=inputs, - redshift=perturbed_field.redshift, - radius=cell_radius, - densities=perturbed_field.density.value, - log10mturns=mt_grid, - ) - integral_xray *= 1 + dens - - # mean-fixing and prefactor numerics results in 1-to-1 comparisons being more difficult - # for now we just test the relative values - integral_sfrd *= hbox.get("halo_sfr").mean() / integral_sfrd.mean() - integral_nion *= hbox.get("n_ion").mean() / integral_nion.mean() - integral_xray *= hbox.get("halo_xray").mean() / integral_xray.mean() - - if plt == mpl.pyplot: - plot_scatter_comparison( - [integral_sfrd, integral_nion, integral_xray], - [hbox.get("halo_sfr"), hbox.get("n_ion"), hbox.get("halo_xray")], - [dens, dens, dens], - ["SFRD", "Nion", "LX"], - plt=plt, - ) - - # TODO: a 5% tolerance isn't fantastic here since they should be the same to a constant factor. - # this happens near the GL integration transition (<1%) and delta_crit (~4%), examine plots - rtol = 5e-2 - print(f"{hbox.get('halo_sfr').shape} {integral_sfrd.shape}", flush=True) - print_failure_stats( - hbox.get("halo_sfr"), - integral_sfrd, - [dens], - 0.0, - rtol, - "sfr", - ) - print_failure_stats( - hbox.get("n_ion"), - integral_nion, - [dens], - 0.0, - rtol, - "nion", - ) - print_failure_stats( - hbox.get("halo_xray"), - integral_xray, - [dens], - 0.0, - rtol, - "LX", - ) - - np.testing.assert_allclose(hbox.get("halo_sfr"), integral_sfrd, rtol=rtol) - np.testing.assert_allclose(hbox.get("n_ion"), integral_nion, rtol=rtol) - np.testing.assert_allclose(hbox.get("halo_xray"), integral_xray, rtol=rtol) - - # very basic scatter comparison def plot_scatter_comparison( truths, tests, inputs, names, log_vals=True, log_inp=False, plt=None diff --git a/tests/test_integration_features.py b/tests/test_integration_features.py index 880d927da..b873b9247 100644 --- a/tests/test_integration_features.py +++ b/tests/test_integration_features.py @@ -76,10 +76,9 @@ def test_power_spectra_coeval(name, module_direc, plt): [test_k], abs_tol=0, rel_tol=1e-4, - name=key, + name=f"{name} - {key}", ) - any_failed = True # TODO:remove this testing line if plt == mpl.pyplot and any_failed: make_coeval_comparison_plot(true_k, test_k, true_powers, test_powers, plt) diff --git a/tests/test_output_structs.py b/tests/test_output_structs.py index 812db02ad..bdd6726cb 100644 --- a/tests/test_output_structs.py +++ b/tests/test_output_structs.py @@ -9,8 +9,10 @@ from py21cmfast import ( InitialConditions, # An example of an output struct InputParameters, + config, ) from py21cmfast.wrapper import outputs as ox +from py21cmfast.wrapper.arrays import Array @pytest.fixture @@ -76,3 +78,157 @@ def test_all_fields_exist(struct: ox.OutputStruct): for name in cstruct.primitive_fields: assert name in this + + +# NOTE: These do not test every field, but does test every conditional in the +# OutputStruct constructors, a better approach would probably be to have a +# comprehensive list of {"field_name": {"flag": value}} conditions for the fields +# in the output module which is checked in the constructors +def test_optional_field_ic(default_input_struct_lc: InputParameters): + """Ensure that the correct InitialConditions fields are set based on the parameters.""" + ic = ox.InitialConditions.new(inputs=default_input_struct_lc) + assert isinstance(ic.lowres_vx, Array) + assert isinstance(ic.lowres_vx_2LPT, Array) + assert ic.hires_vx is None + assert isinstance(ic.hires_vx_2LPT, Array) # Python requires it, check the C + assert ic.lowres_vcb is None + + ic = ox.InitialConditions.new( + inputs=default_input_struct_lc.evolve_input_structs( + PERTURB_ALGORITHM="ZELDOVICH" + ) + ) + assert isinstance(ic.lowres_vy, Array) + assert ic.lowres_vy_2LPT is None + assert ic.hires_vy is None + assert ic.hires_vy_2LPT is None + + ic = ox.InitialConditions.new( + inputs=default_input_struct_lc.evolve_input_structs(PERTURB_ON_HIGH_RES=True) + ) + assert ic.lowres_vz is None + assert ic.lowres_vz_2LPT is None + assert isinstance(ic.hires_vz, Array) + assert isinstance(ic.hires_vz_2LPT, Array) + + ic = ox.InitialConditions.new( + inputs=default_input_struct_lc.evolve_input_structs( + USE_RELATIVE_VELOCITIES=True, + POWER_SPECTRUM="CLASS", + ) + ) + assert isinstance(ic.lowres_vx, Array) + assert isinstance(ic.lowres_vx_2LPT, Array) + assert ic.hires_vx is None + assert isinstance(ic.hires_vx_2LPT, Array) + assert isinstance(ic.lowres_vcb, Array) + + +def test_optional_field_perturb(default_input_struct_lc: InputParameters): + """Ensure that the correct PerturbedField fields are set based on the parameters.""" + pt = ox.PerturbedField.new(redshift=0.0, inputs=default_input_struct_lc) + assert isinstance(pt.density, Array) + assert isinstance(pt.velocity_z, Array) + assert isinstance(pt.velocity_x, Array) + assert isinstance(pt.velocity_y, Array) + + pt = ox.PerturbedField.new( + redshift=0.0, + inputs=default_input_struct_lc.evolve_input_structs(KEEP_3D_VELOCITIES=False), + ) + assert isinstance(pt.density, Array) + assert isinstance(pt.velocity_z, Array) + assert pt.velocity_x is None + assert pt.velocity_y is None + + +def test_optional_field_halobox(default_input_struct_lc: InputParameters): + """Ensure that the correct HaloBox fields are set based on the parameters.""" + hb = ox.HaloBox.new(redshift=0.0, inputs=default_input_struct_lc) + assert hb.halo_mass is None + assert isinstance(hb.halo_sfr, Array) + assert isinstance(hb.n_ion, Array) + assert hb.halo_sfr_mini is None + assert hb.halo_xray is None + assert hb.whalo_sfr is None + + with config.use(EXTRA_HALOBOX_FIELDS=True): + hb = ox.HaloBox.new(redshift=0.0, inputs=default_input_struct_lc) + assert isinstance(hb.halo_mass, Array) + + inputs = default_input_struct_lc.evolve_input_structs(INHOMO_RECO=True) + hb = ox.HaloBox.new(redshift=0.0, inputs=inputs) + assert isinstance(hb.whalo_sfr, Array) + + inputs = inputs.evolve_input_structs(USE_TS_FLUCT=True) + hb = ox.HaloBox.new(redshift=0.0, inputs=inputs) + assert isinstance(hb.halo_xray, Array) + + inputs = inputs.evolve_input_structs(USE_MINI_HALOS=True) + hb = ox.HaloBox.new(redshift=0.0, inputs=inputs) + assert isinstance(hb.halo_sfr_mini, Array) + + +def test_optional_field_xrs(default_input_struct_lc: InputParameters): + """Ensure that the correct XraySourceBox fields are set based on the parameters.""" + xr = ox.XraySourceBox.new(redshift=0.0, inputs=default_input_struct_lc) + assert isinstance(xr.filtered_sfr, Array) + assert isinstance(xr.filtered_xray, Array) + assert xr.filtered_sfr_mini is None + + inputs = default_input_struct_lc.evolve_input_structs( + USE_TS_FLUCT=True, + USE_MINI_HALOS=True, + INHOMO_RECO=True, + ) + xr = ox.XraySourceBox.new(redshift=0.0, inputs=inputs) + assert isinstance(xr.filtered_sfr_mini, Array) + + +def test_optional_field_ts(default_input_struct_lc: InputParameters): + """Ensure that the correct TsBox fields are set based on the parameters.""" + ts = ox.TsBox.new(redshift=0.0, inputs=default_input_struct_lc) + assert isinstance(ts.spin_temperature, Array) + assert isinstance(ts.xray_ionised_fraction, Array) + assert isinstance(ts.kinetic_temp_neutral, Array) + assert ts.J_21_LW is None + + inputs = default_input_struct_lc.evolve_input_structs( + USE_TS_FLUCT=True, + INHOMO_RECO=True, + USE_MINI_HALOS=True, + ) + ts = ox.TsBox.new(redshift=0.0, inputs=inputs) + assert isinstance(ts.J_21_LW, Array) + + +def test_optional_field_ion(default_input_struct_lc: InputParameters): + """Ensure that the correct IonizedBox fields are set based on the parameters.""" + ion = ox.IonizedBox.new(redshift=0.0, inputs=default_input_struct_lc) + assert isinstance(ion.neutral_fraction, Array) + assert ion.unnormalised_nion_mini is None + assert ion.cumulative_recombinations is None + + inputs = default_input_struct_lc.evolve_input_structs( + INHOMO_RECO=True, + ) + ion = ox.IonizedBox.new(redshift=0.0, inputs=inputs) + assert isinstance(ion.cumulative_recombinations, Array) + + inputs = inputs.evolve_input_structs( + USE_TS_FLUCT=True, + USE_MINI_HALOS=True, + ) + ion = ox.IonizedBox.new(redshift=0.0, inputs=inputs) + assert isinstance(ion.unnormalised_nion_mini, Array) + + +def test_optional_field_bt(default_input_struct_lc: InputParameters): + """Ensure that the correct BrightnessTemp fields are set based on the parameters.""" + bt = ox.BrightnessTemp.new(redshift=0.0, inputs=default_input_struct_lc) + assert isinstance(bt.brightness_temp, Array) + assert bt.tau_21 is None + + inputs = default_input_struct_lc.evolve_input_structs(USE_TS_FLUCT=True) + bt = ox.BrightnessTemp.new(redshift=0.0, inputs=inputs) + assert isinstance(bt.tau_21, Array) diff --git a/tests/test_perturb.py b/tests/test_perturb.py new file mode 100644 index 000000000..3871ce3ac --- /dev/null +++ b/tests/test_perturb.py @@ -0,0 +1,229 @@ +"""Contains the tests for the Perturbation algorithm (Linear, Zel'dovich, 2LPT). + +Including perturbation of galaxy properties +""" + +import numpy as np +import pytest + +from py21cmfast import ( + InitialConditions, + compute_halo_grid, + perturb_field, +) +from py21cmfast.wrapper import cfuncs as cf + + +class TestPerturb: + """Tests regarding the perturbation algorithms.""" + + @pytest.fixture(scope="class") + def test_pt_z(self): + """Set redshift at which to test the 2LPT.""" + return 8.0 + + @pytest.fixture(scope="class") + def inputs_low(self, default_input_struct_ts): + """Parameters for 2LPT tests.""" + # using 3-1 ratio for testing + return default_input_struct_ts.evolve_input_structs( + DIM=12, + HII_DIM=4, + BOX_LEN=8, + USE_HALO_FIELD=True, + FIXED_HALO_GRIDS=True, + PERTURB_ON_HIGH_RES=False, + R_BUBBLE_MAX=1.0, + ) + + @pytest.fixture(scope="class") + def inputs_zel(self, inputs_low): + """Parameters for Zel'dovich test.""" + return inputs_low.evolve_input_structs( + PERTURB_ALGORITHM="ZELDOVICH", + ) + + @pytest.fixture(scope="class") + def inputs_linear(self, inputs_low): + """Parameters for Linear test.""" + return inputs_low.evolve_input_structs( + PERTURB_ALGORITHM="LINEAR", + ) + + def get_fake_ics(self, inputs, test_pt_z): + """Make an IC instance for the testing. + + These are inconsistent and strange values for real ICS but + very trackable. + """ + ics = InitialConditions.new(inputs=inputs) + d_z = cf.get_growth_factor(inputs=inputs, redshift=test_pt_z) + d_z_i = cf.get_growth_factor( + inputs=inputs, redshift=inputs.simulation_options.INITIAL_REDSHIFT + ) + + res_fac = int(inputs.simulation_options.HIRES_TO_LOWRES_FACTOR) + lo_dim = inputs.simulation_options.HII_DIM + hi_dim = inputs.simulation_options.DIM + fac_1lpt = inputs.simulation_options.cell_size / (d_z - d_z_i) + fac_2lpt = inputs.simulation_options.cell_size / ( + (-3.0 / 7.0) * (d_z**2 - d_z_i**2) + ) + for name, array in ics.arrays.items(): + setattr(ics, name, array.initialize().computed()) + + # setup the velocities + # NOTE: IC velocities are in Mpc + if not inputs.matter_options.PERTURB_ON_HIGH_RES: + fake_v = np.ones_like(ics.get("lowres_vx")) + ics.set("lowres_vx", 0 * fake_v) + ics.set("lowres_vy", fac_1lpt * fake_v) + ics.set("lowres_vz", 0 * fake_v) + if inputs.matter_options.PERTURB_ALGORITHM == "2LPT": + ics.set("lowres_vx_2LPT", 0 * fake_v) + ics.set("lowres_vy_2LPT", 0 * fake_v) + ics.set("lowres_vz_2LPT", fac_2lpt * fake_v) + else: + fake_v = np.ones_like(ics.get("hires_vx")) + ics.set("hires_vx", 0 * fake_v) + ics.set("hires_vy", -fac_1lpt * fake_v) + ics.set("hires_vz", 0 * fake_v) + if inputs.matter_options.PERTURB_ALGORITHM == "2LPT": + ics.set("hires_vx_2LPT", 0 * fake_v) + ics.set("hires_vy_2LPT", 0 * fake_v) + ics.set("hires_vz_2LPT", -fac_2lpt * fake_v) + + # set some densities that can be easily tracked + d_lo = np.zeros_like(ics.get("lowres_density")) + d_lo[0, 0, 0] = 1 + d_lo[lo_dim // 2, lo_dim // 2, lo_dim // 2] = -1 + ics.set("lowres_density", d_lo) + # make similar hires densities + d_hi = np.zeros_like(ics.get("hires_density")) + d_hi[0, 0, 0] = res_fac**3 + d_hi[hi_dim // 2, hi_dim // 2, hi_dim // 2] = -(res_fac**3) + ics.set("hires_density", d_hi) + + return ics + + @pytest.mark.parametrize("inputs", ["inputs_low", "inputs_zel"]) + def test_lowres_perturb(self, inputs, test_pt_z, request): + """Tests low-resolution perturbation.""" + inputs = request.getfixturevalue(inputs) + ics = self.get_fake_ics(inputs, test_pt_z) + z_d = ( + test_pt_z + if inputs.matter_options.PERTURB_ALGORITHM == "LINEAR" + else inputs.simulation_options.INITIAL_REDSHIFT + ) + roll_var = { + "LINEAR": (0, 0, 0), + "ZELDOVICH": (0, 1, 0), + "2LPT": (0, 1, -1), + }[inputs.matter_options.PERTURB_ALGORITHM] + d_z = cf.get_growth_factor(inputs=inputs, redshift=z_d) + + expected_dens = np.roll(ics.get("lowres_density"), roll_var, (0, 1, 2)) + expected_dens *= d_z + pt = perturb_field( + initial_conditions=ics, + redshift=test_pt_z, + regenerate=True, + write=False, + ) + np.testing.assert_allclose(pt.get("density"), expected_dens, atol=1e-3) + + @pytest.mark.skip( + reason="aliasing in downsampling makes hires 2lpt unit tests difficult" + ) + def test_hires_perturb(self, inputs_hi, test_pt_z): + """Tests the high resolution perturbation.""" + ics = self.get_fake_ics(inputs_hi, test_pt_z) + expected_dens = np.roll(ics.get("lowres_density"), (0, -1, 1), (0, 1, 2)) + d_z_i = cf.get_growth_factor(inputs=inputs_hi, redshift=test_pt_z) + expected_dens *= d_z_i + pt = perturb_field( + initial_conditions=ics, + redshift=test_pt_z, + regenerate=True, + write=False, + ) + np.testing.assert_allclose(pt.get("density"), expected_dens, atol=1e-3) + + # TODO: include minihalo properties + # TODO: include linear (for some reason) + @pytest.mark.parametrize("inputs", ["inputs_low", "inputs_zel"]) + def test_hb_perturb(self, inputs, test_pt_z, request): + """Tests the halo property perturbation.""" + inputs = request.getfixturevalue(inputs) + ics = self.get_fake_ics(inputs, test_pt_z) + hbox = compute_halo_grid( + redshift=test_pt_z, + initial_conditions=ics, + inputs=inputs, + ) + cell_radius = 0.620350491 * ( + inputs.simulation_options.BOX_LEN / inputs.simulation_options.HII_DIM + ) + d_z = cf.get_growth_factor( + inputs=inputs, + redshift=test_pt_z, + ) + roll_var = { + "LINEAR": (0, 0, 0), + "ZELDOVICH": (0, 1, 0), + "2LPT": (0, 1, -1), + }[inputs.matter_options.PERTURB_ALGORITHM] + dens = np.roll(ics.get("lowres_density"), roll_var, (0, 1, 2)) * d_z + mt_grid = np.full_like(dens, inputs.astro_params.M_TURN) + + prefac_sfr = ( + inputs.cosmo_params.cosmo.critical_density(0).to("Msun Mpc-3").value + * inputs.astro_params.cdict["F_STAR10"] + * inputs.cosmo_params.OMb + * inputs.cosmo_params.cosmo.H(test_pt_z).to("s-1").value + / inputs.astro_params.t_STAR + ) + prefac_nion = ( + inputs.cosmo_params.cosmo.critical_density(0).to("Msun Mpc-3").value + * inputs.astro_params.cdict["F_STAR10"] + * inputs.cosmo_params.OMb + * inputs.astro_params.cdict["F_ESC10"] + * inputs.astro_params.cdict["POP2_ION"] + ) + prefac_xray = ( + inputs.cosmo_params.cosmo.critical_density(0).to("Msun Mpc-3").value + * inputs.cosmo_params.OMm + ) + integral_sfrd, _ = cf.evaluate_SFRD_cond( + inputs=inputs, + redshift=test_pt_z, + radius=cell_radius, + densities=dens, + log10mturns=mt_grid, + ) + integral_sfrd *= prefac_sfr + + integral_nion, _ = cf.evaluate_Nion_cond( + inputs=inputs, + redshift=test_pt_z, + radius=cell_radius, + densities=dens, + l10mturns_acg=mt_grid, + l10mturns_mcg=mt_grid, + ) + integral_nion *= prefac_nion + + integral_xray = cf.evaluate_Xray_cond( + inputs=inputs, + redshift=test_pt_z, + radius=cell_radius, + densities=dens, + log10mturns=mt_grid, + ) + integral_xray *= prefac_xray + + rtol = 1e-2 + np.testing.assert_allclose(hbox.get("halo_sfr"), integral_sfrd, rtol=rtol) + np.testing.assert_allclose(hbox.get("n_ion"), integral_nion, rtol=rtol) + np.testing.assert_allclose(hbox.get("halo_xray"), integral_xray, rtol=rtol) diff --git a/tests/test_tables.py b/tests/test_tables.py index 6fc9349f8..8d3a3ad94 100644 --- a/tests/test_tables.py +++ b/tests/test_tables.py @@ -1,6 +1,6 @@ """Test initializing tables in C.""" -from py21cmfast.c_21cmfast import lib +import py21cmfast.c_21cmfast as lib from py21cmfast.wrapper.cfuncs import broadcast_input_struct