diff --git a/.bumpversion.cfg b/.bumpversion.cfg
deleted file mode 100644
index 97e8d915f..000000000
--- a/.bumpversion.cfg
+++ /dev/null
@@ -1,6 +0,0 @@
-[bumpversion]
-current_version = 3.4.0
-commit = False
-tag = False
-
-[bumpversion:file:VERSION]
diff --git a/.gitignore b/.gitignore
index ae1ec4e16..746a9461e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,6 +14,8 @@ src/21cmFAST.egg-info/
 pip-wheel-metadata/
 src/py21cmfast\.egg-info/
 
+.python-version
+
 *.so
 build/
 *.*~
diff --git a/VERSION b/VERSION
deleted file mode 100644
index 18091983f..000000000
--- a/VERSION
+++ /dev/null
@@ -1 +0,0 @@
-3.4.0
diff --git a/build_cffi.py b/build_cffi.py
deleted file mode 100755
index 2b132e78c..000000000
--- a/build_cffi.py
+++ /dev/null
@@ -1,132 +0,0 @@
-"""Build the C code with CFFI."""
-
-import os
-import sys
-import sysconfig
-from pathlib import Path
-
-from cffi import FFI
-
-# Get the compiler. We support gcc and clang.
-# The compiler is determnined from the environment and uses sysconfig as a fallback.
-source = "environment variable 'CC'" if "CC" in os.environ else "sysconfig"
-_compiler = os.environ.get("CC", sysconfig.get_config_var("CC"))
-print(f"Using compiler from {source}: {_compiler}")
-
-if "gcc" in _compiler:
-    compiler = "gcc"
-elif "clang" in _compiler:
-    compiler = "clang"
-else:
-    raise ValueError(f"Compiler {_compiler} not supported for 21cmFAST")
-
-ffi = FFI()
-
-LOCATION = Path(__file__).resolve().parent
-CLOC = LOCATION / "src" / "py21cmfast" / "src"
-include_dirs = [str(CLOC)]
-c_files = [str(fl.relative_to(LOCATION)) for fl in sorted(CLOC.glob("*.c"))]
-
-# Set the C-code logging level.
-# If DEBUG is set, we default to the highest level, but if not,
-# we set it to the level just above no logging at all.
-log_level = os.environ.get("LOG_LEVEL", 4 if "DEBUG" in os.environ else 1)
-available_levels = [
-    "NONE",
-    "ERROR",
-    "WARNING",
-    "INFO",
-    "DEBUG",
-    "SUPER_DEBUG",
-    "ULTRA_DEBUG",
-]
-
-
-if isinstance(log_level, str) and log_level.upper() in available_levels:
-    log_level = available_levels.index(log_level.upper())
-
-try:
-    log_level = int(log_level)
-except ValueError as e:
-    # note: for py35 support, can't use f strings.
-    raise ValueError(
-        "LOG_LEVEL must be specified as a positive integer, or one "
-        f"of {available_levels}"
-    ) from e
-
-# ==================================================
-# Set compilation arguments dependent on environment
-# ==================================================
-
-extra_compile_args = ["-Wall", "--verbose", f"-DLOG_LEVEL={log_level:d}"]
-
-if "DEBUG" in os.environ:
-    extra_compile_args += ["-g", "-O0"]
-else:
-    extra_compile_args += ["-Ofast"]
-
-if sys.platform == "darwin":
-    extra_compile_args += ["-Xpreprocessor"]
-
-extra_compile_args += ["-fopenmp"]
-
-libraries = ["m", "gsl", "gslcblas", "fftw3f_omp", "fftw3f"]
-
-# stuff for gperftools
-if "PROFILE" in os.environ:
-    libraries += ["profiler", "tcmalloc"]
-    # we need this even if DEBUG is off
-    extra_compile_args += ["-g"]
-
-if compiler == "clang":
-    libraries += ["omp"]
-
-library_dirs = []
-for k, v in os.environ.items():
-    if "inc" in k.lower():
-        include_dirs += [v]
-    elif "lib" in k.lower():
-        library_dirs += [v]
-
-# =================================================================
-# NOTES FOR DEVELOPERS:
-#   The CFFI implementation works as follows:
-#       - All function prototypes, global variables and type definitions *directly* used
-#           in the python wrapper must be declared via ffi.cdef("""C CODE""").
-#           There must be no compiler directives in this code (#include, #define, etc)
-#       - All implementations of global variables and types present in the cdef() calls
-#           must also be present in the second argument of set_source.
-#           This is passed to the compiler.
-#       - The `sources` kwarg then contains all the .c files in the library which are to be compiled
-
-# This is the overall C code.
-ffi.set_source(
-    "py21cmfast.c_21cmfast",  # Name/Location of shared library module
-    """
-    #include "21cmFAST.h"
-    """,
-    sources=c_files,
-    include_dirs=include_dirs,
-    library_dirs=library_dirs,
-    libraries=libraries,
-    extra_compile_args=extra_compile_args,
-)
-
-# Header files containing types, globals and function prototypes
-with (CLOC / "_inputparams_wrapper.h").open() as f:
-    ffi.cdef(f.read())
-with (CLOC / "_outputstructs_wrapper.h").open() as f:
-    ffi.cdef(f.read())
-with (CLOC / "_functionprototypes_wrapper.h").open() as f:
-    ffi.cdef(f.read())
-
-# CFFI needs to be able to access a free function to make the __del__ method for OutputStruct fields
-#  This will expose the standard free() function to the wrapper so it can be used
-ffi.cdef(
-    """
-        void free(void *ptr);
-    """
-)
-
-if __name__ == "__main__":
-    ffi.compile()
diff --git a/bump b/bump
deleted file mode 100755
index 1d8fa939d..000000000
--- a/bump
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/usr/bin/bash
-set -e
-
-PART=$1
-
-OLDVERSION=$(cat VERSION)
-NEWVERSION=$(bump2version --dry-run --list ${PART} | grep new_version | sed -r s,"^.*=",,)
-echo "New Version: ${NEWVERSION}"
-
-# Actually Run The Update
-bump2version $PART
-
-# Now add in stuff to the changelog
-python changethelog.py ${NEWVERSION}
-
-# Now commit
-git add .
-git commit -m "Bump Version: ${OLDVERSION} -> ${NEWVERSION}"
diff --git a/environment_dev.yml b/environment_dev.yml
index 8a73359c3..3e3b1bb88 100644
--- a/environment_dev.yml
+++ b/environment_dev.yml
@@ -8,7 +8,6 @@ dependencies:
   - zlib
   - pip
   - libxml2
-  - libffi
   - zipp
   - click
   - scipy
@@ -47,7 +46,6 @@ dependencies:
   - qt
   - packaging
   - ipython_genutils
-  - cffi
   - pytest-remotedata
   - nomkl
   - chardet
@@ -117,4 +115,4 @@ dependencies:
   - pytest-plt
   - questionary
   - pip:
-      - pre-commit
+    - pre-commit
diff --git a/install_custom.py b/install_custom.py
new file mode 100755
index 000000000..a2a7624d4
--- /dev/null
+++ b/install_custom.py
@@ -0,0 +1,72 @@
+#!/bin/python
+"""
+install_custom.py provides a custom installation process for the 21cmFAST package.
+
+Allows users to specify various build and configuration options via command-line arguments.
+
+Features:
+- Allows setting the log level for the installation process.
+- Provides an option to enable debug symbols for the build.
+- Enables customization of the optimization level for the build process.
+
+Command-line Arguments:
+- --log-level: Specifies the log level for the build process. Options include:
+    NO_LOG, ERROR, WARNING, INFO, DEBUG, SUPER_DEBUG, ULTRA_DEBUG. Defaults to WARNING.
+- --debug: Enables debug symbols for the build, which can be useful for debugging.
+- -o, --optimization: Sets the optimization level for the build (e.g., -O0, -O1, -O2, -O3).
+    Defaults to 3.
+
+Usage:
+Run the script from the command line to install 21cmFAST with the desired options:
+        python install_custom.py [options]
+
+Example:
+        python install_custom.py --log-level DEBUG --debug -o 2
+
+Dependencies:
+- Python 3.x
+- pip (Python package installer)
+
+Note:
+This script uses the `subprocess` module to invoke the `pip install` command with
+custom configuration settings.
+
+"""
+
+import argparse
+import subprocess
+
+# Define the command-line arguments
+parser = argparse.ArgumentParser(description="Install 21cmFAST with custom options.")
+parser.add_argument(
+    "--log-level",
+    type=str,
+    default="WARNING",
+    help="Set the log level (NO_LOG, ERROR, WARNING, INFO, DEBUG, SUPER_DEBUG, ULTRA_DEBUG)",
+)
+parser.add_argument("--debug", action="store_true", help="Enable debug symbols")
+parser.add_argument(
+    "-o",
+    "--optimization",
+    help="optimisation level (i,e -O0, -O1, -O2, -O3)",
+    default="3",
+)
+
+args = parser.parse_args()
+if args.optimization not in ["0", "g", "1", "2", "3", "s"]:
+    raise ValueError("Invalid optimization level. Choose from 0, g, 1, 2, 3 or s")
+
+# Get the LOG_LEVEL environment variable (default to 'WARNING' if not set)
+log_level_str = args.log_level
+setup_args = [
+    f"--config-setting=setup-args=-Dlog_level={log_level_str}",
+]
+
+debug = str(args.debug).lower()
+
+setup_args += [f"--config-setting=setup-args=-Ddebug={args.debug}"]
+setup_args += [f"--config-setting=setup-args=-Doptimization={args.optimization}"]
+
+
+# Run pip install with the specified options
+subprocess.run(["pip", "install", ".", *setup_args])
diff --git a/meson.build b/meson.build
new file mode 100644
index 000000000..6bc0cfbed
--- /dev/null
+++ b/meson.build
@@ -0,0 +1,36 @@
+#To find the version we need to find python and run setuptools-scm, while keeping the project() call first
+project('21cmFAST', ['c', 'cpp'],
+   version : run_command(find_program('python'), '-c', 'from setuptools_scm import get_version; print(get_version())', check: true).stdout().strip(),
+   default_options : ['cpp_std=c++17'],
+)
+
+# Check if CUDA should be enabled after project declaration
+env_cuda_str = run_command('printenv', 'USE_CUDA', check: false).stdout().strip()
+if env_cuda_str == ''
+    env_cuda_flag = get_option('use_cuda')
+else
+    if env_cuda_str == 'TRUE'
+        env_cuda_flag = true
+    else
+        env_cuda_flag = false
+    endif
+endif
+
+nvcc_prog = find_program('nvcc', required: false)
+
+# Add CUDA language if both the environment variable is set AND nvcc is available
+if env_cuda_flag and nvcc_prog.found()
+  add_languages('cuda')
+  # For CUDA, we need to pass the C++ standard as a compiler argument since we're adding the language dynamically
+  add_project_arguments('--std=c++17', language: 'cuda')
+  message('CUDA enabled: nvcc found at ' + nvcc_prog.full_path())
+else
+  if env_cuda_flag and not nvcc_prog.found()
+    warning('USE_CUDA=TRUE but nvcc not found, building CPU-only version')
+  endif
+  message('Building CPU-only version')
+endif
+
+py = import('python').find_installation(pure: false)
+
+subdir('src')
diff --git a/meson.options b/meson.options
new file mode 100644
index 000000000..b6db5bbb9
--- /dev/null
+++ b/meson.options
@@ -0,0 +1,3 @@
+# Define the log level option
+option('log_level', type: 'string', value: 'WARNING', description: 'Set the log level (e.g., NO_LOG, ERROR, WARNING, INFO, DEBUG, SUPER_DEBUG, ULTRA_DEBUG)')
+option('use_cuda',type: 'boolean', value: false, description: 'Attempt to find and use CUDA in the compilation if set to TRUE')
diff --git a/pyproject.toml b/pyproject.toml
index 3a17beeaf..d8f8647b7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,8 +1,128 @@
+[project]
+name="21cmFAST"
+dynamic = ["version"]
+license="MIT"
+license-files = ["LICENSE"]
+description="A semi-numerical cosmological simulation code for the 21cm signal"
+# long_description="%s\n%s"
+# % (
+#     re.compile("^.. start-badges.*^.. end-badges", re.M | re.S).sub(
+#         "", _read("README.rst")
+#     ),
+#     re.sub(":[a-z]+:`~?(.*?)`", r"``\1``", _read("CHANGELOG.rst")),
+# )
+# long_description_content_type="text/x-rst"
+authors=[ {name = "The 21cmFAST coredev team" ,email = "21cmfast.coredev@gmail.com"}]
+readme= {file = "README.rst", content-type = "text/x-rst" }
+include_package_data=true
+requires-python=">=3.10"
+classifiers=[
+    # complete classifier list: http://pypi.python.org/pypi?%3Aaction=list_classifiers
+    "Development Status :: 5 - Production/Stable",
+    "Intended Audience :: Developers",
+    "Operating System :: Unix",
+    "Operating System :: POSIX",
+    "Programming Language :: Python",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: Implementation :: CPython",
+]
+keywords=["Epoch of Reionization", "Cosmology"]
+dependencies=[
+        "click",
+        "numpy>=2.0",
+        "pyyaml",
+        "cffi>=1.0",
+        "scipy",
+        "astropy>=2.0",
+        "h5py>=2.8.0",
+        "matplotlib",
+        "bidict",
+        "cosmotile>=0.2.5",
+        "attrs",
+        "tqdm",
+        "classy",
+        "cyclopts",
+        "tomlkit",
+]
+
+# [tool.setuptools.packages.find]
+# where = ["src/py21cmfast"]
+
+[project.optional-dependencies]
+tests = [
+    "clang-format",
+    "clang-tidy",
+    "hmf",
+    "pre-commit",
+    "pytest>=5.0",
+    "pytest-cov",
+    "tox",
+    "pytest-remotedata>=0.3.2",
+    "powerbox",
+    "pytest-plt",
+    "pytest-benchmark",
+    "tyro",
+    "rich",
+    "pytest-xdist",
+    "pytest-mock",
+]
+docs = [
+    "nbsphinx",
+    "numpydoc",
+    "sphinx >= 1.3",
+    "furo"
+]
+# When the min python version supports PEP 735, this can be simplified
+# as dev = test_req + doc_req again (as it was implemented in setup.py)
+dev = [
+    "clang-format",
+    "clang-tidy",
+    "hmf",
+    "pre-commit",
+    "pytest>=5.0",
+    "pytest-cov",
+    "tox",
+    "pytest-remotedata>=0.3.2",
+    "powerbox",
+    "pytest-plt",
+    "pytest-benchmark",
+    "tyro",
+    "rich",
+    "pytest-xdist",
+    "pytest-mock",
+    "nbsphinx",
+    "numpydoc",
+    "sphinx>=1.3",
+    "sphinx-rtd-theme",
+]
+
+# UPDATE THESE
+[project.urls]
+Homepage="https://github.com/21cmFAST/21cmFAST"
+Documentation="https://github.com/21cmFAST/21cmFAST"
+Repository="https://github.com/21cmFAST/21cmFAST"
+Issues="https://github.com/21cmFAST/21cmFAST"
+Changelog="https://github.com/21cmFAST/21cmFAST"
+
+[project.scripts]
+21cmfast = "py21cmfast.cli:main"
+
 [build-system]
-requires = ["setuptools>=78.1.0", "setuptools_scm>=8", "cython"]
-build-backend = "setuptools.build_meta"
+build-backend = 'mesonpy'
+requires = ['meson-python', 'nanobind>=2.4.0', 'setuptools-scm>=8']
+
+[tool.meson-python.args]
+setup = ["-Dbuildtype=release"]
 
 [tool.setuptools_scm]
+version_file = "src/py21cmfast/_version.py"
+fallback_version = "4.0.0b1"
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
 
 [tool.ruff]
 line-length = 88
@@ -82,10 +202,6 @@ ignore = [
     "D103",  # ignore missing docstrings
     "T",     # print statements
 ]
-"build_cffi.py" = [
-    "T",     # print statements
-]
-
 
 [tool.ruff.lint.pydocstyle]
 convention = 'numpy'
diff --git a/setup.py b/setup.py
deleted file mode 100644
index 85cf6f497..000000000
--- a/setup.py
+++ /dev/null
@@ -1,104 +0,0 @@
-#!/usr/bin/env python
-"""Setup the package."""
-
-import os
-import re
-from pathlib import Path
-
-from setuptools import find_packages, setup
-
-THISDIR = Path(__file__).parent.resolve()
-
-
-def _read(name: str):
-    with (THISDIR / name).open(encoding="utf8") as fl:
-        return fl.read()
-
-
-# Enable code coverage for C code: we can't use CFLAGS=-coverage in tox.ini, since that
-# may mess with compiling dependencies (e.g. numpy). Therefore we set SETUPPY_
-# CFLAGS=-coverage in tox.ini and copy it to CFLAGS here (after deps have been safely installed).
-if "TOXENV" in os.environ and "SETUPPY_CFLAGS" in os.environ:
-    os.environ["CFLAGS"] = os.environ["SETUPPY_CFLAGS"]
-
-test_req = [
-    "clang-format",
-    "clang-tidy",
-    "hmf",
-    "pre-commit",
-    "pytest>=5.0",
-    "pytest-cov",
-    "tox",
-    "pytest-remotedata>=0.3.2",
-    "powerbox",
-    "pytest-plt",
-    "pytest-benchmark",
-    "tyro",
-    "rich",
-    "pytest-xdist",
-    "pytest-mock",
-]
-
-doc_req = ["nbsphinx", "numpydoc", "sphinx >= 1.3", "furo"]
-
-setup(
-    name="21cmFAST",
-    license="MIT license",
-    description="A semi-numerical cosmological simulation code for the 21cm signal",
-    long_description="{}\n{}".format(
-        re.compile("^.. start-badges.*^.. end-badges", re.M | re.S).sub(
-            "", _read("README.rst")
-        ),
-        re.sub(":[a-z]+:`~?(.*?)`", r"``\1``", _read("CHANGELOG.rst")),
-    ),
-    long_description_content_type="text/x-rst",
-    author="The 21cmFAST coredev team",
-    author_email="21cmfast.coredev@gmail.com",
-    url="https://github.com/21cmFAST/21cmFAST",
-    packages=find_packages("src"),
-    package_dir={"": "src"},
-    include_package_data=True,
-    python_requires=">=3.11",
-    zip_safe=False,
-    classifiers=[
-        # complete classifier list: http://pypi.python.org/pypi?%3Aaction=list_classifiers
-        "Development Status :: 5 - Production/Stable",
-        "Intended Audience :: Developers",
-        "License :: OSI Approved :: MIT License",
-        "Operating System :: Unix",
-        "Operating System :: POSIX",
-        "Programming Language :: Python",
-        "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.9",
-        "Programming Language :: Python :: 3.10",
-        "Programming Language :: Python :: 3.11",
-        "Programming Language :: Python :: Implementation :: CPython",
-    ],
-    keywords=["Epoch of Reionization", "Cosmology"],
-    install_requires=[
-        "click",
-        "numpy>=2.0",
-        "pyyaml",
-        "cffi>=1.0",
-        "scipy",
-        "astropy>=2.0",
-        "h5py>=2.8.0",
-        "matplotlib",
-        "bidict",
-        "cosmotile>=0.2.5",
-        "attrs",
-        "tqdm",
-        "classy",
-        "cyclopts",
-        "tomlkit",
-    ],
-    extras_require={"tests": test_req, "docs": doc_req, "dev": test_req + doc_req},
-    setup_requires=["cffi>=1.0", "setuptools_scm"],
-    entry_points={"console_scripts": ["21cmfast = py21cmfast.cli:app"]},
-    cffi_modules=[f"{THISDIR}/build_cffi.py:ffi"],
-    use_scm_version={
-        "write_to": "src/py21cmfast/_version.py",
-        "parentdir_prefix_version": "21cmFAST-",
-        "fallback_version": "0.0.0",
-    },
-)
diff --git a/src/meson.build b/src/meson.build
new file mode 100644
index 000000000..9b87673d7
--- /dev/null
+++ b/src/meson.build
@@ -0,0 +1 @@
+subdir('py21cmfast')
diff --git a/src/py21cmfast/_cfg.py b/src/py21cmfast/_cfg.py
index 69cd925ef..b94393680 100644
--- a/src/py21cmfast/_cfg.py
+++ b/src/py21cmfast/_cfg.py
@@ -6,9 +6,10 @@
 from pathlib import Path
 from typing import ClassVar
 
+import py21cmfast.c_21cmfast as lib
+
 from . import yaml
 from ._data import DATA_PATH
-from .c_21cmfast import ffi, lib
 from .wrapper.structs import StructInstanceWrapper
 
 
@@ -25,6 +26,7 @@ class Config(dict):
         "ignore_R_BUBBLE_MAX_error": False,
         "external_table_path": DATA_PATH,
         "HALO_CATALOG_MEM_FACTOR": 1.2,
+        "EXTRA_HALOBOX_FIELDS": False,
         "safe_read": True,
     }
     _defaults["wisdoms_path"] = Path(_defaults["direc"]) / "wisdoms"
@@ -32,7 +34,7 @@ class Config(dict):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         # keep the config settings from the C library here
-        self._c_config_settings = StructInstanceWrapper(lib.config_settings, ffi)
+        self._c_config_settings = StructInstanceWrapper(lib.get_config_settings())
 
         for k, v in self._defaults.items():
             if k not in self:
@@ -59,13 +61,9 @@ def __setitem__(self, key, value):
 
     def _pass_to_backend(self, key, value):
         """Set the value in the backend."""
-        # we should possibly do a typemap for the ffi
-        if isinstance(value, Path | str):
-            setattr(
-                self._c_config_settings, key, ffi.new("char[]", str(value).encode())
-            )
-        else:
-            setattr(self._c_config_settings, key, value)
+        if isinstance(value, Path):
+            value = str(value)
+        setattr(self._c_config_settings, key, value)
 
     @contextlib.contextmanager
     def use(self, **kwargs):
diff --git a/src/py21cmfast/drivers/coeval.py b/src/py21cmfast/drivers/coeval.py
index eb850e7a9..d357caace 100644
--- a/src/py21cmfast/drivers/coeval.py
+++ b/src/py21cmfast/drivers/coeval.py
@@ -13,8 +13,9 @@
 from rich.console import Console
 from rich.progress import Progress
 
+import py21cmfast.c_21cmfast as lib
+
 from .. import __version__
-from ..c_21cmfast import lib
 from ..io import h5
 from ..io.caching import CacheConfig, OutputCache, RunCache
 from ..rsds import apply_rsds, include_dvdr_in_tau21
@@ -30,7 +31,11 @@
     PerturbHaloField,
     TsBox,
 )
-from ..wrapper.photoncons import _get_photon_nonconservation_data, setup_photon_cons
+from ..wrapper.photoncons import (
+    _get_photon_nonconservation_data,
+    _photoncons_state,
+    setup_photon_cons,
+)
 from . import single_field as sf
 from ._param_config import high_level_func
 
@@ -625,7 +630,7 @@ def generate_coeval(
     ):
         yield coeval, coeval.redshift in out_redshifts
 
-    if lib.photon_cons_allocated:
+    if _photoncons_state.c_memory_allocated:
         lib.FreePhotonConsMemory()
 
 
@@ -741,7 +746,7 @@ def _redshift_loop_generator(
                 this_halobox = sf.compute_halo_grid(
                     inputs=inputs,
                     perturbed_halo_list=this_pthalo,
-                    perturbed_field=this_perturbed_field,
+                    redshift=z,
                     previous_ionize_box=getattr(prev_coeval, "ionized_box", None),
                     previous_spin_temp=getattr(prev_coeval, "ts_box", None),
                     write=write.halobox,
diff --git a/src/py21cmfast/drivers/lightcone.py b/src/py21cmfast/drivers/lightcone.py
index 72cef3774..590cbdefd 100644
--- a/src/py21cmfast/drivers/lightcone.py
+++ b/src/py21cmfast/drivers/lightcone.py
@@ -14,8 +14,9 @@
 from astropy import units
 from astropy.cosmology import z_at_value
 
+import py21cmfast.c_21cmfast as lib
+
 from .. import __version__
-from ..c_21cmfast import lib
 from ..io import h5
 from ..io.caching import CacheConfig, OutputCache
 from ..lightconers import Lightconer, RectilinearLightconer
@@ -31,6 +32,7 @@
     PerturbHaloField,
     TsBox,
 )
+from ..wrapper.photoncons import _photoncons_state
 from ._param_config import high_level_func
 from .coeval import (
     _obtain_starting_point_for_scrolling,
@@ -513,7 +515,7 @@ def _run_lightcone_from_perturbed_fields(
 
         # last redshift things
         if iz == len(scrollz) - 1:
-            if lib.photon_cons_allocated:
+            if _photoncons_state.c_memory_allocated:
                 lib.FreePhotonConsMemory()
 
             if include_dvdr_in_tau21:
diff --git a/src/py21cmfast/drivers/single_field.py b/src/py21cmfast/drivers/single_field.py
index 83248dee8..d2c4c0df7 100644
--- a/src/py21cmfast/drivers/single_field.py
+++ b/src/py21cmfast/drivers/single_field.py
@@ -203,10 +203,10 @@ def perturb_halo_list(
 @single_field_func
 def compute_halo_grid(
     *,
+    redshift: float,
     initial_conditions: InitialConditions,
     inputs: InputParameters | None = None,
     perturbed_halo_list: PerturbHaloField | None = None,
-    perturbed_field: PerturbedField | None = None,
     previous_spin_temp: TsBox | None = None,
     previous_ionize_box: IonizedBox | None = None,
 ) -> HaloBox:
@@ -243,29 +243,9 @@ def compute_halo_grid(
     regenerate, write, cache:
         See docs of :func:`initial_conditions` for more information.
     """
-    if perturbed_halo_list:
-        redshift = perturbed_halo_list.redshift
-    elif perturbed_field:
-        redshift = perturbed_field.redshift
-    else:
-        raise ValueError(
-            "Either perturbed_field or perturbed_halo_list are required (or both)."
-        )
-
     box = HaloBox.new(redshift=redshift, inputs=inputs)
 
-    if perturbed_field is None:
-        if (
-            inputs.matter_options.FIXED_HALO_GRIDS
-            or inputs.astro_options.AVG_BELOW_SAMPLER
-        ):
-            raise ValueError(
-                "You must provide the perturbed field if FIXED_HALO_GRIDS is True or AVG_BELOW_SAMPLER is True"
-            )
-        else:
-            perturbed_field = PerturbedField.dummy()
-
-    elif perturbed_halo_list is None:
+    if perturbed_halo_list is None:
         if not inputs.matter_options.FIXED_HALO_GRIDS:
             raise ValueError(
                 "You must provide the perturbed halo list if FIXED_HALO_GRIDS is False"
@@ -302,7 +282,6 @@ def compute_halo_grid(
     return box.compute(
         initial_conditions=initial_conditions,
         pt_halos=perturbed_halo_list,
-        perturbed_field=perturbed_field,
         previous_ionize_box=previous_ionize_box,
         previous_spin_temp=previous_spin_temp,
     )
diff --git a/src/py21cmfast/lightconers.py b/src/py21cmfast/lightconers.py
index ffafd9ee3..675d06d21 100644
--- a/src/py21cmfast/lightconers.py
+++ b/src/py21cmfast/lightconers.py
@@ -125,7 +125,6 @@ def between_redshifts(
         d_at_redshift = cosmo.comoving_distance(min_redshift).to_value(Mpc)
         dmax = cosmo.comoving_distance(max_redshift).to_value(Mpc)
         res = resolution.to_value(Mpc)
-
         lc_distances = np.arange(d_at_redshift, dmax + res, res)
 
         return cls(lc_distances=lc_distances * Mpc, cosmo=cosmo, **kw)
diff --git a/src/py21cmfast/meson.build b/src/py21cmfast/meson.build
new file mode 100644
index 000000000..c91874ade
--- /dev/null
+++ b/src/py21cmfast/meson.build
@@ -0,0 +1,35 @@
+source_files = [
+  '__init__.py',
+  '_cfg.py',
+  '_logging.py',
+  'cli.py',
+  'input_serialization.py',
+  'lightconers.py',
+  'plotting.py',
+  'rsds.py',
+  '_templates.py',
+  'utils.py',
+  'yaml.py',
+]
+
+py.install_sources(
+    source_files,
+    subdir: 'py21cmfast'
+)
+
+pure_subdirs = [
+  'drivers',
+  'io',
+  'templates',
+  'wrapper',
+  '_data',
+]
+
+# Install the Python code
+install_root = py.get_install_dir()
+foreach subdir: pure_subdirs
+  install_subdir(subdir, install_dir: install_root / 'py21cmfast')
+endforeach
+
+# Build C-extension
+subdir('src')
diff --git a/src/py21cmfast/plotting.py b/src/py21cmfast/plotting.py
index 62dafd912..458a2e3c2 100644
--- a/src/py21cmfast/plotting.py
+++ b/src/py21cmfast/plotting.py
@@ -170,7 +170,7 @@ def coeval_sliceplot(
     """
     if kind is None:
         if isinstance(struct, outputs.OutputStruct):
-            kind = struct.struct.fieldnames[0]
+            kind = next(iter(struct.arrays.keys()))
         elif isinstance(struct, Coeval):
             kind = "brightness_temp"
 
diff --git a/src/py21cmfast/src/BrightnessTemperatureBox.h b/src/py21cmfast/src/BrightnessTemperatureBox.h
index c4cf5b8a4..dbd9e3b25 100644
--- a/src/py21cmfast/src/BrightnessTemperatureBox.h
+++ b/src/py21cmfast/src/BrightnessTemperatureBox.h
@@ -5,7 +5,13 @@
 #include "InputParameters.h"
 #include "OutputStructs.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
 int ComputeBrightnessTemp(float redshift, TsBox *spin_temp, IonizedBox *ionized_box,
                           PerturbedField *perturb_field, BrightnessTemp *box);
 
+#ifdef __cplusplus
+}
+#endif
 #endif
diff --git a/src/py21cmfast/src/DeviceConstants.cuh b/src/py21cmfast/src/DeviceConstants.cuh
new file mode 100644
index 000000000..759a731f7
--- /dev/null
+++ b/src/py21cmfast/src/DeviceConstants.cuh
@@ -0,0 +1,15 @@
+#ifndef _DEVICECONSTANTS_CUH
+#define _DEVICECONSTANTS_CUH
+
+#include "InputParameters.h"
+
+#ifndef _HALOFIELD_CU // double check whether this is necessary
+
+extern __constant__ UserParams d_user_params;
+extern __constant__ CosmoParams d_cosmo_params;
+extern __constant__ AstroParams d_astro_params;
+extern __constant__ double d_test_params;
+
+#endif
+
+#endif
diff --git a/src/py21cmfast/src/HaloBox.c b/src/py21cmfast/src/HaloBox.c
index 9fb2d49df..6ef8c7c0b 100644
--- a/src/py21cmfast/src/HaloBox.c
+++ b/src/py21cmfast/src/HaloBox.c
@@ -21,26 +21,24 @@
 #include "indexing.h"
 #include "interp_tables.h"
 #include "logger.h"
+#include "map_mass.h"
 #include "scaling_relations.h"
 #include "thermochem.h"
 
-// struct holding each halo property we currently need.
-// This is only used for both averages over the box/catalogues
-//   as well as an individual halo's properties
-struct HaloProperties {
-    double halo_mass;
-    double stellar_mass;
-    double halo_sfr;
-    double stellar_mass_mini;
-    double sfr_mini;
-    double fescweighted_sfr;
-    double n_ion;
-    double halo_xray;
-    double metallicity;
-    double m_turn_acg;
-    double m_turn_mcg;
-    double m_turn_reion;
-};
+// TODO: this should probably be somewhere else
+void set_integral_constants(IntegralCondition *consts, double redshift, double M_min, double M_max,
+                            double M_cell) {
+    consts->redshift = redshift;
+    consts->growth_factor = dicke(redshift);
+    consts->M_min = M_min;
+    consts->M_max = M_max;
+    consts->lnM_min = log(M_min);
+    consts->lnM_max = log(M_max);
+    consts->M_cell = M_cell;
+    consts->lnM_cell = log(M_cell);
+    // no table since this should be called once
+    consts->sigma_cell = sigma_z0(M_cell);
+}
 
 // calculates halo properties from astro parameters plus the correlated rng
 // The inputs include all properties with a separate RNG
@@ -56,8 +54,7 @@ struct HaloProperties {
 //   representing a smooth transition in halo mass from one set of SFR/emmissivity parameters to the
 //   other.
 void set_halo_properties(double halo_mass, double M_turn_a, double M_turn_m,
-                         struct ScalingConstants *consts, double *input_rng,
-                         struct HaloProperties *output) {
+                         ScalingConstants *consts, double *input_rng, HaloProperties *output) {
     double n_ion_sample, wsfr_sample;
     double fesc;
     double fesc_mini = 0.;
@@ -99,8 +96,8 @@ void set_halo_properties(double halo_mass, double M_turn_a, double M_turn_m,
 
 // Expected global averages for box quantities for mean adjustment
 // WARNING: THESE AVERAGE BOXES ARE WRONG, CHECK THEM
-int get_box_averages(double M_min, double M_max, double M_turn_a, double M_turn_m,
-                     struct ScalingConstants *consts, struct HaloProperties *averages_out) {
+int get_uhmf_averages(double M_min, double M_max, double M_turn_a, double M_turn_m,
+                      ScalingConstants *consts, HaloProperties *averages_out) {
     LOG_SUPER_DEBUG("Getting Box averages z=%.2f M [%.2e %.2e] Mt [%.2e %.2e]", consts->redshift,
                     M_min, M_max, M_turn_a, M_turn_m);
     double t_h = consts->t_h;
@@ -124,7 +121,7 @@ int get_box_averages(double M_min, double M_max, double M_turn_a, double M_turn_
 
     // NOTE: we use the atomic method for all halo mass/count here
     mass_intgrl = Fcoll_General(consts->redshift, lnMmin, lnMmax);
-    struct ScalingConstants consts_sfrd = evolve_scaling_constants_sfr(consts);
+    ScalingConstants consts_sfrd = evolve_scaling_constants_sfr(consts);
 
     intgrl_fesc_weighted = Nion_General(consts->redshift, lnMmin, lnMmax, M_turn_a, consts);
     intgrl_stars_only = Nion_General(consts->redshift, lnMmin, lnMmax, M_turn_a, &consts_sfrd);
@@ -154,70 +151,142 @@ int get_box_averages(double M_min, double M_max, double M_turn_a, double M_turn_
 
     return 0;
 }
+HaloProperties get_halobox_averages(HaloBox *grids) {
+    int mean_count = 0;
+    double mean_mass = 0., mean_stars = 0., mean_stars_mini = 0., mean_sfr = 0., mean_sfr_mini = 0.;
+    double mean_n_ion = 0., mean_xray = 0., mean_wsfr = 0.;
+
+#pragma omp parallel for reduction(+ : mean_count, mean_mass, mean_stars, mean_stars_mini, \
+                                       mean_sfr, mean_sfr_mini)
+    for (int i = 0; i < HII_TOT_NUM_PIXELS; i++) {
+        mean_sfr += grids->halo_sfr[i];
+        mean_n_ion += grids->n_ion[i];
+        if (astro_options_global->USE_TS_FLUCT) {
+            mean_xray += grids->halo_xray[i];
+        }
+        if (astro_options_global->USE_MINI_HALOS) {
+            mean_sfr_mini += grids->halo_sfr_mini[i];
+        }
+        if (astro_options_global->INHOMO_RECO) mean_wsfr += grids->whalo_sfr[i];
+
+        if (config_settings.EXTRA_HALOBOX_FIELDS) {
+            mean_count += grids->count[i];
+            mean_mass += grids->halo_mass[i];
+            mean_stars += grids->halo_stars[i];
+            if (astro_options_global->USE_MINI_HALOS) mean_stars_mini += grids->halo_stars_mini[i];
+        }
+    }
+
+    HaloProperties averages = {
+        .count = (double)mean_count / HII_TOT_NUM_PIXELS,
+        .halo_mass = mean_mass / HII_TOT_NUM_PIXELS,
+        .stellar_mass = mean_stars / HII_TOT_NUM_PIXELS,
+        .stellar_mass_mini = mean_stars_mini / HII_TOT_NUM_PIXELS,
+        .halo_sfr = mean_sfr / HII_TOT_NUM_PIXELS,
+        .sfr_mini = mean_sfr_mini / HII_TOT_NUM_PIXELS,
+        .n_ion = mean_n_ion / HII_TOT_NUM_PIXELS,
+        .halo_xray = mean_xray / HII_TOT_NUM_PIXELS,
+        .fescweighted_sfr = mean_wsfr / HII_TOT_NUM_PIXELS,
+    };
+    return averages;
+}
 
 // This takes a HaloBox struct and fixes it's mean to exactly what we expect from the UMF integrals.
 //   Generally should only be done for the fixed portion of the grids, since
 //   it will otherwise make the box inconsistent with the input catalogue
-void mean_fix_grids(double M_min, double M_max, HaloBox *grids, struct HaloProperties *averages_box,
-                    struct ScalingConstants *consts) {
-    struct HaloProperties averages_global;
-    double M_turn_a_global = averages_box->m_turn_acg;
-    double M_turn_m_global = averages_box->m_turn_mcg;
-    get_box_averages(M_min, M_max, M_turn_a_global, M_turn_m_global, consts, &averages_global);
+void mean_fix_grids(double M_min, double M_max, HaloBox *grids, ScalingConstants *consts) {
+    HaloProperties averages_global;
+    // NOTE: requires the mean mcrits to be set on the grids
+    double M_turn_a_global = pow(10, grids->log10_Mcrit_ACG_ave);
+    double M_turn_m_global = pow(10, grids->log10_Mcrit_MCG_ave);
+    get_uhmf_averages(M_min, M_max, M_turn_a_global, M_turn_m_global, consts, &averages_global);
+    HaloProperties averages_hbox;
+    averages_hbox = get_halobox_averages(grids);
 
     unsigned long long int idx;
 #pragma omp parallel for num_threads(simulation_options_global->N_THREADS) private(idx)
     for (idx = 0; idx < HII_TOT_NUM_PIXELS; idx++) {
-        grids->halo_mass[idx] *= averages_global.halo_mass / averages_box->halo_mass;
-        grids->halo_stars[idx] *= averages_global.stellar_mass / averages_box->stellar_mass;
-        grids->halo_sfr[idx] *= averages_global.halo_sfr / averages_box->halo_sfr;
-        grids->n_ion[idx] *= averages_global.n_ion / averages_box->n_ion;
+        grids->halo_sfr[idx] *= averages_global.halo_sfr / averages_hbox.halo_sfr;
+        grids->n_ion[idx] *= averages_global.n_ion / averages_hbox.n_ion;
         if (astro_options_global->USE_MINI_HALOS) {
-            grids->halo_stars_mini[idx] *=
-                averages_global.stellar_mass_mini / averages_box->stellar_mass_mini;
-            grids->halo_sfr_mini[idx] *= averages_global.sfr_mini / averages_box->sfr_mini;
+            grids->halo_sfr_mini[idx] *= averages_global.sfr_mini / averages_hbox.sfr_mini;
         }
         if (astro_options_global->USE_TS_FLUCT) {
-            grids->halo_xray[idx] *= averages_global.halo_xray / averages_box->halo_xray;
+            grids->halo_xray[idx] *= averages_global.halo_xray / averages_hbox.halo_xray;
         }
         if (astro_options_global->INHOMO_RECO) {
             grids->whalo_sfr[idx] *=
-                averages_global.fescweighted_sfr / averages_box->fescweighted_sfr;
+                averages_global.fescweighted_sfr / averages_hbox.fescweighted_sfr;
+        }
+
+        if (config_settings.EXTRA_HALOBOX_FIELDS) {
+            grids->halo_mass[idx] *= averages_global.halo_mass / averages_hbox.halo_mass;
+            grids->halo_stars[idx] *= averages_global.stellar_mass / averages_hbox.stellar_mass;
+            if (astro_options_global->USE_MINI_HALOS) {
+                grids->halo_stars_mini[idx] *=
+                    averages_global.stellar_mass_mini / averages_hbox.stellar_mass_mini;
+            }
         }
     }
 }
 
+// Evaluate Mass function integrals given information from the cell
+void get_cell_integrals(double dens, double l10_mturn_a, double l10_mturn_m,
+                        ScalingConstants *consts, IntegralCondition *int_consts,
+                        HaloProperties *properties) {
+    double M_min = int_consts->M_min;
+    double M_max = int_consts->M_max;
+    double growth_z = int_consts->growth_factor;
+    double M_cell = int_consts->M_cell;
+    double sigma_cell = int_consts->sigma_cell;
+
+    // set all fields to zero
+    memset(properties, 0, sizeof(HaloProperties));
+
+    // using the properties struct:
+    // stellar_mass --> no F_esc integral ACG
+    // stellar_mass_mini --> no F_esc integral MCG
+    // n_ion --> F_esc integral ACG
+    // fescweighted_sfr --> F_esc integral MCG
+    // halo_xray --> Xray integral
+    // halo_mass --> total mass
+    properties->n_ion = EvaluateNion_Conditional(dens, l10_mturn_a, growth_z, M_min, M_max, M_cell,
+                                                 sigma_cell, consts, false);
+    properties->stellar_mass =
+        EvaluateSFRD_Conditional(dens, growth_z, M_min, M_max, M_cell, sigma_cell, consts);
+    // TODO: SFRD tables still assume no reion feedback, this should be fixed
+    //   although it doesn't affect the histories (only used in Ts) it makes outputs wrong
+    //   for post-processing
+    if (astro_options_global->USE_MINI_HALOS) {
+        properties->stellar_mass_mini = EvaluateSFRD_Conditional_MINI(
+            dens, l10_mturn_m, growth_z, M_min, M_max, M_cell, sigma_cell, consts);
+        // re-using field
+        properties->fescweighted_sfr = EvaluateNion_Conditional_MINI(
+            dens, l10_mturn_m, growth_z, M_min, M_max, M_cell, sigma_cell, consts, false);
+    }
+
+    if (astro_options_global->USE_TS_FLUCT) {
+        properties->halo_xray =
+            EvaluateXray_Conditional(dens, l10_mturn_m, consts->redshift, growth_z, M_min, M_max,
+                                     M_cell, sigma_cell, consts);
+    }
+
+    if (config_settings.EXTRA_HALOBOX_FIELDS) {
+        properties->halo_mass =
+            EvaluateMcoll(dens, growth_z, log(M_min), log(M_max), M_cell, sigma_cell, dens);
+    }
+}
+
 // Fixed halo grids, where each property is set as the integral of the CMF on the EULERIAN cell
 // scale As per default 21cmfast (strange pretending that the lagrangian density is eulerian and
 // then *(1+delta)) This outputs the UN-NORMALISED grids (before mean-adjustment)
-int set_fixed_grids(double M_min, double M_max, InitialConditions *ini_boxes,
-                    PerturbedField *perturbed_field, TsBox *previous_spin_temp,
-                    IonizedBox *previous_ionize_box, struct ScalingConstants *consts,
-                    HaloBox *grids, struct HaloProperties *averages, const bool eulerian) {
+int set_fixed_grids(double M_min, double M_max, InitialConditions *ini_boxes, float *mturn_a_grid,
+                    float *mturn_m_grid, ScalingConstants *consts, HaloBox *grids) {
     double M_cell = RHOcrit * cosmo_params_global->OMm * VOLUME /
                     HII_TOT_NUM_PIXELS;  // mass in cell of mean dens
-    double growth_z = dicke(consts->redshift);
-
-    double lnMmin = log(M_min);
-    double lnMcell = log(M_cell);
-    double lnMmax = log(M_max);
-
-    double sigma_cell = EvaluateSigma(lnMcell);
-
-    double prefactor_mass = RHOcrit * cosmo_params_global->OMm;
-    double prefactor_stars = RHOcrit * cosmo_params_global->OMb * consts->fstar_10;
-    double prefactor_stars_mini = RHOcrit * cosmo_params_global->OMb * consts->fstar_7;
-    double prefactor_sfr = prefactor_stars / consts->t_star / consts->t_h;
-    double prefactor_sfr_mini = prefactor_stars_mini / consts->t_star / consts->t_h;
-    double prefactor_nion = prefactor_stars * consts->fesc_10 * consts->pop2_ion;
-    double prefactor_nion_mini = prefactor_stars_mini * consts->fesc_7 * consts->pop3_ion;
-    double prefactor_wsfr = prefactor_sfr * consts->fesc_10 * consts->pop2_ion;
-    double prefactor_wsfr_mini = prefactor_sfr_mini * consts->fesc_7 * consts->pop3_ion;
-    double prefactor_xray = RHOcrit * cosmo_params_global->OMm;
-
-    double hm_sum = 0, nion_sum = 0, wsfr_sum = 0, xray_sum = 0;
-    double sm_sum = 0, sm_sum_mini = 0, sfr_sum = 0, sfr_sum_mini = 0;
-    double l10_mlim_m_sum = 0., l10_mlim_a_sum = 0., l10_mlim_r_sum = 0.;
+    IntegralCondition integral_cond;
+    set_integral_constants(&integral_cond, consts->redshift, M_min, M_max, M_cell);
+    double growthf = dicke(consts->redshift);
 
     // find grid limits for tables
     double min_density = 0.;
@@ -226,56 +295,27 @@ int set_fixed_grids(double M_min, double M_max, InitialConditions *ini_boxes,
     double min_log10_mturn_m = log10(M_MAX_INTEGRAL);
     double max_log10_mturn_a = log10(astro_params_global->M_TURN);
     double max_log10_mturn_m = log10(astro_params_global->M_TURN);
-    float *mturn_a_grid = calloc(HII_TOT_NUM_PIXELS, sizeof(float));
-    float *mturn_m_grid = calloc(HII_TOT_NUM_PIXELS, sizeof(float));
 #pragma omp parallel num_threads(simulation_options_global->N_THREADS)
     {
         unsigned long long int i;
         double dens;
-        double J21_val, Gamma12_val, zre_val;
-        double M_turn_r = 0.;
         double M_turn_m = consts->mturn_m_nofb;
         double M_turn_a = consts->mturn_a_nofb;
-        double curr_vcb = consts->vcb_norel;
 #pragma omp for reduction(min : min_density, min_log10_mturn_a, min_log10_mturn_m) \
-    reduction(max : max_density, max_log10_mturn_a, max_log10_mturn_m)             \
-    reduction(+ : l10_mlim_m_sum, l10_mlim_a_sum, l10_mlim_r_sum)
+    reduction(max : max_density, max_log10_mturn_a, max_log10_mturn_m)
         for (i = 0; i < HII_TOT_NUM_PIXELS; i++) {
-            if (eulerian)
-                dens = perturbed_field->density[i];
-            else
-                dens = ini_boxes->lowres_density[i] * growth_z;
+            dens = ini_boxes->lowres_density[i] * growthf;
             if (dens > max_density) max_density = dens;
             if (dens < min_density) min_density = dens;
 
             if (astro_options_global->USE_MINI_HALOS) {
-                if (!astro_options_global->FIX_VCB_AVG &&
-                    matter_options_global->USE_RELATIVE_VELOCITIES) {
-                    curr_vcb = ini_boxes->lowres_vcb[i];
-                }
-                J21_val = Gamma12_val = zre_val = 0.;
-                if (consts->redshift < simulation_options_global->Z_HEAT_MAX) {
-                    J21_val = previous_spin_temp->J_21_LW[i];
-                    Gamma12_val = previous_ionize_box->ionisation_rate_G12[i];
-                    zre_val = previous_ionize_box->z_reion[i];
-                }
-                M_turn_a = consts->mturn_a_nofb;
-                M_turn_m = lyman_werner_threshold(consts->redshift, J21_val, curr_vcb);
-                M_turn_r = reionization_feedback(consts->redshift, Gamma12_val, zre_val);
-                M_turn_a = fmax(M_turn_a, fmax(M_turn_r, astro_params_global->M_TURN));
-                M_turn_m = fmax(M_turn_m, fmax(M_turn_r, astro_params_global->M_TURN));
+                M_turn_a = mturn_a_grid[i];
+                M_turn_m = mturn_m_grid[i];
+                if (min_log10_mturn_a > M_turn_a) min_log10_mturn_a = M_turn_a;
+                if (min_log10_mturn_m > M_turn_m) min_log10_mturn_m = M_turn_m;
+                if (max_log10_mturn_a < M_turn_a) max_log10_mturn_a = M_turn_a;
+                if (max_log10_mturn_m < M_turn_m) max_log10_mturn_m = M_turn_m;
             }
-            mturn_a_grid[i] = log10(M_turn_a);
-            mturn_m_grid[i] = log10(M_turn_m);
-
-            if (min_log10_mturn_a > mturn_a_grid[i]) min_log10_mturn_a = mturn_a_grid[i];
-            if (min_log10_mturn_m > mturn_m_grid[i]) min_log10_mturn_m = mturn_m_grid[i];
-            if (max_log10_mturn_a < mturn_a_grid[i]) max_log10_mturn_a = mturn_a_grid[i];
-            if (max_log10_mturn_m < mturn_m_grid[i]) max_log10_mturn_m = mturn_m_grid[i];
-
-            l10_mlim_a_sum += mturn_a_grid[i];
-            l10_mlim_m_sum += mturn_m_grid[i];
-            l10_mlim_r_sum += log10(M_turn_r);
         }
     }
     // buffers for table ranges
@@ -286,17 +326,14 @@ int set_fixed_grids(double M_min, double M_max, InitialConditions *ini_boxes,
     max_log10_mturn_a = max_log10_mturn_a * 1.001;
     max_log10_mturn_m = max_log10_mturn_m * 1.001;
 
-    LOG_DEBUG("Mean halo boxes || M = [%.2e %.2e] | Mcell = %.2e (s=%.2e) | z = %.2e | D = %.2e",
-              M_min, M_max, M_cell, sigma_cell, consts->redshift, growth_z);
-
+    LOG_DEBUG("Mean halo boxes || M = [%.2e %.2e] | Mcell = %.2e", M_min, M_max, M_cell);
     // These tables are coarser than needed, an initial loop for Mturn to find limits may help
     if (matter_options_global->USE_INTERPOLATION_TABLES > 1) {
         if (astro_options_global->INTEGRATION_METHOD_ATOMIC == 1 ||
             (astro_options_global->USE_MINI_HALOS &&
              astro_options_global->INTEGRATION_METHOD_MINI == 1)) {
-            initialise_GL(lnMmin, lnMmax);
+            initialise_GL(integral_cond.lnM_min, integral_cond.lnM_max);
         }
-
         // This table assumes no reionisation feedback
         initialise_SFRD_Conditional_table(consts->redshift, min_density, max_density, M_min, M_max,
                                           M_cell, consts);
@@ -307,95 +344,26 @@ int set_fixed_grids(double M_min, double M_max, InitialConditions *ini_boxes,
                                            M_cell, min_log10_mturn_a, max_log10_mturn_a,
                                            min_log10_mturn_m, max_log10_mturn_m, consts, false);
 
-        initialise_dNdM_tables(min_density, max_density, lnMmin, lnMmax, growth_z, lnMcell, false);
+        initialise_dNdM_tables(min_density, max_density, integral_cond.lnM_min,
+                               integral_cond.lnM_max, integral_cond.growth_factor,
+                               integral_cond.lnM_cell, false);
         if (astro_options_global->USE_TS_FLUCT) {
             initialise_Xray_Conditional_table(consts->redshift, min_density, max_density, M_min,
                                               M_max, M_cell, consts);
         }
     }
 
-#pragma omp parallel num_threads(simulation_options_global->N_THREADS)
-    {
-        unsigned long long int i;
-        double dens;
-        double l10_mturn_a, l10_mturn_m;
-        double mass_intgrl, h_count;
-        double intgrl_fesc_weighted, intgrl_stars_only;
-        double intgrl_fesc_weighted_mini = 0., intgrl_stars_only_mini = 0., integral_xray = 0;
-        double dens_fac;
-
-#pragma omp for reduction(+ : hm_sum, sm_sum, sm_sum_mini, sfr_sum, sfr_sum_mini, xray_sum, \
-                              nion_sum, wsfr_sum)
-        for (i = 0; i < HII_TOT_NUM_PIXELS; i++) {
-            if (eulerian) {
-                dens = perturbed_field->density[i];
-                dens_fac = (1. + dens);
-            } else {
-                dens = ini_boxes->lowres_density[i] * growth_z;
-                dens_fac = 1.;
-            }
-            l10_mturn_a = mturn_a_grid[i];
-            l10_mturn_m = mturn_m_grid[i];
-
-            h_count = EvaluateNhalo(dens, growth_z, lnMmin, lnMmax, M_cell, sigma_cell, dens);
-            mass_intgrl = EvaluateMcoll(dens, growth_z, lnMmin, lnMmax, M_cell, sigma_cell, dens);
-            intgrl_fesc_weighted = EvaluateNion_Conditional(
-                dens, l10_mturn_a, growth_z, M_min, M_max, M_cell, sigma_cell, consts, false);
-            intgrl_stars_only =
-                EvaluateSFRD_Conditional(dens, growth_z, M_min, M_max, M_cell, sigma_cell, consts);
-            // TODO: SFRD tables still assume no reion feedback, this should be fixed
-            //   although it doesn't affect the histories (only used in Ts) it makes outputs wrong
-            //   for post-processing
-            if (astro_options_global->USE_MINI_HALOS) {
-                intgrl_stars_only_mini = EvaluateSFRD_Conditional_MINI(
-                    dens, l10_mturn_m, growth_z, M_min, M_max, M_cell, sigma_cell, consts);
-                intgrl_fesc_weighted_mini = EvaluateNion_Conditional_MINI(
-                    dens, l10_mturn_m, growth_z, M_min, M_max, M_cell, sigma_cell, consts, false);
-            }
-
-            if (astro_options_global->USE_TS_FLUCT) {
-                integral_xray =
-                    EvaluateXray_Conditional(dens, l10_mturn_m, consts->redshift, growth_z, M_min,
-                                             M_max, M_cell, sigma_cell, consts);
-            }
-
-            grids->count[i] = (int)(h_count * M_cell * dens_fac);  // NOTE: truncated
-            grids->halo_mass[i] = mass_intgrl * prefactor_mass * dens_fac;
-            grids->halo_sfr[i] = (intgrl_stars_only * prefactor_sfr) * dens_fac;
-            grids->n_ion[i] = (intgrl_fesc_weighted * prefactor_nion +
-                               intgrl_fesc_weighted_mini * prefactor_nion_mini) *
-                              dens_fac;
-            grids->halo_stars[i] = intgrl_stars_only * prefactor_stars * dens_fac;
-
-            hm_sum += grids->halo_mass[i];
-            nion_sum += grids->n_ion[i];
-            sfr_sum += grids->halo_sfr[i];
-            sm_sum += grids->halo_stars[i];
-
-            if (astro_options_global->USE_TS_FLUCT) {
-                grids->halo_xray[i] = prefactor_xray * integral_xray * dens_fac;
-                xray_sum += grids->halo_xray[i];
-            }
-            if (astro_options_global->INHOMO_RECO) {
-                grids->whalo_sfr[i] = (intgrl_fesc_weighted * prefactor_wsfr +
-                                       intgrl_fesc_weighted_mini * prefactor_wsfr_mini) *
-                                      dens_fac;
-                wsfr_sum += grids->whalo_sfr[i];
-            }
-            if (astro_options_global->USE_MINI_HALOS) {
-                grids->halo_stars_mini[i] =
-                    intgrl_stars_only_mini * prefactor_stars_mini * dens_fac;
-                grids->halo_sfr_mini[i] = intgrl_stars_only_mini * prefactor_sfr_mini * dens_fac;
-                sm_sum_mini += grids->halo_stars_mini[i];
-                sfr_sum_mini += grids->halo_sfr_mini[i];
-            }
-        }
-    }
-
-    LOG_ULTRA_DEBUG("Cell 0 Totals: HM: %.2e SM: %.2e SF: %.2e, NI: %.2e ct : %d",
-                    grids->halo_mass[HII_R_INDEX(0, 0, 0)], grids->halo_stars[HII_R_INDEX(0, 0, 0)],
-                    grids->halo_sfr[HII_R_INDEX(0, 0, 0)], grids->n_ion[HII_R_INDEX(0, 0, 0)],
-                    grids->count[HII_R_INDEX(0, 0, 0)]);
+    int grid_dim[3] = {simulation_options_global->HII_DIM, simulation_options_global->HII_DIM,
+                       HII_D_PARA};
+    float *vel_pointers[3] = {ini_boxes->lowres_vx, ini_boxes->lowres_vy, ini_boxes->lowres_vz};
+    float *vel_pointers_2LPT[3] = {ini_boxes->lowres_vx_2LPT, ini_boxes->lowres_vy_2LPT,
+                                   ini_boxes->lowres_vz_2LPT};
+    move_grid_galprops(consts->redshift, ini_boxes->lowres_density, grid_dim, vel_pointers,
+                       vel_pointers_2LPT, grid_dim, grids, grid_dim, mturn_a_grid, mturn_m_grid,
+                       consts, &integral_cond);
+
+    LOG_ULTRA_DEBUG("Cell 0 Totals: SF: %.2e, NI: %.2e", grids->halo_sfr[HII_R_INDEX(0, 0, 0)],
+                    grids->n_ion[HII_R_INDEX(0, 0, 0)]);
     if (astro_options_global->INHOMO_RECO) {
         LOG_ULTRA_DEBUG("FESC * SF %.2e", grids->whalo_sfr[HII_R_INDEX(0, 0, 0)]);
     }
@@ -405,57 +373,26 @@ int set_fixed_grids(double M_min, double M_max, InitialConditions *ini_boxes,
     if (astro_options_global->USE_MINI_HALOS) {
         LOG_ULTRA_DEBUG("MINI SM %.2e SF %.2e", grids->halo_stars_mini[HII_R_INDEX(0, 0, 0)],
                         grids->halo_sfr_mini[HII_R_INDEX(0, 0, 0)]);
+        LOG_ULTRA_DEBUG("Mturn_a %.2e Mturn_m %.2e", mturn_a_grid[HII_R_INDEX(0, 0, 0)],
+                        mturn_m_grid[HII_R_INDEX(0, 0, 0)]);
     }
-    LOG_ULTRA_DEBUG("Mturn_a %.2e Mturn_m %.2e", mturn_a_grid[HII_R_INDEX(0, 0, 0)],
-                    mturn_m_grid[HII_R_INDEX(0, 0, 0)]);
-
-    free(mturn_a_grid);
-    free(mturn_m_grid);
     free_conditional_tables();
 
-    averages->halo_mass = hm_sum / HII_TOT_NUM_PIXELS;
-    averages->stellar_mass = sm_sum / HII_TOT_NUM_PIXELS;
-    averages->stellar_mass_mini = sm_sum_mini / HII_TOT_NUM_PIXELS;
-    averages->halo_sfr = sfr_sum / HII_TOT_NUM_PIXELS;
-    averages->sfr_mini = sfr_sum_mini / HII_TOT_NUM_PIXELS;
-    averages->n_ion = nion_sum / HII_TOT_NUM_PIXELS;
-    averages->halo_xray = xray_sum / HII_TOT_NUM_PIXELS;
-    averages->fescweighted_sfr = wsfr_sum / HII_TOT_NUM_PIXELS;
-    averages->m_turn_acg = pow(10, l10_mlim_a_sum / HII_TOT_NUM_PIXELS);
-    averages->m_turn_mcg = pow(10, l10_mlim_m_sum / HII_TOT_NUM_PIXELS);
-    averages->m_turn_reion = pow(10, l10_mlim_r_sum / HII_TOT_NUM_PIXELS);
-
-    // mean-fix the grids
-    // TODO: put this behind a flag
-    if (consts->fix_mean) mean_fix_grids(M_min, M_max, grids, averages, consts);
-
-    // assign the log10 average Mturn for the Ts global tables
-    grids->log10_Mcrit_MCG_ave = l10_mlim_m_sum / HII_TOT_NUM_PIXELS;
-    grids->log10_Mcrit_ACG_ave = l10_mlim_a_sum / HII_TOT_NUM_PIXELS;
+    if (consts->fix_mean) mean_fix_grids(M_min, M_max, grids, consts);
 
     return 0;
 }
 
-void halobox_debug_print_avg(struct HaloProperties *averages_box,
-                             struct HaloProperties *averages_subsampler,
-                             struct ScalingConstants *consts, double M_min, double M_max) {
+void halobox_debug_print_avg(HaloBox *halobox, ScalingConstants *consts, double M_min,
+                             double M_max) {
     if (LOG_LEVEL < DEBUG_LEVEL) return;
-    struct HaloProperties averages_sub_expected, averages_global;
+    HaloProperties averages_box;
+    averages_box = get_halobox_averages(halobox);
+    HaloProperties averages_global;
     LOG_DEBUG("HALO BOXES REDSHIFT %.2f [%.2e %.2e]", consts->redshift, M_min, M_max);
-    if (matter_options_global->FIXED_HALO_GRIDS) {
-        get_box_averages(M_min, M_max, averages_box->m_turn_acg, averages_box->m_turn_mcg, consts,
-                         &averages_global);
-    } else {
-        get_box_averages(simulation_options_global->SAMPLER_MIN_MASS, M_max,
-                         averages_box->m_turn_acg, averages_box->m_turn_mcg, consts,
-                         &averages_global);
-        if (astro_options_global->AVG_BELOW_SAMPLER &&
-            M_min < simulation_options_global->SAMPLER_MIN_MASS) {
-            get_box_averages(M_min, simulation_options_global->SAMPLER_MIN_MASS,
-                             averages_box->m_turn_acg, averages_box->m_turn_mcg, consts,
-                             &averages_sub_expected);
-        }
-    }
+    double mturn_a_avg = pow(10, halobox->log10_Mcrit_ACG_ave);
+    double mturn_m_avg = pow(10, halobox->log10_Mcrit_MCG_ave);
+    get_uhmf_averages(M_min, M_max, mturn_a_avg, mturn_m_avg, consts, &averages_global);
 
     LOG_DEBUG(
         "Exp. averages: (HM %11.3e, SM %11.3e SM_MINI %11.3e SFR %11.3e, SFR_MINI %11.3e, XRAY "
@@ -466,43 +403,22 @@ void halobox_debug_print_avg(struct HaloProperties *averages_box,
     LOG_DEBUG(
         "Box. averages: (HM %11.3e, SM %11.3e SM_MINI %11.3e SFR %11.3e, SFR_MINI %11.3e, XRAY "
         "%11.3e, NION %11.3e)",
-        averages_box->halo_mass, averages_box->stellar_mass, averages_box->stellar_mass_mini,
-        averages_box->halo_sfr, averages_box->sfr_mini, averages_box->halo_xray,
-        averages_box->n_ion);
-
-    if (!matter_options_global->FIXED_HALO_GRIDS && astro_options_global->AVG_BELOW_SAMPLER &&
-        M_min < simulation_options_global->SAMPLER_MIN_MASS) {
-        LOG_DEBUG("SUB-SAMPLER");
-        LOG_DEBUG(
-            "Exp. averages: (HM %11.3e, SM %11.3e SM_MINI %11.3e SFR %11.3e, SFR_MINI %11.3e, XRAY "
-            "%11.3e, NION %11.3e)",
-            averages_sub_expected.halo_mass, averages_sub_expected.stellar_mass,
-            averages_sub_expected.stellar_mass_mini, averages_sub_expected.halo_sfr,
-            averages_sub_expected.sfr_mini, averages_sub_expected.halo_xray,
-            averages_sub_expected.n_ion);
-        LOG_DEBUG(
-            "Box. averages: (HM %11.3e, SM %11.3e SM_MINI %11.3e SFR %11.3e, SFR_MINI %11.3e, XRAY "
-            "%11.3e, NION %11.3e)",
-            averages_subsampler->halo_mass, averages_subsampler->stellar_mass,
-            averages_subsampler->stellar_mass_mini, averages_subsampler->halo_sfr,
-            averages_subsampler->sfr_mini, averages_subsampler->halo_xray,
-            averages_subsampler->n_ion);
-    }
+        averages_box.halo_mass, averages_box.stellar_mass, averages_box.stellar_mass_mini,
+        averages_box.halo_sfr, averages_box.sfr_mini, averages_box.halo_xray, averages_box.n_ion);
 }
 
 // We need the mean log10 turnover masses for comparison with expected global Nion and SFRD.
 // Sometimes we don't calculate these on the grid (if we use halos and no sub-sampler)
 // So this function simply returns the volume-weighted average log10 turnover mass
-void get_mean_log10_turnovers(InitialConditions *ini_boxes, TsBox *previous_spin_temp,
-                              IonizedBox *previous_ionize_box, PerturbedField *perturbed_field,
-                              struct ScalingConstants *consts, double turnovers[3]) {
+void get_log10_turnovers(InitialConditions *ini_boxes, TsBox *previous_spin_temp,
+                         IonizedBox *previous_ionize_box, float *mturn_a_grid, float *mturn_m_grid,
+                         ScalingConstants *consts, double averages[2]) {
+    averages[0] = log10(consts->mturn_a_nofb);
+    averages[1] = log10(consts->mturn_m_nofb);
     if (!astro_options_global->USE_MINI_HALOS) {
-        turnovers[0] = log10(consts->mturn_a_nofb);  // ACG
-        turnovers[1] = log10(consts->mturn_m_nofb);  // MCG
-        turnovers[2] = 0.;                           // reion (log10 so effectively 1 solar mass)
         return;
     }
-    double l10_mturn_a_avg = 0., l10_mturn_m_avg = 0., l10_mturn_r_avg = 0.;
+    double log10_mturn_m_avg = 0., log10_mturn_a_avg = 0.;
 
 #pragma omp parallel num_threads(simulation_options_global->N_THREADS)
     {
@@ -513,7 +429,7 @@ void get_mean_log10_turnovers(InitialConditions *ini_boxes, TsBox *previous_spin
         double M_turn_a = consts->mturn_a_nofb;
         double M_turn_r;
 
-#pragma omp for reduction(+ : l10_mturn_m_avg, l10_mturn_a_avg, l10_mturn_r_avg)
+#pragma omp for reduction(+ : log10_mturn_m_avg, log10_mturn_a_avg)
         for (i = 0; i < HII_TOT_NUM_PIXELS; i++) {
             if (!astro_options_global->FIX_VCB_AVG &&
                 matter_options_global->USE_RELATIVE_VELOCITIES) {
@@ -529,31 +445,27 @@ void get_mean_log10_turnovers(InitialConditions *ini_boxes, TsBox *previous_spin
             M_turn_r = reionization_feedback(consts->redshift, Gamma12_val, zre_val);
             M_turn_a = fmax(M_turn_a, fmax(M_turn_r, astro_params_global->M_TURN));
             M_turn_m = fmax(M_turn_m, fmax(M_turn_r, astro_params_global->M_TURN));
-            l10_mturn_a_avg += log10(M_turn_a);
-            l10_mturn_m_avg += log10(M_turn_m);
-            l10_mturn_r_avg += log10(M_turn_r);
-        }
-        l10_mturn_a_avg /= HII_TOT_NUM_PIXELS;
-        l10_mturn_m_avg /= HII_TOT_NUM_PIXELS;
-        l10_mturn_r_avg /= HII_TOT_NUM_PIXELS;
 
-        turnovers[0] = l10_mturn_a_avg;
-        turnovers[1] = l10_mturn_m_avg;
-        turnovers[2] = l10_mturn_r_avg;
+            mturn_a_grid[i] = log10(M_turn_a);
+            log10_mturn_a_avg += log10(M_turn_a);
+            mturn_m_grid[i] = log10(M_turn_m);
+            log10_mturn_m_avg += log10(M_turn_m);
+        }
     }
+
+    // NOTE: This average log10 Mturn will be passed onto the spin temperature calculations where
+    // It is used to perform the frequency integrals (over tau, dependent on <XHI>), and possibly
+    // for mean fixing. It is the volume-weighted mean of LOG10 Mturn, although we could do another
+    // weighting or use Mturn directly None of these are a perfect representation due to the
+    // nonlinear way turnover mass affects N_ion
+    log10_mturn_a_avg /= HII_TOT_NUM_PIXELS;
+    log10_mturn_m_avg /= HII_TOT_NUM_PIXELS;
+    averages[0] = log10_mturn_a_avg;
+    averages[1] = log10_mturn_m_avg;
 }
 
-void sum_halos_onto_grid(InitialConditions *ini_boxes, TsBox *previous_spin_temp,
-                         IonizedBox *previous_ionize_box, PerturbHaloField *halos,
-                         struct ScalingConstants *consts, HaloBox *grids,
-                         struct HaloProperties *averages) {
-    double redshift = consts->redshift;
-    // averages
-    double hm_avg = 0., sm_avg = 0., sfr_avg = 0.;
-    double sm_avg_mini = 0., sfr_avg_mini = 0.;
-    double M_turn_a_avg = 0., M_turn_m_avg = 0., M_turn_r_avg = 0.;
-    double n_ion_avg = 0., wsfr_avg = 0., xray_avg = 0.;
-    // counts
+void sum_halos_onto_grid(InitialConditions *ini_boxes, PerturbHaloField *halos, float *mturn_a_grid,
+                         float *mturn_m_grid, ScalingConstants *consts, HaloBox *grids) {
     unsigned long long int total_n_halos, n_halos_cut = 0.;
 
     double cell_volume = VOLUME / HII_TOT_NUM_PIXELS;
@@ -568,20 +480,15 @@ void sum_halos_onto_grid(InitialConditions *ini_boxes, TsBox *previous_spin_temp
         double halo_pos[3];
         int halo_idx[3];
         unsigned long long int i_halo, i_cell;
-        double hmass, nion, sfr, wsfr, sfr_mini, stars_mini, stars, xray;
-        double J21_val, Gamma12_val, zre_val;
+        double hmass;
 
-        double curr_vcb = consts->vcb_norel;
         double M_turn_m = consts->mturn_m_nofb;
         double M_turn_a = consts->mturn_a_nofb;
-        double M_turn_r = 0.;
 
         double in_props[3];
-        struct HaloProperties out_props;
+        HaloProperties out_props;
 
-#pragma omp for reduction(+ : hm_avg, sm_avg, sm_avg_mini, sfr_avg, sfr_avg_mini, n_ion_avg, \
-                              xray_avg, wsfr_avg, M_turn_a_avg, M_turn_m_avg, M_turn_r_avg,  \
-                              n_halos_cut)
+#pragma omp for reduction(+ : n_halos_cut)
         for (i_halo = 0; i_halo < halos->n_halos; i_halo++) {
             hmass = halos->halo_masses[i_halo];
             // It is sometimes useful to make cuts to the halo catalogues before gridding.
@@ -604,22 +511,8 @@ void sum_halos_onto_grid(InitialConditions *ini_boxes, TsBox *previous_spin_temp
             // NOTE: I could easily apply reionization feedback without minihalos but this was not
             // done previously
             if (astro_options_global->USE_MINI_HALOS) {
-                if (!astro_options_global->FIX_VCB_AVG &&
-                    matter_options_global->USE_RELATIVE_VELOCITIES)
-                    curr_vcb = ini_boxes->lowres_vcb[i_cell];
-
-                J21_val = Gamma12_val = zre_val = 0.;
-                if (consts->redshift < simulation_options_global->Z_HEAT_MAX) {
-                    J21_val = previous_spin_temp->J_21_LW[i_cell];
-                    Gamma12_val = previous_ionize_box->ionisation_rate_G12[i_cell];
-                    zre_val = previous_ionize_box->z_reion[i_cell];
-                }
-
-                M_turn_a = consts->mturn_a_nofb;
-                M_turn_m = lyman_werner_threshold(redshift, J21_val, curr_vcb);
-                M_turn_r = reionization_feedback(redshift, Gamma12_val, zre_val);
-                M_turn_a = fmax(M_turn_a, fmax(M_turn_r, astro_params_global->M_TURN));
-                M_turn_m = fmax(M_turn_m, fmax(M_turn_r, astro_params_global->M_TURN));
+                M_turn_a = mturn_a_grid[i_cell];
+                M_turn_m = mturn_m_grid[i_cell];
             }
 
             // these are the halo property RNG sequences
@@ -629,22 +522,15 @@ void sum_halos_onto_grid(InitialConditions *ini_boxes, TsBox *previous_spin_temp
 
             set_halo_properties(hmass, M_turn_a, M_turn_m, consts, in_props, &out_props);
 
-            sfr = out_props.halo_sfr;
-            sfr_mini = out_props.sfr_mini;
-            nion = out_props.n_ion;
-            wsfr = out_props.fescweighted_sfr;
-            stars = out_props.stellar_mass;
-            stars_mini = out_props.stellar_mass_mini;
-            xray = out_props.halo_xray;
-
 #if LOG_LEVEL >= ULTRA_DEBUG_LEVEL
             if (i_cell == 0) {
                 // LOG_ULTRA_DEBUG("(%d %d %d) i_cell %llu i_halo %llu",x,y,z,i_cell, i_halo);
                 LOG_ULTRA_DEBUG(
                     "Cell 0 Halo: HM: %.2e SM: %.2e (%.2e) SF: %.2e (%.2e) X: %.2e NI: %.2e WS: "
                     "%.2e Z : %.2e ct : %llu",
-                    hmass, stars, stars_mini, sfr, sfr_mini, xray, nion, wsfr,
-                    out_props.metallicity, i_halo);
+                    hmass, out_props.stellar_mass, out_props.stellar_mass_mini, out_props.halo_sfr,
+                    out_props.sfr_mini, out_props.halo_xray, out_props.n_ion,
+                    out_props.fescweighted_sfr, out_props.metallicity, i_halo);
 
                 // LOG_ULTRA_DEBUG("Cell 0 Sums: HM: %.2e SM: %.2e (%.2e) SF: %.2e (%.2e) X: %.2e
                 // NI: %.2e WS: %.2e ct : %d",
@@ -660,52 +546,43 @@ void sum_halos_onto_grid(InitialConditions *ini_boxes, TsBox *previous_spin_temp
 
 // update the grids
 #pragma omp atomic update
-            grids->halo_mass[i_cell] += hmass;
-#pragma omp atomic update
-            grids->halo_stars[i_cell] += stars;
-#pragma omp atomic update
-            grids->n_ion[i_cell] += nion;
-#pragma omp atomic update
-            grids->halo_sfr[i_cell] += sfr;
+            grids->n_ion[i_cell] += out_props.n_ion;
 #pragma omp atomic update
-            grids->count[i_cell] += 1;
+            grids->halo_sfr[i_cell] += out_props.halo_sfr;
 
             if (astro_options_global->USE_MINI_HALOS) {
 #pragma omp atomic update
-                grids->halo_stars_mini[i_cell] += stars_mini;
-#pragma omp atomic update
-                grids->halo_sfr_mini[i_cell] += sfr_mini;
+                grids->halo_sfr_mini[i_cell] += out_props.sfr_mini;
             }
 
             if (astro_options_global->INHOMO_RECO) {
 #pragma omp atomic update
-                grids->whalo_sfr[i_cell] += wsfr;
+                grids->whalo_sfr[i_cell] += out_props.fescweighted_sfr;
             }
 
             if (astro_options_global->USE_TS_FLUCT) {
 #pragma omp atomic update
-                grids->halo_xray[i_cell] += xray;
+                grids->halo_xray[i_cell] += out_props.halo_xray;
             }
 
-            hm_avg += hmass;
-            sfr_avg += sfr;
-            sfr_avg_mini += sfr_mini;
-            sm_avg += stars;
-            sm_avg_mini += stars_mini;
-            xray_avg += xray;
-            n_ion_avg += nion;
-            wsfr_avg += wsfr;
-            M_turn_a_avg += M_turn_a;
-            M_turn_r_avg += M_turn_r;
-            M_turn_m_avg += M_turn_m;
+            if (config_settings.EXTRA_HALOBOX_FIELDS) {
+#pragma omp atomic update
+                grids->halo_mass[i_cell] += hmass;
+#pragma omp atomic update
+                grids->halo_stars[i_cell] += out_props.stellar_mass;
+#pragma omp atomic update
+                grids->count[i_cell] += 1;
+                if (astro_options_global->USE_MINI_HALOS) {
+#pragma omp atomic update
+                    grids->halo_stars_mini[i_cell] += out_props.stellar_mass_mini;
+                }
+            }
         }
 
 #pragma omp for
         for (i_cell = 0; i_cell < HII_TOT_NUM_PIXELS; i_cell++) {
-            grids->halo_mass[i_cell] /= cell_volume;
-            grids->halo_sfr[i_cell] /= cell_volume;
-            grids->halo_stars[i_cell] /= cell_volume;
             grids->n_ion[i_cell] /= cell_volume;
+            grids->halo_sfr[i_cell] /= cell_volume;
             if (astro_options_global->USE_TS_FLUCT) {
                 grids->halo_xray[i_cell] /= cell_volume;
             }
@@ -714,15 +591,19 @@ void sum_halos_onto_grid(InitialConditions *ini_boxes, TsBox *previous_spin_temp
             }
             if (astro_options_global->USE_MINI_HALOS) {
                 grids->halo_sfr_mini[i_cell] /= cell_volume;
-                grids->halo_stars_mini[i_cell] /= cell_volume;
+            }
+            if (config_settings.EXTRA_HALOBOX_FIELDS) {
+                grids->halo_mass[i_cell] /= cell_volume;
+                grids->halo_stars[i_cell] /= cell_volume;
+                if (astro_options_global->USE_MINI_HALOS) {
+                    grids->halo_stars_mini[i_cell] /= cell_volume;
+                }
             }
         }
     }
     total_n_halos = halos->n_halos - n_halos_cut;
-    LOG_SUPER_DEBUG("Cell 0 Totals: HM: %.2e SM: %.2e SF: %.2e NI: %.2e ct : %d",
-                    grids->halo_mass[HII_R_INDEX(0, 0, 0)], grids->halo_stars[HII_R_INDEX(0, 0, 0)],
-                    grids->halo_sfr[HII_R_INDEX(0, 0, 0)], grids->halo_xray[HII_R_INDEX(0, 0, 0)],
-                    grids->n_ion[HII_R_INDEX(0, 0, 0)], grids->count[HII_R_INDEX(0, 0, 0)]);
+    LOG_SUPER_DEBUG("Cell 0 Totals: SF: %.2e NI: %.2e", grids->halo_sfr[HII_R_INDEX(0, 0, 0)],
+                    grids->n_ion[HII_R_INDEX(0, 0, 0)]);
     if (astro_options_global->INHOMO_RECO) {
         LOG_SUPER_DEBUG("FESC * SF %.2e", grids->whalo_sfr[HII_R_INDEX(0, 0, 0)]);
     }
@@ -733,46 +614,11 @@ void sum_halos_onto_grid(InitialConditions *ini_boxes, TsBox *previous_spin_temp
         LOG_SUPER_DEBUG("MINI SM %.2e SF %.2e", grids->halo_stars_mini[HII_R_INDEX(0, 0, 0)],
                         grids->halo_sfr_mini[HII_R_INDEX(0, 0, 0)]);
     }
-
-    // NOTE: There is an inconsistency here, the sampled grids use a halo-averaged turnover mass
-    //   whereas the fixed grids / default 21cmfast uses the volume averaged LOG10(turnover mass).
-    //   Neither of these are a perfect representation due to the nonlinear way turnover mass
-    //   affects N_ion
-    if (total_n_halos > 0) {
-        M_turn_r_avg /= total_n_halos;
-        M_turn_a_avg /= total_n_halos;
-        M_turn_m_avg /= total_n_halos;
-    } else {
-        // If we have no halos, assume the turnover has no reion feedback & no LW
-        M_turn_m_avg = consts->mturn_m_nofb;
-        M_turn_a_avg = consts->mturn_a_nofb;
-        M_turn_r_avg = 0.;
-    }
-
-    hm_avg /= VOLUME;
-    sm_avg /= VOLUME;
-    sm_avg_mini /= VOLUME;
-    sfr_avg /= VOLUME;
-    sfr_avg_mini /= VOLUME;
-    n_ion_avg /= VOLUME;
-    xray_avg /= VOLUME;
-
-    averages->halo_mass = hm_avg;
-    averages->stellar_mass = sm_avg;
-    averages->halo_sfr = sfr_avg;
-    averages->stellar_mass_mini = sm_avg_mini;
-    averages->sfr_mini = sfr_avg_mini;
-    averages->halo_xray = xray_avg;
-    averages->n_ion = n_ion_avg;
-    averages->m_turn_acg = M_turn_a_avg;
-    averages->m_turn_mcg = M_turn_m_avg;
-    averages->m_turn_reion = M_turn_r_avg;
 }
 
 // We grid a PERTURBED halofield into the necessary quantities for calculating radiative backgrounds
-int ComputeHaloBox(double redshift, InitialConditions *ini_boxes, PerturbedField *perturbed_field,
-                   PerturbHaloField *halos, TsBox *previous_spin_temp,
-                   IonizedBox *previous_ionize_box, HaloBox *grids) {
+int ComputeHaloBox(double redshift, InitialConditions *ini_boxes, PerturbHaloField *halos,
+                   TsBox *previous_spin_temp, IonizedBox *previous_ionize_box, HaloBox *grids) {
     int status;
     Try {
         // get parameters
@@ -787,54 +633,58 @@ int ComputeHaloBox(double redshift, InitialConditions *ini_boxes, PerturbedField
         unsigned long long int idx;
 #pragma omp parallel for num_threads(simulation_options_global->N_THREADS) private(idx)
         for (idx = 0; idx < HII_TOT_NUM_PIXELS; idx++) {
-            grids->halo_mass[idx] = 0.0;
             grids->n_ion[idx] = 0.0;
             grids->halo_sfr[idx] = 0.0;
-            grids->halo_stars[idx] = 0.0;
-            grids->count[idx] = 0;
             if (astro_options_global->USE_TS_FLUCT) {
                 grids->halo_xray[idx] = 0.0;
             }
             if (astro_options_global->USE_MINI_HALOS) {
-                grids->halo_stars_mini[idx] = 0.0;
                 grids->halo_sfr_mini[idx] = 0.0;
             }
             if (astro_options_global->INHOMO_RECO) {
                 grids->whalo_sfr[idx] = 0.0;
             }
+            if (config_settings.EXTRA_HALOBOX_FIELDS) {
+                grids->halo_mass[idx] = 0.0;
+                grids->halo_stars[idx] = 0.0;
+                grids->count[idx] = 0;
+                if (astro_options_global->USE_MINI_HALOS) {
+                    grids->halo_stars_mini[idx] = 0.0;
+                }
+            }
         }
 
-        struct ScalingConstants hbox_consts;
-
+        ScalingConstants hbox_consts;
         set_scaling_constants(redshift, &hbox_consts, true);
 
         LOG_DEBUG("Gridding %llu halos...", halos->n_halos);
 
         double M_min = minimum_source_mass(redshift, false);
         double M_max_integral;
-        double cell_volume = VOLUME / HII_TOT_NUM_PIXELS;
-
-        double turnovers[3];
-
-        struct HaloProperties averages_box, averages_subsampler;
 
         init_ps();
         if (matter_options_global->USE_INTERPOLATION_TABLES > 0) {
-            initialiseSigmaMInterpTable(
-                M_min / 2,
-                M_MAX_INTEGRAL);  // this needs to be initialised above MMax because of Nion_General
+            // this needs to be initialised above MMax because of Nion_General
+            initialiseSigmaMInterpTable(M_min / 2, M_MAX_INTEGRAL);
         }
-        // do the mean HMF box
-        // The default 21cmFAST has a strange behaviour where the nonlinear density is used as
-        // linear, the condition mass is at mean density, but the total cell mass is multiplied by
-        // delta This part mimics that behaviour Since we need the average turnover masses before we
-        // can calculate the global means, we do the CMF integrals first Then we calculate the
-        // expected UMF integrals before doing the adjustment
+
+        float *mturn_a_grid = NULL;
+        float *mturn_m_grid = NULL;
+        if (astro_options_global->USE_MINI_HALOS) {
+            mturn_a_grid = calloc(HII_TOT_NUM_PIXELS, sizeof(float));
+            mturn_m_grid = calloc(HII_TOT_NUM_PIXELS, sizeof(float));
+        }
+        double mturn_averages[2];
+        get_log10_turnovers(ini_boxes, previous_spin_temp, previous_ionize_box, mturn_a_grid,
+                            mturn_m_grid, &hbox_consts, mturn_averages);
+        grids->log10_Mcrit_ACG_ave = mturn_averages[0];
+        grids->log10_Mcrit_MCG_ave = mturn_averages[1];
         if (matter_options_global->FIXED_HALO_GRIDS) {
             M_max_integral = M_MAX_INTEGRAL;
-            set_fixed_grids(M_min, M_max_integral, ini_boxes, perturbed_field, previous_spin_temp,
-                            previous_ionize_box, &hbox_consts, grids, &averages_box, true);
+            set_fixed_grids(M_min, M_max_integral, ini_boxes, mturn_a_grid, mturn_m_grid,
+                            &hbox_consts, grids);
         } else {
+            sum_halos_onto_grid(ini_boxes, halos, mturn_a_grid, mturn_m_grid, &hbox_consts, grids);
             // set below-resolution properties
             if (astro_options_global->AVG_BELOW_SAMPLER) {
                 if (matter_options_global->HALO_STOCHASTICITY) {
@@ -844,52 +694,22 @@ int ComputeHaloBox(double redshift, InitialConditions *ini_boxes, PerturbedField
                                           simulation_options_global->DIM);
                 }
                 if (M_min < M_max_integral) {
-                    set_fixed_grids(M_min, M_max_integral, ini_boxes, perturbed_field,
-                                    previous_spin_temp, previous_ionize_box, &hbox_consts, grids,
-                                    &averages_subsampler, false);
-// This is pretty redundant, but since the fixed grids have density units (X Mpc-3) I have to
-// re-multiply before adding the halos.
-//       I should instead have a flag to output the summed values in cell. (2*N_pixel > N_halo so
-//       generally i don't want to do it in the halo loop)
-#pragma omp parallel for num_threads(simulation_options_global->N_THREADS) private(idx)
-                    for (idx = 0; idx < HII_TOT_NUM_PIXELS; idx++) {
-                        grids->halo_mass[idx] *= cell_volume;
-                        grids->halo_stars[idx] *= cell_volume;
-                        grids->n_ion[idx] *= cell_volume;
-                        grids->halo_sfr[idx] *= cell_volume;
-                        if (astro_options_global->USE_TS_FLUCT) {
-                            grids->halo_xray[idx] *= cell_volume;
-                        }
-                        if (astro_options_global->INHOMO_RECO) {
-                            grids->whalo_sfr[idx] *= cell_volume;
-                        }
-                        if (astro_options_global->USE_MINI_HALOS) {
-                            grids->halo_stars_mini[idx] *= cell_volume;
-                            grids->halo_sfr_mini[idx] *= cell_volume;
-                        }
-                    }
+                    set_fixed_grids(M_min, M_max_integral, ini_boxes, mturn_a_grid, mturn_m_grid,
+                                    &hbox_consts, grids);
                     LOG_DEBUG("finished subsampler M[%.2e %.2e]", M_min, M_max_integral);
                 }
-            } else {
-                // we still need the average turnovers for global values in spintemp, so get them
-                // here
-                get_mean_log10_turnovers(ini_boxes, previous_spin_temp, previous_ionize_box,
-                                         perturbed_field, &hbox_consts, turnovers);
-                grids->log10_Mcrit_ACG_ave = turnovers[0];
-                grids->log10_Mcrit_MCG_ave = turnovers[1];
             }
-            sum_halos_onto_grid(ini_boxes, previous_spin_temp, previous_ionize_box, halos,
-                                &hbox_consts, grids, &averages_box);
         }
-        halobox_debug_print_avg(&averages_box, &averages_subsampler, &hbox_consts, M_min,
-                                M_MAX_INTEGRAL);
+        halobox_debug_print_avg(grids, &hbox_consts, M_min, M_MAX_INTEGRAL);
 
+        if (astro_options_global->USE_MINI_HALOS) {
+            free(mturn_a_grid);
+            free(mturn_m_grid);
+        }
         // NOTE: the density-grid based calculations (!USE_HALO_FIELD)
         //  use the cell-weighted average of the log10(Mturn) (see issue #369)
-        LOG_SUPER_DEBUG("log10 Mutrn ACG: log10 cell-weighted %.6e Halo-weighted %.6e",
-                        pow(10, grids->log10_Mcrit_ACG_ave), averages_box.m_turn_acg);
-        LOG_SUPER_DEBUG("log10 Mutrn MCG: log10 cell-weighted %.6e Halo-weighted %.6e",
-                        pow(10, grids->log10_Mcrit_MCG_ave), averages_box.m_turn_mcg);
+        LOG_SUPER_DEBUG("log10 Mutrn ACG: %.6e", pow(10, grids->log10_Mcrit_ACG_ave));
+        LOG_SUPER_DEBUG("log10 Mutrn MCG: %.6e", pow(10, grids->log10_Mcrit_MCG_ave));
 
         if (matter_options_global->USE_INTERPOLATION_TABLES > 0) {
             freeSigmaMInterpTable();
@@ -903,13 +723,14 @@ int ComputeHaloBox(double redshift, InitialConditions *ini_boxes, PerturbedField
 // test function for getting halo properties from the wrapper, can use a lot of memory for large
 // catalogs
 int test_halo_props(double redshift, float *vcb_grid, float *J21_LW_grid, float *z_re_grid,
-                    float *Gamma12_ion_grid, int n_halos, float *halo_masses, float *halo_coords,
-                    float *star_rng, float *sfr_rng, float *xray_rng, float *halo_props_out) {
+                    float *Gamma12_ion_grid, unsigned long long int n_halos, float *halo_masses,
+                    float *halo_coords, float *star_rng, float *sfr_rng, float *xray_rng,
+                    float *halo_props_out) {
     int status;
     Try {
         // get parameters
 
-        struct ScalingConstants hbox_consts;
+        ScalingConstants hbox_consts;
         set_scaling_constants(redshift, &hbox_consts, true);
         print_sc_consts(&hbox_consts);
 
@@ -931,7 +752,7 @@ int test_halo_props(double redshift, float *vcb_grid, float *J21_LW_grid, float
             double M_turn_r = 0.;
 
             double in_props[3], halo_pos[3];
-            struct HaloProperties out_props;
+            HaloProperties out_props;
 
 #pragma omp for
             for (i_halo = 0; i_halo < n_halos; i_halo++) {
diff --git a/src/py21cmfast/src/HaloBox.h b/src/py21cmfast/src/HaloBox.h
index a81f77e25..5ca2e005f 100644
--- a/src/py21cmfast/src/HaloBox.h
+++ b/src/py21cmfast/src/HaloBox.h
@@ -8,10 +8,57 @@
 #include "OutputStructs.h"
 #include "PerturbHaloField.h"
 #include "SpinTemperatureBox.h"
+#include "scaling_relations.h"
 
-// Compute the HaloBox Object
-int ComputeHaloBox(double redshift, InitialConditions *ini_boxes, PerturbedField *perturbed_field,
-                   PerturbHaloField *halos, TsBox *previous_spin_temp,
-                   IonizedBox *previous_ionize_box, HaloBox *grids);
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// struct holding each halo property we currently need.
+// This is only used for both averages over the box/catalogues
+//   as well as an individual halo's properties
+typedef struct HaloProperties {
+    double count;  // from integral
+    double halo_mass;
+    double stellar_mass;
+    double halo_sfr;
+    double stellar_mass_mini;
+    double sfr_mini;
+    double fescweighted_sfr;
+    double n_ion;
+    double halo_xray;
+    double metallicity;
+    double m_turn_acg;
+    double m_turn_mcg;
+    double m_turn_reion;
+} HaloProperties;
+
+// TODO: apply this constant struct to the EvaluateX functions in interp_tables.c,
+//  the integral_wrappers.c functions, and other places where the tables are called
+//  (probably not hmf.c)
+typedef struct IntegralCondition {
+    double redshift;
+    double growth_factor;
+    double M_min;
+    double lnM_min;
+    double M_max;
+    double lnM_max;
+    double M_cell;
+    double lnM_cell;
+    double sigma_cell;
+} IntegralCondition;
+
+void set_integral_constants(IntegralCondition *consts, double redshift, double M_min, double M_max,
+                            double M_cell);
 
+int ComputeHaloBox(double redshift, InitialConditions *ini_boxes, PerturbHaloField *halos,
+                   TsBox *previous_spin_temp, IonizedBox *previous_ionize_box, HaloBox *grids);
+
+void get_cell_integrals(double dens, double l10_mturn_a, double l10_mturn_m,
+                        ScalingConstants *consts, IntegralCondition *int_consts,
+                        HaloProperties *properties);
+
+#ifdef __cplusplus
+}
+#endif
 #endif
diff --git a/src/py21cmfast/src/HaloField.cu b/src/py21cmfast/src/HaloField.cu
new file mode 100644
index 000000000..0390316c2
--- /dev/null
+++ b/src/py21cmfast/src/HaloField.cu
@@ -0,0 +1,22 @@
+#ifndef _HALOFIELD_CU
+#define _HALOFIELD_CU
+
+#include <cuda_runtime.h>
+
+#include "DeviceConstants.cuh"
+#include "HaloField.cuh"
+
+// define relevant variables stored in constant memory
+__constant__ MatterOptions d_matter_options;
+__constant__ SimulationOptions d_simulation_options;
+__constant__ CosmoParams d_cosmo_params;
+__constant__ AstroParams d_astro_params;
+
+void updateGlobalParams(SimulationOptions *h_simulation_options, MatterOptions * h_matter_options, CosmoParams *h_cosmo_params, AstroParams *h_astro_params){
+    cudaMemcpyToSymbol(d_simulation_options, h_simulation_options, sizeof(SimulationOptions), 0, cudaMemcpyHostToDevice);
+    cudaMemcpyToSymbol(d_matter_options, h_matter_options, sizeof(MatterOptions), 0, cudaMemcpyHostToDevice);
+    cudaMemcpyToSymbol(d_cosmo_params, h_cosmo_params, sizeof(CosmoParams), 0, cudaMemcpyHostToDevice);
+    cudaMemcpyToSymbol(d_astro_params, h_astro_params, sizeof(AstroParams), 0, cudaMemcpyHostToDevice);
+}
+
+#endif
diff --git a/src/py21cmfast/src/HaloField.cuh b/src/py21cmfast/src/HaloField.cuh
new file mode 100644
index 000000000..128f340e4
--- /dev/null
+++ b/src/py21cmfast/src/HaloField.cuh
@@ -0,0 +1,15 @@
+#ifndef _HALOFIELD_CUH
+#define _HALOFIELD_CUH
+#include "InputParameters.h"
+#include "interpolation_types.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+    void updateGlobalParams(SimulationOptions *h_simulation_options, CosmoParams *h_cosmo_params, AstroParams *h_astro_params);
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/py21cmfast/src/HaloField.h b/src/py21cmfast/src/HaloField.h
index 1eb35b79b..7c78fd6e4 100644
--- a/src/py21cmfast/src/HaloField.h
+++ b/src/py21cmfast/src/HaloField.h
@@ -5,7 +5,13 @@
 #include "InputParameters.h"
 #include "OutputStructs.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
 int ComputeHaloField(float redshift_desc, float redshift, InitialConditions *boxes,
                      unsigned long long int random_seed, HaloField *halos_desc, HaloField *halos);
 
+#ifdef __cplusplus
+}
+#endif
 #endif
diff --git a/src/py21cmfast/src/InitialConditions.c b/src/py21cmfast/src/InitialConditions.c
index f75678f86..3a8fff291 100644
--- a/src/py21cmfast/src/InitialConditions.c
+++ b/src/py21cmfast/src/InitialConditions.c
@@ -94,6 +94,22 @@ int ComputeInitialConditions(unsigned long long random_seed, InitialConditions *
 
     int status;
 
+    bool use_cuda = false;
+    if (use_cuda) {
+        printf("Check GPU device ...\n\n");
+#if CUDA_FOUND
+        // print key device properties
+        print_key_device_properties();
+
+        // tmp: ensure hello_world works on GPU
+        call_cuda();
+#else
+        LOG_ERROR(
+            "CUDA function print_key_device_properties() and call_cuda() called but code was not "
+            "compiled for CUDA.");
+#endif
+    }
+
     Try {  // This Try wraps the entire function so we don't indent.
 
         // Makes the parameter structs visible to a variety of functions/macros
diff --git a/src/py21cmfast/src/InitialConditions.h b/src/py21cmfast/src/InitialConditions.h
index 835245926..beb950d22 100644
--- a/src/py21cmfast/src/InitialConditions.h
+++ b/src/py21cmfast/src/InitialConditions.h
@@ -4,9 +4,17 @@
 
 #include <gsl/gsl_rng.h>
 
-#include "InputParameters.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
 #include "OutputStructs.h"
 
 int ComputeInitialConditions(unsigned long long random_seed, InitialConditions *boxes);
 
+void seed_rng_threads(gsl_rng *rng_arr[], unsigned long long int seed);
+void free_rng_threads(gsl_rng *rng_arr[]);
+
+#ifdef __cplusplus
+}
+#endif
 #endif
diff --git a/src/py21cmfast/src/InputParameters.c b/src/py21cmfast/src/InputParameters.c
index 574f6ca72..6abd76fb5 100644
--- a/src/py21cmfast/src/InputParameters.c
+++ b/src/py21cmfast/src/InputParameters.c
@@ -1,5 +1,8 @@
 #include "InputParameters.h"
 
+#include <stdlib.h>
+#include <string.h>
+
 void Broadcast_struct_global_all(SimulationOptions *simulation_options,
                                  MatterOptions *matter_options, CosmoParams *cosmo_params,
                                  AstroParams *astro_params, AstroOptions *astro_options) {
diff --git a/src/py21cmfast/src/InputParameters.h b/src/py21cmfast/src/InputParameters.h
index ce30c5ae5..8ace82b70 100644
--- a/src/py21cmfast/src/InputParameters.h
+++ b/src/py21cmfast/src/InputParameters.h
@@ -2,9 +2,169 @@
 #define _PARAMSTRUCTURES_H
 
 #include <stdbool.h>
-// since ffi.cdef() cannot include directives, we store the types and globals in another file
-//   Since it is unguarded, make sure to ONLY include this file from here
-#include "_inputparams_wrapper.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct CosmoParams {
+    float SIGMA_8;
+    float hlittle;
+    float OMm;
+    float OMl;
+    float OMb;
+    float POWER_INDEX;
+
+    float OMn;
+    float OMk;
+    float OMr;
+    float OMtot;
+    float Y_He;
+    float wl;
+
+} CosmoParams;
+
+typedef struct SimulationOptions {
+    // Parameters taken from INIT_PARAMS.H
+    int HII_DIM;
+    int DIM;
+    float BOX_LEN;
+    float NON_CUBIC_FACTOR;
+    int N_THREADS;
+    double Z_HEAT_MAX;
+    double ZPRIME_STEP_FACTOR;
+
+    // Halo Sampler Options
+    float SAMPLER_MIN_MASS;
+    double SAMPLER_BUFFER_FACTOR;
+    int N_COND_INTERP;
+    int N_PROB_INTERP;
+    double MIN_LOGPROB;
+    double HALOMASS_CORRECTION;
+    double PARKINSON_G0;
+    double PARKINSON_y1;
+    double PARKINSON_y2;
+
+    float INITIAL_REDSHIFT;
+    double DELTA_R_FACTOR;
+    double DENSITY_SMOOTH_RADIUS;
+
+    double DEXM_OPTIMIZE_MINMASS;
+    double DEXM_R_OVERLAP;
+
+    double CORR_STAR;
+    double CORR_SFR;
+    double CORR_LX;
+} SimulationOptions;
+
+typedef struct MatterOptions {
+    bool USE_FFTW_WISDOM;
+    int HMF;
+    int USE_RELATIVE_VELOCITIES;
+    int POWER_SPECTRUM;
+    int USE_INTERPOLATION_TABLES;
+    bool PERTURB_ON_HIGH_RES;
+    int PERTURB_ALGORITHM;
+    bool MINIMIZE_MEMORY;
+    bool KEEP_3D_VELOCITIES;
+    bool DEXM_OPTIMIZE;
+    int FILTER;
+    int HALO_FILTER;
+    bool SMOOTH_EVOLVED_DENSITY_FIELD;
+
+    bool USE_HALO_FIELD;
+    bool HALO_STOCHASTICITY;
+    bool FIXED_HALO_GRIDS;
+    int SAMPLE_METHOD;
+} MatterOptions;
+
+typedef struct AstroParams {
+    float HII_EFF_FACTOR;
+
+    // SHMR
+    float F_STAR10;
+    float ALPHA_STAR;
+    float ALPHA_STAR_MINI;
+    float SIGMA_STAR;
+    double UPPER_STELLAR_TURNOVER_MASS;
+    double UPPER_STELLAR_TURNOVER_INDEX;
+    float F_STAR7_MINI;
+
+    // SFMS
+    float t_STAR;
+    double SIGMA_SFR_INDEX;
+    double SIGMA_SFR_LIM;
+
+    // L_X/SFR
+    double L_X;
+    double L_X_MINI;
+    double SIGMA_LX;
+
+    // Escape Fraction
+    float F_ESC10;
+    float ALPHA_ESC;
+    float F_ESC7_MINI;
+
+    float T_RE;
+
+    float M_TURN;
+    float R_BUBBLE_MAX;
+    float ION_Tvir_MIN;
+    double F_H2_SHIELD;
+    float NU_X_THRESH;
+    float X_RAY_SPEC_INDEX;
+    float X_RAY_Tvir_MIN;
+
+    double A_LW;
+    double BETA_LW;
+    double A_VCB;
+    double BETA_VCB;
+
+    double FIXED_VAVG;
+    double POP2_ION;
+    double POP3_ION;
+
+    double PHOTONCONS_CALIBRATION_END;
+    double CLUMPING_FACTOR;
+    double ALPHA_UVB;
+
+    float R_MAX_TS;
+    int N_STEP_TS;
+    double DELTA_R_HII_FACTOR;
+    float R_BUBBLE_MIN;
+    double MAX_DVDR;
+    double NU_X_MAX;
+    double NU_X_BAND_MAX;
+} AstroParams;
+
+typedef struct AstroOptions {
+    bool USE_MINI_HALOS;
+    bool USE_CMB_HEATING;  // CMB Heating Flag
+    bool USE_LYA_HEATING;  // Lya Heating Flag
+    bool USE_MASS_DEPENDENT_ZETA;
+    bool INHOMO_RECO;
+    bool USE_TS_FLUCT;
+    bool M_MIN_in_Mass;
+    bool FIX_VCB_AVG;
+    bool USE_EXP_FILTER;
+    bool CELL_RECOMB;
+    int PHOTON_CONS_TYPE;
+    bool USE_UPPER_STELLAR_TURNOVER;
+    bool HALO_SCALING_RELATIONS_MEDIAN;
+    int HII_FILTER;
+    int HEAT_FILTER;
+    bool IONISE_ENTIRE_SPHERE;
+    bool AVG_BELOW_SAMPLER;
+    int INTEGRATION_METHOD_ATOMIC;
+    int INTEGRATION_METHOD_MINI;
+} AstroOptions;
+
+typedef struct ConfigSettings {
+    double HALO_CATALOG_MEM_FACTOR;
+    bool EXTRA_HALOBOX_FIELDS;
+    char external_table_path[200];
+    char wisdoms_path[200];
+} ConfigSettings;
 
 void Broadcast_struct_global_all(SimulationOptions *simulation_options,
                                  MatterOptions *matter_options, CosmoParams *cosmo_params,
@@ -12,4 +172,31 @@ void Broadcast_struct_global_all(SimulationOptions *simulation_options,
 void Broadcast_struct_global_noastro(SimulationOptions *simulation_options,
                                      MatterOptions *matter_options, CosmoParams *cosmo_params);
 
+void set_external_table_path(ConfigSettings *params, const char *value);
+char *get_external_table_path(ConfigSettings *params);
+void set_wisdoms_path(ConfigSettings *params, const char *value);
+char *get_wisdoms_path(ConfigSettings *params);
+
+/* Previously, we had a few structures spread throughout the code e.g simulation_options_ufunc which
+   were all globally defined and separately broadcast at different times. Several of these were used
+   across different files and some inside #defines (e.g indexing.h), so for now I've combined
+   the parameter structures to avoid confusion (we shouldn't have the possibility of two files using
+   different parameters).
+
+   In future we should have a parameter structure in each .c file containing ONLY parameters
+   relevant to it (look at HaloBox.c), and force the broadcast at each _compute() step (or even
+   decorate any library call) However this would require us to be very careful about initialising
+   the globals when ANY function from that file is called */
+// The structs declared here defined in InputParameters.c
+extern SimulationOptions *simulation_options_global;
+extern MatterOptions *matter_options_global;
+extern CosmoParams *cosmo_params_global;
+extern AstroParams *astro_params_global;
+extern AstroOptions *astro_options_global;
+
+extern ConfigSettings config_settings;
+
+#ifdef __cplusplus
+}
+#endif
 #endif
diff --git a/src/py21cmfast/src/IonisationBox.c b/src/py21cmfast/src/IonisationBox.c
index e55f884a9..2526286b1 100644
--- a/src/py21cmfast/src/IonisationBox.c
+++ b/src/py21cmfast/src/IonisationBox.c
@@ -54,7 +54,7 @@ struct IonBoxConstants {
     int hii_filter;
 
     // astro parameters
-    struct ScalingConstants scale_consts;
+    ScalingConstants scale_consts;
     double T_re;
 
     // astro calculated values
@@ -135,7 +135,7 @@ void set_ionbox_constants(double redshift, double prev_redshift, struct IonBoxCo
     else
         consts->dz = prev_redshift - redshift;
 
-    struct ScalingConstants sc;
+    ScalingConstants sc;
     set_scaling_constants(redshift, &sc, true);
     consts->scale_consts = sc;
 
@@ -206,9 +206,7 @@ void set_ionbox_constants(double redshift, double prev_redshift, struct IonBoxCo
         pow(1 + redshift, 2) * CMperMPC * SIGMA_HI * astro_params_global->ALPHA_UVB /
         (astro_params_global->ALPHA_UVB + 2.75) * N_b0 * consts->ion_eff_factor / 1.0e-12;
     if (matter_options_global->USE_HALO_FIELD)
-        consts->gamma_prefactor /=
-            RHOcrit * cosmo_params_global->OMb;  // TODO: double-check these unit differences,
-                                                 // HaloBox.halo_wsfr vs Nion_General units
+        consts->gamma_prefactor /= RHOcrit * cosmo_params_global->OMb;
     else
         consts->gamma_prefactor = consts->gamma_prefactor / (sc.t_h * sc.t_star);
 
@@ -447,7 +445,7 @@ void calculate_mcrit_boxes(IonizedBox *prev_ionbox, TsBox *spin_temp, InitialCon
 void set_mean_fcoll(struct IonBoxConstants *c, IonizedBox *prev_box, IonizedBox *curr_box,
                     double mturn_acg, double mturn_mcg, double *f_limit_acg, double *f_limit_mcg) {
     double f_coll_curr = 0., f_coll_prev = 0., f_coll_curr_mini = 0., f_coll_prev_mini = 0.;
-    struct ScalingConstants *sc_ptr = &(c->scale_consts);
+    ScalingConstants *sc_ptr = &(c->scale_consts);
     if (astro_options_global->USE_MASS_DEPENDENT_ZETA) {
         f_coll_curr = Nion_General(c->redshift, c->lnMmin, c->lnMmax_gl, mturn_acg, sc_ptr);
         *f_limit_acg = Nion_General(simulation_options_global->Z_HEAT_MAX, c->lnMmin, c->lnMmax_gl,
@@ -669,7 +667,7 @@ void setup_integration_tables(struct FilteredGrids *fg_struct, struct IonBoxCons
     double min_density, max_density, prev_min_density = 0., prev_max_density = 0.;
     double log10Mturn_min = 0., log10Mturn_max = 0., log10Mturn_min_MINI = 0.,
            log10Mturn_max_MINI = 0.;
-    struct ScalingConstants *sc_ptr = &(consts->scale_consts);
+    ScalingConstants *sc_ptr = &(consts->scale_consts);
 
     // TODO: instead of putting a random upper limit, put a proper flag for switching of one/both
     // sides of the clipping
@@ -741,7 +739,7 @@ void calculate_fcoll_grid(IonizedBox *box, IonizedBox *previous_ionize_box,
     double f_coll_total = 0., f_coll_MINI_total = 0.;
     // TODO: make proper error tracking through the parallel region
     bool error_flag;
-    struct ScalingConstants *sc_ptr = &(consts->scale_consts);
+    ScalingConstants *sc_ptr = &(consts->scale_consts);
 
     int fc_r_idx;
     fc_r_idx = (astro_options_global->USE_MINI_HALOS && !matter_options_global->USE_HALO_FIELD)
@@ -1379,6 +1377,28 @@ int ComputeIonizedBox(float redshift, float prev_redshift, PerturbedField *pertu
         int n_radii;
         n_radii = setup_radii(&radii_spec, &ionbox_constants);
 
+        fftwf_complex *d_deltax_filtered = NULL;
+        fftwf_complex *d_xe_filtered = NULL;
+        float *d_y_arr = NULL;
+        float *d_Fcoll = NULL;  //_outputstructs_wrapper.h
+
+        unsigned int threadsPerBlock;
+        unsigned int numBlocks;
+
+        // If GPU & flags call init_ionbox_gpu_data()
+        bool use_cuda = false;  // pass this as a parameter later
+        if (use_cuda && astro_options_global->USE_MASS_DEPENDENT_ZETA &&
+            !astro_options_global->USE_MINI_HALOS && !matter_options_global->USE_HALO_FIELD) {
+            unsigned int Nion_nbins = get_nbins();
+#if CUDA_FOUND
+            init_ionbox_gpu_data(&d_deltax_filtered, &d_xe_filtered, &d_y_arr, &d_Fcoll, Nion_nbins,
+                                 HII_TOT_NUM_PIXELS, HII_KSPACE_NUM_PIXELS, &threadsPerBlock,
+                                 &numBlocks);
+#else
+            LOG_ERROR(
+                "CUDA function init_ionbox_gpu_data() called but code was not compiled for CUDA.");
+#endif
+        }
         // CONSTRUCT GRIDS OUTSIDE R LOOP HERE
         // if we don't have a previous ionised box, make a fake one here
         if (prev_redshift < 1)
@@ -1528,8 +1548,27 @@ int ComputeIonizedBox(float redshift, float prev_redshift, PerturbedField *pertu
                                              need_prev_ion);
                 }
 
-                calculate_fcoll_grid(box, previous_ionize_box, grid_struct, &ionbox_constants,
-                                     &curr_radius);
+                // If GPU & flags, call gpu version of calculate_fcoll_grid()
+                bool use_cuda = false;  // pass this as a parameter later
+                if (use_cuda && astro_options_global->USE_MASS_DEPENDENT_ZETA &&
+                    !astro_options_global->USE_MINI_HALOS &&
+                    !matter_options_global->USE_HALO_FIELD) {
+#if CUDA_FOUND
+                    calculate_fcoll_grid_gpu(box, grid_struct->deltax_filtered,
+                                             grid_struct->xe_filtered,
+                                             &curr_radius.f_coll_grid_mean, d_deltax_filtered,
+                                             d_xe_filtered, d_Fcoll, d_y_arr, HII_TOT_NUM_PIXELS,
+                                             HII_KSPACE_NUM_PIXELS, &threadsPerBlock, &numBlocks);
+#else
+                    LOG_ERROR(
+                        "CUDA function calculate_fcoll_grid_gpu() called but code was not compiled "
+                        "for CUDA.");
+#endif
+                } else {
+                    calculate_fcoll_grid(box, previous_ionize_box, grid_struct, &ionbox_constants,
+                                         &curr_radius);
+                }
+
                 // To avoid ST_over_PS becoming nan when f_coll = 0, I set f_coll = FRACT_FLOAT_ERR.
                 // TODO: This was the previous behaviour, but is this right?
                 // setting the *total* to the minimum for the adjustment factor,
@@ -1554,6 +1593,17 @@ int ComputeIonizedBox(float redshift, float prev_redshift, PerturbedField *pertu
                 LOG_ULTRA_DEBUG("z_reion after R=%f: ", curr_radius.R);
                 debugSummarizeBox(box->z_reion, simulation_options_global->HII_DIM,
                                   simulation_options_global->HII_DIM, HII_D_PARA, "  ");
+#endif
+            }
+            // If GPU & flags, call free_ionbox_gpu_data()
+            if (use_cuda && astro_options_global->USE_MASS_DEPENDENT_ZETA &&
+                !astro_options_global->USE_MINI_HALOS && !matter_options_global->USE_HALO_FIELD) {
+#if USE_CUDA
+                free_ionbox_gpu_data(&d_deltax_filtered, &d_xe_filtered, &d_y_arr, &d_Fcoll);
+#else
+                LOG_ERROR(
+                    "CUDA function free_ionbox_gpu_data() called but code was not compiled for "
+                    "CUDA.");
 #endif
             }
             set_ionized_temperatures(box, perturbed_field, spin_temp, &ionbox_constants);
diff --git a/src/py21cmfast/src/IonisationBox.cu b/src/py21cmfast/src/IonisationBox.cu
new file mode 100644
index 000000000..f0d473825
--- /dev/null
+++ b/src/py21cmfast/src/IonisationBox.cu
@@ -0,0 +1,240 @@
+#include "cexcept.h"
+#include "exceptions.h"
+#include "logger.h"
+#include <complex.h>
+#include <fftw3.h>
+#include <gsl/gsl_randist.h>
+#include <gsl/gsl_rng.h>
+#include <math.h>
+#include <omp.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+// GPU
+#include <cuComplex.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+// We use thrust for reduction
+#include <thrust/device_ptr.h>
+#include <thrust/functional.h> // thrust::plus
+#include <thrust/reduce.h>
+
+#include "Constants.h"
+#include "InitialConditions.h"
+#include "InputParameters.h"
+#include "OutputStructs.h"
+#include "bubble_helper_progs.h"
+#include "cosmology.h"
+#include "debugging.h"
+#include "dft.h"
+#include "filtering.h"
+#include "heating_helper_progs.h"
+#include "hmf.h"
+#include "indexing.h"
+#include "interp_tables.h"
+#include "photoncons.h"
+#include "recombinations.h"
+#include "thermochem.h"
+
+#include "IonisationBox_gpu.h"
+#include "cuda_utils.cuh"
+
+__device__ inline double EvaluateRGTable1D_f_gpu(double x, double x_min,
+                                                 double x_width, float *y_arr) {
+
+  int idx = (int)floor((x - x_min) / x_width);
+
+  double table_val = x_min + x_width * (float)idx;
+  double interp_point = (x - table_val) / x_width;
+
+  return y_arr[idx] * (1 - interp_point) + y_arr[idx + 1] * (interp_point);
+}
+
+// template <unsigned int threadsPerBlock>
+__global__ void
+compute_Fcoll(cuFloatComplex *deltax_filtered, // fg_struct
+              cuFloatComplex *xe_filtered,     // fg_struct
+              float *y_arr,                    // Nion_conditional_table1D
+              double x_min,                    // Nion_conditional_table1D
+              double x_width,                  // Nion_conditional_table1D
+              double fract_float_err,          // FRACT_FLOAT_ERR
+              bool use_ts_fluct, // flag_options_global->USE_TS_FLUCT
+              unsigned long long hii_tot_num_pixels, // HII_TOT_NUM_PIXELS
+              long long hii_d,                       // HII_D
+              long long hii_d_para,                  // HII_D_PARA
+              long long hii_mid_para,                // HII_MID_PARA
+              float *Fcoll                           // box
+) {
+  // Get index of grids
+  unsigned long long idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  // Bound check
+  if (idx >= hii_tot_num_pixels) {
+    return;
+  }
+
+  // Get x, y, z from idx using HII_R_INDEX macro formula
+  int z = idx % hii_d_para;
+  unsigned long long remaining = idx / hii_d_para;
+  int y = remaining % hii_d;
+  int x = remaining / hii_d;
+
+  // Get FFT index using HII_R_FFT_INDEX macro formula
+  unsigned long long fft_idx = z + 2 * (hii_mid_para + 1) * (y + hii_d * x);
+
+  // These clippings could be made in the calling function, using thrust, rather
+  // than here...
+
+  // Clip the filtered grids to physical values
+  // delta cannot be less than -1
+  *((float *)deltax_filtered + fft_idx) =
+      fmaxf(*((float *)deltax_filtered + fft_idx), -1. + fract_float_err);
+  // <N_rec> cannot be less than zero
+  // x_e has to be between zero and unity
+  if (use_ts_fluct) {
+    *((float *)xe_filtered + fft_idx) =
+        fmaxf(*((float *)xe_filtered + fft_idx), 0.0);
+    *((float *)xe_filtered + fft_idx) =
+        fminf(*((float *)xe_filtered + fft_idx), 0.999);
+  }
+
+  // Compute collapse fraction
+  Fcoll[idx] = exp(EvaluateRGTable1D_f_gpu(
+      *((float *)deltax_filtered + fft_idx), x_min, x_width, y_arr));
+}
+
+void init_ionbox_gpu_data(
+    fftwf_complex **d_deltax_filtered, // copies of pointers to pointers
+    fftwf_complex **d_xe_filtered, float **d_y_arr, float **d_Fcoll,
+    unsigned int nbins, // nbins for Nion_conditional_table1D->y
+    unsigned long long hii_tot_num_pixels,    // HII_TOT_NUM_PIXELS
+    unsigned long long hii_kspace_num_pixels, // HII_KSPACE_NUM_PIXELS
+    unsigned int *threadsPerBlock, unsigned int *numBlocks) {
+  CALL_CUDA(cudaMalloc(
+      (void **)d_deltax_filtered,
+      sizeof(fftwf_complex) *
+          hii_kspace_num_pixels)); // already pointers to pointers (no & needed)
+  CALL_CUDA(cudaMemset(
+      *d_deltax_filtered, 0,
+      sizeof(fftwf_complex) *
+          hii_kspace_num_pixels)); // dereference the pointer to a pointer (*)
+
+  if (astro_options_global->USE_TS_FLUCT) {
+    CALL_CUDA(cudaMalloc((void **)d_xe_filtered,
+                         sizeof(fftwf_complex) * hii_kspace_num_pixels));
+    CALL_CUDA(cudaMemset(*d_xe_filtered, 0,
+                         sizeof(fftwf_complex) * hii_kspace_num_pixels));
+  }
+
+  CALL_CUDA(cudaMalloc((void **)d_y_arr, sizeof(float) * nbins));
+  CALL_CUDA(cudaMemset(*d_y_arr, 0, sizeof(float) * nbins));
+
+  CALL_CUDA(cudaMalloc((void **)d_Fcoll, sizeof(float) * hii_tot_num_pixels));
+  CALL_CUDA(cudaMemset(*d_Fcoll, 0, sizeof(float) * hii_tot_num_pixels));
+
+  LOG_INFO("Ionisation grids allocated on device.");
+  LOG_INFO("Ionisation grids initialised on device.");
+
+  // Get max threads/block for device
+  int maxThreadsPerBlock;
+  CALL_CUDA(cudaDeviceGetAttribute(&maxThreadsPerBlock,
+                                   cudaDevAttrMaxThreadsPerBlock, 0));
+
+  // Set threads/block based on device max
+  if (maxThreadsPerBlock >= 512) {
+    *threadsPerBlock = 512;
+  } else if (maxThreadsPerBlock >= 256) {
+    *threadsPerBlock = 256;
+  } else if (maxThreadsPerBlock >= 128) {
+    *threadsPerBlock = 128;
+  } else if (maxThreadsPerBlock >= 64) {
+    *threadsPerBlock = 64;
+  } else if (maxThreadsPerBlock >= 32) {
+    *threadsPerBlock = 32;
+  } else {
+    *threadsPerBlock = 16;
+  }
+
+  *numBlocks = (hii_tot_num_pixels + *threadsPerBlock - 1) / *threadsPerBlock;
+}
+
+void calculate_fcoll_grid_gpu(
+    IonizedBox *box,                  // for box->Fcoll
+    fftwf_complex *h_deltax_filtered, // members of fg_struct
+    fftwf_complex *h_xe_filtered,
+    double *f_coll_grid_mean,         // member of rspec
+    fftwf_complex *d_deltax_filtered, // device pointers
+    fftwf_complex *d_xe_filtered, float *d_Fcoll, float *d_y_arr,
+    unsigned long long hii_tot_num_pixels,    // HII_TOT_NUM_PIXELS
+    unsigned long long hii_kspace_num_pixels, // HII_KSPACE_NUM_PIXELS
+    unsigned int *threadsPerBlock, unsigned int *numBlocks) {
+  RGTable1D_f *Nion_conditional_table1D = get_Nion_conditional_table1D();
+
+  // Copy grids from host to device
+  CALL_CUDA(cudaMemcpy(d_deltax_filtered, h_deltax_filtered,
+                       sizeof(fftwf_complex) * hii_kspace_num_pixels,
+                       cudaMemcpyHostToDevice));
+  if (astro_options_global->USE_TS_FLUCT) {
+    CALL_CUDA(cudaMemcpy(d_xe_filtered, h_xe_filtered,
+                         sizeof(fftwf_complex) * hii_kspace_num_pixels,
+                         cudaMemcpyHostToDevice));
+  }
+  CALL_CUDA(cudaMemcpy(d_y_arr, Nion_conditional_table1D->y_arr,
+                       sizeof(float) * Nion_conditional_table1D->n_bin,
+                       cudaMemcpyHostToDevice));
+  LOG_INFO("Ionisation grids copied to device.");
+
+  // TODO: Can I pass these straight to kernel? (or access in kernel w/ Tiger's
+  // method)
+  double fract_float_err = FRACT_FLOAT_ERR;
+  bool use_ts_fluct = astro_options_global->USE_TS_FLUCT;
+  long long hii_d = HII_D;
+  long long hii_d_para = HII_D_PARA;
+  long long hii_mid_para = HII_MID_PARA;
+
+  // Invoke kernel
+  compute_Fcoll<<<*numBlocks, *threadsPerBlock>>>(
+      reinterpret_cast<cuFloatComplex *>(d_deltax_filtered),
+      reinterpret_cast<cuFloatComplex *>(d_xe_filtered), d_y_arr,
+      Nion_conditional_table1D->x_min, Nion_conditional_table1D->x_width,
+      fract_float_err, use_ts_fluct, hii_tot_num_pixels, hii_d, hii_d_para,
+      hii_mid_para, d_Fcoll);
+  CALL_CUDA(cudaDeviceSynchronize());
+  LOG_INFO("IonisationBox compute_Fcoll kernel called.");
+
+  // Use thrust to reduce computed sums to one value.
+  // Wrap device pointer in a thrust::device_ptr
+  thrust::device_ptr<float> d_Fcoll_ptr(d_Fcoll);
+  // Reduce final buffer sums to one value
+  double f_coll_grid_total = thrust::reduce(
+      d_Fcoll_ptr, d_Fcoll_ptr + hii_tot_num_pixels, 0., thrust::plus<float>());
+  *f_coll_grid_mean = f_coll_grid_total / (double)hii_tot_num_pixels;
+  LOG_INFO("Fcoll sum reduced to single value by thrust::reduce operation.");
+
+  // Copy results from device to host
+  CALL_CUDA(cudaMemcpy(box->Fcoll, d_Fcoll, sizeof(float) * hii_tot_num_pixels,
+                       cudaMemcpyDeviceToHost));
+  CALL_CUDA(cudaMemcpy(h_deltax_filtered, d_deltax_filtered,
+                       sizeof(fftwf_complex) * hii_kspace_num_pixels,
+                       cudaMemcpyDeviceToHost));
+  if (astro_options_global->USE_TS_FLUCT) {
+    CALL_CUDA(cudaMemcpy(h_xe_filtered, d_xe_filtered,
+                         sizeof(fftwf_complex) * hii_kspace_num_pixels,
+                         cudaMemcpyDeviceToHost));
+  }
+  LOG_INFO("Grids copied to host.");
+}
+
+void free_ionbox_gpu_data(
+    fftwf_complex **d_deltax_filtered, // copies of pointers to pointers
+    fftwf_complex **d_xe_filtered, float **d_y_arr, float **d_Fcoll) {
+  CALL_CUDA(cudaFree(
+      *d_deltax_filtered)); // Need to dereference the pointers to pointers (*)
+  if (astro_options_global->USE_TS_FLUCT) {
+    CALL_CUDA(cudaFree(*d_xe_filtered));
+  }
+  CALL_CUDA(cudaFree(*d_y_arr));
+  CALL_CUDA(cudaFree(*d_Fcoll));
+  LOG_INFO("Device memory freed.");
+}
diff --git a/src/py21cmfast/src/IonisationBox.h b/src/py21cmfast/src/IonisationBox.h
index e3a12e0b7..a221a2064 100644
--- a/src/py21cmfast/src/IonisationBox.h
+++ b/src/py21cmfast/src/IonisationBox.h
@@ -1,12 +1,21 @@
 #ifndef _IONBOX_H
 #define _IONBOX_H
 
+#include <complex.h>
+#include <fftw3.h>
+
 #include "InputParameters.h"
 #include "OutputStructs.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
 int ComputeIonizedBox(float redshift, float prev_redshift, PerturbedField *perturbed_field,
                       PerturbedField *previous_perturbed_field, IonizedBox *previous_ionize_box,
                       TsBox *spin_temp, HaloBox *halos, InitialConditions *ini_boxes,
                       IonizedBox *box);
+#ifdef __cplusplus
+}
+#endif
 
 #endif
diff --git a/src/py21cmfast/src/IonisationBox_gpu.h b/src/py21cmfast/src/IonisationBox_gpu.h
new file mode 100644
index 000000000..62a59fcc3
--- /dev/null
+++ b/src/py21cmfast/src/IonisationBox_gpu.h
@@ -0,0 +1,35 @@
+#ifndef _IONBOX_H
+#define _IONBOX_H
+
+#include <complex.h>
+#include <fftw3.h>
+// #include <cuda_runtime.h>
+
+#include "InputParameters.h"
+#include "OutputStructs.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+void init_ionbox_gpu_data(fftwf_complex **d_deltax_filtered,  // copies of pointers to pointers
+                          fftwf_complex **d_xe_filtered, float **d_y_arr, float **d_Fcoll,
+                          unsigned int nbins,  // nbins for Nion_conditional_table1D->y
+                          unsigned long long hii_tot_num_pixels,     // HII_TOT_NUM_PIXELS
+                          unsigned long long hii_kspace_num_pixels,  // HII_KSPACE_NUM_PIXELS
+                          unsigned int *threadsPerBlock, unsigned int *numBlocks);
+void calculate_fcoll_grid_gpu(IonizedBox *box,                   // for box->Fcoll
+                              fftwf_complex *h_deltax_filtered,  // members of fg_struct
+                              fftwf_complex *h_xe_filtered,
+                              double *f_coll_grid_mean,          // member of rspec
+                              fftwf_complex *d_deltax_filtered,  // device pointers
+                              fftwf_complex *d_xe_filtered, float *d_Fcoll, float *d_y_arr,
+                              unsigned long long hii_tot_num_pixels,     // HII_TOT_NUM_PIXELS
+                              unsigned long long hii_kspace_num_pixels,  // HII_KSPACE_NUM_PIXELS
+                              unsigned int *threadsPerBlock, unsigned int *numBlocks);
+void free_ionbox_gpu_data(fftwf_complex **d_deltax_filtered,  // copies of pointers to pointers
+                          fftwf_complex **d_xe_filtered, float **d_y_arr, float **d_Fcoll);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/src/py21cmfast/src/LuminosityFunction.h b/src/py21cmfast/src/LuminosityFunction.h
index 5c8b3049e..1863d756d 100644
--- a/src/py21cmfast/src/LuminosityFunction.h
+++ b/src/py21cmfast/src/LuminosityFunction.h
@@ -3,7 +3,13 @@
 
 #include "InputParameters.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
 int ComputeLF(int nbins, int component, int NUM_OF_REDSHIFT_FOR_LF, float *z_LF, float *M_TURNs,
               double *M_uv_z, double *M_h_z, double *log10phi);
 
+#ifdef __cplusplus
+}
+#endif
 #endif
diff --git a/src/py21cmfast/src/MapMass_gpu.cu b/src/py21cmfast/src/MapMass_gpu.cu
new file mode 100644
index 000000000..d2034ee37
--- /dev/null
+++ b/src/py21cmfast/src/MapMass_gpu.cu
@@ -0,0 +1,386 @@
+// Re-write of perturb_field.c for being accessible within the MCMC
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <complex.h>
+#include <omp.h>
+#include <fftw3.h>
+
+// GPU
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include "cexcept.h"
+#include "exceptions.h"
+#include "logger.h"
+#include "Constants.h"
+#include "indexing.h"
+#include "InputParameters.h"
+#include "OutputStructs.h"
+#include "cosmology.h"
+#include "dft.h"
+#include "debugging.h"
+#include "filtering.h"
+
+#include "PerturbField.h"
+
+// #define R_INDEX(x,y,z)((unsigned long long)((z)+D_PARA*((y)+D*(x))))
+__device__ inline unsigned long long compute_R_INDEX(int i, int j, int k, int dim, long long d_para) {
+    return k + d_para * (j + dim * i);
+}
+
+// #define HII_R_INDEX(x,y,z)((unsigned long long)((z)+HII_D_PARA*((y)+HII_D*(x))))
+__device__ inline unsigned long long compute_HII_R_INDEX(int i, int j, int k, int hii_d, long long hii_d_para) {
+    return k + hii_d_para * (j + hii_d * i);
+}
+
+// Is const needed as well as __restrict__?
+__global__ void perturb_density_field_kernel(
+    double *resampled_box,
+    // const float* __restrict__ hires_density,
+    // const float* __restrict__ hires_vx,
+    // const float* __restrict__ hires_vy,
+    // const float* __restrict__ hires_vz,
+    // const float* __restrict__ lowres_vx,
+    // const float* __restrict__ lowres_vy,
+    // const float* __restrict__ lowres_vz,
+    // const float* __restrict__ hires_vx_2LPT,
+    // const float* __restrict__ hires_vy_2LPT,
+    // const float* __restrict__ hires_vz_2LPT,
+    // const float* __restrict__ lowres_vx_2LPT,
+    // const float* __restrict__ lowres_vy_2LPT,
+    // const float* __restrict__ lowres_vz_2LPT,
+    float* hires_density,
+    float* hires_vx,
+    float* hires_vy,
+    float* hires_vz,
+    float* lowres_vx,
+    float* lowres_vy,
+    float* lowres_vz,
+    float* hires_vx_2LPT,
+    float* hires_vy_2LPT,
+    float* hires_vz_2LPT,
+    float* lowres_vx_2LPT,
+    float* lowres_vy_2LPT,
+    float* lowres_vz_2LPT,
+    int dimension, int DIM,
+    long long d_para, long long hii_d, long long hii_d_para,
+    int non_cubic_factor,
+    float f_pixel_factor, float init_growth_factor,
+    bool perturb_on_high_res, bool use_2lpt
+    ) {
+
+    unsigned long long idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (idx < DIM * DIM * d_para) {
+
+        // Get index of density cell
+        int i = idx / (d_para * DIM);
+        int j = (idx / d_para) % DIM;
+        int k = idx % d_para;
+
+        unsigned long long r_index = compute_R_INDEX(i, j, k, DIM, d_para);
+
+        // Map index to location in units of box size
+        double xf = (i + 0.5) / (DIM + 0.0);
+        double yf = (j + 0.5) / (DIM + 0.0);
+        double zf = (k + 0.5) / (d_para + 0.0);
+
+        // Update locations
+        unsigned long long HII_index;
+
+        if (perturb_on_high_res) {
+            // xf += __ldg(&hires_vx[r_index]);
+            // yf += __ldg(&hires_vy[r_index]);
+            // zf += __ldg(&hires_vz[r_index]);
+            xf += hires_vx[r_index];
+            yf += hires_vy[r_index];
+            zf += hires_vz[r_index];
+        }
+        else {
+            unsigned long long HII_i = (unsigned long long)(i / f_pixel_factor);
+            unsigned long long HII_j = (unsigned long long)(j / f_pixel_factor);
+            unsigned long long HII_k = (unsigned long long)(k / f_pixel_factor);
+            HII_index = compute_HII_R_INDEX(HII_i, HII_j, HII_k, hii_d, hii_d_para);
+            // xf += __ldg(&lowres_vx[HII_index]);
+            // yf += __ldg(&lowres_vy[HII_index]);
+            // zf += __ldg(&lowres_vz[HII_index]);
+            xf += lowres_vx[HII_index];
+            yf += lowres_vy[HII_index];
+            zf += lowres_vz[HII_index];
+        }
+
+        // 2LPT (add second order corrections)
+        if (use_2lpt) {
+            if (perturb_on_high_res) {
+                // xf -= __ldg(&hires_vx_2LPT[r_index]);
+                // yf -= __ldg(&hires_vy_2LPT[r_index]);
+                // zf -= __ldg(&hires_vz_2LPT[r_index]);
+                xf -= hires_vx_2LPT[r_index];
+                yf -= hires_vy_2LPT[r_index];
+                zf -= hires_vz_2LPT[r_index];
+            }
+            else {
+                // xf -= __ldg(&lowres_vx_2LPT[HII_index]);
+                // yf -= __ldg(&lowres_vy_2LPT[HII_index]);
+                // zf -= __ldg(&lowres_vz_2LPT[HII_index]);
+                xf -= lowres_vx_2LPT[HII_index];
+                yf -= lowres_vy_2LPT[HII_index];
+                zf -= lowres_vz_2LPT[HII_index];
+            }
+        }
+
+        // TODO: shared between threads?
+        // Convert once to reduce overhead of multiple casts
+        double dimension_double = (double)(dimension);
+        double dimension_factored_double = dimension_double * (double)(non_cubic_factor);
+        int dimension_factored = dimension * non_cubic_factor;
+
+        // Scale coordinates back to grid size
+        xf *= dimension_double;
+        yf *= dimension_double;
+        zf *= dimension_factored_double;
+
+        // Wrap coordinates to keep them within valid boundaries
+        xf = fmod(fmod(xf, dimension_double) + dimension_double, dimension_double);
+        yf = fmod(fmod(yf, dimension_double) + dimension_double, dimension_double);
+        zf = fmod(fmod(zf, dimension_factored_double) + dimension_factored_double, dimension_factored_double);
+
+        // FROM NVIDIA DOCS:
+        // __device__ doublenearbyint(double x) // Round the input argument to the nearest integer.
+        // There are SO many double-to-int conversion intrinsics. How to know if should use any?
+
+        // Get integer values for indices from double precision values
+        int xi = xf;
+        int yi = yf;
+        int zi = zf;
+
+        // Wrap index coordinates to ensure no out-of-bounds array access will be attempted
+        xi = ((xi % dimension) + dimension) % dimension;
+        yi = ((yi % dimension) + dimension) % dimension;
+        zi = ((zi % dimension_factored) + dimension_factored) % dimension_factored;
+
+        // Determine the fraction of the perturbed cell which overlaps with the 8 nearest grid cells,
+        // based on the grid cell which contains the centre of the perturbed cell
+        float d_x = fabs(xf - (double)(xi + 0.5)); // Absolute distances from grid cell centre to perturbed cell centre
+        float d_y = fabs(yf - (double)(yi + 0.5)); // (also) The fractions of mass which will be moved to neighbouring cells
+        float d_z = fabs(zf - (double)(zi + 0.5));
+
+        // 8 neighbour cells-of-interest will be shifted left/down/behind if perturbed midpoint is in left/bottom/back corner of cell.
+        if (xf < (double)(xi + 0.5)) {
+            // If perturbed cell centre is less than the mid-point then update fraction
+            // of mass in the cell and determine the cell centre of neighbour to be the
+            // lowest grid point index
+            d_x = 1. - d_x;
+            xi -= 1;
+            xi = (xi + dimension) % dimension; // Only this critera is possible as iterate back by one (we cannot exceed DIM)
+        }
+        if(yf < (double)(yi + 0.5)) {
+            d_y = 1. - d_y;
+            yi -= 1;
+            yi = (yi + dimension) % dimension;
+        }
+        if(zf < (double)(zi + 0.5)) {
+            d_z = 1. - d_z;
+            zi -= 1;
+            zi = (zi + (unsigned long long)(non_cubic_factor * dimension)) % (unsigned long long)(non_cubic_factor * dimension);
+        }
+        // The fractions of mass which will remain with perturbed cell
+        float t_x = 1. - d_x;
+        float t_y = 1. - d_y;
+        float t_z = 1. - d_z;
+
+        // Determine the grid coordinates of the 8 neighbouring cells.
+        // Neighbours will be in positive direction; front/right/above cells (-> 2x2 cube, with perturbed cell bottom/left/back)
+        // Takes into account the offset based on cell centre determined above
+        int xp1 = (xi + 1) % dimension;
+        int yp1 = (yi + 1) % dimension;
+        int zp1 = (zi + 1) % (unsigned long long)(non_cubic_factor * dimension);
+
+        // double scaled_density = 1 + init_growth_factor * __ldg(&hires_density[r_index]);
+        double scaled_density = 1.0 + init_growth_factor * hires_density[r_index];
+
+        if (perturb_on_high_res) {
+            // Redistribute the mass over the 8 neighbouring cells according to cloud in cell
+            // Cell mass = (1 + init_growth_factor * orig_density) * (proportion of mass to distribute)
+            atomicAdd(&resampled_box[compute_R_INDEX(xi, yi, zi, DIM, d_para)], scaled_density * t_x * t_y * t_z);
+            atomicAdd(&resampled_box[compute_R_INDEX(xp1, yi, zi, DIM, d_para)], scaled_density * d_x * t_y * t_z);
+            atomicAdd(&resampled_box[compute_R_INDEX(xi, yp1, zi, DIM, d_para)], scaled_density * t_x * d_y * t_z);
+            atomicAdd(&resampled_box[compute_R_INDEX(xp1, yp1, zi, DIM, d_para)], scaled_density * d_x * d_y * t_z);
+            atomicAdd(&resampled_box[compute_R_INDEX(xi, yi, zp1, DIM, d_para)], scaled_density * t_x * t_y * d_z);
+            atomicAdd(&resampled_box[compute_R_INDEX(xp1, yi, zp1, DIM, d_para)], scaled_density * d_x * t_y * d_z);
+            atomicAdd(&resampled_box[compute_R_INDEX(xi, yp1, zp1, DIM, d_para)], scaled_density * t_x * d_y * d_z);
+            atomicAdd(&resampled_box[compute_R_INDEX(xp1, yp1, zp1, DIM, d_para)], scaled_density * d_x * d_y * d_z);
+        }
+        else {
+            atomicAdd(&resampled_box[compute_HII_R_INDEX(xi, yi, zi, hii_d, hii_d_para)], scaled_density * t_x * t_y * t_z);
+            atomicAdd(&resampled_box[compute_HII_R_INDEX(xp1, yi, zi, hii_d, hii_d_para)], scaled_density * d_x * t_y * t_z);
+            atomicAdd(&resampled_box[compute_HII_R_INDEX(xi, yp1, zi, hii_d, hii_d_para)], scaled_density * t_x * d_y * t_z);
+            atomicAdd(&resampled_box[compute_HII_R_INDEX(xp1, yp1, zi, hii_d, hii_d_para)], scaled_density * d_x * d_y * t_z);
+            atomicAdd(&resampled_box[compute_HII_R_INDEX(xi, yi, zp1, hii_d, hii_d_para)], scaled_density * t_x * t_y * d_z);
+            atomicAdd(&resampled_box[compute_HII_R_INDEX(xp1, yi, zp1, hii_d, hii_d_para)], scaled_density * d_x * t_y * d_z);
+            atomicAdd(&resampled_box[compute_HII_R_INDEX(xi, yp1, zp1, hii_d, hii_d_para)], scaled_density * t_x * d_y * d_z);
+            atomicAdd(&resampled_box[compute_HII_R_INDEX(xp1, yp1, zp1, hii_d, hii_d_para)], scaled_density * d_x * d_y * d_z);
+        }
+    }
+}
+
+double* MapMass_gpu(
+    InitialConditions *boxes, double *resampled_box,
+    int dimension, float f_pixel_factor, float init_growth_factor
+) {
+    // Box shapes from outputs.py and convenience macros
+    size_t size_double, size_float;
+    if(matter_options_global->PERTURB_ON_HIGH_RES) {
+        size_double = TOT_NUM_PIXELS * sizeof(double);
+        size_float = TOT_NUM_PIXELS * sizeof(float);
+    }
+    else {
+        size_double = HII_TOT_NUM_PIXELS * sizeof(double);
+        size_float = HII_TOT_NUM_PIXELS * sizeof(float);
+    }
+
+    // Allocate device memory for output box and set to 0.
+    double* d_resampled_box;
+    cudaMalloc((void**)&d_resampled_box, size_double);
+    cudaMemset(d_resampled_box, 0, size_double); // fills size_double bytes with byte=0
+
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        LOG_ERROR("CUDA error: %s", cudaGetErrorString(err));
+        Throw(CUDAError);
+    }
+
+    // Allocate device memory for density field
+    float* hires_density;
+    cudaMalloc(&hires_density, (TOT_NUM_PIXELS * sizeof(float))); // from 21cmFAST.h, outputs.py & indexing.h
+    cudaMemcpy(hires_density, boxes->hires_density, (TOT_NUM_PIXELS * sizeof(float)), cudaMemcpyHostToDevice);
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        LOG_ERROR("CUDA error: %s", cudaGetErrorString(err));
+        Throw(CUDAError);
+    }
+
+    // Allocate device memory and copy arrays to device as per user_params
+    float* hires_vx; // floats as per 21cmFAST.h
+    float* hires_vy;
+    float* hires_vz;
+    float* lowres_vx;
+    float* lowres_vy;
+    float* lowres_vz;
+    float* hires_vx_2LPT;
+    float* hires_vy_2LPT;
+    float* hires_vz_2LPT;
+    float* lowres_vx_2LPT;
+    float* lowres_vy_2LPT;
+    float* lowres_vz_2LPT;
+
+    if (matter_options_global->PERTURB_ON_HIGH_RES) {
+        cudaMalloc(&hires_vx, size_float);
+        cudaMalloc(&hires_vy, size_float);
+        cudaMalloc(&hires_vz, size_float);
+        cudaMemcpy(hires_vx, boxes->hires_vx, size_float, cudaMemcpyHostToDevice);
+        cudaMemcpy(hires_vy, boxes->hires_vy, size_float, cudaMemcpyHostToDevice);
+        cudaMemcpy(hires_vz, boxes->hires_vz, size_float, cudaMemcpyHostToDevice);
+    }
+    else {
+        cudaMalloc(&lowres_vx, size_float);
+        cudaMalloc(&lowres_vy, size_float);
+        cudaMalloc(&lowres_vz, size_float);
+        cudaMemcpy(lowres_vx, boxes->lowres_vx, size_float, cudaMemcpyHostToDevice);
+        cudaMemcpy(lowres_vy, boxes->lowres_vy, size_float, cudaMemcpyHostToDevice);
+        cudaMemcpy(lowres_vz, boxes->lowres_vz, size_float, cudaMemcpyHostToDevice);
+    }
+    if (matter_options_global->USE_2LPT) {
+        if (matter_options_global->PERTURB_ON_HIGH_RES) {
+            cudaMalloc(&hires_vx_2LPT, size_float);
+            cudaMalloc(&hires_vy_2LPT, size_float);
+            cudaMalloc(&hires_vz_2LPT, size_float);
+            cudaMemcpy(hires_vx_2LPT, boxes->hires_vx_2LPT, size_float, cudaMemcpyHostToDevice);
+            cudaMemcpy(hires_vy_2LPT, boxes->hires_vy_2LPT, size_float, cudaMemcpyHostToDevice);
+            cudaMemcpy(hires_vz_2LPT, boxes->hires_vz_2LPT, size_float, cudaMemcpyHostToDevice);
+        }
+        else {
+            cudaMalloc(&lowres_vx_2LPT, size_float);
+            cudaMalloc(&lowres_vy_2LPT, size_float);
+            cudaMalloc(&lowres_vz_2LPT, size_float);
+            cudaMemcpy(lowres_vx_2LPT, boxes->lowres_vx_2LPT, size_float, cudaMemcpyHostToDevice);
+            cudaMemcpy(lowres_vy_2LPT, boxes->lowres_vy_2LPT, size_float, cudaMemcpyHostToDevice);
+            cudaMemcpy(lowres_vz_2LPT, boxes->lowres_vz_2LPT, size_float, cudaMemcpyHostToDevice);
+        }
+    }
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        LOG_ERROR("CUDA error: %s", cudaGetErrorString(err));
+        Throw(CUDAError);
+    }
+
+    // Can't pass macro straight to kernel
+    long long d_para = D_PARA;
+    long long hii_d = HII_D;
+    long long hii_d_para = HII_D_PARA;
+
+    // Invoke kernel
+    int threadsPerBlock = 256;
+    int numBlocks = (TOT_NUM_PIXELS + threadsPerBlock - 1) / threadsPerBlock;
+    perturb_density_field_kernel<<<numBlocks, threadsPerBlock>>>(
+        d_resampled_box, hires_density, hires_vx, hires_vy, hires_vz, lowres_vx, lowres_vy, lowres_vz,
+        hires_vx_2LPT, hires_vy_2LPT, hires_vz_2LPT, lowres_vx_2LPT, lowres_vy_2LPT, lowres_vz_2LPT,
+        dimension, simulation_options_global->DIM, d_para, hii_d, hii_d_para, simulation_options_global->NON_CUBIC_FACTOR,
+        f_pixel_factor, init_growth_factor, matter_options_global->PERTURB_ON_HIGH_RES, matter_options_global->USE_2LPT);
+
+    // // Only use during development!
+    // err = cudaDeviceSynchronize();
+    // CATCH_CUDA_ERROR(err);
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        LOG_ERROR("Kernel launch error: %s", cudaGetErrorString(err));
+        Throw(CUDAError);
+    }
+
+    // Copy results from device to host
+    err = cudaMemcpy(resampled_box, d_resampled_box, size_double, cudaMemcpyDeviceToHost);
+    if (err != cudaSuccess) {
+        LOG_ERROR("CUDA error: %s", cudaGetErrorString(err));
+        Throw(CUDAError);
+    }
+
+    // Deallocate device memory
+    cudaFree(d_resampled_box);
+    cudaFree(hires_density);
+
+    if (matter_options_global->PERTURB_ON_HIGH_RES) {
+        cudaFree(hires_vx);
+        cudaFree(hires_vy);
+        cudaFree(hires_vz);
+    }
+    else {
+        cudaFree(lowres_vx);
+        cudaFree(lowres_vy);
+        cudaFree(lowres_vz);
+    }
+    if (matter_options_global->USE_2LPT) {
+        if (matter_options_global->PERTURB_ON_HIGH_RES) {
+            cudaFree(hires_vx_2LPT);
+            cudaFree(hires_vy_2LPT);
+            cudaFree(hires_vz_2LPT);
+        }
+        else {
+            cudaFree(lowres_vx_2LPT);
+            cudaFree(lowres_vy_2LPT);
+            cudaFree(lowres_vz_2LPT);
+        }
+    }
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        LOG_ERROR("CUDA error: %s", cudaGetErrorString(err));
+        Throw(CUDAError);
+    }
+
+    return resampled_box;
+}
diff --git a/src/py21cmfast/src/OutputStructs.h b/src/py21cmfast/src/OutputStructs.h
index 9ebf63167..3f9f183cb 100644
--- a/src/py21cmfast/src/OutputStructs.h
+++ b/src/py21cmfast/src/OutputStructs.h
@@ -6,8 +6,97 @@
 
 #include "InputParameters.h"
 
-// since ffi.cdef() cannot include directives, we store the types and globals in another file
-//   Since it is unguarded, make sure to ONLY include this file from here
-#include "_outputstructs_wrapper.h"
+typedef struct InitialConditions {
+    float *lowres_density, *lowres_vx, *lowres_vy, *lowres_vz;
+    float *lowres_vx_2LPT, *lowres_vy_2LPT, *lowres_vz_2LPT;
+    float *hires_density, *hires_vx, *hires_vy, *hires_vz;
+    float *hires_vx_2LPT, *hires_vy_2LPT, *hires_vz_2LPT;  // cw addition
+    float *lowres_vcb;
+} InitialConditions;
+
+typedef struct PerturbedField {
+    float *density, *velocity_x, *velocity_y, *velocity_z;
+} PerturbedField;
+
+typedef struct HaloField {
+    long long unsigned int n_halos;
+    long long unsigned int buffer_size;
+    float *halo_masses;
+    float *halo_coords;
+
+    // Halo properties for stochastic model
+    float *star_rng;
+    float *sfr_rng;
+    float *xray_rng;
+} HaloField;
+
+typedef struct PerturbHaloField {
+    long long unsigned int n_halos;
+    long long unsigned int buffer_size;
+    float *halo_masses;
+    float *halo_coords;
+
+    // Halo properties for stochastic model
+    float *star_rng;
+    float *sfr_rng;
+    float *xray_rng;
+} PerturbHaloField;
+
+typedef struct HaloBox {
+    // Things that aren't used in radiation fields but useful outputs
+    float *halo_mass;
+    float *halo_stars;
+    float *halo_stars_mini;
+    int *count;
+
+    // For IonisationBox.c and SpinTemperatureBox.c
+    float *n_ion;     // weighted by F_ESC*PopN_ion
+    float *halo_sfr;  // for x-rays and Ts stuff
+    float *halo_xray;
+    float *halo_sfr_mini;  // for x-rays and Ts stuff
+    float *whalo_sfr;      // SFR weighted by PopN_ion and F_ESC, used for Gamma12
+
+    // Average volume-weighted log10 Turnover masses are kept in order to compare with the expected
+    // MF integrals
+    double log10_Mcrit_ACG_ave;
+    double log10_Mcrit_MCG_ave;
+} HaloBox;
+
+typedef struct XraySourceBox {
+    float *filtered_sfr;
+    float *filtered_xray;
+    float *filtered_sfr_mini;
+
+    double *mean_log10_Mcrit_LW;
+    double *mean_sfr;
+    double *mean_sfr_mini;
+} XraySourceBox;
+
+typedef struct TsBox {
+    float *spin_temperature;
+    float *xray_ionised_fraction;
+    float *kinetic_temp_neutral;
+    float *J_21_LW;
+} TsBox;
+
+typedef struct IonizedBox {
+    double mean_f_coll;
+    double mean_f_coll_MINI;
+    double log10_Mturnover_ave;
+    double log10_Mturnover_MINI_ave;
+    float *neutral_fraction;
+    float *ionisation_rate_G12;
+    float *mean_free_path;
+    float *z_reion;
+    float *cumulative_recombinations;
+    float *kinetic_temperature;
+    float *unnormalised_nion;
+    float *unnormalised_nion_mini;
+} IonizedBox;
+
+typedef struct BrightnessTemp {
+    float *brightness_temp;
+    float *tau_21;
+} BrightnessTemp;
 
 #endif
diff --git a/src/py21cmfast/src/PerturbField.c b/src/py21cmfast/src/PerturbField.c
index c3b2baf16..715ea93f2 100644
--- a/src/py21cmfast/src/PerturbField.c
+++ b/src/py21cmfast/src/PerturbField.c
@@ -19,113 +19,7 @@
 #include "filtering.h"
 #include "indexing.h"
 #include "logger.h"
-
-static inline void do_cic_interpolation(double *resampled_box, double pos[3], int box_dim[3],
-                                        double curr_dens) {
-    // get the CIC indices and distances
-    int ipos[3], iposp1[3];
-    double dist[3];
-    // NOTE: assumes the cell at idx == 0 is *centred* at (0,0,0)
-    for (int axis = 0; axis < 3; axis++) {
-        ipos[axis] = (int)floor(pos[axis]);
-        iposp1[axis] = ipos[axis] + 1;
-        dist[axis] = pos[axis] - ipos[axis];
-    }
-
-    wrap_coord(ipos, box_dim);
-    wrap_coord(iposp1, box_dim);
-
-    unsigned long long int cic_indices[8] = {
-        grid_index_general(ipos[0], ipos[1], ipos[2], box_dim),
-        grid_index_general(iposp1[0], ipos[1], ipos[2], box_dim),
-        grid_index_general(ipos[0], iposp1[1], ipos[2], box_dim),
-        grid_index_general(iposp1[0], iposp1[1], ipos[2], box_dim),
-        grid_index_general(ipos[0], ipos[1], iposp1[2], box_dim),
-        grid_index_general(iposp1[0], ipos[1], iposp1[2], box_dim),
-        grid_index_general(ipos[0], iposp1[1], iposp1[2], box_dim),
-        grid_index_general(iposp1[0], iposp1[1], iposp1[2], box_dim)};
-
-    double cic_weights[8] = {(1. - dist[0]) * (1. - dist[1]) * (1. - dist[2]),
-                             dist[0] * (1. - dist[1]) * (1. - dist[2]),
-                             (1. - dist[0]) * dist[1] * (1. - dist[2]),
-                             dist[0] * dist[1] * (1. - dist[2]),
-                             (1. - dist[0]) * (1. - dist[1]) * dist[2],
-                             dist[0] * (1. - dist[1]) * dist[2],
-                             (1. - dist[0]) * dist[1] * dist[2],
-                             dist[0] * dist[1] * dist[2]};
-
-    for (int i = 0; i < 8; i++) {
-#pragma omp atomic update
-        resampled_box[cic_indices[i]] += curr_dens * cic_weights[i];
-    }
-}
-
-// Function that maps a IC density grid to the perturbed density grid
-void move_grid_masses(double redshift, float *dens_pointer, int dens_dim[3], float *vel_pointers[3],
-                      float *vel_pointers_2LPT[3], int vel_dim[3], double *resampled_box,
-                      int out_dim[3]) {
-    // grid dimension constants
-    double boxlen = simulation_options_global->BOX_LEN;
-    double boxlen_z = boxlen * simulation_options_global->NON_CUBIC_FACTOR;
-    double box_size[3] = {boxlen, boxlen, boxlen_z};
-    double dim_ratio_vel = (double)vel_dim[0] / (double)dens_dim[0];
-    double dim_ratio_out = (double)out_dim[0] / (double)dens_dim[0];
-
-    // Setup IC velocity factors
-    double growth_factor = dicke(redshift);
-    double displacement_factor_2LPT = -(3.0 / 7.0) * growth_factor * growth_factor;  // 2LPT eq. D8
-
-    double init_growth_factor = dicke(simulation_options_global->INITIAL_REDSHIFT);
-    double init_displacement_factor_2LPT =
-        -(3.0 / 7.0) * init_growth_factor * init_growth_factor;  // 2LPT eq. D8
-
-    double velocity_displacement_factor[3] = {
-        (growth_factor - init_growth_factor) / box_size[0] * simulation_options_global->DIM,
-        (growth_factor - init_growth_factor) / box_size[1] * simulation_options_global->DIM,
-        (growth_factor - init_growth_factor) / box_size[2] * D_PARA};
-    double velocity_displacement_factor_2LPT[3] = {
-        (displacement_factor_2LPT - init_displacement_factor_2LPT) / box_size[0] *
-            simulation_options_global->DIM,
-        (displacement_factor_2LPT - init_displacement_factor_2LPT) / box_size[1] *
-            simulation_options_global->DIM,
-        (displacement_factor_2LPT - init_displacement_factor_2LPT) / box_size[2] * D_PARA};
-#pragma omp parallel num_threads(simulation_options_global->N_THREADS)
-    {
-        int i, j, k, axis;
-        double pos[3], curr_dens;
-        int ipos[3];
-        unsigned long long vel_index, dens_index;
-#pragma omp for
-        for (i = 0; i < dens_dim[0]; i++) {
-            for (j = 0; j < dens_dim[1]; j++) {
-                for (k = 0; k < dens_dim[2]; k++) {
-                    // Transform position to units of box size
-                    pos[0] = i;
-                    pos[1] = j;
-                    pos[2] = k;
-                    resample_index((int[3]){i, j, k}, dim_ratio_vel, ipos);
-                    wrap_coord(ipos, vel_dim);
-                    vel_index = grid_index_general(ipos[0], ipos[1], ipos[2], vel_dim);
-                    for (axis = 0; axis < 3; axis++) {
-                        pos[axis] +=
-                            vel_pointers[axis][vel_index] * velocity_displacement_factor[axis];
-                        // add 2LPT second order corrections
-                        if (matter_options_global->PERTURB_ALGORITHM == 2) {
-                            pos[axis] -= vel_pointers_2LPT[axis][vel_index] *
-                                         velocity_displacement_factor_2LPT[axis];
-                        }
-                        pos[axis] *= dim_ratio_out;
-                    }
-
-                    // CIC interpolation
-                    dens_index = grid_index_general(i, j, k, dens_dim);
-                    curr_dens = 1.0 + dens_pointer[dens_index] * init_growth_factor;
-                    do_cic_interpolation(resampled_box, pos, out_dim, curr_dens);
-                }
-            }
-        }
-    }
-}
+#include "map_mass.h"
 
 void make_density_grid(float redshift, fftwf_complex *fft_density_grid, InitialConditions *boxes) {
     int i, j, k;
@@ -210,8 +104,18 @@ void make_density_grid(float redshift, fftwf_complex *fft_density_grid, InitialC
             resampled_box = (double *)calloc(HII_TOT_NUM_PIXELS, sizeof(double));
         }
         int hi_dim[3] = {simulation_options_global->DIM, simulation_options_global->DIM, D_PARA};
-        move_grid_masses(redshift, boxes->hires_density, hi_dim, vel_pointers, vel_pointers_2LPT,
-                         box_dim, resampled_box, box_dim);
+        bool use_cuda = false;  // pass this as a parameter later
+        if (use_cuda) {
+#if CUDA_FOUND
+            resampled_box =
+                MapMass_gpu(boxes, resampled_box, dimension, f_pixel_factor, init_growth_factor);
+#else
+            LOG_ERROR("CUDA version of MapMass() called but code was not compiled for CUDA.");
+#endif
+        } else {
+            move_grid_masses(redshift, boxes->hires_density, hi_dim, vel_pointers,
+                             vel_pointers_2LPT, box_dim, resampled_box, box_dim);
+        }
 
         LOG_SUPER_DEBUG("resampled_box: ");
         debugSummarizeBoxDouble(resampled_box, box_dim[0], box_dim[1], box_dim[2], "  ");
diff --git a/src/py21cmfast/src/PerturbField.h b/src/py21cmfast/src/PerturbField.h
index a99e42ac3..6f2fba56d 100644
--- a/src/py21cmfast/src/PerturbField.h
+++ b/src/py21cmfast/src/PerturbField.h
@@ -1,9 +1,17 @@
 #ifndef _PERTURBFIELD_H
 #define _PERTURBFIELD_H
 
+// #include <cuda_runtime.h>
+
 #include "InputParameters.h"
 #include "OutputStructs.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
 int ComputePerturbField(float redshift, InitialConditions *boxes, PerturbedField *perturbed_field);
 
+#ifdef __cplusplus
+}
+#endif
 #endif
diff --git a/src/py21cmfast/src/PerturbHaloField.c b/src/py21cmfast/src/PerturbHaloField.c
index ca0747f21..0db4a7aed 100644
--- a/src/py21cmfast/src/PerturbHaloField.c
+++ b/src/py21cmfast/src/PerturbHaloField.c
@@ -94,7 +94,7 @@ int ComputePerturbHaloField(float redshift, InitialConditions *boxes, HaloField
         halos_perturbed->n_halos = halos->n_halos;
 
         // ******************   END INITIALIZATION     ******************************** //
-        int n_exact_dim = 0;
+        unsigned long long int n_exact_dim = 0;
         bool error_in_parallel = false;
 #pragma omp parallel private(i_halo) num_threads(simulation_options_global -> N_THREADS) \
     reduction(+ : n_exact_dim)
@@ -136,7 +136,7 @@ int ComputePerturbHaloField(float redshift, InitialConditions *boxes, HaloField
             }
         }
         // Divide out multiplicative factor to return to pristine state
-        LOG_SUPER_DEBUG("Number of halos exactly on the box edge = %d of %d", n_exact_dim,
+        LOG_SUPER_DEBUG("Number of halos exactly on the box edge = %llu of %llu", n_exact_dim,
                         halos->n_halos);
         if (error_in_parallel) {
             LOG_ERROR("Error in parallel processing, some halos were out of bounds.");
diff --git a/src/py21cmfast/src/PerturbHaloField.h b/src/py21cmfast/src/PerturbHaloField.h
index 1868c4b39..3854adf36 100644
--- a/src/py21cmfast/src/PerturbHaloField.h
+++ b/src/py21cmfast/src/PerturbHaloField.h
@@ -4,7 +4,13 @@
 #include "InputParameters.h"
 #include "OutputStructs.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
 int ComputePerturbHaloField(float redshift, InitialConditions *boxes, HaloField *halos,
                             PerturbHaloField *halos_perturbed);
 
+#ifdef __cplusplus
+}
+#endif
 #endif
diff --git a/src/py21cmfast/src/SpinTemperatureBox.c b/src/py21cmfast/src/SpinTemperatureBox.c
index ea1e0ce6b..162b99111 100644
--- a/src/py21cmfast/src/SpinTemperatureBox.c
+++ b/src/py21cmfast/src/SpinTemperatureBox.c
@@ -745,7 +745,7 @@ int UpdateXraySourceBox(HaloBox *halobox, double R_inner, double R_outer, int R_
 // NOTE: Frequency integrals are based on PREVIOUS x_e_ave
 //   The x_e tables are not regular, hence the precomputation of indices/interp points
 void fill_freqint_tables(double zp, double x_e_ave, double filling_factor_of_HI_zp,
-                         double *log10_Mcrit_LW_ave, int R_mm, struct ScalingConstants *sc) {
+                         double *log10_Mcrit_LW_ave, int R_mm, ScalingConstants *sc) {
     double lower_int_limit;
     int x_e_ct, R_ct;
     int R_start, R_end;
@@ -867,7 +867,7 @@ int global_reion_properties(double zp, double x_e_ave, double *log10_Mcrit_LW_av
     double determine_zpp_max, determine_zpp_min;
 
     // at z', we need a differenc constant struct
-    struct ScalingConstants sc;
+    ScalingConstants sc;
     set_scaling_constants(zp, &sc, false);
 
     if (matter_options_global->USE_INTERPOLATION_TABLES > 1) {
@@ -930,7 +930,8 @@ int global_reion_properties(double zp, double x_e_ave, double *log10_Mcrit_LW_av
 
 void calculate_sfrd_from_grid(int R_ct, float *dens_R_grid, float *Mcrit_R_grid, float *sfrd_grid,
                               float *sfrd_grid_mini, double *ave_sfrd, double *ave_sfrd_mini,
-                              struct ScalingConstants *sc) {
+                              unsigned int threadsPerBlock, float *d_y_arr, float *d_dens_R_grid,
+                              float *d_sfrd_grid, double *d_ave_sfrd_buf, ScalingConstants *sc) {
     double ave_sfrd_buf = 0;
     double ave_sfrd_buf_mini = 0;
     if (astro_options_global->INTEGRATION_METHOD_ATOMIC == 1 ||
@@ -951,43 +952,61 @@ void calculate_sfrd_from_grid(int R_ct, float *dens_R_grid, float *Mcrit_R_grid,
         }
     }
 
+    // --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+    // If GPU is to be used and flags are ideal, call GPU version of reduction
+    bool use_cuda = false;  // pass this as a parameter later
+    if (use_cuda && astro_options_global->USE_MASS_DEPENDENT_ZETA &&
+        matter_options_global->USE_INTERPOLATION_TABLES && !astro_options_global->USE_MINI_HALOS) {
+#if CUDA_FOUND
+        RGTable1D_f *SFRD_conditional_table = get_SFRD_conditional_table();
+        ave_sfrd_buf =
+            calculate_sfrd_from_grid_gpu(SFRD_conditional_table, dens_R_grid, zpp_growth, R_ct,
+                                         sfrd_grid, HII_TOT_NUM_PIXELS, threadsPerBlock,
+                                         // d_data
+                                         d_y_arr, d_dens_R_grid, d_sfrd_grid, d_ave_sfrd_buf);
+#else
+        LOG_ERROR("calculate_sfrd_from_grid_gpu() called but code was not compiled for CUDA.");
+#endif
+    } else {
 #pragma omp parallel num_threads(simulation_options_global->N_THREADS)
-    {
-        unsigned long long int box_ct;
-        double curr_dens;
-        double curr_mcrit = 0.;
-        double fcoll, dfcoll;
-        double fcoll_MINI = 0;
+        {
+            unsigned long long int box_ct;
+            double curr_dens;
+            double curr_mcrit = 0.;
+            double fcoll, dfcoll;
+            double fcoll_MINI = 0;
 
 #pragma omp for reduction(+ : ave_sfrd_buf, ave_sfrd_buf_mini)
-        for (box_ct = 0; box_ct < HII_TOT_NUM_PIXELS; box_ct++) {
-            curr_dens = dens_R_grid[box_ct] * zpp_growth[R_ct];
-            if (astro_options_global->USE_MINI_HALOS) curr_mcrit = Mcrit_R_grid[box_ct];
+            for (box_ct = 0; box_ct < HII_TOT_NUM_PIXELS; box_ct++) {
+                curr_dens = dens_R_grid[box_ct] * zpp_growth[R_ct];
+                if (astro_options_global->USE_MINI_HALOS) curr_mcrit = Mcrit_R_grid[box_ct];
 
-            if (astro_options_global->USE_MASS_DEPENDENT_ZETA) {
-                fcoll = EvaluateSFRD_Conditional(curr_dens, zpp_growth[R_ct], M_min_R[R_ct],
+                if (astro_options_global->USE_MASS_DEPENDENT_ZETA) {
+                    fcoll =
+                        EvaluateSFRD_Conditional(curr_dens, zpp_growth[R_ct], M_min_R[R_ct],
                                                  M_max_R[R_ct], M_max_R[R_ct], sigma_max[R_ct], sc);
-                sfrd_grid[box_ct] = (1. + curr_dens) * fcoll;
+                    sfrd_grid[box_ct] = (1. + curr_dens) * fcoll;
 
-                if (astro_options_global->USE_MINI_HALOS) {
-                    fcoll_MINI = EvaluateSFRD_Conditional_MINI(
-                        curr_dens, curr_mcrit, zpp_growth[R_ct], M_min_R[R_ct], M_max_R[R_ct],
-                        M_max_R[R_ct], sigma_max[R_ct], sc);
-                    sfrd_grid_mini[box_ct] = (1. + curr_dens) * fcoll_MINI;
+                    if (astro_options_global->USE_MINI_HALOS) {
+                        fcoll_MINI = EvaluateSFRD_Conditional_MINI(
+                            curr_dens, curr_mcrit, zpp_growth[R_ct], M_min_R[R_ct], M_max_R[R_ct],
+                            M_max_R[R_ct], sigma_max[R_ct], sc);
+                        sfrd_grid_mini[box_ct] = (1. + curr_dens) * fcoll_MINI;
+                    }
+                } else {
+                    fcoll = EvaluateFcoll_delta(curr_dens, zpp_growth[R_ct], sigma_min[R_ct],
+                                                sigma_max[R_ct]);
+                    dfcoll = EvaluatedFcolldz(curr_dens, zpp_for_evolve_list[R_ct], sigma_min[R_ct],
+                                              sigma_max[R_ct]);
+                    sfrd_grid[box_ct] = (1. + curr_dens) * dfcoll;
                 }
-            } else {
-                fcoll = EvaluateFcoll_delta(curr_dens, zpp_growth[R_ct], sigma_min[R_ct],
-                                            sigma_max[R_ct]);
-                dfcoll = EvaluatedFcolldz(curr_dens, zpp_for_evolve_list[R_ct], sigma_min[R_ct],
-                                          sigma_max[R_ct]);
-                sfrd_grid[box_ct] = (1. + curr_dens) * dfcoll;
+                ave_sfrd_buf += fcoll;
+                ave_sfrd_buf_mini += fcoll_MINI;
             }
-            ave_sfrd_buf += fcoll;
-            ave_sfrd_buf_mini += fcoll_MINI;
         }
+        *ave_sfrd = ave_sfrd_buf / HII_TOT_NUM_PIXELS;
+        *ave_sfrd_mini = ave_sfrd_buf_mini / HII_TOT_NUM_PIXELS;
     }
-    *ave_sfrd = ave_sfrd_buf / HII_TOT_NUM_PIXELS;
-    *ave_sfrd_mini = ave_sfrd_buf_mini / HII_TOT_NUM_PIXELS;
 
     // These functions check for allocation
     free_conditional_tables();
@@ -1458,10 +1477,49 @@ void ts_main(float redshift, float prev_redshift, float perturbed_field_redshift
     int R_index;
     float *delta_box_input;
     float *Mcrit_box_input = NULL;  // may be unused
-    struct ScalingConstants sc, sc_sfrd;
+    ScalingConstants sc, sc_sfrd;
 
     // if we have stars, fill in the heating term boxes
     if (!NO_LIGHT) {
+        // Device pointers that reference GPU memory and need to persist across loop iterations
+        // -------------------------------------------------------------------------
+        float *d_y_arr = NULL;
+        float *d_dens_R_grid = NULL;
+        float *d_sfrd_grid = NULL;
+        double *d_ave_sfrd_buf = NULL;
+
+        // initialise pointer to struct of pointers
+        // ----------------------------------------------------------------------------------------------------------------------
+        // sfrd_gpu_data *device_data;
+        // sfrd_gpu_data *device_data = (sfrd_gpu_data *)malloc(sizeof(sfrd_gpu_data));
+        unsigned int threadsPerBlock = 0;
+        unsigned int sfrd_nbins = get_nbins();
+
+        // GPU=True
+        // if (true) {
+        //     // unsigned int init_sfrd_gpu_data(float *dens_R_grid, float *sfrd_grid, unsigned
+        //     long long num_pixels,
+        //         //   unsigned int nbins, sfrd_gpu_data *d_data);
+        //     threadsPerBlock = init_sfrd_gpu_data(delta_box_input, del_fcoll_Rct,
+        //     HII_TOT_NUM_PIXELS, sfrd_nbins, &device_data);
+        // }
+        // struct
+        // ---------------------------------------------------------------------------------------------------------------------------------------------------------
+        // threadsPerBlock = init_sfrd_gpu_data(delta_box_input, del_fcoll_Rct, HII_TOT_NUM_PIXELS,
+        // sfrd_nbins, &device_data); pointers
+        // -------------------------------------------------------------------------------------------------------------------------------------------------------
+        bool use_cuda = false;  // pass this as a parameter later
+        if (use_cuda) {
+#if CUDA_FOUND
+            threadsPerBlock =
+                init_sfrd_gpu_data(delta_box_input, del_fcoll_Rct, HII_TOT_NUM_PIXELS, sfrd_nbins,
+                                   &d_y_arr, &d_dens_R_grid, &d_sfrd_grid, &d_ave_sfrd_buf);
+#else
+            LOG_ERROR(
+                "CUDA function init_sfrd_gpu_data() called but code was not compiled for CUDA.");
+#endif
+        }
+        // ---------------------------------------------------------------------------------------------------------------------------------------------------------------
         for (R_ct = astro_params_global->N_STEP_TS; R_ct--;) {
             dzpp_for_evolve = dzpp_list[R_ct];
             zpp = zpp_for_evolve_list[R_ct];
@@ -1509,7 +1567,9 @@ void ts_main(float redshift, float prev_redshift, float perturbed_field_redshift
                     Mcrit_box_input = log10_Mcrit_LW[R_index];
                 }
                 calculate_sfrd_from_grid(R_ct, delta_box_input, Mcrit_box_input, del_fcoll_Rct,
-                                         del_fcoll_Rct_MINI, &ave_fcoll, &ave_fcoll_MINI, &sc);
+                                         del_fcoll_Rct_MINI, &ave_fcoll, &ave_fcoll_MINI,
+                                         threadsPerBlock, d_y_arr, d_dens_R_grid, d_sfrd_grid,
+                                         d_ave_sfrd_buf, &sc);
                 avg_fix_term = mean_sfr_zpp[R_ct] / ave_fcoll;
                 if (astro_options_global->USE_MINI_HALOS)
                     avg_fix_term_MINI = mean_sfr_zpp_mini[R_ct] / ave_fcoll_MINI;
@@ -1665,6 +1725,21 @@ void ts_main(float redshift, float prev_redshift, float perturbed_field_redshift
                 }
             }
         }
+        // struct
+        // ------------------------------------------------------------------------------------------------------------------------------------------------------------------
+        // free_sfrd_gpu_data(device_data);
+        // free(device_data);
+        // pointers
+        // ----------------------------------------------------------------------------------------------------------------------------------------------------------------
+        if (use_cuda) {
+#if CUDA_FOUND
+            free_sfrd_gpu_data(&d_y_arr, &d_dens_R_grid, &d_sfrd_grid, &d_ave_sfrd_buf);
+#else
+            LOG_ERROR(
+                "CUDA function free_sfrd_gpu_data() called but code was not compiled for CUDA.");
+#endif
+        }
+        // -------------------------------------------------------------------------------------------------------------------------------------------------------------------------
     }
 
     // we definitely don't need these tables anymore
diff --git a/src/py21cmfast/src/SpinTemperatureBox.cu b/src/py21cmfast/src/SpinTemperatureBox.cu
new file mode 100644
index 000000000..c0b9dd49f
--- /dev/null
+++ b/src/py21cmfast/src/SpinTemperatureBox.cu
@@ -0,0 +1,260 @@
+// Most of the following includes likely can be removed.
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <fftw3.h>
+
+// GPU
+#include <cuda.h>
+#include <cuda_runtime.h>
+// We use thrust for reduction
+#include <thrust/device_ptr.h>
+#include <thrust/reduce.h>
+#include <thrust/functional.h> // thrust::plus
+
+#include "cexcept.h"
+#include "exceptions.h"
+#include "logger.h"
+#include "Constants.h"
+#include "indexing.h"
+#include "InputParameters.h"
+#include "OutputStructs.h"
+#include "heating_helper_progs.h"
+#include "elec_interp.h"
+#include "interp_tables.h"
+#include "debugging.h"
+#include "cosmology.h"
+#include "hmf.h"
+#include "dft.h"
+#include "filtering.h"
+#include "thermochem.h"
+#include "interpolation.h"
+
+#include "cuda_utils.cuh"
+#include "SpinTemperatureBox.h"
+
+
+__device__ inline double EvaluateRGTable1D_f_gpu(double x, double x_min, double x_width, float *y_arr) {
+
+    int idx = (int)floor((x - x_min) / x_width);
+
+    double table_val = x_min + x_width * (float)idx;
+    double interp_point = (x - table_val) / x_width;
+
+    return y_arr[idx] * (1 - interp_point) + y_arr[idx + 1] * (interp_point);
+}
+
+template <unsigned int threadsPerBlock>
+__device__ void warp_reduce(volatile double *sdata, unsigned int tid) {
+    // Reduce by half
+    // No syncing required with threads < 32
+    if (threadsPerBlock >= 64) { sdata[tid] += sdata[tid + 32]; }
+    if (threadsPerBlock >= 32) { sdata[tid] += sdata[tid + 16]; }
+    if (threadsPerBlock >= 16) { sdata[tid] += sdata[tid + 8]; }
+    if (threadsPerBlock >= 8) { sdata[tid] += sdata[tid + 4]; }
+    if (threadsPerBlock >= 4) { sdata[tid] += sdata[tid + 2]; }
+    if (threadsPerBlock >= 2) { sdata[tid] += sdata[tid + 1]; }
+}
+
+// As seen in talk by Mark Harris, NVIDIA.
+// https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf
+// https://www.youtube.com/watch?v=NrWhZMHrP4w
+template <unsigned int threadsPerBlock>
+__global__ void compute_and_reduce(
+    double x_min, // reference
+    double x_width, // reference
+    float *y_arr, // reference
+    float *dens_R_grid, // reference
+    double zpp_growth_R_ct, // value
+    float *sfrd_grid, // star formation rate density grid to be updated
+    double *ave_sfrd_buf, // output buffer of length ceil(n / (threadsPerBlock * 2))
+    unsigned long long num_pixels // length of input data
+) {
+
+    // An array to store intermediate summations
+    // Shared between all threads in block
+    extern __shared__ double sdata[];
+
+    unsigned int tid = threadIdx.x; // thread within current block
+    unsigned int i = blockIdx.x * (threadsPerBlock * 2) + tid; // index of input data
+    unsigned int gridSize = threadsPerBlock * 2 * gridDim.x;
+
+    sdata[tid] = 0;
+
+    // In bounds of gridSize, sum pairs of collapse fraction data together
+    // And update the star formation rate density grid.
+    double curr_dens_i;
+    double curr_dens_j;
+    double fcoll_i;
+    double fcoll_j;
+
+    while (i < num_pixels) {
+        // Compute current density from density grid value * redshift-scaled growth factor
+        curr_dens_i = dens_R_grid[i] * zpp_growth_R_ct;
+
+        // Compute fraction of mass that has collapsed to form stars/other structures
+        fcoll_i = exp(EvaluateRGTable1D_f_gpu(curr_dens_i, x_min, x_width, y_arr));
+
+        // Update the shared buffer with the collapse fractions
+        sdata[tid] += fcoll_i;
+
+        // Update the relevant cells in the star formation rate density grid
+        sfrd_grid[i] = (1. + curr_dens_i) * fcoll_i;
+
+        // Repeat for i + threadsPerBlock
+        if ((i + threadsPerBlock) < num_pixels) {
+            curr_dens_j = dens_R_grid[i + threadsPerBlock] * zpp_growth_R_ct;
+            fcoll_j = exp(EvaluateRGTable1D_f_gpu(curr_dens_j, x_min, x_width, y_arr));
+            sdata[tid] += fcoll_j;
+            sfrd_grid[i + threadsPerBlock] = (1. + curr_dens_j) * fcoll_j;
+        }
+
+        i += gridSize;
+    }
+    __syncthreads();
+
+    // Reduce by half and sync (and repeat)
+    if (threadsPerBlock >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } __syncthreads(); }
+    if (threadsPerBlock >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } __syncthreads(); }
+    if (threadsPerBlock >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; } __syncthreads(); }
+
+    // Final reduction by separate kernel
+    if (tid < 32) { warp_reduce<threadsPerBlock>(sdata, tid); }
+
+    // The first thread of each block updates the block totals
+    if (tid == 0) { ave_sfrd_buf[blockIdx.x] = sdata[0]; }
+}
+
+unsigned int init_sfrd_gpu_data(
+    float *dens_R_grid, // input data
+    float *sfrd_grid, // star formation rate density grid to be updated
+    unsigned long long num_pixels, // length of input data
+    unsigned int nbins, // nbins for sfrd_grid->y
+    float **d_y_arr, // copies of pointers to pointers
+    float **d_dens_R_grid,
+    float **d_sfrd_grid,
+    double **d_ave_sfrd_buf
+) {
+    // Allocate device memory
+    CALL_CUDA(cudaMalloc(d_y_arr, sizeof(float) * nbins)); // already pointers to pointers (no & needed)
+    CALL_CUDA(cudaMalloc(d_dens_R_grid, sizeof(float) * num_pixels));
+    CALL_CUDA(cudaMalloc(d_sfrd_grid, sizeof(float) * num_pixels));
+    LOG_INFO("SFRD_conditional_table.y_arr and density and sfrd grids allocated on device.");
+
+    // Initialise sfrd_grid to 0 (fill with byte=0)
+    CALL_CUDA(cudaMemset(*d_sfrd_grid, 0, sizeof(float) * num_pixels)); // dereference the pointer to a pointer (*)
+    LOG_INFO("sfrd grid initialised to 0.");
+
+    // Get max threads/block for device
+    int maxThreadsPerBlock;
+    CALL_CUDA(cudaDeviceGetAttribute(&maxThreadsPerBlock, cudaDevAttrMaxThreadsPerBlock, 0));
+
+    // Set threads/block based on device max
+    unsigned int threadsPerBlock;
+    if (maxThreadsPerBlock >= 512) {
+        threadsPerBlock = 512;
+    } else if (maxThreadsPerBlock >= 256) {
+        threadsPerBlock = 256;
+    } else if (maxThreadsPerBlock >= 128) {
+        threadsPerBlock = 128;
+    } else if (maxThreadsPerBlock >= 64) {
+        threadsPerBlock = 64;
+    } else if (maxThreadsPerBlock >= 32) {
+        threadsPerBlock = 32;
+    } else {
+        threadsPerBlock = 16;
+    }
+
+    // Allocate memory for SFRD sum buffer and initialise to 0 only for initial filter step;
+    // reuse memory for remaining filter steps.
+    unsigned int numBlocks = ceil(num_pixels / (threadsPerBlock * 2));
+    CALL_CUDA(cudaMalloc(d_ave_sfrd_buf, sizeof(double) * numBlocks)); // already pointer to a pointer (no & needed)
+    LOG_INFO("SFRD sum reduction buffer allocated on device.");
+
+    // Initialise buffer to 0 (fill with byte=0)
+    CALL_CUDA(cudaMemset(*d_ave_sfrd_buf, 0, sizeof(double) * numBlocks)); // dereference the pointer to a pointer (*)
+    LOG_INFO("SFRD sum reduction buffer initialised to 0.");
+
+    return threadsPerBlock;
+}
+
+double calculate_sfrd_from_grid_gpu(
+    RGTable1D_f *SFRD_conditional_table, // input data
+    float *dens_R_grid, // input data
+    double *zpp_growth, // input data
+    int R_ct, // filter step/loop iteration/spherical annuli (out of 40 by default)
+    float *sfrd_grid, // star formation rate density grid to be updated
+    unsigned long long num_pixels, // length of input data
+    unsigned int threadsPerBlock, // computed in init function
+    float *d_y_arr,
+    float *d_dens_R_grid,
+    float *d_sfrd_grid,
+    double *d_ave_sfrd_buf
+) {
+    // Get growth factor for current filter step
+    double zpp_growth_R_ct = zpp_growth[R_ct];
+
+    // Copy data from host to device
+    CALL_CUDA(cudaMemcpy(d_y_arr, SFRD_conditional_table->y_arr, sizeof(float) * SFRD_conditional_table->n_bin, cudaMemcpyHostToDevice));
+    CALL_CUDA(cudaMemcpy(d_dens_R_grid, dens_R_grid, sizeof(float) * num_pixels, cudaMemcpyHostToDevice));
+    LOG_INFO("SFRD_conditional_table.y_arr and density grid copied to device.");
+
+    unsigned int numBlocks = ceil(num_pixels / (threadsPerBlock * 2));
+    unsigned int smemSize = threadsPerBlock * sizeof(double); // shared memory
+
+    // Invoke kernel
+    switch (threadsPerBlock) {
+        case 512:
+            compute_and_reduce<512><<< numBlocks, threadsPerBlock, smemSize >>>(SFRD_conditional_table->x_min, SFRD_conditional_table->x_width, d_y_arr, d_dens_R_grid, zpp_growth_R_ct, d_sfrd_grid, d_ave_sfrd_buf, num_pixels);
+            break;
+        case 256:
+            compute_and_reduce<256><<< numBlocks, threadsPerBlock, smemSize >>>(SFRD_conditional_table->x_min, SFRD_conditional_table->x_width, d_y_arr, d_dens_R_grid, zpp_growth_R_ct, d_sfrd_grid, d_ave_sfrd_buf, num_pixels);
+            break;
+        case 128:
+            compute_and_reduce<128><<< numBlocks, threadsPerBlock, smemSize >>>(SFRD_conditional_table->x_min, SFRD_conditional_table->x_width, d_y_arr, d_dens_R_grid, zpp_growth_R_ct, d_sfrd_grid, d_ave_sfrd_buf, num_pixels);
+            break;
+        case 64:
+            compute_and_reduce<64><<< numBlocks, threadsPerBlock, smemSize >>>(SFRD_conditional_table->x_min, SFRD_conditional_table->x_width, d_y_arr, d_dens_R_grid, zpp_growth_R_ct, d_sfrd_grid, d_ave_sfrd_buf, num_pixels);
+            break;
+        case 32:
+            compute_and_reduce<32><<< numBlocks, threadsPerBlock, smemSize >>>(SFRD_conditional_table->x_min, SFRD_conditional_table->x_width, d_y_arr, d_dens_R_grid, zpp_growth_R_ct, d_sfrd_grid, d_ave_sfrd_buf, num_pixels);
+            break;
+        default:
+            LOG_WARNING("Thread size invalid; defaulting to 256.");
+            compute_and_reduce<256><<< numBlocks, 256, 256 * sizeof(double) >>>(SFRD_conditional_table->x_min, SFRD_conditional_table->x_width, d_y_arr, d_dens_R_grid, zpp_growth_R_ct, d_sfrd_grid, d_ave_sfrd_buf, num_pixels);
+    }
+    CALL_CUDA(cudaGetLastError());
+    // CALL_CUDA(cudaDeviceSynchronize()); // Only use during development
+    LOG_INFO("SpinTemperatureBox compute-and-reduce kernel called.");
+
+    // Use thrust to reduce computed sums to one value.
+    // Wrap device pointer in a thrust::device_ptr
+    thrust::device_ptr<double> d_ave_sfrd_buf_ptr(d_ave_sfrd_buf);
+    // Reduce final buffer sums to one value
+    double ave_sfrd_buf = thrust::reduce(d_ave_sfrd_buf_ptr, d_ave_sfrd_buf_ptr + numBlocks, 0., thrust::plus<double>());
+    CALL_CUDA(cudaGetLastError());
+    // CALL_CUDA(cudaDeviceSynchronize()); // Only use during development
+    LOG_INFO("SFRD sum reduced to single value by thrust::reduce operation.");
+
+    // Copy results from device to host
+    CALL_CUDA(cudaMemcpy(sfrd_grid, d_sfrd_grid, sizeof(float) * num_pixels, cudaMemcpyDeviceToHost));
+    LOG_INFO("SFRD sum copied to host.");
+
+    return ave_sfrd_buf;
+}
+
+void free_sfrd_gpu_data(
+    float **d_y_arr, // copies of pointers to pointers
+    float **d_dens_R_grid,
+    float **d_sfrd_grid,
+    double **d_ave_sfrd_buf
+) {
+    // Need to dereference the pointers to pointers (*)
+    CALL_CUDA(cudaFree(*d_y_arr));
+    CALL_CUDA(cudaFree(*d_dens_R_grid));
+    CALL_CUDA(cudaFree(*d_sfrd_grid));
+    CALL_CUDA(cudaFree(*d_ave_sfrd_buf));
+    LOG_INFO("Device memory freed.");
+}
diff --git a/src/py21cmfast/src/SpinTemperatureBox.h b/src/py21cmfast/src/SpinTemperatureBox.h
index 2c607ff38..033ea2e99 100644
--- a/src/py21cmfast/src/SpinTemperatureBox.h
+++ b/src/py21cmfast/src/SpinTemperatureBox.h
@@ -1,8 +1,22 @@
 #ifndef _SPINTEMP_H
 #define _SPINTEMP_H
 
+// #include <cuda_runtime.h>
+
 #include "InputParameters.h"
 #include "OutputStructs.h"
+#include "interpolation.h"
+#include "scaling_relations.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+// typedef struct sfrd_gpu_data {
+//     float *d_y_arr;
+//     float *d_dens_R_grid;
+//     float *d_sfrd_grid;
+//     double *d_ave_sfrd_buf;
+// } sfrd_gpu_data;
 
 int ComputeTsBox(float redshift, float prev_redshift, float perturbed_field_redshift, short cleanup,
                  PerturbedField *perturbed_field, XraySourceBox *source_box,
@@ -11,4 +25,44 @@ int ComputeTsBox(float redshift, float prev_redshift, float perturbed_field_reds
 int UpdateXraySourceBox(HaloBox *halobox, double R_inner, double R_outer, int R_ct,
                         XraySourceBox *source_box);
 
+// pointers
+// --------------------------------------------------------------------------------------------------------
+void calculate_sfrd_from_grid(int R_ct, float *dens_R_grid, float *Mcrit_R_grid, float *sfrd_grid,
+                              float *sfrd_grid_mini, double *ave_sfrd, double *ave_sfrd_mini,
+                              unsigned int threadsPerBlock, float *d_y_arr, float *d_dens_R_grid,
+                              float *d_sfrd_grid, double *d_ave_sfrd_buf,
+                              struct ScalingConstants *sc);
+
+unsigned int init_sfrd_gpu_data(float *dens_R_grid, float *sfrd_grid, unsigned long long num_pixels,
+                                unsigned int nbins, float **d_y_arr, float **d_dens_R_grid,
+                                float **d_sfrd_grid, double **d_ave_sfrd_buf);
+
+double calculate_sfrd_from_grid_gpu(RGTable1D_f *SFRD_conditional_table, float *dens_R_grid,
+                                    double *zpp_growth, int R_ct, float *sfrd_grid,
+                                    unsigned long long num_pixels, unsigned int threadsPerBlock,
+                                    float *d_y_arr, float *d_dens_R_grid, float *d_sfrd_grid,
+                                    double *d_ave_sfrd_buf, struct ScalingConstants *sc);
+
+void free_sfrd_gpu_data(float **d_y_arr, float **d_dens_R_grid, float **d_sfrd_grid,
+                        double **d_ave_sfrd_buf);
+
+// wrap pointers in struct
+// ------------------------------------------------------------------------------------------ void
+// calculate_sfrd_from_grid(int R_ct, float *dens_R_grid, float *Mcrit_R_grid, float *sfrd_grid,
+//                   float *sfrd_grid_mini, double *ave_sfrd, double *ave_sfrd_mini,
+//                   unsigned int threadsPerBlock, const sfrd_gpu_data *d_data);
+
+// unsigned int init_sfrd_gpu_data(float *dens_R_grid, float *sfrd_grid, unsigned long long
+// num_pixels,
+//                   unsigned int nbins, sfrd_gpu_data *d_data);
+
+// double calculate_sfrd_from_grid_gpu(RGTable1D_f *SFRD_conditional_table, float *dens_R_grid,
+//                   double *zpp_growth, int R_ct, float *sfrd_grid, unsigned long long num_pixels,
+//                   unsigned int threadsPerBlock, const sfrd_gpu_data *d_data);
+
+// void free_sfrd_gpu_data(sfrd_gpu_data *d_data);
+
+#ifdef __cplusplus
+}
+#endif
 #endif
diff --git a/src/py21cmfast/src/Stochasticity.c b/src/py21cmfast/src/Stochasticity.c
index a64e481c6..5610b4557 100644
--- a/src/py21cmfast/src/Stochasticity.c
+++ b/src/py21cmfast/src/Stochasticity.c
@@ -3,22 +3,29 @@
  * other halo relations.*/
 #include "Stochasticity.h"
 
+#include <gsl/gsl_cblas.h>
 #include <gsl/gsl_randist.h>
 #include <gsl/gsl_rng.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <time.h>
 
 #include "Constants.h"
+#include "HaloField.cuh"
 #include "InitialConditions.h"
 #include "InputParameters.h"
 #include "OutputStructs.h"
+#include "Stochasticity.cuh"
 #include "cexcept.h"
 #include "cosmology.h"
+#include "device_rng.cuh"
 #include "exceptions.h"
 #include "hmf.h"
 #include "indexing.h"
+#include "interp_tables.cuh"
 #include "interp_tables.h"
+#include "interpolation.h"
 #include "logger.h"
 #include "rng.h"
 // buffer size (per cell of arbitrary size) in the sampling function
@@ -161,6 +168,19 @@ void stoc_set_consts_z(struct HaloSamplingConstants *const_struct, double redshi
     return;
 }
 
+double get_max_nhalo(struct HaloSamplingConstants *const_struct, float *halo_masses, int size) {
+    int idx_max = cblas_isamax(size, halo_masses, 1);
+    float mass_max = halo_masses[idx_max];
+    double ln_mm = log(mass_max);
+    double sigma_cond = EvaluateSigma(ln_mm);
+    double delta = get_delta_crit(matter_options_global->HMF, sigma_cond, const_struct->growth_in) /
+                   const_struct->growth_in * const_struct->growth_out;
+    int n_exp = EvaluateNhalo(ln_mm, const_struct->growth_out, const_struct->lnM_min,
+                              const_struct->lnM_max_tb, mass_max, sigma_cond, delta);
+    double expected_N = n_exp * mass_max;
+    return expected_N;
+}
+
 // set the constants which are calculated once per condition
 void stoc_set_consts_cond(struct HaloSamplingConstants *const_struct, double cond_val) {
     double m_exp, n_exp;
@@ -860,7 +880,6 @@ int sample_halo_grids(gsl_rng **rng_arr, double redshift, float *dens_field,
                         // sometimes halos are subtracted from the sample (set to zero)
                         // we do not want to save these
                         if (hm_buf[i] < simulation_options_global->SAMPLER_MIN_MASS) continue;
-
                         if (count >= arraysize_local) {
                             out_of_buffer = true;
                             continue;
@@ -906,14 +925,14 @@ int sample_halo_grids(gsl_rng **rng_arr, double redshift, float *dens_field,
         nhalo_threads[threadnum] = count;
     }
     if (out_of_buffer) {
-        LOG_ERROR("Halo buffer overflow (allocated %d halos per thread)", arraysize_local);
+        LOG_ERROR("Halo buffer overflow (allocated %llu halos per thread)", arraysize_local);
         for (int n_t = 0; n_t < simulation_options_global->N_THREADS; n_t++) {
-            LOG_ERROR("Thread %d: %d halos", n_t, nhalo_threads[n_t]);
+            LOG_ERROR("Thread %d: %llu halos", n_t, nhalo_threads[n_t]);
         }
         LOG_ERROR(
             "If you expected to have an above average halo number try raising "
             "config['HALO_CATALOG_MEM_FACTOR']");
-        Throw(ValueError);
+        Throw(ParallelError);
     }
 
     LOG_SUPER_DEBUG("Total dexm volume %.6e Total volume excluded %.6e (In units of HII_DIM cells)",
@@ -951,126 +970,140 @@ int sample_halo_progenitors(gsl_rng **rng_arr, double z_in, double z_out, HaloFi
     double boxlen[3] = {simulation_options_global->BOX_LEN, simulation_options_global->BOX_LEN,
                         BOXLEN_PARA};
 
-    bool out_of_buffer = false;
+    // use cuda function if use_cuda is true
+    bool use_cuda = false;  // pass this as a parameter later
+    if (use_cuda) {
+#if CUDA_FOUND
+        // get parameters needed for sigma calculation
+
+        RGTable1D_f *sigma_table = GetSigmaInterpTable();
+        double x_min = sigma_table->x_min;
+        double x_width = sigma_table->x_width;
+        int sigma_bin = sigma_table->n_bin;
+        float *sigma_y_arr = sigma_table->y_arr;
+        // Create a copy of hs_constants for passing to cuda
+        struct HaloSamplingConstants d_hs_constants;
+        d_hs_constants = *hs_constants;
+        // get in halo data
+        float *halo_m = halofield_in->halo_masses;
+        float *halo_star_rng = halofield_in->star_rng;
+        float *halo_sfr_rng = halofield_in->sfr_rng;
+        float *halo_xray_rng = halofield_in->xray_rng;
+        int *halo_c = halofield_in->halo_coords;
+
+        printf("Start cuda calculation for progenitors. ");
+        updateHaloOut(halo_m, halo_star_rng, halo_sfr_rng, halo_xray_rng, halo_c, nhalo_in,
+                      sigma_y_arr, sigma_bin, x_min, x_width, d_hs_constants, arraysize_total,
+                      halofield_out);
+        printf("End cuda calculation for progenitors. ");
+
+#else
+        LOG_ERROR("CUDA function updateHaloOut() called but code was not compiled for CUDA.");
+        Throw(ValueError);
+#endif
+    } else {  // CPU fallback
+        bool parallel_error = false;
 
 #pragma omp parallel num_threads(simulation_options_global->N_THREADS)
-    {
-        float prog_buf[MAX_HALO_CELL];
-        int n_prog;
-        double M_prog;
+        {
+            float prog_buf[MAX_HALO_CELL];
+            int n_prog;
 
-        double propbuf_in[3];
-        double propbuf_out[3];
+            double propbuf_in[3];
+            double propbuf_out[3];
 
-        int threadnum = omp_get_thread_num();
-        double M2, R2, R1;
-        int jj;
-        unsigned long long int ii;
-        unsigned long long int count = 0;
-        unsigned long long int istart = threadnum * arraysize_local;
-        double pos_prog[3], pos_desc[3];
+            int threadnum = omp_get_thread_num();
+            double M2, R2, R1;
+            int jj;
+            unsigned long long int ii;
+            unsigned long long int count = 0;
+            unsigned long long int istart = threadnum * arraysize_local;
+            double pos_prog[3], pos_desc[3];
 
-        // we need a private version
-        // also the naming convention should be better between structs/struct pointers
-        struct HaloSamplingConstants hs_constants_priv;
-        hs_constants_priv = *hs_constants;
+            // we need a private version
+            // also the naming convention should be better between structs/struct pointers
+            struct HaloSamplingConstants hs_constants_priv;
+            hs_constants_priv = *hs_constants;
 
 #pragma omp for
-        for (ii = 0; ii < nhalo_in; ii++) {
-            if (out_of_buffer) continue;
-            M2 = halofield_in->halo_masses[ii];
-            R2 = MtoR(M2);
-            if (M2 < Mmin || M2 > Mmax_tb) {
-                LOG_ERROR(
-                    "Input Mass = %.2e at %llu of %llu, something went wrong in the input "
-                    "catalogue",
-                    M2, ii, nhalo_in);
-                Throw(ValueError);
-            }
-            // set condition-dependent variables for sampling
-            stoc_set_consts_cond(&hs_constants_priv, M2);
-
-            // Sample the CMF set by the descendant
-            stoc_sample(&hs_constants_priv, rng_arr[threadnum], &n_prog, prog_buf);
-
-            propbuf_in[0] = halofield_in->star_rng[ii];
-            propbuf_in[1] = halofield_in->sfr_rng[ii];
-            propbuf_in[2] = halofield_in->xray_rng[ii];
-            pos_desc[0] = halofield_in->halo_coords[3 * ii + 0];
-            pos_desc[1] = halofield_in->halo_coords[3 * ii + 1];
-            pos_desc[2] = halofield_in->halo_coords[3 * ii + 2];
-
-            // place progenitors in local list
-            M_prog = 0;
-            for (jj = 0; jj < n_prog; jj++) {
-                // sometimes halos are subtracted from the sample (set to zero)
-                // we do not want to save these
-                if (prog_buf[jj] < simulation_options_global->SAMPLER_MIN_MASS) continue;
-
-                if (count >= arraysize_local) {
-                    out_of_buffer = true;
-                    continue;
+            for (ii = 0; ii < nhalo_in; ii++) {
+                if (parallel_error) continue;
+                M2 = halofield_in->halo_masses[ii];
+                R2 = MtoR(M2);
+                if (M2 < Mmin || M2 > Mmax_tb) {
+                    LOG_ERROR(
+                        "Input Mass = %.2e at %llu of %llu, something went wrong in the input "
+                        "catalogue",
+                        M2, ii, nhalo_in);
+                    parallel_error = true;
+                }
+                // set condition-dependent variables for sampling
+                stoc_set_consts_cond(&hs_constants_priv, M2);
+
+                // Sample the CMF set by the descendant
+                stoc_sample(&hs_constants_priv, rng_arr[threadnum], &n_prog, prog_buf);
+
+                propbuf_in[0] = halofield_in->star_rng[ii];
+                propbuf_in[1] = halofield_in->sfr_rng[ii];
+                propbuf_in[2] = halofield_in->xray_rng[ii];
+                pos_desc[0] = halofield_in->halo_coords[3 * ii + 0];
+                pos_desc[1] = halofield_in->halo_coords[3 * ii + 1];
+                pos_desc[2] = halofield_in->halo_coords[3 * ii + 2];
+
+                // place progenitors in local list
+                for (jj = 0; jj < n_prog; jj++) {
+                    // sometimes halos are subtracted from the sample (set to zero)
+                    // we do not want to save these
+                    if (prog_buf[jj] < simulation_options_global->SAMPLER_MIN_MASS) continue;
+
+                    if (parallel_error || count >= arraysize_local) {
+                        parallel_error = true;
+                        continue;
+                    }
+                    halofield_out->halo_masses[istart + count] = prog_buf[jj];
+
+                    // Place the progenitor in a random position within the condition
+                    // Such that a sphere of the progenitor's Lagrangian radius is placed
+                    // entirely within the descendant's Lagrangian radius,
+                    R1 = MtoR(prog_buf[jj]);
+                    random_point_in_sphere(pos_desc, R2 - R1, rng_arr[threadnum], pos_prog);
+                    wrap_position(pos_prog, boxlen);
+
+                    set_prop_rng(rng_arr[threadnum], true, corr_arr, propbuf_in, propbuf_out);
+                    halofield_out->halo_coords[3 * (istart + count) + 0] = pos_prog[0];
+                    halofield_out->halo_coords[3 * (istart + count) + 1] = pos_prog[1];
+                    halofield_out->halo_coords[3 * (istart + count) + 2] = pos_prog[2];
+                    halofield_out->star_rng[istart + count] = propbuf_out[0];
+                    halofield_out->sfr_rng[istart + count] = propbuf_out[1];
+                    halofield_out->xray_rng[istart + count] = propbuf_out[2];
+                    count++;
+                    if (ii == 0) {
+                        LOG_ULTRA_DEBUG(
+                            "Halo %d Prog %d: Mass %.2e Stellar %.2e SFR %.2e XRAY %.2e", ii, jj,
+                            prog_buf[jj], propbuf_out[0], propbuf_out[1], propbuf_out[2]);
+                    }
                 }
-
-                set_prop_rng(rng_arr[threadnum], true, corr_arr, propbuf_in, propbuf_out);
-
-                halofield_out->halo_masses[istart + count] = prog_buf[jj];
-
-                // Place the progenitor in a random position within the condition
-                // Such that a sphere of the progenitor's Lagrangian radius is placed
-                // entirely within the descendant's Lagrangian radius,
-                R1 = MtoR(prog_buf[jj]);
-                random_point_in_sphere(pos_desc, R2 - R1, rng_arr[threadnum], pos_prog);
-                wrap_position(pos_prog, boxlen);
-
-                halofield_out->halo_coords[3 * (istart + count) + 0] = pos_prog[0];
-                halofield_out->halo_coords[3 * (istart + count) + 1] = pos_prog[1];
-                halofield_out->halo_coords[3 * (istart + count) + 2] = pos_prog[2];
-                halofield_out->star_rng[istart + count] = propbuf_out[0];
-                halofield_out->sfr_rng[istart + count] = propbuf_out[1];
-                halofield_out->xray_rng[istart + count] = propbuf_out[2];
-                count++;
-
                 if (ii == 0) {
-                    M_prog += prog_buf[jj];
-
-                    LOG_ULTRA_DEBUG(
-                        "First Halo Prog %d: Mass %.2e Stellar %.2e SFR %.2e XRAY %.2e e_d %.3f",
-                        jj, prog_buf[jj], propbuf_out[0], propbuf_out[1], propbuf_out[2],
-                        Deltac * hs_constants->growth_out / hs_constants->growth_in);
+                    print_hs_consts(&hs_constants_priv);
                 }
             }
-            if (ii == 0) {
-                LOG_ULTRA_DEBUG(
-                    " HMF %d delta %.3f delta_coll %.3f delta_desc %.3f adjusted %.3f",
-                    matter_options_global->HMF, hs_constants_priv.delta,
-                    get_delta_crit(matter_options_global->HMF, hs_constants_priv.sigma_cond,
-                                   hs_constants->growth_out),
-                    get_delta_crit(matter_options_global->HMF, hs_constants_priv.sigma_cond,
-                                   hs_constants->growth_in),
-                    get_delta_crit(matter_options_global->HMF, hs_constants_priv.sigma_cond,
-                                   hs_constants->growth_in) *
-                        hs_constants->growth_out / hs_constants->growth_in);
-                print_hs_consts(&hs_constants_priv);
-                LOG_SUPER_DEBUG(
-                    "First Halo: Mass %.2f | N %d (exp. %.2e) | Total M %.2e (exp. %.2e)", M2,
-                    n_prog, hs_constants_priv.expected_N, M_prog, hs_constants_priv.expected_M);
+            istart_threads[threadnum] = istart;
+            nhalo_threads[threadnum] = count;
+        }
+        if (parallel_error) {
+            LOG_ERROR("More than %llu halos (expected %.1e) with buffer size factor %.1f",
+                      arraysize_local, arraysize_local / config_settings.HALO_CATALOG_MEM_FACTOR,
+                      config_settings.HALO_CATALOG_MEM_FACTOR);
+            for (int n_t = 0; n_t < simulation_options_global->N_THREADS; n_t++) {
+                LOG_ERROR("Thread %d: %llu halos", n_t, nhalo_threads[n_t]);
             }
+            LOG_ERROR(
+                "If you expected to have an above average halo number try raising "
+                "config_settings.HALO_CATALOG_MEM_FACTOR");
+            Throw(ParallelError);
         }
-        istart_threads[threadnum] = istart;
-        nhalo_threads[threadnum] = count;
+        condense_sparse_halolist(halofield_out, istart_threads, nhalo_threads);
     }
-    if (out_of_buffer) {
-        LOG_ERROR("Halo buffer overflow (allocated %d halos per thread)", arraysize_local);
-        for (int n_t = 0; n_t < simulation_options_global->N_THREADS; n_t++) {
-            LOG_ERROR("Thread %d: %d halos", n_t, nhalo_threads[n_t]);
-        }
-        LOG_ERROR(
-            "If you expected to have an above average halo number try raising "
-            "config['HALO_CATALOG_MEM_FACTOR']");
-        Throw(ValueError);
-    }
-    condense_sparse_halolist(halofield_out, istart_threads, nhalo_threads);
     return 0;
 }
 
@@ -1092,19 +1125,56 @@ int stochastic_halofield(unsigned long long int seed, float redshift_desc, float
     struct HaloSamplingConstants hs_constants;
     stoc_set_consts_z(&hs_constants, redshift, redshift_desc);
 
+    bool use_cuda = false;
+    if (use_cuda) {
+#if CUDA_FOUND
+        // get interp tables needed for sampling progenitors
+        RGTable1D *nhalo_table = GetNhaloTable();
+        RGTable1D *mcoll_table = GetMcollTable();
+        RGTable2D *nhalo_inv_table = GetNhaloInvTable();
+        // copy the tables to the device
+        copyTablesToDevice(*nhalo_table, *mcoll_table, *nhalo_inv_table);
+
+        // copy global variables to the device
+        // todo: move the following operation to InitialConditions.c
+        updateGlobalParams(simulation_options_global, cosmo_params_global, astro_params_global);
+#else
+        LOG_ERROR("CUDA function copyTablesToDevice called but code was not compiled for CUDA.");
+#endif
+    }
+
     // Fill them
     // NOTE:Halos prev in the first box corresponds to the large DexM halos
     if (redshift_desc <= 0.) {
         LOG_DEBUG("building first halo field at z=%.1f", redshift);
         sample_halo_grids(rng_stoc, redshift, dens_field, halo_overlap_box, halos_desc, halos,
                           &hs_constants);
+
+        if (use_cuda) {
+            // initiate rand states on the device
+#if CUDA_FOUND
+            unsigned long long int nhalo_first = halos->n_halos;
+            int buffer_scale = HALO_CUDA_THREAD_FACTOR + 1;
+            unsigned long long int n_rstates = nhalo_first * buffer_scale;
+            printf("initializing %llu random states on the device... \n", n_rstates);
+            init_rand_states(seed, n_rstates);
+
+            printf("finish initializing \n");
+
+            // todo: add a signal to free rand states once all iterations are done
+#else
+            LOG_ERROR(
+                "CUDA function init_rand_states() called but code was not compiled for CUDA.");
+            Throw(ValueError);
+#endif
+        }
+
     } else {
         LOG_DEBUG("Calculating halo progenitors from z=%.1f to z=%.1f | %llu", redshift_desc,
                   redshift, halos_desc->n_halos);
         sample_halo_progenitors(rng_stoc, redshift_desc, redshift, halos_desc, halos,
                                 &hs_constants);
     }
-
     LOG_DEBUG("Found %llu Halos", halos->n_halos);
 
     if (halos->n_halos >= 3) {
diff --git a/src/py21cmfast/src/Stochasticity.cu b/src/py21cmfast/src/Stochasticity.cu
new file mode 100644
index 000000000..33947691a
--- /dev/null
+++ b/src/py21cmfast/src/Stochasticity.cu
@@ -0,0 +1,1031 @@
+#include <math.h>
+#include <stdio.h>
+
+#include <cuda_runtime.h>
+#include <curand.h> // host-side header file
+#include <curand_kernel.h> // device-side header file
+
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+#include <thrust/reduce.h>
+#include <thrust/remove.h>
+#include <thrust/copy.h>
+#include <thrust/fill.h>
+#include <iostream>
+#include <algorithm>
+#include <vector>
+#include <cassert>
+
+#include "Constants.h"
+#include "interpolation_types.h"
+#include "Stochasticity.h"
+
+#include "cuda_utils.cuh"
+#include "Stochasticity.cuh"
+#include "DeviceConstants.cuh"
+#include "device_rng.cuh"
+#include "hmf.cu"
+#include "interp_tables.cu"
+
+
+
+
+#ifndef MAX_DELTAC_FRAC
+#define MAX_DELTAC_FRAC (float)0.99 // max delta/deltac for the mass function integrals
+#endif
+
+#ifndef DELTA_MIN
+#define DELTA_MIN -1 // minimum delta for Lagrangian mass function integrals
+#endif
+
+#ifndef MAX_HALO_CELL
+#define MAX_HALO_CELL (int)1e5
+#endif
+
+void validate_thrust()
+{
+    // Create a host vector with some values
+    thrust::host_vector<int> h_vec(5);
+    h_vec[0] = 1;
+    h_vec[1] = 2;
+    h_vec[2] = 3;
+    h_vec[3] = 4;
+    h_vec[4] = 5;
+
+    // Transfer data from host to device
+    thrust::device_vector<int> d_vec = h_vec;
+
+    // Calculate the sum of all elements in the device vector
+    int sum = thrust::reduce(d_vec.begin(), d_vec.end(), 0, thrust::plus<int>());
+
+    // Print the result
+    std::cout << "Sum is: " << sum << std::endl; // Should print "Sum is: 15"
+}
+
+void condense_device_vector()
+{
+    // Step 1: Create a device vector with some elements, including -1
+    thrust::device_vector<int> d_vec(10);
+    d_vec[0] = 1;
+    d_vec[1] = -1;
+    d_vec[2] = 3;
+    d_vec[3] = -1;
+    d_vec[4] = 5;
+    d_vec[5] = 6;
+    d_vec[6] = -1;
+    d_vec[7] = 7;
+    d_vec[8] = -1;
+    d_vec[9] = 9;
+
+    // Step 2: Use thrust::remove_if to remove all occurrences of -1
+    thrust::device_vector<int>::iterator new_end = thrust::remove(d_vec.begin(), d_vec.end(), -1);
+
+    // Step 3: Resize the vector to remove the trailing elements after the "new_end" iterator
+    d_vec.erase(new_end, d_vec.end());
+
+    // Step 4: Copy the result to the host to check
+    thrust::host_vector<int> h_vec = d_vec;
+
+    // Step 5: Print the result
+    std::cout << "Condensed Vector: ";
+    for (size_t i = 0; i < h_vec.size(); i++)
+    {
+        std::cout << h_vec[i] << " ";
+    }
+    std::cout << std::endl;
+}
+
+// int condenseDeviceArray(float *d_array, int original_size, float mask_value)
+// {
+//     // Wrap the raw device pointer into a thrust device pointer
+//     thrust::device_ptr<float> d_array_ptr(d_array);
+
+//     // Remove elements with mask value
+//     // i.e.move elements not equal to mask value to the beginning of the array without changing order
+//     auto new_end = thrust::remove(d_array_ptr, d_array_ptr + original_size, mask_value);
+
+//     // Calculate the number of valid elements
+//     int valid_size = new_end - d_array_ptr;
+
+//     // Fill the remaining space with mask value
+//     thrust::fill(new_end, d_array_ptr + original_size, mask_value);
+
+//     // Print results (on host side)
+//     // std::cout << "Valid elements count: " << valid_size << "\n";
+//     return valid_size;
+// }
+
+template <typename T>
+int condenseDeviceArray(T *d_array, int original_size, T mask_value)
+{
+    // Wrap the raw device pointer into a thrust device pointer
+    thrust::device_ptr<T> d_array_ptr(d_array);
+
+    // Remove elements with mask value
+    auto new_end = thrust::remove(d_array_ptr, d_array_ptr + original_size, mask_value);
+
+    // Calculate the number of valid elements
+    int valid_size = new_end - d_array_ptr;
+
+    // Fill the remaining space with mask value
+    thrust::fill(new_end, d_array_ptr + original_size, mask_value);
+
+    return valid_size;
+}
+
+// todo: maybe add python wrapper for test functions
+void testCondenseDeviceArray()
+{
+    // Input data
+    float h_array[] = {1.0f, 0.0f, 2.0f, 3.0f, 0.0f, 4.0f};
+    float mask_value = 0.0f;
+    int original_size = 6;
+
+    // Expected outputs
+    float expected_array[] = {1.0f, 2.0f, 3.0f, 4.0f, 0.0f, 0.0f};
+    int expected_valid_size = 4;
+
+    // Allocate and copy to device
+    float *d_array;
+    cudaMalloc(&d_array, original_size * sizeof(float));
+    cudaMemcpy(d_array, h_array, original_size * sizeof(float), cudaMemcpyHostToDevice);
+
+    // Call the function from Stochasticity.cu
+    int valid_size = condenseDeviceArray(d_array, original_size, mask_value);
+
+    // Copy the results back to the host
+    float h_result[original_size];
+    cudaMemcpy(h_result, d_array, original_size * sizeof(float), cudaMemcpyDeviceToHost);
+
+    // Validate the results
+    assert(valid_size == expected_valid_size);
+    for (int i = 0; i < original_size; ++i)
+    {
+        assert(h_result[i] == expected_array[i]);
+    }
+
+    std::cout << "Test passed: condenseDeviceArray\n";
+
+    // Free device memory
+    cudaFree(d_array);
+}
+
+// todo: add more tests to check with large number of input; fix the type mismatch (int, ull)
+int filterWithMask(float *d_data, int *d_mask, int original_size)
+{
+    // Wrap the raw pointers into thrust device pointers
+    thrust::device_ptr<float> d_data_ptr(d_data);
+    thrust::device_ptr<int> d_mask_ptr(d_mask);
+
+    // Use the mask to select only elements that correspond to a value of 1 in the mask
+    auto end = thrust::copy_if(d_data_ptr, d_data_ptr + original_size, d_mask_ptr, d_data_ptr, thrust::identity<float>());
+
+    // Calculate the new valid size after filtering
+    int valid_size = end - d_data_ptr;
+
+    // Optionally, print the number of valid elements
+    // std::cout << "Valid elements count: " << valid_size << "\n";
+
+    return valid_size;
+}
+
+void testFilterWithMask()
+{
+    // Input arrays
+    float h_data[] = {1.1f, 2.2f, 3.3f, 4.4f, 5.5f}; // Input data
+    int h_mask[] = {1, 0, 1, 0, 1};                  // Mask array
+    int original_size = 5;
+
+    // Expected outputs
+    float expected_data[] = {1.1f, 3.3f, 5.5f}; // Expected filtered data
+    int expected_size = 3;                      // Number of valid elements
+
+    // Allocate device memory
+    float *d_data;
+    int *d_mask;
+    cudaMalloc(&d_data, original_size * sizeof(float));
+    cudaMalloc(&d_mask, original_size * sizeof(int));
+
+    // Copy data to device
+    cudaMemcpy(d_data, h_data, original_size * sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_mask, h_mask, original_size * sizeof(int), cudaMemcpyHostToDevice);
+
+    // Call the function
+    int valid_size = filterWithMask(d_data, d_mask, original_size);
+
+    // Copy the filtered data back to host
+    float h_result[original_size];
+    cudaMemcpy(h_result, d_data, original_size * sizeof(float), cudaMemcpyDeviceToHost);
+
+    // Validate the size of the filtered array
+    assert(valid_size == expected_size);
+
+    // Validate the filtered elements
+    for (int i = 0; i < valid_size; ++i)
+    {
+        assert(h_result[i] == expected_data[i]);
+    }
+
+    // Print success message
+    std::cout << "Test passed: filterWithMask\n";
+
+    // Free device memory
+    cudaFree(d_data);
+    cudaFree(d_mask);
+}
+
+void countElements(const int *array, int size, const std::vector<int> &values_to_count)
+{
+    // Initialize a frequency array to count occurrences
+    int count[values_to_count.size()] = {0};
+
+    // Iterate through the input array
+    for (int i = 0; i < size; ++i)
+    {
+        // Find the index of the value in values_to_count
+        for (size_t j = 0; j < values_to_count.size(); ++j)
+        {
+            if (array[i] == values_to_count[j])
+            {
+                count[j]++;
+                break;
+            }
+        }
+    }
+
+    // Print the results
+    for (size_t i = 0; i < values_to_count.size(); ++i)
+    {
+        std::cout << "Value " << values_to_count[i] << ": " << count[i] << " occurrences\n";
+    }
+}
+
+// decide the number of sparsity
+int getSparsity(int n_buffer, int n_halo){
+    if (n_halo > 0){
+        int power = floor(log2(n_buffer / n_halo));
+        int sparsity = 1 << power;
+        return sparsity;
+    }
+    else{
+        return -1;
+    }
+
+}
+
+// initialize device array with given value
+void initializeArray(int *d_array, int n_elements, int value){
+    thrust::device_ptr<int> d_array_ptr(d_array);
+    thrust::fill(d_array_ptr, d_array_ptr + n_elements, value);
+}
+
+// void getKernelAttr(){
+//     cudaFuncAttributes attr;
+//     cudaFuncGetAttributes(&attr, myKernel);
+//     printf("Kernel Shared Memory per Block: %zu bytes\n", attr.sharedSizeBytes);
+//     printf("Kernel Registers per Thread: %d\n", attr.numRegs);
+//     printf("Kernel Max Threads per Block: %d\n", attr.maxThreadsPerBlock);
+// }
+
+struct GridLayout{
+    int n_threads;
+    int n_blocks;
+};
+// calculate workload
+// todo: add more checks on sparsity
+GridLayout getWorkload(int sparsity, unsigned long long int n_halos){
+    GridLayout res;
+    int n_threads, n_blocks;
+    if (sparsity != 0 && 256 % sparsity == 0){
+        n_threads = 256;
+    }
+    else {
+        n_threads = std::min(sparsity,512);
+    }
+    res.n_threads = n_threads;
+    n_blocks = (n_halos * sparsity + n_threads -1)/n_threads;
+    res.n_blocks = n_blocks;
+    return res;
+}
+
+// 11-30: the following implementation works (before using any global params on gpu)
+__device__ void stoc_set_consts_cond(struct HaloSamplingConstants *const_struct, float cond_val, int HMF, double x_min, double x_width, float *d_y_arr, int n_bin, double *expected_mass)
+{
+    double m_exp, n_exp;
+    // Here the condition is a mass, volume is the Lagrangian volume and delta_l is set by the
+    // redshift difference which represents the difference in delta_crit across redshifts
+    if (const_struct->from_catalog){
+        const_struct->M_cond = cond_val;
+        const_struct->lnM_cond = log(cond_val);
+        const_struct->sigma_cond = EvaluateSigma(const_struct->lnM_cond, x_min, x_width, d_y_arr, n_bin); //todo: update this function using global tables in constant memory
+        // mean stellar mass of this halo mass, used for stellar z correlations
+        const_struct->cond_val = const_struct->lnM_cond;
+        // condition delta is the previous delta crit
+        const_struct->delta = get_delta_crit(HMF, const_struct->sigma_cond, const_struct->growth_in) / const_struct->growth_in * const_struct->growth_out; //todo: update this function using global variables in constant memory
+    }
+    // Here the condition is a cell of a given density, the volume/mass is given by the grid parameters
+    else
+    {
+        // since the condition mass/sigma is already set all we need is delta
+        const_struct->delta = cond_val;
+        const_struct->cond_val = cond_val;
+    }
+    // Get expected N and M from interptables
+    // the splines don't work well for cells above Deltac, but there CAN be cells above deltac, since this calculation happens
+    // before the overlap, and since the smallest dexm mass is M_cell*(1.01^3) there *could* be a cell above Deltac not in a halo
+    // NOTE: all this does is prevent integration errors below since these cases are also dealt with in stoc_sample
+    if (const_struct->delta > MAX_DELTAC_FRAC * get_delta_crit(d_matter_options.HMF, const_struct->sigma_cond, const_struct->growth_out)){
+        const_struct->expected_M = const_struct->M_cond;
+        const_struct->expected_N = 1;
+    }
+    else if (const_struct->delta <= DELTA_MIN){
+        const_struct->expected_M = 0;
+        const_struct->expected_N = 0;
+    }
+    else
+    {
+        n_exp = EvaluateNhalo(const_struct->cond_val, const_struct->growth_out, const_struct->lnM_min,
+                              const_struct->lnM_max_tb, const_struct->M_cond, const_struct->sigma_cond, const_struct->delta);
+        m_exp = EvaluateMcoll(const_struct->cond_val, const_struct->growth_out, const_struct->lnM_min,
+                              const_struct->lnM_max_tb, const_struct->M_cond, const_struct->sigma_cond, const_struct->delta);
+        const_struct->expected_N = n_exp * const_struct->M_cond;
+        const_struct->expected_M = m_exp * const_struct->M_cond;
+    }
+    *expected_mass = const_struct->expected_M;
+    return;
+}
+
+__device__ double sample_dndM_inverse(double condition, struct HaloSamplingConstants *hs_constants, curandState *state)
+{
+    double p_in, result;
+    p_in = curand_uniform_double(state);
+    // printf("curand uniform random number: %f\n", p_in);
+    result = EvaluateNhaloInv(condition, p_in);
+    result = fmin(1.0, fmax(0.0, result)); // clip in case of extrapolation
+    result = result * hs_constants->M_cond;
+    return result;
+}
+
+__device__ double remove_random_halo(curandState *state, int n_halo, int *idx, float *M_prog, float *M_out){
+    double last_M_del;
+    int random_idx;
+    do {
+        random_idx = (int)(curand_uniform(state) * n_halo);
+    } while (M_out[random_idx] == 0.0f);
+    last_M_del = M_out[random_idx];
+    *M_prog -= last_M_del;
+    M_out[random_idx] = 0.0f; // -1 mass halos are skipped and not counted
+
+    *idx = random_idx;
+    return last_M_del;
+}
+
+__device__ void fix_mass_sample(curandState *state, double exp_M, float *M_prog, float *M_out, int write_limit, int *n_prog){
+    // Keep the last halo if it brings us closer to the expected mass
+    // This is done by addition or subtraction over the limit to balance
+    // the bias of the last halo being larger
+    int random_idx;
+    double last_M_del;
+    int sel = curand(state) % 2;
+    // int sel = 1; //tmp: implement the first case
+    if (sel)
+    {
+        if (fabs(*M_prog - M_out[write_limit] - exp_M) < fabs(*M_prog - exp_M))
+        {
+            // *M_tot_pt -= M_out[*n_halo_pt - 1];
+            // here we remove by setting the counter one lower so it isn't read
+            M_out[write_limit] = 0.0f;
+            (*n_prog)--;
+        }
+    }
+    else
+    {
+        do {
+            // here we remove by setting halo mass to zero, skipping it during the consolidation
+            last_M_del = remove_random_halo(state, write_limit+1, &random_idx, M_prog, M_out);
+        } while (*M_prog > exp_M);
+
+        // if the sample with the last subtracted halo is closer to the expected mass, keep it
+        // LOG_ULTRA_DEBUG("Deciding to keep last halo M %.3e tot %.3e exp %.3e",last_M_del,*M_tot_pt,exp_M);
+        if (fabs(*M_prog + last_M_del - exp_M) < fabs(*M_prog - exp_M))
+        {
+            M_out[random_idx] = last_M_del;
+            *M_prog += last_M_del;
+        }
+
+    }
+}
+
+__device__ int stoc_mass_sample(struct HaloSamplingConstants *hs_constants, curandState *state, float *M_out){
+    double exp_M = hs_constants->expected_M;
+
+    // The mass-limited sampling as-is has a slight bias to producing too many halos,
+    //   which is independent of density or halo mass,
+    //   this factor reduces the total expected mass to bring it into line with the CMF
+    // exp_M *= user_params_global->HALOMASS_CORRECTION;
+    exp_M *= d_matter_options.HALOMASS_CORRECTION;
+
+    // int n_halo_sampled = 0;
+    // double M_prog = 0;
+    // double M_sample;
+
+    double tbl_arg = hs_constants->cond_val;
+
+    // tmp (start)
+    double M_sample = sample_dndM_inverse(tbl_arg, hs_constants, state);
+
+    // M_prog += M_sample;
+    // tmp (end)
+
+    // while (M_prog < exp_M){
+    //     M_sample = sample_dndM_inverse(tbl_arg, hs_constants, state);
+
+    //     M_prog += M_sample;
+    //     M_out[n_halo_sampled++] = M_sample;
+    // }
+    // todo: enable fix_mass_sample
+    // The above sample is above the expected mass, by up to 100%. I wish to make the average mass equal to exp_M
+    // fix_mass_sample(state, exp_M, &n_halo_sampled, &M_prog, M_out);
+
+    // *n_halo_out = n_halo_sampled;
+    // if (M_prog < exp_M){
+    //     *further_process = 1;
+    //     return 1;
+    // }
+    *M_out = M_sample;
+    return 0;
+}
+
+__device__ int stoc_sample(struct HaloSamplingConstants *hs_constants, curandState *state, float *M_out, int *sampleCondition){
+    // TODO: really examine the case for number/mass sampling
+    // The poisson sample fails spectacularly for high delta (from_catalogs or dense cells)
+    //   and excludes the correlation between number and mass (e.g many small halos or few large ones)
+    // The mass sample underperforms at low exp_M/M_max by excluding stochasticity in the total collapsed fraction
+    //   and excluding larger halos (e.g if exp_M is 0.1*M_max we can effectively never sample the large halos)
+    // i.e there is some case for a delta cut between these two methods however I have no intuition for the exact levels
+
+    int err;
+
+    // If the expected mass is below our minimum saved mass, don't bother calculating
+    // NOTE: some of these conditions are redundant with set_consts_cond()
+    if (hs_constants->delta <= DELTA_MIN || hs_constants->expected_M < d_simulation_options.SAMPLER_MIN_MASS)
+    {
+        // *n_halo_out = 0;
+        *sampleCondition = 0;
+        return 0;
+    }
+    // if delta is above critical, form one big halo
+    if (hs_constants->delta >= MAX_DELTAC_FRAC * get_delta_crit(d_matter_options.HMF, hs_constants->sigma_cond, hs_constants->growth_out)){
+        // *n_halo_out = 1;
+
+        // Expected mass takes into account potential dexm overlap
+        *M_out = hs_constants->expected_M;
+        *sampleCondition = 1;
+        return 0;
+    }
+
+    // todo: implement callee functions for SAMPLE_METHOD (1,2,3)
+    // We always use Number-Limited sampling for grid-based cases
+    if (d_matter_options.SAMPLE_METHOD == 1 || !hs_constants->from_catalog)
+    {
+        // err = stoc_halo_sample(hs_constants, rng, n_halo_out, M_out);
+        return 0;
+    }
+    else if (d_matter_options.SAMPLE_METHOD == 0)
+    {
+        err = stoc_mass_sample(hs_constants, state, M_out);
+    }
+    else if (d_matter_options.SAMPLE_METHOD == 2)
+    {
+        // err = stoc_partition_sample(hs_constants, rng, n_halo_out, M_out);
+        return 0;
+    }
+    else if (d_matter_options.SAMPLE_METHOD == 3)
+    {
+        // err = stoc_split_sample(hs_constants, rng, n_halo_out, M_out);
+        return 0;
+    }
+    else
+    {
+        printf("Invalid sampling method \n");
+        return 0;
+        // todo: check how to throw error in cuda
+        // LOG_ERROR("Invalid sampling method");
+        // Throw(ValueError);
+    }
+    // if (*n_halo_out > MAX_HALO_CELL)
+    // {
+    //     printf("too many halos in conditin, buffer overflow\n");
+    //     // todo: check how to throw error in cuda
+    //     // LOG_ERROR("too many halos in condition, buffer overflow");
+    //     // Throw(ValueError);
+    // }
+    return err;
+}
+
+// todo: implement condense_sparse_halolist
+// // todo: just copied the original function here, need to verify it works with cuda
+// __device__ void condense_sparse_halolist(HaloField *halofield, unsigned long long int *istart_threads, unsigned long long int *nhalo_threads)
+// {
+//     int i = 0;
+//     unsigned long long int count_total = 0;
+//     for (i = 0; i < user_params_global->N_THREADS; i++)
+//     {
+//         memmove(&halofield->halo_masses[count_total], &halofield->halo_masses[istart_threads[i]], sizeof(float) * nhalo_threads[i]);
+//         memmove(&halofield->star_rng[count_total], &halofield->star_rng[istart_threads[i]], sizeof(float) * nhalo_threads[i]);
+//         memmove(&halofield->sfr_rng[count_total], &halofield->sfr_rng[istart_threads[i]], sizeof(float) * nhalo_threads[i]);
+//         memmove(&halofield->xray_rng[count_total], &halofield->xray_rng[istart_threads[i]], sizeof(float) * nhalo_threads[i]);
+//         memmove(&halofield->halo_coords[3 * count_total], &halofield->halo_coords[3 * istart_threads[i]], sizeof(int) * 3 * nhalo_threads[i]);
+//         LOG_SUPER_DEBUG("Moved array (start,count) (%llu, %llu) to position %llu", istart_threads[i], nhalo_threads[i], count_total);
+//         count_total += nhalo_threads[i];
+//     }
+//     halofield->n_halos = count_total;
+
+//     // replace the rest with zeros for clarity
+//     memset(&halofield->halo_masses[count_total], 0, (halofield->buffer_size - count_total) * sizeof(float));
+//     memset(&halofield->halo_coords[3 * count_total], 0, 3 * (halofield->buffer_size - count_total) * sizeof(int));
+//     memset(&halofield->star_rng[count_total], 0, (halofield->buffer_size - count_total) * sizeof(float));
+//     memset(&halofield->sfr_rng[count_total], 0, (halofield->buffer_size - count_total) * sizeof(float));
+//     memset(&halofield->xray_rng[count_total], 0, (halofield->buffer_size - count_total) * sizeof(float));
+//     LOG_SUPER_DEBUG("Set %llu elements beyond %llu to zero", halofield->buffer_size - count_total, count_total);
+// }
+
+__device__ void set_prop_rng(curandState *state, bool from_catalog, double *interp, float *input, float *output)
+{
+    float rng_star, rng_sfr, rng_xray;
+
+    // Correlate properties by interpolating between the sampled and descendant gaussians
+    rng_star = d_astro_params.SIGMA_STAR > 0. ? curand_normal(state) : 0.;
+    rng_sfr = d_astro_params.SIGMA_SFR_LIM > 0. ? curand_normal(state) : 0.;
+    rng_xray = d_astro_params.SIGMA_LX > 0. ? curand_normal(state) : 0.;
+
+    if (from_catalog)
+    {
+        // this transforms the sample to one from the multivariate Gaussian, conditioned on the first sample
+        rng_star = sqrt(1 - interp[0] * interp[0]) * rng_star + interp[0] * input[0];
+        rng_sfr = sqrt(1 - interp[1] * interp[1]) * rng_sfr + interp[1] * input[1];
+        rng_xray = sqrt(1 - interp[2] * interp[2]) * rng_xray + interp[2] * input[2];
+    }
+
+    output[0] = rng_star;
+    output[1] = rng_sfr;
+    output[2] = rng_xray;
+    return;
+}
+
+__global__ void update_halo_constants(float *d_halo_masses, float *d_star_rng_in, float *d_sfr_rng_in, float *d_xray_rng_in,
+                                      int *d_halo_coords_in, float *d_y_arr, double x_min, double x_width,
+                                      unsigned long long int n_halos, int n_bin, struct HaloSamplingConstants d_hs_constants,
+                                      int HMF,
+                                      float *d_halo_masses_out, float *d_star_rng_out,
+                                      float *d_sfr_rng_out, float *d_xray_rng_out, int *d_halo_coords_out, int *d_sum_check,
+                                      int *d_further_process, int *d_nprog_predict, int sparsity, unsigned long long int write_offset,
+                                      double *expected_mass, int *d_n_prog, int offset_shared)
+{
+    // Define shared memory for block-level reduction
+    extern __shared__ float shared_memory[];
+    // __shared__ float shared_mass[256];
+
+    // partition shared memory
+    float *shared_mass = shared_memory;
+    float *shared_prop_rng = shared_memory + offset_shared;
+
+    // get local thread idx
+    int tid = threadIdx.x;
+
+    // initialize shared_mass
+    shared_mass[tid] = 0.0f;
+
+    // initialize shared_prop_rng
+    for (int i=0;i<3;i++){
+        shared_prop_rng[tid+i*offset_shared] = 0.0f;
+    }
+
+
+    // get global thread idx
+    int ind = blockIdx.x * blockDim.x + threadIdx.x;
+
+    // get halo idx
+    int hid = ind / sparsity;
+    if (hid >= n_halos)
+    {
+        // printf("Out of halo range.\n");
+        return;
+    }
+
+    // get halo mass
+    float M = d_halo_masses[hid];
+
+    // get stoc properties from in halo
+    float prop_in[3] = {d_star_rng_in[hid], d_sfr_rng_in[hid], d_xray_rng_in[hid]};
+
+    // get correction
+    double corr_arr[3] = {d_hs_constants.corr_star, d_hs_constants.corr_sfr, d_hs_constants.corr_xray};
+
+    // get coordinate
+    int coords_in[3] = {d_halo_coords_in[hid*3], d_halo_coords_in[hid*3+1], d_halo_coords_in[hid*3+2]};
+
+    // idx of d_halo_masses_out and other halo field arrays
+    int out_id = write_offset + ind;
+
+    // set condition-dependent variables for sampling
+    stoc_set_consts_cond(&d_hs_constants, M, HMF, x_min, x_width, d_y_arr, n_bin, &expected_mass[hid]);
+    // if (hid == 1){
+    //     printf("check here. \n");
+    // }
+
+    // if (hid == 2){
+    //     printf("check here. \n");
+    // }
+
+    // tmp: just to verify the tables have been copied correctly
+    // if (ind == 0)
+    // {
+    //     printf("The first element of Nhalo y_arr: %e (%e) \n", d_Nhalo_yarr[0], d_Nhalo_table.y_arr[0]);
+    //     printf("The nhalo table n_bin: %d\n", d_Nhalo_table.n_bin);
+    //     printf("The nhalo_inv table nx_bin: %d\n", d_Nhalo_inv_table.nx_bin);
+    //     printf("HII_DIM: %d \n", d_user_params.HII_DIM);
+    //     printf("test params: %f \n", d_test_params);
+    //     printf("A_VCB: %f \n", d_astro_params.A_VCB);
+    //     printf("SIGMA_8: %f \n", d_cosmo_params.SIGMA_8);
+        // printf("number of rng states: %d\n", g_numRNGStates);
+        // // tiger tmp: debug (start)
+        // double res1, res2, res3, res4;
+        // res1 = EvaluateNhaloInv(18.694414138793945, 0.0046723012881037529);
+        // printf("tmp res1 on gpu: %.17f \n", res1);
+        // res2 = EvaluateNhaloInv(20.084152221679688, 0.32153863360286256);
+        // printf("tmp res2 on gpu: %.17f \n", res2);
+        // res3 = EvaluateNhaloInv(26.806314468383789, 0.8698794976081996);
+        // printf("tmp res3 on gpu: %.17f \n", res3);
+        // res4 = EvaluateNhaloInv(19.00053596496582, 0.83130413049947305);
+        // printf("tmp res4 on gpu: %.17f \n", res4);
+        // // tiger tmp: debug (end)
+    // }
+
+    curandState local_state = d_randStates[ind];
+    // if (blockIdx.x > 100000){
+    //     // printf("check here. \n");
+    // }
+    // tmp: for validation only
+    // sample_dndM_inverse(0.38, &d_hs_constants, &local_state);
+    // int tmp1 = 20;
+    // double tmp2 = 681273355217.0;
+    // float tmp3 = 101976856.0;
+    // remove_random_halo(&local_state, 59, &tmp1, &tmp2, &tmp3);
+
+    // check sample condition
+    // condition 0: no sampling; condition 1: use expected_M; condition 2: sampling
+    int sampleCondition = 2;
+    stoc_sample(&d_hs_constants, &local_state, &shared_mass[tid], &sampleCondition);
+
+    // get stochastic halo properties
+    set_prop_rng(&local_state, true, corr_arr, prop_in, &shared_prop_rng[tid*3]);
+
+
+
+    __syncthreads();
+
+    if (tid % sparsity == 0){
+        if (sampleCondition == 0){
+            d_n_prog[hid] = 0;
+        }
+        if (sampleCondition == 1){
+            if(shared_mass[tid] >= d_simulation_options.SAMPLER_MIN_MASS){
+                d_halo_masses_out[out_id] = shared_mass[tid];
+                d_n_prog[hid] = 1;
+                d_star_rng_out[out_id] = shared_prop_rng[3 * tid];
+                d_sfr_rng_out[out_id] = shared_prop_rng[3 * tid + 1];
+                d_xray_rng_out[out_id] = shared_prop_rng[3 * tid + 2];
+                d_halo_coords_out[out_id*3] = coords_in[0];
+                d_halo_coords_out[out_id*3+1] = coords_in[1];
+                d_halo_coords_out[out_id*3+2] = coords_in[2];
+
+            }
+        }
+        if (sampleCondition == 2){
+            float Mprog = 0.0;
+            int write_limit = 0;
+            int meetCondition = 0;
+
+            for (int i = 0; i < sparsity; ++i){
+                Mprog += shared_mass[tid + i];
+                if (Mprog >= d_hs_constants.expected_M)
+                {
+                    write_limit = i;
+                    meetCondition = 1;
+                    break;
+                }
+                }
+
+            if (meetCondition){
+                // correct the mass samples
+                int n_prog = write_limit +1;
+
+                fix_mass_sample(&local_state, d_hs_constants.expected_M, &Mprog, &shared_mass[tid], write_limit, &n_prog);
+
+                // record number of progenitors
+                d_n_prog[hid] = min(100,n_prog);
+
+                for (int i = 0; i < write_limit + 1; ++i)
+                {
+                    if(shared_mass[tid + i] < d_simulation_options.SAMPLER_MIN_MASS) continue;
+                    // write the final mass sample to array in global memory
+                    d_halo_masses_out[out_id + i] = shared_mass[tid + i];
+                    d_star_rng_out[out_id + i] = shared_prop_rng[3*(tid +i)];
+                    d_sfr_rng_out[out_id + i] = shared_prop_rng[3*(tid+i) + 1];
+                    d_xray_rng_out[out_id + i] = shared_prop_rng[3*(tid+i) + 2];
+                    d_halo_coords_out[(out_id+i) * 3] = coords_in[0];
+                    d_halo_coords_out[(out_id+i) * 3 + 1] = coords_in[1];
+                    d_halo_coords_out[(out_id+i) * 3 + 2] = coords_in[2];
+                }
+            }
+            else{
+                d_further_process[hid] = 1;
+                d_nprog_predict[hid] = ceil(d_hs_constants.expected_M * sparsity / Mprog);
+
+            }
+        }
+    }
+
+        // Perform reduction within the block
+        // for (int stride = blockDim.x / 2; stride > 0; stride /= 2)
+        // {
+        //     if (tid < stride)
+        //     {
+        //         shared_check[tid] += shared_check[tid + stride];
+        //     }
+        //     __syncthreads(); // Ensure all threads have completed each stage of reduction
+        // }
+
+        // Write the result from each block to the global sum
+        // if (tid == 0)
+        // {
+        //     atomicAdd(d_sum_check, shared_check[0]);
+        // }
+
+        // Sample the CMF set by the descendant
+        // stoc_sample(&hs_constants, &local_state, &n_prog, prog_buf);
+
+        // double sigma = EvaluateSigma(log(M), x_min, x_width, d_y_arr, n_bin);
+        // double delta = get_delta_crit(HMF, sigma, d_hs_constants.growth_in)\
+    //                                 / d_hs_constants.growth_in * d_hs_constants.growth_out;
+
+    d_randStates[ind] = local_state;
+    return;
+}
+
+// function to launch kernel grids
+int updateHaloOut(float *halo_masses, float *star_rng, float *sfr_rng, float *xray_rng, int *halo_coords,
+                  unsigned long long int n_halos, float *y_arr, int n_bin_y, double x_min, double x_width,
+                  struct HaloSamplingConstants hs_constants, unsigned long long int n_buffer, HaloField *halofield_out)
+{
+    // allocate memory and copy halo data to the device (halo in)
+    size_t size_halo = sizeof(float) * n_halos;
+    float *d_halo_masses;
+    CALL_CUDA(cudaMalloc(&d_halo_masses, size_halo));
+    CALL_CUDA(cudaMemcpy(d_halo_masses, halo_masses, size_halo, cudaMemcpyHostToDevice));
+
+    float *d_star_rng;
+    CALL_CUDA(cudaMalloc(&d_star_rng, size_halo));
+    CALL_CUDA(cudaMemcpy(d_star_rng, star_rng, size_halo, cudaMemcpyHostToDevice));
+
+    float *d_sfr_rng;
+    CALL_CUDA(cudaMalloc(&d_sfr_rng, size_halo));
+    CALL_CUDA(cudaMemcpy(d_sfr_rng, sfr_rng, size_halo, cudaMemcpyHostToDevice));
+
+    float *d_xray_rng;
+    CALL_CUDA(cudaMalloc(&d_xray_rng, size_halo));
+    CALL_CUDA(cudaMemcpy(d_xray_rng, xray_rng, size_halo, cudaMemcpyHostToDevice));
+
+    int *d_halo_coords;
+    size_t size_halo_coords = 3 * sizeof(int) * n_halos;
+    CALL_CUDA(cudaMalloc(&d_halo_coords, size_halo_coords));
+    CALL_CUDA(cudaMemcpy(d_halo_coords, halo_coords, size_halo_coords, cudaMemcpyHostToDevice));
+
+    // allocate memory and copy y_arr of sigma_table to the device
+    size_t size_yarr = sizeof(float) * n_bin_y;
+    float *d_y_arr;
+    CALL_CUDA(cudaMalloc(&d_y_arr, size_yarr));
+    CALL_CUDA(cudaMemcpy(d_y_arr, y_arr, size_yarr, cudaMemcpyHostToDevice));
+
+    // allocate memory for d_check_sum (tmp)
+    int *d_sum_check;
+    CALL_CUDA(cudaMalloc((void **)&d_sum_check, sizeof(int)));
+    CALL_CUDA(cudaMemset(d_sum_check, 0, sizeof(int)));
+
+    // allocate memory to store list of halo index need further process
+    int *d_further_process;
+    CALL_CUDA(cudaMalloc(&d_further_process, sizeof(int)*n_halos));
+    CALL_CUDA(cudaMemset(d_further_process, 0, sizeof(int)*n_halos));
+
+    // allocate memory to store number of progenitors per halo
+    int *d_n_prog;
+    CALL_CUDA(cudaMalloc(&d_n_prog, sizeof(int) * n_halos));
+    initializeArray(d_n_prog, n_halos, 32);
+
+    // allocate memory to store estimated n_prog after the first kernel launch
+    int *d_nprog_predict;
+    CALL_CUDA(cudaMalloc(&d_nprog_predict, sizeof(int) * n_halos));
+    CALL_CUDA(cudaMemset(d_nprog_predict, 0, sizeof(int) * n_halos));
+
+    // tmp: check expected_M
+    double *d_expected_mass, *h_expected_mass;
+    CALL_CUDA(cudaMalloc(&d_expected_mass, sizeof(double) * n_halos));
+    CALL_CUDA(cudaMemset(d_expected_mass, 0, sizeof(double) * n_halos));
+    CALL_CUDA(cudaHostAlloc((void **)&h_expected_mass, sizeof(double) * n_halos, cudaHostAllocDefault));
+
+    // get parameters needed by the kernel
+    int HMF = user_params_global->HMF;
+
+    // set buffer size (hard-coded)
+    int scale = 5;
+    size_t d_n_buffer = n_halos * scale;
+    size_t buffer_size = sizeof(float) * d_n_buffer;
+
+    // allocate memory for out halos (just allocate once at each call of this grid launch function)
+    float *d_halo_masses_out;
+    CALL_CUDA(cudaMalloc(&d_halo_masses_out, buffer_size));
+    CALL_CUDA(cudaMemset(d_halo_masses_out, 0, buffer_size));
+    // initializeArray(d_halo_masses_out, d_n_buffer, -1.2f);
+
+    float *d_star_rng_out;
+    CALL_CUDA(cudaMalloc(&d_star_rng_out, buffer_size));
+    CALL_CUDA(cudaMemset(d_star_rng_out, 0, buffer_size));
+    // initializeArray(d_halo_masses_out, d_n_buffer, -1.2f);
+
+    float *d_sfr_rng_out;
+    CALL_CUDA(cudaMalloc(&d_sfr_rng_out, buffer_size));
+    CALL_CUDA(cudaMemset(d_sfr_rng_out, 0, buffer_size));
+
+    float *d_xray_rng_out;
+    CALL_CUDA(cudaMalloc(&d_xray_rng_out, buffer_size));
+    CALL_CUDA(cudaMemset(d_xray_rng_out, 0, buffer_size));
+
+    int *d_halo_coords_out;
+    CALL_CUDA(cudaMalloc(&d_halo_coords_out, sizeof(int) * d_n_buffer * 3));
+    initializeArray(d_halo_coords_out, d_n_buffer * 3, -1000);
+
+    // initiate n_halo check
+    // unsigned long long int n_halo_check = n_halos;
+
+    // initiate offset for writing output data
+    unsigned long long int write_offset = 0;
+
+    // initialize n filter halo
+    unsigned long long int n_halos_tbp = n_halos;
+
+    // initialize number of progenitors processed
+    unsigned long long int n_processed_prog;
+
+    // todo: add the following to debug
+    cudaFuncAttributes attr;
+    cudaFuncGetAttributes(&attr, update_halo_constants);
+    // printf("Kernel Shared Memory per Block: %zu bytes\n", attr.sharedSizeBytes);
+    // printf("Kernel Registers per Thread: %d\n", attr.numRegs);
+    // printf("Kernel Max Threads per Block: %d\n", attr.maxThreadsPerBlock);
+
+    // start with 4 threads work with one halo
+    int sparsity = 4;
+
+    // Check if sparsity is smaller than scale
+    if (sparsity >= scale)
+    {
+        throw std::runtime_error("'sparsity' must be smaller than 'scale'.");
+    }
+
+    // initial kernel grid
+    GridLayout grids = getWorkload(sparsity, n_halos);
+
+    // launch kernel grid
+    while (n_halos_tbp > 0){
+    size_t shared_size = grids.n_threads * sizeof(float) * 4;
+    int offset_shared = grids.n_threads;
+    printf("start launching kernel function.\n");
+    update_halo_constants<<<grids.n_blocks, grids.n_threads, shared_size>>>(d_halo_masses, d_star_rng, d_sfr_rng, d_xray_rng, d_halo_coords,
+                                                       d_y_arr, x_min, x_width, n_halos_tbp, n_bin_y, hs_constants, HMF, d_halo_masses_out, d_star_rng_out,
+                                                       d_sfr_rng_out, d_xray_rng_out, d_halo_coords_out, d_sum_check, d_further_process, d_nprog_predict, sparsity, write_offset, d_expected_mass,
+                                                       d_n_prog, offset_shared);
+
+    // Check kernel launch errors
+    CALL_CUDA(cudaGetLastError());
+
+    CALL_CUDA(cudaDeviceSynchronize());
+
+    // filter device halo masses in-place
+    n_halos_tbp = filterWithMask(d_halo_masses, d_further_process, n_halos_tbp);
+    printf("The number of halos for further processing: %d \n", n_halos_tbp);
+
+    // // tmp 2025-01-19: check d_halo_masses_out writing out
+    // float *h_halo_masses_out_check;
+    // CALL_CUDA(cudaHostAlloc((void **)&h_halo_masses_out_check, buffer_size, cudaHostAllocDefault));
+    // CALL_CUDA(cudaMemcpy(h_halo_masses_out_check, d_halo_masses_out, buffer_size, cudaMemcpyDeviceToHost));
+
+    // number of progenitors per halo
+    int *h_n_prog;
+    CALL_CUDA(cudaHostAlloc((void **)&h_n_prog, sizeof(int)*n_halos, cudaHostAllocDefault));
+    CALL_CUDA(cudaMemcpy(h_n_prog, d_n_prog, sizeof(int)*n_halos, cudaMemcpyDeviceToHost));
+
+    // debug only
+    // // Values to count
+    // std::vector<int> values_to_count = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100,32};
+
+    // // Count and display occurrences
+    // countElements(h_n_prog, n_halos, values_to_count);
+
+    // condense halo mass array on the device
+    n_processed_prog = condenseDeviceArray(d_halo_masses_out, d_n_buffer, 0.0f);
+    printf("The number of progenitors written in out halo field so far: %d \n", n_processed_prog);
+
+    // condense other halo field arrays on the device
+    unsigned long long int n_processed_star_rng = condenseDeviceArray(d_star_rng_out, d_n_buffer, 0.0f);
+    printf("The number of star prop rng written in out halo field so far: %d \n", n_processed_star_rng);
+
+    unsigned long long int n_processed_sfr_rng = condenseDeviceArray(d_sfr_rng_out, d_n_buffer, 0.0f);
+    printf("The number of sfr prop rng written in out halo field so far: %d \n", n_processed_sfr_rng);
+
+    unsigned long long int n_processed_xray_rng = condenseDeviceArray(d_xray_rng_out, d_n_buffer, 0.0f);
+    printf("The number of xray prop rng written in out halo field so far: %d \n", n_processed_xray_rng);
+
+    unsigned long long int n_processed_coords = condenseDeviceArray(d_halo_coords_out, d_n_buffer*3, -1000);
+    printf("The number of halo coords written in out halo field so far: %d \n", n_processed_coords);
+
+    // tmp: the following is just needed for debugging purpose
+    // float *h_filter_halos;
+    // CALL_CUDA(cudaHostAlloc((void **)&h_filter_halos, sizeof(float) * n_halos_tbp, cudaHostAllocDefault));
+    // CALL_CUDA(cudaMemcpy(h_filter_halos, d_halo_masses, sizeof(float) * n_halos_tbp, cudaMemcpyDeviceToHost));
+
+    // int *h_nprog_predict;
+    // CALL_CUDA(cudaHostAlloc((void **)&h_nprog_predict, sizeof(int) * n_halos, cudaHostAllocDefault));
+    // CALL_CUDA(cudaMemcpy(h_nprog_predict, d_nprog_predict, sizeof(int) * n_halos, cudaMemcpyDeviceToHost));
+
+    if (n_halos_tbp > 0){
+        // update sparsity value
+        unsigned long long int available_n_buffer = d_n_buffer - n_processed_prog;
+        sparsity = getSparsity(available_n_buffer, n_halos_tbp);
+
+
+        // sparsity should not exceed the max threads per block
+        // sparsity = 256;
+        sparsity = std::min(sparsity, 512);
+
+        // reset grids layout
+        grids = getWorkload(sparsity, n_halos_tbp);
+
+        // update write offset
+        write_offset = n_processed_prog;
+
+        // reset mask array
+        CALL_CUDA(cudaMemset(d_further_process, 0, sizeof(int) * n_halos));
+
+        // copy data from device to host
+        int h_sum_check;
+        CALL_CUDA(cudaMemcpy(&h_sum_check, d_sum_check, sizeof(int), cudaMemcpyDeviceToHost));
+    }
+    // tmp: for debug only
+    // CALL_CUDA(cudaFreeHost(h_filter_halos));
+    // CALL_CUDA(cudaFreeHost(h_sum_check));
+
+    }
+
+    // write data back to the host
+    halofield_out->n_halos = n_processed_prog;
+    size_t out_size = sizeof(float) * n_processed_prog;
+
+    // float *h_halo_masses_out;
+    // CALL_CUDA(cudaHostAlloc((void **)&h_halo_masses_out, out_size, cudaHostAllocDefault));
+    CALL_CUDA(cudaGetLastError());
+    CALL_CUDA(cudaDeviceSynchronize());
+
+    CALL_CUDA(cudaMemcpy(halofield_out->halo_masses, d_halo_masses_out, out_size, cudaMemcpyDeviceToHost));
+
+
+    CALL_CUDA(cudaMemcpy(halofield_out->star_rng, d_star_rng_out, out_size, cudaMemcpyDeviceToHost));
+    CALL_CUDA(cudaMemcpy(halofield_out->sfr_rng, d_sfr_rng_out, out_size, cudaMemcpyDeviceToHost));
+    CALL_CUDA(cudaMemcpy(halofield_out->xray_rng, d_xray_rng_out, out_size, cudaMemcpyDeviceToHost));
+
+    size_t out_coords_size = sizeof(int) * n_processed_prog * 3;
+    CALL_CUDA(cudaMemcpy(halofield_out->halo_coords, d_halo_coords_out, out_coords_size, cudaMemcpyDeviceToHost));
+
+
+    // Free device memory
+    CALL_CUDA(cudaFree(d_halo_masses));
+    CALL_CUDA(cudaFree(d_y_arr));
+    CALL_CUDA(cudaFree(d_halo_masses_out));
+    CALL_CUDA(cudaFree(d_star_rng_out));
+    CALL_CUDA(cudaFree(d_sfr_rng_out));
+    CALL_CUDA(cudaFree(d_xray_rng_out));
+    CALL_CUDA(cudaFree(d_halo_coords_out));
+    CALL_CUDA(cudaFree(d_further_process));
+
+    validate_thrust();
+
+    condense_device_vector();
+
+    testCondenseDeviceArray();
+
+    testFilterWithMask();
+
+    CALL_CUDA(cudaGetLastError());
+    CALL_CUDA(cudaDeviceSynchronize());
+    return 0;
+}
diff --git a/src/py21cmfast/src/Stochasticity.cuh b/src/py21cmfast/src/Stochasticity.cuh
new file mode 100644
index 000000000..25a1670aa
--- /dev/null
+++ b/src/py21cmfast/src/Stochasticity.cuh
@@ -0,0 +1,17 @@
+#ifndef _STOCHASTICITY_CUH
+#define _STOCHASTICITY_CUH
+
+#define HALO_CUDA_THREAD_FACTOR (int) (4)
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+    int updateHaloOut(float *halo_masses, float *star_rng, float *sfr_rng, float *xray_rng, int *halo_coords,
+                      unsigned long long int n_halos, float *y_arr, int n_bin_y, double x_min, double x_width,
+                      struct HaloSamplingConstants hs_constants, unsigned long long int n_buffer, HaloField *halofield_out);
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/py21cmfast/src/Stochasticity.h b/src/py21cmfast/src/Stochasticity.h
index ea6260a6b..63d44de74 100644
--- a/src/py21cmfast/src/Stochasticity.h
+++ b/src/py21cmfast/src/Stochasticity.h
@@ -39,6 +39,11 @@ struct HaloSamplingConstants {
     double expected_M;
 };
 
+#ifdef __cplusplus
+extern "C" {
+
+#endif
+
 int stochastic_halofield(unsigned long long int seed, float redshift_desc, float redshift,
                          float *dens_field, float *halo_overlap_box, HaloField *halos_desc,
                          HaloField *halos);
@@ -59,4 +64,7 @@ void stoc_set_consts_z(struct HaloSamplingConstants *const_struct, double redshi
                        double redshift_desc);
 void stoc_set_consts_cond(struct HaloSamplingConstants *const_struct, double cond_val);
 
+#ifdef __cplusplus
+}
 #endif
+#endif  //_STOCHASTICITY_H
diff --git a/src/py21cmfast/src/_functionprototypes_wrapper.h b/src/py21cmfast/src/_functionprototypes_wrapper.h
index 4732dcfd4..eb552f07e 100644
--- a/src/py21cmfast/src/_functionprototypes_wrapper.h
+++ b/src/py21cmfast/src/_functionprototypes_wrapper.h
@@ -1,7 +1,3 @@
-/* This file contains the repeated function prototypes which are needed by CFFI
-    to be included explicitly via ffi.cdef(), These are the only functions which
-    are visible to the python wrapper */
-
 /* OutputStruct COMPUTE FUNCTIONS */
 int ComputeInitialConditions(unsigned long long random_seed, InitialConditions *boxes);
 
@@ -25,9 +21,8 @@ int ComputeIonizedBox(float redshift, float prev_redshift, PerturbedField *pertu
 int ComputeBrightnessTemp(float redshift, TsBox *spin_temp, IonizedBox *ionized_box,
                           PerturbedField *perturb_field, BrightnessTemp *box);
 
-int ComputeHaloBox(double redshift, InitialConditions *ini_boxes, PerturbedField *perturbed_field,
-                   PerturbHaloField *halos, TsBox *previous_spin_temp,
-                   IonizedBox *previous_ionize_box, HaloBox *grids);
+int ComputeHaloBox(double redshift, InitialConditions *ini_boxes, PerturbHaloField *halos,
+                   TsBox *previous_spin_temp, IonizedBox *previous_ionize_box, HaloBox *grids);
 
 int UpdateXraySourceBox(HaloBox *halobox, double R_inner, double R_outer, int R_ct,
                         XraySourceBox *source_box);
diff --git a/src/py21cmfast/src/_inputparams_wrapper.h b/src/py21cmfast/src/_inputparams_wrapper.h
deleted file mode 100644
index 0927b3c2a..000000000
--- a/src/py21cmfast/src/_inputparams_wrapper.h
+++ /dev/null
@@ -1,182 +0,0 @@
-/*We need to explicitly define the types used by the warpper using ffi.cdef()
-    However, that function does not take directives, so we separate the types here
-*/
-// WARNING: DO NOT #include THIS FILE IN THE C CODE EXCEPT FOR IN InputParameters.h
-
-typedef struct CosmoParams {
-    float SIGMA_8;
-    float hlittle;
-    float OMm;
-    float OMl;
-    float OMb;
-    float POWER_INDEX;
-
-    float OMn;
-    float OMk;
-    float OMr;
-    float OMtot;
-    float Y_He;
-    float wl;
-
-} CosmoParams;
-
-typedef struct SimulationOptions {
-    // Parameters taken from INIT_PARAMS.H
-    int HII_DIM;
-    int DIM;
-    float BOX_LEN;
-    float NON_CUBIC_FACTOR;
-    int N_THREADS;
-    double Z_HEAT_MAX;
-    double ZPRIME_STEP_FACTOR;
-
-    // Halo Sampler Options
-    float SAMPLER_MIN_MASS;
-    double SAMPLER_BUFFER_FACTOR;
-    int N_COND_INTERP;
-    int N_PROB_INTERP;
-    double MIN_LOGPROB;
-    double HALOMASS_CORRECTION;
-    double PARKINSON_G0;
-    double PARKINSON_y1;
-    double PARKINSON_y2;
-
-    float INITIAL_REDSHIFT;
-    double DELTA_R_FACTOR;
-    double DENSITY_SMOOTH_RADIUS;
-
-    double DEXM_OPTIMIZE_MINMASS;
-    double DEXM_R_OVERLAP;
-
-    double CORR_STAR;
-    double CORR_SFR;
-    double CORR_LX;
-} SimulationOptions;
-
-typedef struct MatterOptions {
-    bool USE_FFTW_WISDOM;
-    int HMF;
-    int USE_RELATIVE_VELOCITIES;
-    int POWER_SPECTRUM;
-    int USE_INTERPOLATION_TABLES;
-    bool PERTURB_ON_HIGH_RES;
-    int PERTURB_ALGORITHM;
-    bool MINIMIZE_MEMORY;
-    bool KEEP_3D_VELOCITIES;
-    bool DEXM_OPTIMIZE;
-    int FILTER;
-    int HALO_FILTER;
-    bool SMOOTH_EVOLVED_DENSITY_FIELD;
-
-    bool USE_HALO_FIELD;
-    bool HALO_STOCHASTICITY;
-    bool FIXED_HALO_GRIDS;
-    int SAMPLE_METHOD;
-} MatterOptions;
-
-typedef struct AstroParams {
-    float HII_EFF_FACTOR;
-
-    // SHMR
-    float F_STAR10;
-    float ALPHA_STAR;
-    float ALPHA_STAR_MINI;
-    float SIGMA_STAR;
-    double UPPER_STELLAR_TURNOVER_MASS;
-    double UPPER_STELLAR_TURNOVER_INDEX;
-    float F_STAR7_MINI;
-
-    // SFMS
-    float t_STAR;
-    double SIGMA_SFR_INDEX;
-    double SIGMA_SFR_LIM;
-
-    // L_X/SFR
-    double L_X;
-    double L_X_MINI;
-    double SIGMA_LX;
-
-    // Escape Fraction
-    float F_ESC10;
-    float ALPHA_ESC;
-    float F_ESC7_MINI;
-
-    float T_RE;
-
-    float M_TURN;
-    float R_BUBBLE_MAX;
-    float ION_Tvir_MIN;
-    double F_H2_SHIELD;
-    float NU_X_THRESH;
-    float X_RAY_SPEC_INDEX;
-    float X_RAY_Tvir_MIN;
-
-    double A_LW;
-    double BETA_LW;
-    double A_VCB;
-    double BETA_VCB;
-
-    double FIXED_VAVG;
-    double POP2_ION;
-    double POP3_ION;
-
-    double PHOTONCONS_CALIBRATION_END;
-    double CLUMPING_FACTOR;
-    double ALPHA_UVB;
-
-    float R_MAX_TS;
-    int N_STEP_TS;
-    double DELTA_R_HII_FACTOR;
-    float R_BUBBLE_MIN;
-    double MAX_DVDR;
-    double NU_X_MAX;
-    double NU_X_BAND_MAX;
-} AstroParams;
-
-typedef struct AstroOptions {
-    bool USE_MINI_HALOS;
-    bool USE_CMB_HEATING;  // CMB Heating Flag
-    bool USE_LYA_HEATING;  // Lya Heating Flag
-    bool USE_MASS_DEPENDENT_ZETA;
-    bool INHOMO_RECO;
-    bool USE_TS_FLUCT;
-    bool M_MIN_in_Mass;
-    bool FIX_VCB_AVG;
-    bool USE_EXP_FILTER;
-    bool CELL_RECOMB;
-    int PHOTON_CONS_TYPE;
-    bool USE_UPPER_STELLAR_TURNOVER;
-    bool HALO_SCALING_RELATIONS_MEDIAN;
-    int HII_FILTER;
-    int HEAT_FILTER;
-    bool IONISE_ENTIRE_SPHERE;
-    bool AVG_BELOW_SAMPLER;
-    int INTEGRATION_METHOD_ATOMIC;
-    int INTEGRATION_METHOD_MINI;
-} AstroOptions;
-
-typedef struct ConfigSettings {
-    double HALO_CATALOG_MEM_FACTOR;
-
-    char *external_table_path;
-    char *wisdoms_path;
-} ConfigSettings;
-
-/* Previously, we had a few structures spread throughout the code e.g simulation_options_ufunc which
-   were all globally defined and separately broadcast at different times. Several of these were used
-   across different files and some inside #defines (e.g indexing.h), so for now I've combined
-   the parameter structures to avoid confusion (we shouldn't have the possibility of two files using
-   different parameters).
-
-   In future we should have a parameter structure in each .c file containing ONLY parameters
-   relevant to it (look at HaloBox.c), and force the broadcast at each _compute() step (or even
-   decorate any library call) However this would require us to be very careful about initialising
-   the globals when ANY function from that file is called */
-// The structs declared here defined in InputParameters.c
-extern SimulationOptions *simulation_options_global;
-extern MatterOptions *matter_options_global;
-extern CosmoParams *cosmo_params_global;
-extern AstroParams *astro_params_global;
-extern AstroOptions *astro_options_global;
-
-extern ConfigSettings config_settings;
diff --git a/src/py21cmfast/src/_outputstructs_wrapper.h b/src/py21cmfast/src/_outputstructs_wrapper.h
deleted file mode 100644
index c347df68f..000000000
--- a/src/py21cmfast/src/_outputstructs_wrapper.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*We need to explicitly define the types used by the warpper using ffi.cdef()
-    However, that function does not take directives, so we separate the types here
-*/
-// WARNING: DO NOT #include THIS FILE IN THE C CODE EXCEPT FOR IN OutputStructs.h
-
-typedef struct InitialConditions {
-    float *lowres_density, *lowres_vx, *lowres_vy, *lowres_vz, *lowres_vx_2LPT, *lowres_vy_2LPT,
-        *lowres_vz_2LPT;
-    float *hires_density, *hires_vx, *hires_vy, *hires_vz, *hires_vx_2LPT, *hires_vy_2LPT,
-        *hires_vz_2LPT;  // cw addition
-    float *lowres_vcb;
-} InitialConditions;
-
-typedef struct PerturbedField {
-    float *density, *velocity_x, *velocity_y, *velocity_z;
-} PerturbedField;
-
-typedef struct HaloField {
-    long long unsigned int n_halos;
-    long long unsigned int buffer_size;
-    float *halo_masses;
-    float *halo_coords;
-
-    // Halo properties for stochastic model
-    float *star_rng;
-    float *sfr_rng;
-    float *xray_rng;
-} HaloField;
-
-typedef struct PerturbHaloField {
-    long long unsigned int n_halos;
-    long long unsigned int buffer_size;
-    float *halo_masses;
-    float *halo_coords;
-
-    // Halo properties for stochastic model
-    float *star_rng;
-    float *sfr_rng;
-    float *xray_rng;
-} PerturbHaloField;
-
-typedef struct HaloBox {
-    // Things that aren't used in radiation fields but useful outputs
-    float *halo_mass;
-    float *halo_stars;
-    float *halo_stars_mini;
-    int *count;
-
-    // For IonisationBox.c and SpinTemperatureBox.c
-    float *n_ion;     // weighted by F_ESC*PopN_ion
-    float *halo_sfr;  // for x-rays and Ts stuff
-    float *halo_xray;
-    float *halo_sfr_mini;  // for x-rays and Ts stuff
-    float *whalo_sfr;      // SFR weighted by PopN_ion and F_ESC, used for Gamma12
-
-    // Average volume-weighted log10 Turnover masses are kept in order to compare with the expected
-    // MF integrals
-    double log10_Mcrit_ACG_ave;
-    double log10_Mcrit_MCG_ave;
-} HaloBox;
-
-typedef struct XraySourceBox {
-    float *filtered_sfr;
-    float *filtered_xray;
-    float *filtered_sfr_mini;
-
-    double *mean_log10_Mcrit_LW;
-    double *mean_sfr;
-    double *mean_sfr_mini;
-} XraySourceBox;
-
-typedef struct TsBox {
-    float *spin_temperature;
-    float *xray_ionised_fraction;
-    float *kinetic_temp_neutral;
-    float *J_21_LW;
-} TsBox;
-
-typedef struct IonizedBox {
-    double mean_f_coll;
-    double mean_f_coll_MINI;
-    double log10_Mturnover_ave;
-    double log10_Mturnover_MINI_ave;
-    float *neutral_fraction;
-    float *ionisation_rate_G12;
-    float *mean_free_path;
-    float *z_reion;
-    float *cumulative_recombinations;
-    float *kinetic_temperature;
-    float *unnormalised_nion;
-    float *unnormalised_nion_mini;
-} IonizedBox;
-
-typedef struct BrightnessTemp {
-    float *brightness_temp;
-    float *tau_21;
-} BrightnessTemp;
diff --git a/src/py21cmfast/src/_wrapper.cpp b/src/py21cmfast/src/_wrapper.cpp
new file mode 100644
index 000000000..b6099fef0
--- /dev/null
+++ b/src/py21cmfast/src/_wrapper.cpp
@@ -0,0 +1,686 @@
+#include <nanobind/nanobind.h>
+#include <nanobind/ndarray.h>
+#include <nanobind/stl/string.h>
+
+namespace nb = nanobind;
+
+extern "C" {
+#include "21cmFAST.h"
+}
+
+NB_MODULE(c_21cmfast, m) {
+    m.doc() = "This is the docstring for the 21cmFAST Python extension.";
+
+    // Bind input parameters
+
+    // Bind CosmoParams
+    nb::class_<CosmoParams>(m, "CosmoParams")
+        .def(nb::init<>())
+        .def_rw("SIGMA_8", &CosmoParams::SIGMA_8)
+        .def_rw("hlittle", &CosmoParams::hlittle)
+        .def_rw("OMm", &CosmoParams::OMm)
+        .def_rw("OMl", &CosmoParams::OMl)
+        .def_rw("OMb", &CosmoParams::OMb)
+        .def_rw("OMn", &CosmoParams::OMn)
+        .def_rw("OMk", &CosmoParams::OMk)
+        .def_rw("OMr", &CosmoParams::OMr)
+        .def_rw("OMtot", &CosmoParams::OMtot)
+        .def_rw("Y_He", &CosmoParams::Y_He)
+        .def_rw("wl", &CosmoParams::wl)
+        .def_rw("POWER_INDEX", &CosmoParams::POWER_INDEX);
+
+    // Bind SimulationOptions
+    nb::class_<SimulationOptions>(m, "SimulationOptions")
+        .def(nb::init<>())
+        .def_rw("HII_DIM", &SimulationOptions::HII_DIM)
+        .def_rw("DIM", &SimulationOptions::DIM)
+        .def_rw("BOX_LEN", &SimulationOptions::BOX_LEN)
+        .def_rw("NON_CUBIC_FACTOR", &SimulationOptions::NON_CUBIC_FACTOR)
+        .def_rw("N_THREADS", &SimulationOptions::N_THREADS)
+        .def_rw("Z_HEAT_MAX", &SimulationOptions::Z_HEAT_MAX)
+        .def_rw("ZPRIME_STEP_FACTOR", &SimulationOptions::ZPRIME_STEP_FACTOR)
+        .def_rw("SAMPLER_MIN_MASS", &SimulationOptions::SAMPLER_MIN_MASS)
+        .def_rw("SAMPLER_BUFFER_FACTOR", &SimulationOptions::SAMPLER_BUFFER_FACTOR)
+        .def_rw("N_COND_INTERP", &SimulationOptions::N_COND_INTERP)
+        .def_rw("N_PROB_INTERP", &SimulationOptions::N_PROB_INTERP)
+        .def_rw("MIN_LOGPROB", &SimulationOptions::MIN_LOGPROB)
+        .def_rw("HALOMASS_CORRECTION", &SimulationOptions::HALOMASS_CORRECTION)
+        .def_rw("PARKINSON_G0", &SimulationOptions::PARKINSON_G0)
+        .def_rw("PARKINSON_y1", &SimulationOptions::PARKINSON_y1)
+        .def_rw("PARKINSON_y2", &SimulationOptions::PARKINSON_y2)
+        .def_rw("INITIAL_REDSHIFT", &SimulationOptions::INITIAL_REDSHIFT)
+        .def_rw("DELTA_R_FACTOR", &SimulationOptions::DELTA_R_FACTOR)
+        .def_rw("DENSITY_SMOOTH_RADIUS", &SimulationOptions::DENSITY_SMOOTH_RADIUS)
+        .def_rw("DEXM_OPTIMIZE_MINMASS", &SimulationOptions::DEXM_OPTIMIZE_MINMASS)
+        .def_rw("DEXM_R_OVERLAP", &SimulationOptions::DEXM_R_OVERLAP)
+        .def_rw("CORR_STAR", &SimulationOptions::CORR_STAR)
+        .def_rw("CORR_SFR", &SimulationOptions::CORR_SFR)
+        .def_rw("CORR_LX", &SimulationOptions::CORR_LX);
+
+    // Bind MatterOptions
+    nb::class_<MatterOptions>(m, "MatterOptions")
+        .def(nb::init<>())
+        .def_rw("USE_FFTW_WISDOM", &MatterOptions::USE_FFTW_WISDOM)
+        .def_rw("HMF", &MatterOptions::HMF)
+        .def_rw("USE_RELATIVE_VELOCITIES", &MatterOptions::USE_RELATIVE_VELOCITIES)
+        .def_rw("POWER_SPECTRUM", &MatterOptions::POWER_SPECTRUM)
+        .def_rw("USE_INTERPOLATION_TABLES", &MatterOptions::USE_INTERPOLATION_TABLES)
+        .def_rw("PERTURB_ON_HIGH_RES", &MatterOptions::PERTURB_ON_HIGH_RES)
+        .def_rw("PERTURB_ALGORITHM", &MatterOptions::PERTURB_ALGORITHM)
+        .def_rw("MINIMIZE_MEMORY", &MatterOptions::MINIMIZE_MEMORY)
+        .def_rw("KEEP_3D_VELOCITIES", &MatterOptions::KEEP_3D_VELOCITIES)
+        .def_rw("DEXM_OPTIMIZE", &MatterOptions::DEXM_OPTIMIZE)
+        .def_rw("FILTER", &MatterOptions::FILTER)
+        .def_rw("HALO_FILTER", &MatterOptions::HALO_FILTER)
+        .def_rw("SMOOTH_EVOLVED_DENSITY_FIELD", &MatterOptions::SMOOTH_EVOLVED_DENSITY_FIELD)
+        .def_rw("USE_HALO_FIELD", &MatterOptions::USE_HALO_FIELD)
+        .def_rw("HALO_STOCHASTICITY", &MatterOptions::HALO_STOCHASTICITY)
+        .def_rw("FIXED_HALO_GRIDS", &MatterOptions::FIXED_HALO_GRIDS)
+        .def_rw("SAMPLE_METHOD", &MatterOptions::SAMPLE_METHOD);
+
+    // Bind AstroParams
+    nb::class_<AstroParams>(m, "AstroParams")
+        .def(nb::init<>())
+        .def_rw("HII_EFF_FACTOR", &AstroParams::HII_EFF_FACTOR)
+        .def_rw("F_STAR10", &AstroParams::F_STAR10)
+        .def_rw("ALPHA_STAR", &AstroParams::ALPHA_STAR)
+        .def_rw("ALPHA_STAR_MINI", &AstroParams::ALPHA_STAR_MINI)
+        .def_rw("SIGMA_STAR", &AstroParams::SIGMA_STAR)
+        .def_rw("UPPER_STELLAR_TURNOVER_MASS", &AstroParams::UPPER_STELLAR_TURNOVER_MASS)
+        .def_rw("UPPER_STELLAR_TURNOVER_INDEX", &AstroParams::UPPER_STELLAR_TURNOVER_INDEX)
+        .def_rw("F_STAR7_MINI", &AstroParams::F_STAR7_MINI)
+        .def_rw("t_STAR", &AstroParams::t_STAR)
+        .def_rw("SIGMA_SFR_INDEX", &AstroParams::SIGMA_SFR_INDEX)
+        .def_rw("SIGMA_SFR_LIM", &AstroParams::SIGMA_SFR_LIM)
+        .def_rw("L_X", &AstroParams::L_X)
+        .def_rw("L_X_MINI", &AstroParams::L_X_MINI)
+        .def_rw("SIGMA_LX", &AstroParams::SIGMA_LX)
+        .def_rw("F_ESC10", &AstroParams::F_ESC10)
+        .def_rw("ALPHA_ESC", &AstroParams::ALPHA_ESC)
+        .def_rw("F_ESC7_MINI", &AstroParams::F_ESC7_MINI)
+        .def_rw("T_RE", &AstroParams::T_RE)
+        .def_rw("M_TURN", &AstroParams::M_TURN)
+        .def_rw("R_BUBBLE_MAX", &AstroParams::R_BUBBLE_MAX)
+        .def_rw("ION_Tvir_MIN", &AstroParams::ION_Tvir_MIN)
+        .def_rw("F_H2_SHIELD", &AstroParams::F_H2_SHIELD)
+        .def_rw("NU_X_THRESH", &AstroParams::NU_X_THRESH)
+        .def_rw("X_RAY_SPEC_INDEX", &AstroParams::X_RAY_SPEC_INDEX)
+        .def_rw("X_RAY_Tvir_MIN", &AstroParams::X_RAY_Tvir_MIN)
+        .def_rw("A_LW", &AstroParams::A_LW)
+        .def_rw("BETA_LW", &AstroParams::BETA_LW)
+        .def_rw("A_VCB", &AstroParams::A_VCB)
+        .def_rw("BETA_VCB", &AstroParams::BETA_VCB)
+        .def_rw("FIXED_VAVG", &AstroParams::FIXED_VAVG)
+        .def_rw("POP2_ION", &AstroParams::POP2_ION)
+        .def_rw("POP3_ION", &AstroParams::POP3_ION)
+        .def_rw("PHOTONCONS_CALIBRATION_END", &AstroParams::PHOTONCONS_CALIBRATION_END)
+        .def_rw("CLUMPING_FACTOR", &AstroParams::CLUMPING_FACTOR)
+        .def_rw("ALPHA_UVB", &AstroParams::ALPHA_UVB)
+        .def_rw("R_MAX_TS", &AstroParams::R_MAX_TS)
+        .def_rw("N_STEP_TS", &AstroParams::N_STEP_TS)
+        .def_rw("DELTA_R_HII_FACTOR", &AstroParams::DELTA_R_HII_FACTOR)
+        .def_rw("R_BUBBLE_MIN", &AstroParams::R_BUBBLE_MIN)
+        .def_rw("MAX_DVDR", &AstroParams::MAX_DVDR)
+        .def_rw("NU_X_MAX", &AstroParams::NU_X_MAX)
+        .def_rw("NU_X_BAND_MAX", &AstroParams::NU_X_BAND_MAX);
+
+    // Bind AstroOptions
+    nb::class_<AstroOptions>(m, "AstroOptions")
+        .def(nb::init<>())
+        .def_rw("USE_MINI_HALOS", &AstroOptions::USE_MINI_HALOS)
+        .def_rw("USE_CMB_HEATING", &AstroOptions::USE_CMB_HEATING)
+        .def_rw("USE_LYA_HEATING", &AstroOptions::USE_LYA_HEATING)
+        .def_rw("USE_MASS_DEPENDENT_ZETA", &AstroOptions::USE_MASS_DEPENDENT_ZETA)
+        .def_rw("INHOMO_RECO", &AstroOptions::INHOMO_RECO)
+        .def_rw("USE_TS_FLUCT", &AstroOptions::USE_TS_FLUCT)
+        .def_rw("M_MIN_in_Mass", &AstroOptions::M_MIN_in_Mass)
+        .def_rw("FIX_VCB_AVG", &AstroOptions::FIX_VCB_AVG)
+        .def_rw("USE_EXP_FILTER", &AstroOptions::USE_EXP_FILTER)
+        .def_rw("CELL_RECOMB", &AstroOptions::CELL_RECOMB)
+        .def_rw("PHOTON_CONS_TYPE", &AstroOptions::PHOTON_CONS_TYPE)
+        .def_rw("USE_UPPER_STELLAR_TURNOVER", &AstroOptions::USE_UPPER_STELLAR_TURNOVER)
+        .def_rw("HALO_SCALING_RELATIONS_MEDIAN", &AstroOptions::HALO_SCALING_RELATIONS_MEDIAN)
+        .def_rw("HII_FILTER", &AstroOptions::HII_FILTER)
+        .def_rw("HEAT_FILTER", &AstroOptions::HEAT_FILTER)
+        .def_rw("IONISE_ENTIRE_SPHERE", &AstroOptions::IONISE_ENTIRE_SPHERE)
+        .def_rw("AVG_BELOW_SAMPLER", &AstroOptions::AVG_BELOW_SAMPLER)
+        .def_rw("INTEGRATION_METHOD_ATOMIC", &AstroOptions::INTEGRATION_METHOD_ATOMIC)
+        .def_rw("INTEGRATION_METHOD_MINI", &AstroOptions::INTEGRATION_METHOD_MINI);
+
+    // Bind ConfigSettings
+    nb::class_<ConfigSettings>(m, "ConfigSettings")
+        .def(nb::init<>())
+        .def_rw("HALO_CATALOG_MEM_FACTOR", &ConfigSettings::HALO_CATALOG_MEM_FACTOR)
+        .def("set_external_table_path",
+             [](ConfigSettings& self, const std::string& path) {
+                 strcpy(self.external_table_path, path.c_str());
+             })
+        .def("get_external_table_path",
+             [](ConfigSettings& self) { return std::string(self.external_table_path); })
+        .def("set_wisdoms_path",
+             [](ConfigSettings& self, const std::string& path) {
+                 strcpy(self.wisdoms_path, path.c_str());
+             })
+        .def("get_wisdoms_path",
+             [](ConfigSettings& self) { return std::string(self.wisdoms_path); });
+
+    // Output Struct Bindings
+    // Bind InitialConditions
+    nb::class_<InitialConditions>(m, "InitialConditions")
+        .def(nb::init<>())
+        .def("set_lowres_density",
+             [](InitialConditions& self, nb::ndarray<float> array) {
+                 self.lowres_density = array.data();
+             })
+        .def("set_lowres_vx", [](InitialConditions& self,
+                                 nb::ndarray<float> array) { self.lowres_vx = array.data(); })
+        .def("set_lowres_vy", [](InitialConditions& self,
+                                 nb::ndarray<float> array) { self.lowres_vy = array.data(); })
+        .def("set_lowres_vz", [](InitialConditions& self,
+                                 nb::ndarray<float> array) { self.lowres_vz = array.data(); })
+        .def("set_lowres_vx_2LPT",
+             [](InitialConditions& self, nb::ndarray<float> array) {
+                 self.lowres_vx_2LPT = array.data();
+             })
+        .def("set_lowres_vy_2LPT",
+             [](InitialConditions& self, nb::ndarray<float> array) {
+                 self.lowres_vy_2LPT = array.data();
+             })
+        .def("set_lowres_vz_2LPT",
+             [](InitialConditions& self, nb::ndarray<float> array) {
+                 self.lowres_vz_2LPT = array.data();
+             })
+        .def("set_hires_density",
+             [](InitialConditions& self, nb::ndarray<float> array) {
+                 self.hires_density = array.data();
+             })
+        .def("set_hires_vx", [](InitialConditions& self,
+                                nb::ndarray<float> array) { self.hires_vx = array.data(); })
+        .def("set_hires_vy", [](InitialConditions& self,
+                                nb::ndarray<float> array) { self.hires_vy = array.data(); })
+        .def("set_hires_vz", [](InitialConditions& self,
+                                nb::ndarray<float> array) { self.hires_vz = array.data(); })
+        .def("set_hires_vx_2LPT",
+             [](InitialConditions& self, nb::ndarray<float> array) {
+                 self.hires_vx_2LPT = array.data();
+             })
+        .def("set_hires_vy_2LPT",
+             [](InitialConditions& self, nb::ndarray<float> array) {
+                 self.hires_vy_2LPT = array.data();
+             })
+        .def("set_hires_vz_2LPT",
+             [](InitialConditions& self, nb::ndarray<float> array) {
+                 self.hires_vz_2LPT = array.data();
+             })
+        .def("set_lowres_vcb", [](InitialConditions& self, nb::ndarray<float> array) {
+            self.lowres_vcb = array.data();
+        });
+
+    // Bind PerturbedField
+    nb::class_<PerturbedField>(m, "PerturbedField")
+        .def(nb::init<>())
+        .def("set_density",
+             [](PerturbedField& self, nb::ndarray<float> array) { self.density = array.data(); })
+        .def("set_velocity_x",
+             [](PerturbedField& self, nb::ndarray<float> array) { self.velocity_x = array.data(); })
+        .def("set_velocity_y",
+             [](PerturbedField& self, nb::ndarray<float> array) { self.velocity_y = array.data(); })
+        .def("set_velocity_z", [](PerturbedField& self, nb::ndarray<float> array) {
+            self.velocity_z = array.data();
+        });
+
+    // Bind HaloField
+    nb::class_<HaloField>(m, "HaloField")
+        .def(nb::init<>())
+        .def_rw("n_halos", &HaloField::n_halos)
+        .def_rw("buffer_size", &HaloField::buffer_size)
+        .def("set_halo_masses",
+             [](HaloField& self, nb::ndarray<float> array) { self.halo_masses = array.data(); })
+        .def("set_halo_coords",
+             [](HaloField& self, nb::ndarray<float> array) { self.halo_coords = array.data(); })
+        .def("set_star_rng",
+             [](HaloField& self, nb::ndarray<float> array) { self.star_rng = array.data(); })
+        .def("set_sfr_rng",
+             [](HaloField& self, nb::ndarray<float> array) { self.sfr_rng = array.data(); })
+        .def("set_xray_rng",
+             [](HaloField& self, nb::ndarray<float> array) { self.xray_rng = array.data(); });
+
+    // Bind PerturbHaloField
+    nb::class_<PerturbHaloField>(m, "PerturbHaloField")
+        .def(nb::init<>())
+        .def_rw("n_halos", &PerturbHaloField::n_halos)
+        .def_rw("buffer_size", &PerturbHaloField::buffer_size)
+        .def("set_halo_masses", [](PerturbHaloField& self,
+                                   nb::ndarray<float> array) { self.halo_masses = array.data(); })
+        .def("set_halo_coords", [](PerturbHaloField& self,
+                                   nb::ndarray<float> array) { self.halo_coords = array.data(); })
+        .def("set_star_rng",
+             [](PerturbHaloField& self, nb::ndarray<float> array) { self.star_rng = array.data(); })
+        .def("set_sfr_rng",
+             [](PerturbHaloField& self, nb::ndarray<float> array) { self.sfr_rng = array.data(); })
+        .def("set_xray_rng", [](PerturbHaloField& self, nb::ndarray<float> array) {
+            self.xray_rng = array.data();
+        });
+
+    // Bind HaloBox
+    nb::class_<HaloBox>(m, "HaloBox")
+        .def(nb::init<>())
+        .def("set_halo_mass",
+             [](HaloBox& self, nb::ndarray<float> array) { self.halo_mass = array.data(); })
+        .def("set_halo_stars",
+             [](HaloBox& self, nb::ndarray<float> array) { self.halo_stars = array.data(); })
+        .def("set_halo_stars_mini",
+             [](HaloBox& self, nb::ndarray<float> array) { self.halo_stars_mini = array.data(); })
+        .def("set_count", [](HaloBox& self, nb::ndarray<int> array) { self.count = array.data(); })
+        .def("set_n_ion",
+             [](HaloBox& self, nb::ndarray<float> array) { self.n_ion = array.data(); })
+        .def("set_halo_sfr",
+             [](HaloBox& self, nb::ndarray<float> array) { self.halo_sfr = array.data(); })
+        .def("set_halo_xray",
+             [](HaloBox& self, nb::ndarray<float> array) { self.halo_xray = array.data(); })
+        .def("set_halo_sfr_mini",
+             [](HaloBox& self, nb::ndarray<float> array) { self.halo_sfr_mini = array.data(); })
+        .def("set_whalo_sfr",
+             [](HaloBox& self, nb::ndarray<float> array) { self.whalo_sfr = array.data(); })
+        .def_rw("log10_Mcrit_ACG_ave", &HaloBox::log10_Mcrit_ACG_ave)
+        .def_rw("log10_Mcrit_MCG_ave", &HaloBox::log10_Mcrit_MCG_ave);
+
+    // Bind XraySourceBox
+    nb::class_<XraySourceBox>(m, "XraySourceBox")
+        .def(nb::init<>())
+        .def("set_filtered_sfr", [](XraySourceBox& self,
+                                    nb::ndarray<float> array) { self.filtered_sfr = array.data(); })
+        .def("set_filtered_xray",
+             [](XraySourceBox& self, nb::ndarray<float> array) {
+                 self.filtered_xray = array.data();
+             })
+        .def("set_filtered_sfr_mini",
+             [](XraySourceBox& self, nb::ndarray<float> array) {
+                 self.filtered_sfr_mini = array.data();
+             })
+        .def("set_mean_log10_Mcrit_LW",
+             [](XraySourceBox& self, nb::ndarray<double> array) {
+                 self.mean_log10_Mcrit_LW = array.data();
+             })
+        .def("set_mean_sfr",
+             [](XraySourceBox& self, nb::ndarray<double> array) { self.mean_sfr = array.data(); })
+        .def("set_mean_sfr_mini", [](XraySourceBox& self, nb::ndarray<double> array) {
+            self.mean_sfr_mini = array.data();
+        });
+
+    // Bind TsBox
+    nb::class_<TsBox>(m, "TsBox")
+        .def(nb::init<>())
+        .def("set_spin_temperature",
+             [](TsBox& self, nb::ndarray<float> array) { self.spin_temperature = array.data(); })
+        .def("set_xray_ionised_fraction",
+             [](TsBox& self, nb::ndarray<float> array) {
+                 self.xray_ionised_fraction = array.data();
+             })
+        .def(
+            "set_kinetic_temp_neutral",
+            [](TsBox& self, nb::ndarray<float> array) { self.kinetic_temp_neutral = array.data(); })
+        .def("set_J_21_LW",
+             [](TsBox& self, nb::ndarray<float> array) { self.J_21_LW = array.data(); });
+
+    // Bind IonizedBox
+    nb::class_<IonizedBox>(m, "IonizedBox")
+        .def(nb::init<>())
+        .def_rw("mean_f_coll", &IonizedBox::mean_f_coll)
+        .def_rw("mean_f_coll_MINI", &IonizedBox::mean_f_coll_MINI)
+        .def_rw("log10_Mturnover_ave", &IonizedBox::log10_Mturnover_ave)
+        .def_rw("log10_Mturnover_MINI_ave", &IonizedBox::log10_Mturnover_MINI_ave)
+        .def("set_neutral_fraction",
+             [](IonizedBox& self, nb::ndarray<float> array) {
+                 self.neutral_fraction = array.data();
+             })
+        .def("set_ionisation_rate_G12",
+             [](IonizedBox& self, nb::ndarray<float> array) {
+                 self.ionisation_rate_G12 = array.data();
+             })
+        .def("set_mean_free_path",
+             [](IonizedBox& self, nb::ndarray<float> array) { self.mean_free_path = array.data(); })
+        .def("set_z_reion",
+             [](IonizedBox& self, nb::ndarray<float> array) { self.z_reion = array.data(); })
+        .def("set_cumulative_recombinations",
+             [](IonizedBox& self, nb::ndarray<float> array) {
+                 self.cumulative_recombinations = array.data();
+             })
+        .def("set_kinetic_temperature",
+             [](IonizedBox& self, nb::ndarray<float> array) {
+                 self.kinetic_temperature = array.data();
+             })
+        .def("set_unnormalised_nion",
+             [](IonizedBox& self, nb::ndarray<float> array) {
+                 self.unnormalised_nion = array.data();
+             })
+        .def("set_unnormalised_nion_mini", [](IonizedBox& self, nb::ndarray<float> array) {
+            self.unnormalised_nion_mini = array.data();
+        });
+
+    // Bind BrightnessTemp
+    nb::class_<BrightnessTemp>(m, "BrightnessTemp")
+        .def(nb::init<>())
+        .def("set_brightness_temp",
+             [](BrightnessTemp& self, nb::ndarray<float> array) {
+                 self.brightness_temp = array.data();
+             })
+        .def("set_tau_21",
+             [](BrightnessTemp& self, nb::ndarray<float> array) { self.tau_21 = array.data(); });
+
+    // Function Bindings
+    // OutputStruct COMPUTE FUNCTIONS
+    m.def("ComputeInitialConditions", &ComputeInitialConditions);
+    m.def("ComputePerturbField", &ComputePerturbField);
+    m.def("ComputeHaloField", &ComputeHaloField);
+    m.def("ComputePerturbHaloField", &ComputePerturbHaloField);
+    m.def("ComputeTsBox", &ComputeTsBox);
+    m.def("ComputeIonizedBox", &ComputeIonizedBox);
+    m.def("ComputeBrightnessTemp", &ComputeBrightnessTemp);
+    m.def("ComputeHaloBox", &ComputeHaloBox);
+    m.def("UpdateXraySourceBox", &UpdateXraySourceBox);
+
+    // PHOTON CONSERVATION MODEL FUNCTIONS
+    m.def("InitialisePhotonCons", &InitialisePhotonCons);
+    m.def("PhotonCons_Calibration",
+          [](nb::ndarray<double> z_estimate, nb::ndarray<double> xH_estimate) {
+              int n_spline = z_estimate.size();
+              if (xH_estimate.size() != n_spline) {
+                  throw std::runtime_error("Array sizes do not match the specified NSpline.");
+              }
+              int status = PhotonCons_Calibration(z_estimate.data(), xH_estimate.data(), n_spline);
+              if (status != 0) {
+                  throw std::runtime_error("PhotonCons_Calibration failed with status: " +
+                                           std::to_string(status));
+              }
+          });
+    m.def("ComputeZstart_PhotonCons", [](nb::ndarray<double> zstart) {
+        if (zstart.size() != 1) {
+            throw std::runtime_error("zstart array must have size 1.");
+        }
+        int status = ComputeZstart_PhotonCons(zstart.data());
+        if (status != 0) {
+            throw std::runtime_error("ComputeZstart_PhotonCons failed with status: " +
+                                     std::to_string(status));
+        }
+    });
+    m.def("adjust_redshifts_for_photoncons",
+          [](double z_step_factor, nb::ndarray<float> redshift, nb::ndarray<float> stored_redshift,
+             nb::ndarray<float> absolute_delta_z) {
+              adjust_redshifts_for_photoncons(z_step_factor, redshift.data(),
+                                              stored_redshift.data(), absolute_delta_z.data());
+          });
+    m.def("determine_deltaz_for_photoncons", &determine_deltaz_for_photoncons);
+    m.def("ObtainPhotonConsData",
+          [](nb::ndarray<double> z_at_Q_data, nb::ndarray<double> Q_data,
+             nb::ndarray<int> Ndata_analytic, nb::ndarray<double> z_cal_data,
+             nb::ndarray<double> nf_cal_data, nb::ndarray<int> Ndata_calibration,
+             nb::ndarray<double> PhotonCons_NFdata, nb::ndarray<double> PhotonCons_deltaz,
+             nb::ndarray<int> Ndata_PhotonCons) {
+              if (Ndata_analytic.size() != 1 || Ndata_calibration.size() != 1 ||
+                  Ndata_PhotonCons.size() != 1) {
+                  throw std::runtime_error(
+                      "Ndata_analytic, Ndata_calibration, and Ndata_PhotonCons must have size 1.");
+              }
+              int status = ObtainPhotonConsData(
+                  z_at_Q_data.data(), Q_data.data(), Ndata_analytic.data(), z_cal_data.data(),
+                  nf_cal_data.data(), Ndata_calibration.data(), PhotonCons_NFdata.data(),
+                  PhotonCons_deltaz.data(), Ndata_PhotonCons.data());
+              if (status != 0) {
+                  throw std::runtime_error("ObtainPhotonConsData failed with status: " +
+                                           std::to_string(status));
+              }
+          });
+    m.def("FreePhotonConsMemory", &FreePhotonConsMemory);
+    m.def("set_alphacons_params", &set_alphacons_params);
+
+    // Non-OutputStruct data products
+    m.def("ComputeLF",
+          [](int component, size_t n_bins_mass, nb::ndarray<float> z_LF, nb::ndarray<float> M_TURNs,
+             nb::ndarray<double> M_uv_z, nb::ndarray<double> M_h_z, nb::ndarray<double> log10phi) {
+              size_t n_redshifts = z_LF.shape(0);
+              if (M_h_z.shape(0) != n_redshifts || M_h_z.shape(1) != n_bins_mass ||
+                  M_uv_z.shape(0) != n_redshifts || M_uv_z.shape(1) != n_bins_mass ||
+                  log10phi.shape(0) != n_redshifts || log10phi.shape(1) != n_bins_mass ||
+                  M_TURNs.shape(0) != n_redshifts) {
+                  throw std::runtime_error(
+                      "Array size mismatch: M_h_z shape: " + std::to_string(M_h_z.shape(0)) + "x" +
+                      std::to_string(M_h_z.shape(1)) + ", M_uv_z shape: " +
+                      std::to_string(M_uv_z.shape(0)) + "x" + std::to_string(M_uv_z.shape(1)) +
+                      ", log10phi shape: " + std::to_string(log10phi.shape(0)) + "x" +
+                      std::to_string(log10phi.shape(1)) +
+                      ", M_TURNs shape: " + std::to_string(M_TURNs.shape(0)));
+              }
+              ComputeLF(n_bins_mass, component, n_redshifts, z_LF.data(), M_TURNs.data(),
+                        M_h_z.data(), M_uv_z.data(), log10phi.data());
+          });
+    m.def("ComputeTau",
+          [](nb::ndarray<float> redshifts, nb::ndarray<float> global_xHI, float z_re_HeII) {
+              size_t n_redshifts = redshifts.shape(0);
+              if (global_xHI.shape(0) != n_redshifts) {
+                  throw std::runtime_error("XHI array size" + std::to_string(global_xHI.shape(0)) +
+                                           "does not match the number of redshifts." +
+                                           std::to_string(n_redshifts));
+              }
+              return ComputeTau(n_redshifts, redshifts.data(), global_xHI.data(), z_re_HeII);
+          });
+
+    // Initialisation functions needed in the wrapper
+    m.def("init_ps", &init_ps);
+    m.def("init_heat", &init_heat);
+    m.def("CreateFFTWWisdoms", &CreateFFTWWisdoms);
+    m.def("Broadcast_struct_global_noastro", &Broadcast_struct_global_noastro);
+    m.def("Broadcast_struct_global_all", &Broadcast_struct_global_all);
+    m.def("initialiseSigmaMInterpTable", &initialiseSigmaMInterpTable);
+    m.def("initialise_GL", &initialise_GL);
+
+    // Integration routines
+    // TODO: it may be a better choice to rewrite integral_wrappers in C++ directly
+    m.def("get_sigma", [](nb::ndarray<double> mass_values, nb::ndarray<double> sigma_out,
+                          nb::ndarray<double> dsigmasqdm_out) {
+        size_t n_masses = mass_values.shape(0);
+        if (sigma_out.shape(0) != n_masses || dsigmasqdm_out.shape(0) != n_masses) {
+            throw std::runtime_error("Array sizes do not match the number of masses.");
+        }
+        get_sigma(n_masses, mass_values.data(), sigma_out.data(), dsigmasqdm_out.data());
+    });
+
+    m.def("get_condition_integrals",
+          [](double redshift, double z_prev, nb::ndarray<double> cond_values,
+             nb::ndarray<double> out_n_exp, nb::ndarray<double> out_m_exp) {
+              size_t n_conditions = cond_values.shape(0);
+              if (out_n_exp.shape(0) != n_conditions || out_m_exp.shape(0) != n_conditions) {
+                  throw std::runtime_error("Array sizes do not match the number of conditions.");
+              }
+              get_condition_integrals(redshift, z_prev, n_conditions, cond_values.data(),
+                                      out_n_exp.data(), out_m_exp.data());
+          });
+
+    m.def("get_halo_chmf_interval",
+          [](double redshift, double z_prev, nb::ndarray<double> cond_values,
+             nb::ndarray<double> lnM_lo, nb::ndarray<double> lnM_hi, nb::ndarray<double> out_n) {
+              size_t n_conditions = cond_values.shape(0);
+              size_t n_masslim = lnM_lo.shape(0);
+              if (lnM_hi.shape(0) != n_masslim || out_n.shape(0) != n_conditions ||
+                  out_n.shape(1) != n_masslim) {
+                  throw std::runtime_error("Array sizes do not match the specified dimensions.");
+              }
+              get_halo_chmf_interval(redshift, z_prev, n_conditions, cond_values.data(), n_masslim,
+                                     lnM_lo.data(), lnM_hi.data(), out_n.data());
+          });
+
+    m.def("get_halomass_at_probability",
+          [](double redshift, double z_prev, nb::ndarray<double> cond_values,
+             nb::ndarray<double> probabilities, nb::ndarray<double> out_mass) {
+              size_t n_conditions = cond_values.shape(0) * cond_values.shape(1);
+              if (probabilities.shape(0) * probabilities.shape(1) != n_conditions ||
+                  out_mass.shape(0) * out_mass.shape(1) != n_conditions) {
+                  throw std::runtime_error("Array sizes do not match the number of conditions.");
+              }
+              get_halomass_at_probability(redshift, z_prev, n_conditions, cond_values.data(),
+                                          probabilities.data(), out_mass.data());
+          });
+
+    m.def("get_global_SFRD_z",
+          [](nb::ndarray<double> redshifts, nb::ndarray<double> log10_turnovers_mcg,
+             nb::ndarray<double> out_sfrd, nb::ndarray<double> out_sfrd_mini) {
+              size_t n_redshift = redshifts.size();
+              if (log10_turnovers_mcg.size() != n_redshift || out_sfrd.size() != n_redshift ||
+                  out_sfrd_mini.size() != n_redshift) {
+                  throw std::runtime_error("Array sizes do not match the number of redshifts.");
+              }
+              get_global_SFRD_z(n_redshift, redshifts.data(), log10_turnovers_mcg.data(),
+                                out_sfrd.data(), out_sfrd_mini.data());
+          });
+
+    m.def("get_global_Nion_z",
+          [](nb::ndarray<double> redshifts, nb::ndarray<double> log10_turnovers_mcg,
+             nb::ndarray<double> out_nion, nb::ndarray<double> out_nion_mini) {
+              size_t n_redshift = redshifts.size();
+              if (log10_turnovers_mcg.size() != n_redshift || out_nion.size() != n_redshift ||
+                  out_nion_mini.size() != n_redshift) {
+                  throw std::runtime_error("Array sizes do not match the number of redshifts.");
+              }
+              get_global_Nion_z(n_redshift, redshifts.data(), log10_turnovers_mcg.data(),
+                                out_nion.data(), out_nion_mini.data());
+          });
+
+    m.def("get_conditional_FgtrM",
+          [](double redshift, double R, nb::ndarray<double> densities,
+             nb::ndarray<double> out_fcoll, nb::ndarray<double> out_dfcoll) {
+              size_t n_densities = densities.size();
+              if (out_fcoll.size() != n_densities || out_dfcoll.size() != n_densities) {
+                  throw std::runtime_error("Array sizes do not match the number of densities.");
+              }
+              get_conditional_FgtrM(redshift, R, n_densities, densities.data(), out_fcoll.data(),
+                                    out_dfcoll.data());
+          });
+
+    m.def("get_conditional_SFRD", [](double redshift, double R, nb::ndarray<double> densities,
+                                     nb::ndarray<double> log10_mturns, nb::ndarray<double> out_sfrd,
+                                     nb::ndarray<double> out_sfrd_mini) {
+        size_t n_densities = densities.size();
+        if (log10_mturns.size() != n_densities || out_sfrd.size() != n_densities ||
+            out_sfrd_mini.size() != n_densities) {
+            throw std::runtime_error("Array sizes do not match the number of densities.");
+        }
+        get_conditional_SFRD(redshift, R, n_densities, densities.data(), log10_mturns.data(),
+                             out_sfrd.data(), out_sfrd_mini.data());
+    });
+
+    m.def("get_conditional_Nion", [](double redshift, double R, nb::ndarray<double> densities,
+                                     nb::ndarray<double> log10_mturns_acg,
+                                     nb::ndarray<double> log10_mturns_mcg,
+                                     nb::ndarray<double> out_nion,
+                                     nb::ndarray<double> out_nion_mini) {
+        size_t n_densities = densities.size();
+        if (log10_mturns_acg.size() != n_densities || log10_mturns_mcg.size() != n_densities ||
+            out_nion.size() != n_densities || out_nion_mini.size() != n_densities) {
+            throw std::runtime_error("Array sizes do not match the number of densities.");
+        }
+        get_conditional_Nion(redshift, R, n_densities, densities.data(), log10_mturns_acg.data(),
+                             log10_mturns_mcg.data(), out_nion.data(), out_nion_mini.data());
+    });
+
+    m.def("get_conditional_Xray",
+          [](double redshift, double R, nb::ndarray<double> densities,
+             nb::ndarray<double> log10_mturns, nb::ndarray<double> out_xray) {
+              size_t n_densities = densities.size();
+              if (log10_mturns.size() != n_densities || out_xray.size() != n_densities) {
+                  throw std::runtime_error("Array sizes do not match the number of densities.");
+              }
+              get_conditional_Xray(redshift, R, n_densities, densities.data(), log10_mturns.data(),
+                                   out_xray.data());
+          });
+
+    // Error framework testing
+    m.def("SomethingThatCatches", &SomethingThatCatches);
+    m.def("FunctionThatCatches", [](bool sub_func, bool pass, nb::ndarray<double> answer) {
+        return FunctionThatCatches(sub_func, pass, answer.data());
+    });
+    m.def("FunctionThatThrows", &FunctionThatThrows);
+
+    m.def("single_test_sample",
+          [](unsigned long long int seed, nb::ndarray<float> conditions,
+             nb::ndarray<float> cond_crd, double z_out, double z_in, nb::ndarray<int> out_n_tot,
+             nb::ndarray<int> out_n_cell, nb::ndarray<double> out_n_exp,
+             nb::ndarray<double> out_m_cell, nb::ndarray<double> out_m_exp,
+             nb::ndarray<float> out_halo_masses, nb::ndarray<float> out_halo_coords) {
+              size_t n_condition = conditions.shape(0);
+              if (cond_crd.shape(0) != n_condition || cond_crd.shape(1) != 3) {
+                  throw std::runtime_error("cond_crd must have shape (n_condition, 3).");
+              }
+              if (out_n_cell.shape(0) != n_condition || out_n_exp.shape(0) != n_condition ||
+                  out_m_cell.shape(0) != n_condition || out_m_exp.shape(0) != n_condition) {
+                  throw std::runtime_error("Output arrays must match the number of conditions.");
+              }
+              int status = single_test_sample(seed, n_condition, conditions.data(), cond_crd.data(),
+                                              z_out, z_in, out_n_tot.data(), out_n_cell.data(),
+                                              out_n_exp.data(), out_m_cell.data(), out_m_exp.data(),
+                                              out_halo_masses.data(), out_halo_coords.data());
+              if (status != 0) {
+                  throw std::runtime_error("single_test_sample failed with status: " +
+                                           std::to_string(status));
+              }
+          });
+
+    m.def("test_halo_props", [](double redshift, nb::ndarray<float> vcb_grid,
+                                nb::ndarray<float> J21_LW_grid, nb::ndarray<float> z_re_grid,
+                                nb::ndarray<float> Gamma12_ion_grid, nb::ndarray<float> halo_masses,
+                                nb::ndarray<float> halo_coords, nb::ndarray<float> star_rng,
+                                nb::ndarray<float> sfr_rng, nb::ndarray<float> xray_rng,
+                                nb::ndarray<float> halo_props_out) {
+        size_t n_halos = halo_masses.shape(0);
+        if (halo_coords.shape(0) != n_halos || halo_coords.shape(1) != 3 ||
+            star_rng.shape(0) != n_halos || sfr_rng.shape(0) != n_halos ||
+            xray_rng.shape(0) != n_halos || halo_props_out.shape(0) != n_halos ||
+            halo_props_out.shape(1) != 12) {
+            throw std::runtime_error(
+                "Input/output arrays must have the same shape as the number of halos. halo_coords "
+                "shape: " +
+                std::to_string(halo_coords.shape(0)) + "x" + std::to_string(halo_coords.shape(1)) +
+                ", " + "halo_masses shape: " + std::to_string(halo_masses.shape(0)) + ", " +
+                "star_rng shape: " + std::to_string(star_rng.shape(0)) + ", " +
+                "sfr_rng shape: " + std::to_string(sfr_rng.shape(0)) + ", " +
+                "halo_props_out shape: " + std::to_string(halo_props_out.shape(0)) + "x" +
+                std::to_string(halo_props_out.shape(1)));
+        }
+        int status = test_halo_props(redshift, vcb_grid.data(), J21_LW_grid.data(),
+                                     z_re_grid.data(), Gamma12_ion_grid.data(), n_halos,
+                                     halo_masses.data(), halo_coords.data(), star_rng.data(),
+                                     sfr_rng.data(), xray_rng.data(), halo_props_out.data());
+        if (status != 0) {
+            throw std::runtime_error("test_halo_props failed with status: " +
+                                     std::to_string(status));
+        }
+    });
+
+    m.def("test_filter", [](nb::ndarray<float> input_box, double R, double R_param, int filter_flag,
+                            nb::ndarray<double> result) {
+        size_t n_elements = input_box.size();
+        if (result.size() != n_elements) {
+            throw std::runtime_error("result array must have the same size as input_box.");
+        }
+        int status = test_filter(input_box.data(), R, R_param, filter_flag, result.data());
+        if (status != 0) {
+            throw std::runtime_error("test_filter failed with status: " + std::to_string(status));
+        }
+    });
+
+    // Functions required to access cosmology & mass functions directly
+    m.def("dicke", &dicke);
+    m.def("sigma_z0", &sigma_z0);
+    m.def("dsigmasqdm_z0", &dsigmasqdm_z0);
+    m.def("power_in_k", &power_in_k);
+    m.def("get_delta_crit", &get_delta_crit);
+    m.def("atomic_cooling_threshold", &atomic_cooling_threshold);
+    m.def("unconditional_hmf", &unconditional_hmf);
+    m.def("conditional_hmf", &conditional_hmf);
+    m.def("expected_nhalo", &expected_nhalo);
+
+    m.def(
+        "get_config_settings", []() -> ConfigSettings& { return config_settings; },
+        nb::rv_policy::reference);
+
+    m.attr("photon_cons_allocated") = nb::cast(&photon_cons_allocated);
+}
diff --git a/src/py21cmfast/src/bubble_helper_progs.h b/src/py21cmfast/src/bubble_helper_progs.h
index 8ea6087a8..91eedd24f 100644
--- a/src/py21cmfast/src/bubble_helper_progs.h
+++ b/src/py21cmfast/src/bubble_helper_progs.h
@@ -2,8 +2,14 @@
 #ifndef _BUBBLEHELP_H
 #define _BUBBLEHELP_H
 
+#ifdef __cplusplus
+extern "C" {
+#endif
 // NOTE: This file is only used for the old bubble finding algorithm which updates the whole sphere
 void update_in_sphere(float* box, int dimensions, int dimensions_ncf, float R, float xf, float yf,
                       float zf);
 
+#ifdef __cplusplus
+}
+#endif
 #endif
diff --git a/src/py21cmfast/src/cosmology.c b/src/py21cmfast/src/cosmology.c
index 6e489bfc5..b1cde18aa 100644
--- a/src/py21cmfast/src/cosmology.c
+++ b/src/py21cmfast/src/cosmology.c
@@ -126,7 +126,7 @@ double transfer_function_CLASS(double k, int flag_int, int flag_dv) {
     int gsl_status;
     FILE *F;
 
-    static bool warning_printed;
+    static bool warning_printed = false;
     static double eh_ratio_at_kmax;
 
     char filename[500];
@@ -137,7 +137,6 @@ double transfer_function_CLASS(double k, int flag_int, int flag_dv) {
             LOG_ERROR("Unable to open file: %s for reading.", filename);
             Throw(IOError);
         }
-        warning_printed = false;
 
         int nscans;
         for (i = 0; i < CLASS_LENGTH; i++) {
@@ -190,9 +189,10 @@ double transfer_function_CLASS(double k, int flag_int, int flag_dv) {
     if (k > kclass[CLASS_LENGTH - 1]) {  // k>kmax
         if (!warning_printed) {
             LOG_WARNING(
-                "Called transfer_function_CLASS with k=%f, larger than kmax! performing linear "
+                "Called transfer_function_CLASS with k=%f > %f, larger than kmax! performing "
+                "linear "
                 "extrapolation with Eisenstein & Hu",
-                k);
+                k, kclass[CLASS_LENGTH - 1]);
             warning_printed = true;
         }
         if (flag_dv == 0) {  // output is density
diff --git a/src/py21cmfast/src/cosmology.h b/src/py21cmfast/src/cosmology.h
index 7da11f2df..871ef1fd2 100644
--- a/src/py21cmfast/src/cosmology.h
+++ b/src/py21cmfast/src/cosmology.h
@@ -1,6 +1,9 @@
 #ifndef _PS_H
 #define _PS_H
 
+#ifdef __cplusplus
+extern "C" {
+#endif
 void init_ps();
 double dicke(double z);
 double sigma_z0(double M);
@@ -33,4 +36,7 @@ double hubble(float z);
 double t_hubble(float z);
 double M_J_WDM();
 
+#ifdef __cplusplus
+}
+#endif
 #endif
diff --git a/src/py21cmfast/src/cuda_hello_world.cu b/src/py21cmfast/src/cuda_hello_world.cu
new file mode 100644
index 000000000..f63633844
--- /dev/null
+++ b/src/py21cmfast/src/cuda_hello_world.cu
@@ -0,0 +1,31 @@
+#include <stdio.h>
+#include <cuda_runtime.h>
+
+#include "cuda_utils.cuh"
+#include "cuda_hello_world.cuh"
+
+__global__ void hello_kernel() {
+    printf("Hello World from GPU! BlockIdx: %d, ThreadIdx: %d\n", blockIdx.x, threadIdx.x);
+}
+
+int call_cuda() {
+    hello_kernel<<<3, 3>>>();
+    cudaDeviceSynchronize();
+    return 0;
+}
+
+// more members of deviceprop can be found in cura_runtime_api documentation
+void print_key_device_properties(){
+    int device;
+    CALL_CUDA(cudaGetDevice(&device));
+    cudaDeviceProp deviceProp;
+    CALL_CUDA(cudaGetDeviceProperties(&deviceProp, device));
+    printf("Device name: %s\n", deviceProp.name);
+    printf("Total global memory: %zu bytes \n", deviceProp.totalGlobalMem);
+    printf("Shared memory per block: %zu bytes\n", deviceProp.sharedMemPerBlock);
+    printf("Registers per block: %d\n", deviceProp.regsPerBlock);
+    printf("Warp size: %d \n", deviceProp.warpSize);
+    printf("Memory pitch: %zu bytes \n", deviceProp.memPitch);
+    printf("Max threads per block: %d \n", deviceProp.maxThreadsPerBlock);
+    printf("Total constant memory: %zu bytes \n", deviceProp.totalConstMem);
+}
diff --git a/src/py21cmfast/src/cuda_hello_world.cuh b/src/py21cmfast/src/cuda_hello_world.cuh
new file mode 100644
index 000000000..5a34921db
--- /dev/null
+++ b/src/py21cmfast/src/cuda_hello_world.cuh
@@ -0,0 +1,14 @@
+#ifndef _CUDA_HELLO_WORLD_CUH
+#define _CUDA_HELLO_WORLD_CUH
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+    int call_cuda();
+    void print_key_device_properties();
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _CUDA_HELLO_WORLD_CUH
diff --git a/src/py21cmfast/src/cuda_utils.cuh b/src/py21cmfast/src/cuda_utils.cuh
new file mode 100644
index 000000000..f2e992651
--- /dev/null
+++ b/src/py21cmfast/src/cuda_utils.cuh
@@ -0,0 +1,18 @@
+#ifndef _CUDA_UTILS_CUH
+#define _CUDA_UTILS_CUH
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+#define CALL_CUDA(x)                                                                    \
+    do                                                                                  \
+    {                                                                                   \
+        cudaError_t err = (x);                                                          \
+        if (err != cudaSuccess)                                                         \
+        {                                                                               \
+            printf("Error %s at %s:%d\n", cudaGetErrorString(err), __FILE__, __LINE__); \
+            exit(EXIT_FAILURE);                                                         \
+        }                                                                               \
+    } while (0)
+
+#endif
diff --git a/src/py21cmfast/src/debugging.c b/src/py21cmfast/src/debugging.c
index 71f4f0ea3..afeb5d794 100644
--- a/src/py21cmfast/src/debugging.c
+++ b/src/py21cmfast/src/debugging.c
@@ -147,8 +147,7 @@ void writeAstroParams(AstroParams *p) {
         "       HII_EFF_FACTOR=%10.3e\n"
         "       ION_Tvir_MIN=%10.3e\n"
         "       X_RAY_Tvir_MIN=%10.3e\n",
-        p->HII_EFF_FACTOR, p->ION_Tvir_MIN, p->X_RAY_Tvir_MIN, p->R_BUBBLE_MAX, p->L_X,
-        p->NU_X_THRESH, p->X_RAY_SPEC_INDEX, p->F_STAR10, p->t_STAR);
+        p->HII_EFF_FACTOR, p->ION_Tvir_MIN, p->X_RAY_Tvir_MIN);
 }
 
 void writeAstroOptions(AstroOptions *p) {
diff --git a/src/py21cmfast/src/debugging.h b/src/py21cmfast/src/debugging.h
index c0c876257..bce8deb97 100644
--- a/src/py21cmfast/src/debugging.h
+++ b/src/py21cmfast/src/debugging.h
@@ -7,6 +7,9 @@
 #include "InputParameters.h"
 #include "OutputStructs.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
 // Input debugging
 void writeAstroOptions(AstroOptions *p);
 void writeSimulationOptions(SimulationOptions *p);
@@ -26,4 +29,7 @@ int SomethingThatCatches(bool sub_func);
 int FunctionThatCatches(bool sub_func, bool pass, double *result);
 void FunctionThatThrows();
 
+#ifdef __cplusplus
+}
+#endif
 #endif
diff --git a/src/py21cmfast/src/device_rng.cu b/src/py21cmfast/src/device_rng.cu
new file mode 100644
index 000000000..eda652e39
--- /dev/null
+++ b/src/py21cmfast/src/device_rng.cu
@@ -0,0 +1,83 @@
+#include <cuda_runtime.h>
+#include <curand.h>
+#include <curand_kernel.h>
+#include <stdio.h>
+
+#include "cuda_utils.cuh"
+#include "device_rng.cuh"
+
+__device__ curandState *d_randStates = nullptr;
+__device__ int d_numStates = 0;
+
+// initiate random states
+// use the same random seed, different sub-sequence, and with offset of 0
+__global__ void initRandStates(unsigned long long int random_seed, int totalStates)
+{
+    // get thread idx
+    int ind = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (ind < totalStates){
+        curand_init(random_seed, ind, 0, &d_randStates[ind]);
+
+        // todo: add the following block to debug
+        if (ind < 2)
+        {
+            printf("temp check rng init.\n");
+            printf("Thread %d: d = %u, v0 = %u, boxmuller_flag = %d, boxmuller_extra = %f\n",
+                   ind, d_randStates[ind].d, d_randStates[ind].v[0],
+                   d_randStates[ind].boxmuller_flag, d_randStates[ind].boxmuller_extra);
+        }
+    }
+}
+
+// Function to initialize RNG states.
+void init_rand_states(unsigned long long int seed, int numStates)
+{
+    // ensure previously allocated random states on the device are freed before allocating new ones
+    free_rand_states();
+
+    CALL_CUDA(cudaMemcpyToSymbol(d_numStates, &numStates, sizeof(int), 0, cudaMemcpyHostToDevice));
+
+    // todo: add the following block to debug
+    curandState *checkPtr0 = nullptr;
+    CALL_CUDA(cudaMemcpyFromSymbol(&checkPtr0, d_randStates, sizeof(checkPtr0), 0, cudaMemcpyDeviceToHost));
+    printf("init device pointer = %p\n", checkPtr0);
+
+    curandState *tmpPtr = nullptr;
+    CALL_CUDA(cudaMalloc((void **)&tmpPtr, numStates * sizeof(curandState)));
+    CALL_CUDA(cudaMemcpyToSymbol(d_randStates, &tmpPtr, sizeof(tmpPtr), 0, cudaMemcpyHostToDevice));
+    tmpPtr = nullptr;
+
+    // todo: add the following block to debug (verify device pointer has been updated successfully)
+    curandState *checkPtr = nullptr;
+    CALL_CUDA(cudaMemcpyFromSymbol(&checkPtr, d_randStates, sizeof(checkPtr), 0, cudaMemcpyDeviceToHost));
+    printf("updated device pointer = %p\n", checkPtr);
+
+    // define kernel grids
+    int threadsPerBlock = 256;
+    int blocks = (numStates + threadsPerBlock - 1) / threadsPerBlock;
+
+    // launch kernel function
+    initRandStates<<<blocks, threadsPerBlock>>>(seed, numStates);
+    CALL_CUDA(cudaGetLastError());
+    cudaDeviceSynchronize();
+}
+
+void free_rand_states()
+{
+    // copy device pointer/variable to the host
+    curandState *h_randStates = nullptr;
+    int h_numStates = 0;
+    CALL_CUDA(cudaMemcpyFromSymbol(&h_randStates, d_randStates, sizeof(d_randStates), 0, cudaMemcpyDeviceToHost));
+    CALL_CUDA(cudaMemcpyFromSymbol(&h_numStates, d_numStates, sizeof(int), 0, cudaMemcpyDeviceToHost));
+    if (h_randStates){
+        CALL_CUDA(cudaFree(h_randStates));
+        h_randStates = nullptr;
+        CALL_CUDA(cudaMemcpyToSymbol(d_randStates, &h_randStates, sizeof(h_randStates), 0, cudaMemcpyHostToDevice));
+    }
+
+    if (h_numStates){
+        h_numStates = 0;
+        CALL_CUDA(cudaMemcpyToSymbol(d_numStates, &h_numStates, sizeof(int), 0, cudaMemcpyHostToDevice));
+    }
+}
diff --git a/src/py21cmfast/src/device_rng.cuh b/src/py21cmfast/src/device_rng.cuh
new file mode 100644
index 000000000..4d2b449d8
--- /dev/null
+++ b/src/py21cmfast/src/device_rng.cuh
@@ -0,0 +1,24 @@
+#ifndef _DEVICE_RNG_CUH
+#define _DEVICE_RNG_CUH
+
+#ifdef __CUDACC__
+#include <curand_kernel.h>
+// Declare the device variables as extern so that they can be shared across CUDA files.
+extern __device__ curandState *d_randStates;
+extern __device__ int d_numStates;
+#endif
+
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+    // Function prototypes.
+    void init_rand_states(unsigned long long int seed, int numStates);
+    void free_rand_states();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/py21cmfast/src/dft.h b/src/py21cmfast/src/dft.h
index f8e977b68..e8f0b2afd 100644
--- a/src/py21cmfast/src/dft.h
+++ b/src/py21cmfast/src/dft.h
@@ -8,8 +8,14 @@
 
 #include "InputParameters.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
 int dft_c2r_cube(bool use_wisdom, int dim, int dim_los, int n_threads, fftwf_complex *box);
 int dft_r2c_cube(bool use_wisdom, int dim, int dim_los, int n_threads, fftwf_complex *box);
 int CreateFFTWWisdoms();
 
+#ifdef __cplusplus
+}
+#endif
 #endif
diff --git a/src/py21cmfast/src/elec_interp.h b/src/py21cmfast/src/elec_interp.h
index 815998b7c..dfed888ad 100644
--- a/src/py21cmfast/src/elec_interp.h
+++ b/src/py21cmfast/src/elec_interp.h
@@ -5,6 +5,10 @@
 #define x_int_NXHII 14
 #define x_int_NENERGY 258
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 void initialize_interp_arrays();
 
 // Primary functions to compute heating fractions and number of Lya photons or ionization produced,
@@ -26,4 +30,7 @@ int locate_xHII_index(float xHII_call);
 // TODO: remove it and make it static in elec_interp.c
 extern float x_int_XHII[x_int_NXHII];
 
+#ifdef __cplusplus
+}
+#endif
 #endif
diff --git a/src/py21cmfast/src/exceptions.h b/src/py21cmfast/src/exceptions.h
index d951d6cae..fcd62dd54 100644
--- a/src/py21cmfast/src/exceptions.h
+++ b/src/py21cmfast/src/exceptions.h
@@ -3,6 +3,9 @@
 
 #include "cexcept.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
 define_exception_type(int);
 
 // NOTE: declaration here, definition in debugging.c
@@ -19,11 +22,21 @@ extern struct exception_context the_exception_context[1];
 #define InfinityorNaNError 7
 #define MassDepZetaError 8
 #define MemoryAllocError 9
+#define CUDAError 10
+#define ParallelError 11
 
 #define CATCH_GSL_ERROR(status)                                                           \
     if (status > 0) {                                                                     \
         LOG_ERROR("GSL Error Encountered (Code = %d): %s", status, gsl_strerror(status)); \
         Throw(GSLError);                                                                  \
     }
+#define CATCH_CUDA_ERROR(err)                                             \
+    if (err != cudaSuccess) {                                             \
+        LOG_ERROR("CUDA Error Encountered: %s", cudaGetErrorString(err)); \
+        Throw(CUDAError);                                                 \
+    }
 
+#ifdef __cplusplus
+}
+#endif
 #endif
diff --git a/src/py21cmfast/src/filtering.c b/src/py21cmfast/src/filtering.c
index 408e94ad4..ae17c9c92 100644
--- a/src/py21cmfast/src/filtering.c
+++ b/src/py21cmfast/src/filtering.c
@@ -1,3 +1,4 @@
+#include "filtering.h"
 
 #include <complex.h>
 #include <fftw3.h>
@@ -115,7 +116,7 @@ double spherical_shell_filter(double k, double R_outer, double R_inner) {
            (sin(kR_outer) - cos(kR_outer) * kR_outer - sin(kR_inner) + cos(kR_inner) * kR_inner);
 }
 
-void filter_box(fftwf_complex *box, int RES, int filter_type, float R, float R_param) {
+void filter_box_cpu(fftwf_complex *box, int RES, int filter_type, float R, float R_param) {
     int dimension, midpoint;  // TODO: figure out why defining as ULL breaks this
     switch (RES) {
         case 0:
@@ -167,28 +168,23 @@ void filter_box(fftwf_complex *box, int RES, int filter_type, float R, float R_p
                     grid_index = RES == 1 ? HII_C_INDEX(n_x, n_y, n_z) : C_INDEX(n_x, n_y, n_z);
 
                     // TODO: it would be nice to combine these into the filter_function call, *but*
-                    // since
-                    //  each can take different arguments more thought is needed
+                    // since each can take different arguments more thought is needed
                     if (filter_type == 0) {  // real space top-hat
                         kR = sqrt(k_mag_sq) * R;
                         box[grid_index] *= real_tophat_filter(kR);
                     } else if (filter_type == 1) {  // k-space top hat
-                        // NOTE: why was this commented????
-                        //  This is actually (kR^2) but since we zero the value and find kR > 1 this
-                        //  is more computationally efficient kR = 0.17103765852*( k_x*k_x + k_y*k_y
-                        //  + k_z*k_z )*R*R;
+                        // NOTE: Since it's a tophat we could just supply kr^2 for speed
                         kR = sqrt(k_mag_sq) * R;
                         box[grid_index] *= sharp_k_filter(kR);
                     } else if (filter_type == 2) {  // gaussian
-                        // This is actually (kR^2) but since we zero the value and find kR > 1 this
-                        // is more computationally efficient
+                        // NOTE: This is actually (kR^2)
                         kR = k_mag_sq * R * R;
                         box[grid_index] *= gaussian_filter(kR);
                     }
                     // The next two filters are not given by the HII_FILTER global, but used for
                     // specific grids
-                    else if (filter_type ==
-                             3) {  // exponentially decaying tophat, param == scale of decay (MFP)
+                    // exponentially decaying tophat, param == scale of decay (MFP)
+                    else if (filter_type == 3) {
                         // NOTE: This should be optimized, I havne't looked at it in a while
                         box[grid_index] *= exp_mfp_filter(sqrt(k_mag_sq), R, R_param, R_const);
                     } else if (filter_type == 4) {  // spherical shell, R_param == inner radius
@@ -206,8 +202,21 @@ void filter_box(fftwf_complex *box, int RES, int filter_type, float R, float R_p
     return;
 }
 
+void filter_box(fftwf_complex *box, int RES, int filter_type, float R, float R_param) {
+    bool use_cuda = false;  // pass this as a parameter later
+    if (use_cuda) {
+#if CUDA_FOUND
+        filter_box_gpu(box, RES, filter_type, R, R_param);
+#else
+        LOG_ERROR("CUDA version of filter_box() called but code was not compiled for CUDA.");
+#endif
+    } else {
+        filter_box_cpu(box, RES, filter_type, R, R_param);
+    }
+}
+
 // Test function to filter a box without computing a whole output box
-int test_filter(float *input_box, double R, double R_param, int filter_flag, double *result) {
+int test_filter_cpu(float *input_box, double R, double R_param, int filter_flag, double *result) {
     int i, j, k;
     unsigned long long int ii;
 
@@ -232,7 +241,7 @@ int test_filter(float *input_box, double R, double R_param, int filter_flag, dou
 
     memcpy(box_filtered, box_unfiltered, sizeof(fftwf_complex) * HII_KSPACE_NUM_PIXELS);
 
-    filter_box(box_filtered, 1, filter_flag, R, R_param);
+    filter_box_cpu(box_filtered, 1, filter_flag, R, R_param);
 
     dft_c2r_cube(matter_options_global->USE_FFTW_WISDOM, simulation_options_global->HII_DIM,
                  HII_D_PARA, simulation_options_global->N_THREADS, box_filtered);
@@ -247,3 +256,17 @@ int test_filter(float *input_box, double R, double R_param, int filter_flag, dou
 
     return 0;
 }
+
+int test_filter(float *input_box, double R, double R_param, int filter_flag, double *result) {
+    bool use_cuda = false;  // pass this as a parameter later
+    if (use_cuda) {
+#if CUDA_FOUND
+        return test_filter_gpu(input_box, R, R_param, filter_flag, result);
+#else
+        LOG_ERROR("CUDA version of test_filter() called but code was not compiled for CUDA.");
+        return 1;
+#endif
+    } else {
+        return test_filter_cpu(input_box, R, R_param, filter_flag, result);
+    }
+}
diff --git a/src/py21cmfast/src/filtering.cu b/src/py21cmfast/src/filtering.cu
new file mode 100644
index 000000000..fb4a3b859
--- /dev/null
+++ b/src/py21cmfast/src/filtering.cu
@@ -0,0 +1,255 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <omp.h>
+#include <complex.h>
+#include <fftw3.h>
+
+// GPU
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuComplex.h>
+// #include <cufft.h>
+// #include <cufftw.h>
+
+#include "cexcept.h"
+#include "exceptions.h"
+#include "logger.h"
+
+#include "Constants.h"
+#include "InputParameters.h"
+#include "indexing.h"
+#include "dft.h"
+#include "filtering.h"
+
+__device__ inline double real_tophat_filter(double kR) {
+    // Second order taylor expansion around kR==0
+    if (kR < 1e-4)
+        return 1 - kR*kR/10;
+    return 3.0*pow(kR, -3) * (sin(kR) - cos(kR)*kR);
+}
+
+__device__ inline double sharp_k_filter(double kR) {
+    if (kR * 0.413566994 > 1)
+       return 0.;
+    return 1;
+}
+
+__device__ inline double gaussian_filter(double kR_squared) {
+    return exp(-0.643 * 0.643 * kR_squared / 2.);
+}
+
+__device__ inline double exp_mfp_filter(double k, double R, double mfp, double exp_term) {
+    double f;
+    double kR = k * R;
+    double ratio = mfp / R;
+
+    // Second order taylor expansion around kR==0
+    if (kR < 1e-4) {
+        double ts_0 = 6 * pow(ratio, 3) - exp_term * (6 * pow(ratio, 3) + 6 * pow(ratio, 2) + 3 * ratio);
+        return ts_0 + (exp_term * (2 * pow(ratio, 2) + 0.5 * ratio) - 2 * ts_0 * pow(ratio, 2)) * kR * kR;
+    }
+    // Davies & Furlanetto MFP-eps(r) window function
+    f = (kR * kR * pow(ratio, 2) + 2 * ratio + 1) * ratio * cos(kR);
+    f += (kR * kR * (pow(ratio, 2) - pow(ratio, 3)) + ratio + 1) * sin(kR) / kR;
+    f *= exp_term;
+    f -= 2 * pow(ratio, 2);
+    f *= -3 * ratio/pow(pow(kR * ratio, 2) + 1, 2);
+    return f;
+}
+
+__device__ inline double spherical_shell_filter(double k, double R_outer, double R_inner) {
+    double kR_inner = k * R_inner;
+    double kR_outer = k * R_outer;
+
+    // Second order taylor expansion around kR_outer==0
+    if (kR_outer < 1e-4)
+        return 1. - kR_outer*kR_outer / 10 * \
+                (pow(R_inner / R_outer, 5) - 1) / \
+                (pow(R_inner / R_outer, 3) - 1);
+
+    return 3.0 / (pow(kR_outer, 3) - pow(kR_inner, 3)) \
+        * (sin(kR_outer) - cos(kR_outer) * kR_outer \
+        -  sin(kR_inner) + cos(kR_inner) * kR_inner);
+}
+
+__global__ void filter_box_kernel(cuFloatComplex *box, int num_pixels, int dimension, int midpoint, int midpoint_para, double delta_k, float R, float R_param, double R_const, int filter_type) {
+
+    // Get index of box (flattened k-box)
+    unsigned long long idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    // Bound check (in case number of threads != multiple of block size)
+    if (idx >= num_pixels) {
+        return;
+    }
+    // Compute the 3D indices (n_x, n_y, n_z) for the k-box from the flattened index (idx)
+    // Based on convenience macros in indexing.h
+    int n_z = idx % (midpoint_para + 1);
+    unsigned long long remaining = idx / (midpoint_para + 1);
+    int n_y = remaining % dimension;
+    int n_x = remaining / dimension;
+
+    // Compute wave vector components
+    float k_x = (n_x - dimension * (n_x > midpoint)) * delta_k; // Wrap around midpoint
+    float k_y = (n_y - dimension * (n_y > midpoint)) * delta_k;
+    float k_z = n_z * delta_k;
+
+    // TODO: Try alternative vectorised coords & wave vector components?
+    // int *cell_coords = (int[]) {idx % (midpoint_para + 1), (idx / (midpoint_para + 1)) % dimension, (idx / (midpoint_para + 1)) / dimension)};    // (as above and * delta_k to vector at end)
+    // int *wave_vector = (float[]) { ... }
+
+    // Compute squared magnitude of wave vector
+    float k_mag_sq = k_x*k_x + k_y*k_y + k_z*k_z;
+
+    float kR;
+    if (filter_type == 0) { // real space top-hat
+        kR = sqrt(k_mag_sq) * R;
+        // box[idx] *= real_tophat_filter(kR);
+        box[idx] = cuCmulf(box[idx], make_cuFloatComplex((float)real_tophat_filter(kR), 0.f));
+    }
+    else if (filter_type == 1) { // k-space top hat
+        kR = sqrt(k_mag_sq) * R;
+        // box[idx] *= sharp_k_filter(kR);
+        box[idx] = cuCmulf(box[idx], make_cuFloatComplex((float)sharp_k_filter(kR), 0.f));
+    }
+    else if (filter_type == 2) { // gaussian
+        kR = k_mag_sq * R * R;
+        // box[idx] *= gaussian_filter(kR);
+        box[idx] = cuCmulf(box[idx], make_cuFloatComplex((float)gaussian_filter(kR), 0.f));
+    }
+    else if (filter_type == 3) { // exponentially decaying tophat
+        // box[idx] *= exp_mfp_filter(sqrt(k_mag_sq), R, R_param, R_const);
+        box[idx] = cuCmulf(box[idx], make_cuFloatComplex((float)exp_mfp_filter(sqrt(k_mag_sq), R, R_param, R_const), 0.f));
+    }
+    else if (filter_type == 4) { //spherical shell
+        // box[idx] *= spherical_shell_filter(sqrt(k_mag_sq), R, R_param);
+        box[idx] = cuCmulf(box[idx], make_cuFloatComplex((float)spherical_shell_filter(sqrt(k_mag_sq), R, R_param), 0.f));
+    }
+}
+
+void filter_box_gpu(fftwf_complex *box, int RES, int filter_type, float R, float R_param) {
+
+    // Check for valid filter type
+    if (filter_type < 0 || filter_type > 4) {
+        LOG_WARNING("Filter type %i is undefined. Box is unfiltered.", filter_type);
+        return;
+    }
+
+    // Get required values
+    int dimension, midpoint, midpoint_para, num_pixels;
+    switch(RES) {
+        case 0:
+            dimension = user_params_global->DIM;
+            midpoint = MIDDLE;  // midpoint of x,y = DIM / 2
+            midpoint_para = MID_PARA;  // midpoint of z = NON_CUBIC_FACTOR * HII_DIM / 2
+            num_pixels = KSPACE_NUM_PIXELS;
+            break;
+        case 1:
+            dimension = user_params_global->HII_DIM;
+            midpoint = HII_MIDDLE;  // midpoint of x,y = HII_DIM / 2
+            midpoint_para = HII_MID_PARA;  // midpoint of z = NON_CUBIC_FACTOR * HII_DIM / 2
+            num_pixels = HII_KSPACE_NUM_PIXELS;
+            break;
+        default:
+            LOG_ERROR("Resolution for filter functions must be 0(DIM) or 1(HII_DIM)");
+            Throw(ValueError);
+            break;
+    }
+    double delta_k = DELTA_K;
+    double R_const;
+    if (filter_type == 3) {
+        R_const = exp(-R / R_param);
+    }
+
+    // Get size of flattened array
+    size_t size = num_pixels * sizeof(fftwf_complex);
+
+    cudaError_t err;
+
+    // Allocate device memory
+    fftwf_complex* d_box;
+    err = cudaMalloc(&d_box, size);
+    if (err != cudaSuccess) {
+        LOG_ERROR("CUDA error: %s", cudaGetErrorString(err));
+        Throw(CUDAError);
+    }
+
+    // Copy array from host to device
+    err = cudaMemcpy(d_box, box, size, cudaMemcpyHostToDevice);
+    if (err != cudaSuccess) {
+        LOG_ERROR("CUDA error: %s", cudaGetErrorString(err));
+        Throw(CUDAError);
+    }
+
+    // Invoke kernel
+    int threadsPerBlock = 256;
+    int numBlocks = (num_pixels + threadsPerBlock - 1) / threadsPerBlock;
+    // d_box must be cast to cuFloatComplex (from fftwf_complex) for CUDA
+    filter_box_kernel<<<numBlocks, threadsPerBlock>>>(reinterpret_cast<cuFloatComplex *>(d_box), num_pixels, dimension, midpoint, midpoint_para, delta_k, R, R_param, R_const, filter_type);
+
+    // // Only use during development!
+    err = cudaDeviceSynchronize();
+    CATCH_CUDA_ERROR(err);
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        LOG_ERROR("Kernel launch error: %s", cudaGetErrorString(err));
+        Throw(CUDAError);
+    }
+
+    // Copy results from device to host
+    err = cudaMemcpy(box, d_box, size, cudaMemcpyDeviceToHost);
+    if (err != cudaSuccess) {
+        LOG_ERROR("CUDA error: %s", cudaGetErrorString(err));
+        Throw(CUDAError);
+    }
+
+    // Deallocate device memory
+    err = cudaFree(d_box);
+    if (err != cudaSuccess) {
+        LOG_ERROR("CUDA error: %s", cudaGetErrorString(err));
+        Throw(CUDAError);
+    }
+}
+
+// Test function to filter a box without computing a whole output box
+//TODO: set device constants here
+int test_filter_gpu(float *input_box, double R, double R_param, int filter_flag, double *result) {
+    int i,j,k;
+    unsigned long long int ii;
+
+    //setup the box
+    fftwf_complex *box_unfiltered = (fftwf_complex *) fftwf_malloc(sizeof(fftwf_complex)*HII_KSPACE_NUM_PIXELS);
+    fftwf_complex *box_filtered = (fftwf_complex *) fftwf_malloc(sizeof(fftwf_complex)*HII_KSPACE_NUM_PIXELS);
+
+    for (i=0; i<user_params->HII_DIM; i++)
+        for (j=0; j<user_params->HII_DIM; j++)
+            for (k=0; k<HII_D_PARA; k++)
+                *((float *)box_unfiltered + HII_R_FFT_INDEX(i,j,k)) = input_box[HII_R_INDEX(i,j,k)];
+
+    dft_r2c_cube(user_params->USE_FFTW_WISDOM, user_params->HII_DIM, HII_D_PARA, user_params->N_THREADS, box_unfiltered);
+
+    // Convert to CUDA complex type
+    cuFloatComplex* box_unfiltered_cu = reinterpret_cast<cuFloatComplex*>(box_unfiltered);
+
+    for(ii=0;ii<HII_KSPACE_NUM_PIXELS;ii++){
+        // box_unfiltered[ii] /= (double)HII_TOT_NUM_PIXELS;
+        box_unfiltered_cu[ii] = cuCdivf(box_unfiltered_cu[ii], make_cuFloatComplex((float)HII_TOT_NUM_PIXELS, 0.f));
+    }
+
+    memcpy(box_filtered, box_unfiltered, sizeof(fftwf_complex) * HII_KSPACE_NUM_PIXELS);
+
+    filter_box_gpu(box_filtered, 1, filter_flag, R, R_param);
+
+    dft_c2r_cube(user_params->USE_FFTW_WISDOM, user_params->HII_DIM, HII_D_PARA, user_params->N_THREADS, box_filtered);
+
+    for (i=0; i<user_params->HII_DIM; i++)
+        for (j=0; j<user_params->HII_DIM; j++)
+            for (k=0; k<HII_D_PARA; k++)
+                    result[HII_R_INDEX(i,j,k)] = *((float *)box_filtered + HII_R_FFT_INDEX(i,j,k));
+
+    fftwf_free(box_unfiltered);
+    fftwf_free(box_filtered);
+
+    return 0;
+}
diff --git a/src/py21cmfast/src/filtering.h b/src/py21cmfast/src/filtering.h
index 97d6bd44b..0b5fa8808 100644
--- a/src/py21cmfast/src/filtering.h
+++ b/src/py21cmfast/src/filtering.h
@@ -1,13 +1,23 @@
 #ifndef _FILTERING_H
 #define _FILTERING_H
 
+#include <complex.h>
 #include <fftw3.h>
 
-#include "InputParameters.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 void filter_box(fftwf_complex *box, int RES, int filter_type, float R, float R_param);
+void filter_box_cpu(fftwf_complex *box, int RES, int filter_type, float R, float R_param);
+void filter_box_gpu(fftwf_complex *box, int RES, int filter_type, float R, float R_param);
 int test_filter(float *input_box, double R, double R_param, int filter_flag, double *result);
+int test_filter_cpu(float *input_box, double R, double R_param, int filter_flag, double *result);
+int test_filter_gpu(float *input_box, double R, double R_param, int filter_flag, double *result);
 double filter_function(double k, int filter_type);
 double dwdm_filter(double k, double R, int filter_type);
 
+#ifdef __cplusplus
+}
+#endif
 #endif
diff --git a/src/py21cmfast/src/heating_helper_progs.c b/src/py21cmfast/src/heating_helper_progs.c
index 1552c1747..17bff67ea 100644
--- a/src/py21cmfast/src/heating_helper_progs.c
+++ b/src/py21cmfast/src/heating_helper_progs.c
@@ -857,7 +857,7 @@ typedef struct {
     double ion_eff;
     double ion_eff_MINI;
     double log10_Mturn_MINI;
-    struct ScalingConstants *scale_consts;
+    ScalingConstants *scale_consts;
 } tauX_params;
 
 double tauX_integrand_MINI(double zhat, void *params) {
@@ -919,8 +919,7 @@ double tauX_integrand(double zhat, void *params) {
     return drpropdz * n * HI_filling_factor_zhat * sigma_tilde;
 }
 double tauX_MINI(double nu, double x_e, double x_e_ave, double zp, double zpp,
-                 double HI_filling_factor_zp, double log10_Mturn_MINI,
-                 struct ScalingConstants *sc) {
+                 double HI_filling_factor_zp, double log10_Mturn_MINI, ScalingConstants *sc) {
     double result, error;
     gsl_function F;
 
@@ -963,7 +962,7 @@ double tauX_MINI(double nu, double x_e, double x_e_ave, double zp, double zpp,
 }
 
 double tauX(double nu, double x_e, double x_e_ave, double zp, double zpp,
-            double HI_filling_factor_zp, struct ScalingConstants *sc) {
+            double HI_filling_factor_zp, ScalingConstants *sc) {
     double result, error, fcoll;
     gsl_function F;
     double rel_tol = 0.005;  //<- relative tolerance
@@ -1026,7 +1025,7 @@ typedef struct {
     double zpp;
     double HI_filling_factor_zp;
     double log10_Mturn_MINI;
-    struct ScalingConstants *scale_consts;
+    ScalingConstants *scale_consts;
 } nu_tau_one_params;
 double nu_tau_one_helper_MINI(double nu, void *params) {
     nu_tau_one_params *p = (nu_tau_one_params *)params;
@@ -1039,7 +1038,7 @@ double nu_tau_one_helper(double nu, void *params) {
     return tauX(nu, p->x_e, p->x_e, p->zp, p->zpp, p->HI_filling_factor_zp, p->scale_consts) - 1;
 }
 double nu_tau_one_MINI(double zp, double zpp, double x_e, double HI_filling_factor_zp,
-                       double log10_Mturn_MINI, struct ScalingConstants *sc) {
+                       double log10_Mturn_MINI, ScalingConstants *sc) {
     int status, iter, max_iter;
     const gsl_root_fsolver_type *T;
     gsl_root_fsolver *s;
@@ -1107,7 +1106,7 @@ double nu_tau_one_MINI(double zp, double zpp, double x_e, double HI_filling_fact
 }
 
 double nu_tau_one(double zp, double zpp, double x_e, double HI_filling_factor_zp,
-                  struct ScalingConstants *sc) {
+                  ScalingConstants *sc) {
     int status, iter, max_iter;
     const gsl_root_fsolver_type *T;
     gsl_root_fsolver *s;
diff --git a/src/py21cmfast/src/heating_helper_progs.h b/src/py21cmfast/src/heating_helper_progs.h
index cbd0f1c08..97b806a8e 100644
--- a/src/py21cmfast/src/heating_helper_progs.h
+++ b/src/py21cmfast/src/heating_helper_progs.h
@@ -1,6 +1,9 @@
 #ifndef _HEATHELPER_H
 #define _HEATHELPER_H
 
+#ifdef __cplusplus
+extern "C" {
+#endif
 #include "scaling_relations.h"
 
 // * initialization routine * //
@@ -45,11 +48,14 @@ double Energy_Lya_heating(double Tk, double Ts, double tau_gp, int flag);
 
 // rootfind to get the distance at which GP optical depth tau==1
 double nu_tau_one_MINI(double zp, double zpp, double x_e, double HI_filling_factor_zp,
-                       double log10_Mturn_MINI, struct ScalingConstants *sc);
+                       double log10_Mturn_MINI, ScalingConstants *sc);
 double nu_tau_one(double zp, double zpp, double x_e, double HI_filling_factor_zp,
-                  struct ScalingConstants *sc);
+                  ScalingConstants *sc);
 
 // xray heating integrals over frequency
 double integrate_over_nu(double zp, double local_x_e, double lower_int_limit, int FLAG);
 
+#ifdef __cplusplus
+}
+#endif
 #endif
diff --git a/src/py21cmfast/src/hmf.c b/src/py21cmfast/src/hmf.c
index ccac26017..4ec577e49 100644
--- a/src/py21cmfast/src/hmf.c
+++ b/src/py21cmfast/src/hmf.c
@@ -842,7 +842,7 @@ double Fcoll_General(double z, double lnM_min, double lnM_max) {
 }
 
 double Nion_General(double z, double lnM_Min, double lnM_Max, double MassTurnover,
-                    struct ScalingConstants *sc) {
+                    ScalingConstants *sc) {
     struct parameters_gsl_MF_integrals params = {
         .redshift = z,
         .growthf = dicke(z),
@@ -860,7 +860,7 @@ double Nion_General(double z, double lnM_Min, double lnM_Max, double MassTurnove
 }
 
 double Nion_General_MINI(double z, double lnM_Min, double lnM_Max, double MassTurnover,
-                         struct ScalingConstants *sc) {
+                         ScalingConstants *sc) {
     struct parameters_gsl_MF_integrals params = {
         .redshift = z,
         .growthf = dicke(z),
@@ -879,7 +879,7 @@ double Nion_General_MINI(double z, double lnM_Min, double lnM_Max, double MassTu
 }
 
 double Xray_General(double z, double lnM_Min, double lnM_Max, double mturn_acg, double mturn_mcg,
-                    struct ScalingConstants *sc) {
+                    ScalingConstants *sc) {
     // NOTE:in the _General functions, we don't use the scaling relation constants
     //  that are z-dependent so we can evaluate them at multiple redshifts without redoing the
     //  constants
@@ -954,7 +954,7 @@ double Mcoll_Conditional(double growthf, double lnM1, double lnM2, double lnM_co
 
 double Nion_ConditionalM_MINI(double growthf, double lnM1, double lnM2, double lnM_cond,
                               double sigma2, double delta2, double MassTurnover,
-                              struct ScalingConstants *sc, int method) {
+                              ScalingConstants *sc, int method) {
     struct parameters_gsl_MF_integrals params = {
         .growthf = growthf,
         .Mturn_mcg = MassTurnover,
@@ -992,8 +992,7 @@ double Nion_ConditionalM_MINI(double growthf, double lnM1, double lnM2, double l
 }
 
 double Nion_ConditionalM(double growthf, double lnM1, double lnM2, double lnM_cond, double sigma2,
-                         double delta2, double MassTurnover, struct ScalingConstants *sc,
-                         int method) {
+                         double delta2, double MassTurnover, ScalingConstants *sc, int method) {
     struct parameters_gsl_MF_integrals params = {
         .growthf = growthf,
         .Mturn_acg = MassTurnover,
@@ -1029,7 +1028,7 @@ double Nion_ConditionalM(double growthf, double lnM1, double lnM2, double lnM_co
 
 double Xray_ConditionalM(double redshift, double growthf, double lnM1, double lnM2, double lnM_cond,
                          double sigma2, double delta2, double mturn_acg, double mturn_mcg,
-                         struct ScalingConstants *sc, int method) {
+                         ScalingConstants *sc, int method) {
     // re-using escape fraction for minihalo parameters
     struct parameters_gsl_MF_integrals params = {
         .redshift = redshift,
diff --git a/src/py21cmfast/src/hmf.cu b/src/py21cmfast/src/hmf.cu
new file mode 100644
index 000000000..379a90a62
--- /dev/null
+++ b/src/py21cmfast/src/hmf.cu
@@ -0,0 +1,21 @@
+#include <cuda_runtime.h>
+#include <math.h>
+
+#include "Constants.h"
+#include "hmf.cuh"
+
+__device__ double sheth_delc_fixed(double del, double sig)
+{
+    return sqrt(JENKINS_a) * del * (1. + JENKINS_b * pow(sig * sig / (JENKINS_a * del * del), JENKINS_c));
+}
+
+// Get the relevant excursion set barrier density given the user-specified HMF
+__device__ double get_delta_crit(int HMF, double sigma, double growthf)
+{
+    if (HMF == 4)
+        return DELTAC_DELOS;
+    if (HMF == 1)
+        return sheth_delc_fixed(Deltac / growthf, sigma) * growthf;
+
+    return Deltac;
+}
diff --git a/src/py21cmfast/src/hmf.cuh b/src/py21cmfast/src/hmf.cuh
new file mode 100644
index 000000000..84316a5ef
--- /dev/null
+++ b/src/py21cmfast/src/hmf.cuh
@@ -0,0 +1,24 @@
+#include <cuda_runtime.h>
+
+#ifndef _HMF_CUH
+#define _HMF_CUH
+
+// define macros
+#ifndef JENKINS_a
+#define JENKINS_a (0.73) // Jenkins+01, SMT has 0.707
+#endif
+
+#ifndef JENKINS_b
+#define JENKINS_b (0.34) // Jenkins+01 fit from Barkana+01, SMT has 0.5
+#endif
+
+#ifndef JENKINS_c
+#define JENKINS_c (0.81) // Jenkins+01 from from Barkana+01, SMT has 0.6
+#endif
+
+// #ifdef __CUDA_ARCH__
+__device__ double sheth_delc_fixed(double del, double sig);
+__device__ double get_delta_crit(int HMF, double sigma, double growthf);
+// #endif
+
+#endif
diff --git a/src/py21cmfast/src/hmf.h b/src/py21cmfast/src/hmf.h
index 57796e592..3db969bb4 100644
--- a/src/py21cmfast/src/hmf.h
+++ b/src/py21cmfast/src/hmf.h
@@ -5,6 +5,10 @@
 #include "scaling_relations.h"
 // integrals
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define MAX_DELTAC_FRAC (float)0.99  // max delta/deltac for the mass function integrals
 #define DELTA_MIN -1                 // minimum delta for Lagrangian mass function integrals
 #define M_MIN_INTEGRAL 1e5
@@ -14,11 +18,11 @@
 void initialise_GL(double lnM_Min, double lnM_Max);
 
 double Nion_General(double z, double lnM_Min, double lnM_Max, double MassTurnover,
-                    struct ScalingConstants *sc);
+                    ScalingConstants *sc);
 double Nion_General_MINI(double z, double lnM_Min, double lnM_Max, double MassTurnover,
-                         struct ScalingConstants *sc);
+                         ScalingConstants *sc);
 double Xray_General(double z, double lnM_Min, double lnM_Max, double mturn_acg, double mturn_mcg,
-                    struct ScalingConstants *sc);
+                    ScalingConstants *sc);
 double Fcoll_General(double z, double lnM_min, double lnM_max);
 double Nhalo_General(double z, double lnM_min, double lnM_max);
 
@@ -28,13 +32,12 @@ double Mcoll_Conditional(double growthf, double lnM1, double lnM2, double lnM_co
                          double delta, int method);
 double Nion_ConditionalM_MINI(double growthf, double lnM1, double lnM2, double lnM_cond,
                               double sigma2, double delta2, double MassTurnover,
-                              struct ScalingConstants *sc, int method);
+                              ScalingConstants *sc, int method);
 double Nion_ConditionalM(double growthf, double lnM1, double lnM2, double lnM_cond, double sigma2,
-                         double delta2, double MassTurnover, struct ScalingConstants *sc,
-                         int method);
+                         double delta2, double MassTurnover, ScalingConstants *sc, int method);
 double Xray_ConditionalM(double redshift, double growthf, double lnM1, double lnM2, double lnM_cond,
                          double sigma2, double delta2, double mturn_acg, double mturn_mcg,
-                         struct ScalingConstants *sc, int method);
+                         ScalingConstants *sc, int method);
 
 double unconditional_hmf(double growthf, double lnM, double z, int HMF);
 double conditional_hmf(double growthf, double lnM, double delta_cond, double sigma_cond, int HMF);
@@ -54,4 +57,7 @@ double sheth_delc_dexm(double del, double sig);
 float Mass_limit_bisection(float Mmin, float Mmax, float PL, float FRAC);
 double euler_to_lagrangian_delta(double delta);
 
+#ifdef __cplusplus
+}
+#endif
 #endif
diff --git a/src/py21cmfast/src/integral_wrappers.c b/src/py21cmfast/src/integral_wrappers.c
index 4481dc98a..a25f5c759 100644
--- a/src/py21cmfast/src/integral_wrappers.c
+++ b/src/py21cmfast/src/integral_wrappers.c
@@ -116,7 +116,7 @@ void get_global_SFRD_z(int n_redshift, double *redshifts, double *log10_turnover
     if (matter_options_global->USE_INTERPOLATION_TABLES > 0)
         initialiseSigmaMInterpTable(M_min, 1e20);
 
-    struct ScalingConstants sc;
+    ScalingConstants sc;
     set_scaling_constants(redshifts[0], &sc, false);
 
     int i;
@@ -146,7 +146,7 @@ void get_global_Nion_z(int n_redshift, double *redshifts, double *log10_turnover
     if (matter_options_global->USE_INTERPOLATION_TABLES > 0)
         initialiseSigmaMInterpTable(M_min, 1e20);
 
-    struct ScalingConstants sc;
+    ScalingConstants sc;
     set_scaling_constants(redshifts[0], &sc, false);
 
     int i;
@@ -217,7 +217,7 @@ void get_conditional_SFRD(double redshift, double R, int n_densities, double *de
          astro_options_global->INTEGRATION_METHOD_MINI == 1))
         initialise_GL(log(M_min), log(M_cond));
 
-    struct ScalingConstants sc;
+    ScalingConstants sc;
     set_scaling_constants(redshift, &sc, false);
 
     int i;
@@ -260,7 +260,7 @@ void get_conditional_Nion(double redshift, double R, int n_densities, double *de
          astro_options_global->INTEGRATION_METHOD_MINI == 1))
         initialise_GL(log(M_min), log(M_cond));
 
-    struct ScalingConstants sc;
+    ScalingConstants sc;
     set_scaling_constants(redshift, &sc, false);
 
     int i;
@@ -315,7 +315,7 @@ void get_conditional_Xray(double redshift, double R, int n_densities, double *de
          astro_options_global->INTEGRATION_METHOD_MINI == 1))
         initialise_GL(log(M_min), log(M_cond));
 
-    struct ScalingConstants sc;
+    ScalingConstants sc;
     set_scaling_constants(redshift, &sc, false);
 
     int i;
diff --git a/src/py21cmfast/src/interp_tables.c b/src/py21cmfast/src/interp_tables.c
index 88452b10e..185f502c5 100644
--- a/src/py21cmfast/src/interp_tables.c
+++ b/src/py21cmfast/src/interp_tables.c
@@ -43,9 +43,8 @@ static RGTable2D SFRD_z_table_MINI = {.allocated = false};
 static RGTable2D Nion_z_table_MINI = {.allocated = false};
 static RGTable2D Xray_z_table_2D = {.allocated = false};
 // TODO: SFRD tables assume no reionisation feedback, this is self-inconsistent, but probably okay
-// given
-//  it's used (mostly) in the SpinTemperature, which deals with neutral regions
-//  Will overestimate integral component of SFRD lightcones used in observation
+// given it's used (mostly) in the SpinTemperature, which deals with neutral regions
+// Will overestimate integral component of SFRD lightcones used in observation
 static RGTable1D_f SFRD_conditional_table = {.allocated = false};
 static RGTable1D_f Nion_conditional_table1D = {.allocated = false};
 static RGTable2D_f Nion_conditional_table2D = {.allocated = false};
@@ -92,7 +91,7 @@ static RGTable1D_f dSigmasqdm_InterpTable = {
 // NOTE: this table is initialised for up to N_redshift x N_Mturn, but only called N_filter times to
 // assign ST_over_PS in Spintemp.
 //   It may be better to just do the integrals at each R
-void initialise_SFRD_spline(int Nbin, float zmin, float zmax, struct ScalingConstants *sc) {
+void initialise_SFRD_spline(int Nbin, float zmin, float zmax, ScalingConstants *sc) {
     int i, j;
     double Mmax = M_MAX_INTEGRAL;
     double lnMmax = log(Mmax);
@@ -117,7 +116,7 @@ void initialise_SFRD_spline(int Nbin, float zmin, float zmax, struct ScalingCons
 
 #pragma omp parallel private(i, j) num_threads(simulation_options_global -> N_THREADS)
     {
-        struct ScalingConstants sc_sfrd;
+        ScalingConstants sc_sfrd;
         sc_sfrd = evolve_scaling_constants_sfr(sc);
         double mturn_mcg;
         double lnMmin;
@@ -159,7 +158,7 @@ void initialise_SFRD_spline(int Nbin, float zmin, float zmax, struct ScalingCons
 
 // Unlike the SFRD spline, this one is used more due to the nu_tau_one() rootfind
 // although still ignores reionisation feedback
-void initialise_Nion_Ts_spline(int Nbin, float zmin, float zmax, struct ScalingConstants *sc) {
+void initialise_Nion_Ts_spline(int Nbin, float zmin, float zmax, ScalingConstants *sc) {
     int i, j;
     double Mmax = M_MAX_INTEGRAL;
     double lnMmax = log(Mmax);
@@ -183,7 +182,7 @@ void initialise_Nion_Ts_spline(int Nbin, float zmin, float zmax, struct ScalingC
 
 #pragma omp parallel private(i, j) num_threads(simulation_options_global -> N_THREADS)
     {
-        struct ScalingConstants sc_z;
+        ScalingConstants sc_z;
         double mturn_mcg;
         double z_val;
         double lnMmin;
@@ -290,7 +289,7 @@ void initialise_Nion_Conditional_spline(double z, double min_density, double max
                                         double Mmin, double Mmax, double Mcond,
                                         double log10Mturn_min, double log10Mturn_max,
                                         double log10Mturn_min_MINI, double log10Mturn_max_MINI,
-                                        struct ScalingConstants *sc, bool prev) {
+                                        ScalingConstants *sc, bool prev) {
     int i, j;
     double overdense_table[NDELTA];
     double mturns[NMTURN], mturns_MINI[NMTURN];
@@ -412,7 +411,7 @@ void initialise_Nion_Conditional_spline(double z, double min_density, double max
 // This function initialises one table, for table Rx arrays I will call this function in a loop
 void initialise_SFRD_Conditional_table(double z, double min_density, double max_density,
                                        double Mmin, double Mmax, double Mcond,
-                                       struct ScalingConstants *sc) {
+                                       ScalingConstants *sc) {
     float sigma2;
     int i, k;
 
@@ -449,7 +448,7 @@ void initialise_SFRD_Conditional_table(double z, double min_density, double max_
         SFRD_conditional_table_MINI.y_width = (LOG10_MTURN_MAX - LOG10_MTURN_MIN) / (NMTURN - 1.);
     }
 
-    struct ScalingConstants sc_sfrd = evolve_scaling_constants_sfr(sc);
+    ScalingConstants sc_sfrd = evolve_scaling_constants_sfr(sc);
 
 #pragma omp parallel private(i, k) num_threads(simulation_options_global -> N_THREADS)
     {
@@ -494,7 +493,7 @@ void initialise_SFRD_Conditional_table(double z, double min_density, double max_
 // This function initialises one table, for table Rx arrays I will call this function in a loop
 void initialise_Xray_Conditional_table(double redshift, double min_density, double max_density,
                                        double Mmin, double Mmax, double Mcond,
-                                       struct ScalingConstants *sc) {
+                                       ScalingConstants *sc) {
     int i, k;
 
     LOG_SUPER_DEBUG("Initialising Xray conditional table at mass %.2e from delta %.2e to %.2e",
@@ -884,7 +883,7 @@ void free_global_tables() {
 
 // JD: moving the interp table evaluations here since some of them are needed in nu_tau_one
 // NOTE: with !USE_MASS_DEPENDENT_ZETA both EvaluateNionTs and EvaluateSFRD return Fcoll
-double EvaluateNionTs(double redshift, struct ScalingConstants *sc) {
+double EvaluateNionTs(double redshift, ScalingConstants *sc) {
     // differences in turnover are handled by table setup
     if (matter_options_global->USE_INTERPOLATION_TABLES > 1) {
         if (astro_options_global->USE_MASS_DEPENDENT_ZETA)
@@ -898,7 +897,7 @@ double EvaluateNionTs(double redshift, struct ScalingConstants *sc) {
     double lnMmin = log(minimum_source_mass(redshift, true));
     double lnMmax = log(M_MAX_INTEGRAL);
 
-    struct ScalingConstants sc_z = evolve_scaling_constants_to_redshift(redshift, sc, false);
+    ScalingConstants sc_z = evolve_scaling_constants_to_redshift(redshift, sc, false);
 
     // minihalos uses a different turnover mass
     if (astro_options_global->USE_MASS_DEPENDENT_ZETA)
@@ -907,19 +906,18 @@ double EvaluateNionTs(double redshift, struct ScalingConstants *sc) {
     return Fcoll_General(redshift, lnMmin, lnMmax);
 }
 
-double EvaluateNionTs_MINI(double redshift, double log10_Mturn_LW_ave,
-                           struct ScalingConstants *sc) {
+double EvaluateNionTs_MINI(double redshift, double log10_Mturn_LW_ave, ScalingConstants *sc) {
     if (matter_options_global->USE_INTERPOLATION_TABLES > 1) {
         return EvaluateRGTable2D(redshift, log10_Mturn_LW_ave, &Nion_z_table_MINI);
     }
     double lnMmin = log(minimum_source_mass(redshift, true));
     double lnMmax = log(M_MAX_INTEGRAL);
-    struct ScalingConstants sc_z = evolve_scaling_constants_to_redshift(redshift, sc, false);
+    ScalingConstants sc_z = evolve_scaling_constants_to_redshift(redshift, sc, false);
 
     return Nion_General_MINI(redshift, lnMmin, lnMmax, pow(10., log10_Mturn_LW_ave), &sc_z);
 }
 
-double EvaluateSFRD(double redshift, struct ScalingConstants *sc) {
+double EvaluateSFRD(double redshift, ScalingConstants *sc) {
     if (matter_options_global->USE_INTERPOLATION_TABLES > 1) {
         if (astro_options_global->USE_MASS_DEPENDENT_ZETA)
             return EvaluateRGTable1D(redshift, &SFRD_z_table);
@@ -934,7 +932,7 @@ double EvaluateSFRD(double redshift, struct ScalingConstants *sc) {
 
     // The SFRD calls the same function as N_ion but sets escape fractions to unity
     // NOTE: since this only occurs on integration, the struct copy shouldn't be a bottleneck
-    struct ScalingConstants sc_sfrd = evolve_scaling_constants_sfr(sc);
+    ScalingConstants sc_sfrd = evolve_scaling_constants_sfr(sc);
     sc_sfrd = evolve_scaling_constants_to_redshift(redshift, &sc_sfrd, false);
 
     if (astro_options_global->USE_MASS_DEPENDENT_ZETA)
@@ -942,7 +940,7 @@ double EvaluateSFRD(double redshift, struct ScalingConstants *sc) {
     return Fcoll_General(redshift, lnMmin, lnMmax);
 }
 
-double EvaluateSFRD_MINI(double redshift, double log10_Mturn_LW_ave, struct ScalingConstants *sc) {
+double EvaluateSFRD_MINI(double redshift, double log10_Mturn_LW_ave, ScalingConstants *sc) {
     if (matter_options_global->USE_INTERPOLATION_TABLES > 1) {
         return EvaluateRGTable2D(redshift, log10_Mturn_LW_ave, &SFRD_z_table_MINI);
     }
@@ -950,19 +948,19 @@ double EvaluateSFRD_MINI(double redshift, double log10_Mturn_LW_ave, struct Scal
     double lnMmin = log(minimum_source_mass(redshift, true));
     double lnMmax = log(M_MAX_INTEGRAL);
 
-    struct ScalingConstants sc_sfrd = evolve_scaling_constants_sfr(sc);
+    ScalingConstants sc_sfrd = evolve_scaling_constants_sfr(sc);
     sc_sfrd = evolve_scaling_constants_to_redshift(redshift, &sc_sfrd, false);
 
     return Nion_General_MINI(redshift, lnMmin, lnMmax, pow(10., log10_Mturn_LW_ave), &sc_sfrd);
 }
 
 double EvaluateSFRD_Conditional(double delta, double growthf, double M_min, double M_max,
-                                double M_cond, double sigma_max, struct ScalingConstants *sc) {
+                                double M_cond, double sigma_max, ScalingConstants *sc) {
     if (matter_options_global->USE_INTERPOLATION_TABLES > 1) {
         return exp(EvaluateRGTable1D_f(delta, &SFRD_conditional_table));
     }
 
-    struct ScalingConstants sc_sfrd = evolve_scaling_constants_sfr(sc);
+    ScalingConstants sc_sfrd = evolve_scaling_constants_sfr(sc);
     // SFRD in Ts assumes no (reion) feedback on ACG
     return Nion_ConditionalM(growthf, log(M_min), log(M_max), log(M_cond), sigma_max, delta,
                              sc_sfrd.mturn_a_nofb, &sc_sfrd,
@@ -971,20 +969,20 @@ double EvaluateSFRD_Conditional(double delta, double growthf, double M_min, doub
 
 double EvaluateSFRD_Conditional_MINI(double delta, double log10Mturn_m, double growthf,
                                      double M_min, double M_max, double M_cond, double sigma_max,
-                                     struct ScalingConstants *sc) {
+                                     ScalingConstants *sc) {
     if (matter_options_global->USE_INTERPOLATION_TABLES > 1) {
         return exp(EvaluateRGTable2D_f(delta, log10Mturn_m, &SFRD_conditional_table_MINI));
     }
 
-    struct ScalingConstants sc_sfrd = evolve_scaling_constants_sfr(sc);
+    ScalingConstants sc_sfrd = evolve_scaling_constants_sfr(sc);
     return Nion_ConditionalM_MINI(growthf, log(M_min), log(M_max), log(M_cond), sigma_max, delta,
                                   pow(10, log10Mturn_m), &sc_sfrd,
                                   astro_options_global->INTEGRATION_METHOD_MINI);
 }
 
 double EvaluateNion_Conditional(double delta, double log10Mturn, double growthf, double M_min,
-                                double M_max, double M_cond, double sigma_max,
-                                struct ScalingConstants *sc, bool prev) {
+                                double M_max, double M_cond, double sigma_max, ScalingConstants *sc,
+                                bool prev) {
     RGTable2D_f *table = prev ? &Nion_conditional_table_prev : &Nion_conditional_table2D;
     if (matter_options_global->USE_INTERPOLATION_TABLES > 1) {
         if (astro_options_global->USE_MINI_HALOS)
@@ -1001,7 +999,7 @@ double EvaluateNion_Conditional(double delta, double log10Mturn, double growthf,
 
 double EvaluateNion_Conditional_MINI(double delta, double log10Mturn_m, double growthf,
                                      double M_min, double M_max, double M_cond, double sigma_max,
-                                     struct ScalingConstants *sc, bool prev) {
+                                     ScalingConstants *sc, bool prev) {
     RGTable2D_f *table = prev ? &Nion_conditional_table_MINI_prev : &Nion_conditional_table_MINI;
     if (matter_options_global->USE_INTERPOLATION_TABLES > 1) {
         return exp(EvaluateRGTable2D_f(delta, log10Mturn_m, table));
@@ -1014,7 +1012,7 @@ double EvaluateNion_Conditional_MINI(double delta, double log10Mturn_m, double g
 
 double EvaluateXray_Conditional(double delta, double log10Mturn_m, double redshift, double growthf,
                                 double M_min, double M_max, double M_cond, double sigma_max,
-                                struct ScalingConstants *sc) {
+                                ScalingConstants *sc) {
     if (matter_options_global->USE_INTERPOLATION_TABLES > 1) {
         if (astro_options_global->USE_MINI_HALOS)
             return exp(EvaluateRGTable2D_f(delta, log10Mturn_m, &Xray_conditional_table_2D));
@@ -1183,3 +1181,34 @@ double EvaluatedSigmasqdm(double lnM) {
     }
     return dsigmasqdm_z0(exp(lnM));
 }
+
+// Accessor function for the GPU SpinTemp kernel to access table.
+RGTable1D_f *get_SFRD_conditional_table(void) { return &SFRD_conditional_table; }
+
+// Accessor function for the GPU Ionisation kernel to access table.
+RGTable1D_f *get_Nion_conditional_table1D(void) { return &Nion_conditional_table1D; }
+
+// Accessor function for GPU memory allocation functions to access nbins.
+int get_nbins(void) { return NDELTA; }
+
+// todo: only return when it's been initialized
+RGTable1D *GetNhaloTable() {
+    printf("The number of bins: %d; x_min: %f\n", Nhalo_table.n_bin, Nhalo_table.x_min);
+    return &Nhalo_table;
+}
+
+RGTable1D *GetMcollTable() {
+    printf("The number of bins: %d; x_min: %f\n", Mcoll_table.n_bin, Mcoll_table.x_min);
+    return &Mcoll_table;
+}
+
+RGTable2D *GetNhaloInvTable() {
+    printf("The number of nx bins: %d; the number of ny bins: %d \n", Nhalo_inv_table.nx_bin,
+           Nhalo_inv_table.ny_bin);
+    return &Nhalo_inv_table;
+}
+
+RGTable1D_f *GetSigmaInterpTable() {
+    printf("The number of bins: %d; x_min: %f\n", Sigma_InterpTable.n_bin, Sigma_InterpTable.x_min);
+    return &Sigma_InterpTable;
+}
diff --git a/src/py21cmfast/src/interp_tables.cu b/src/py21cmfast/src/interp_tables.cu
new file mode 100644
index 000000000..1f0702af5
--- /dev/null
+++ b/src/py21cmfast/src/interp_tables.cu
@@ -0,0 +1,157 @@
+#include <cuda_runtime.h>
+
+#include <math.h>
+// #include <stdio.h>
+
+// #include "InputParameters.h"
+#include "interpolation_types.h"
+
+#include "cuda_utils.cuh"
+#include "interp_tables.cuh"
+#include "DeviceConstants.cuh"
+
+#include "interpolation.cu"
+
+// define relevant variables stored in constant memory
+__constant__ RGTable1D d_Nhalo_table;
+__constant__ RGTable1D d_Mcoll_table;
+__constant__ RGTable2D d_Nhalo_inv_table;
+
+// specify a max size of yarr
+const int device_n_max = 200;
+__constant__ double d_Nhalo_yarr[device_n_max];
+__constant__ double d_Mcoll_yarr[device_n_max];
+
+
+// copy tables to gpu
+void copyTablesToDevice(RGTable1D h_Nhalo_table, RGTable1D h_Mcoll_table, RGTable2D h_Nhalo_inv_table)
+{
+    // copy Nhalo table and its member y_arr
+    size_t size_Nhalo_yarr = sizeof(double) * h_Nhalo_table.n_bin;
+    // get a copy of the Nhalo table
+    RGTable1D h_Nhalo_table_to_device = h_Nhalo_table;
+    if (h_Nhalo_table.n_bin > device_n_max){
+        // double *d_Nhalo_yarr;
+        // todo: declare device yarr (not using constant)
+        return;
+    }
+    else{
+        CALL_CUDA(cudaMemcpyToSymbol(d_Nhalo_yarr, h_Nhalo_table.y_arr, size_Nhalo_yarr, 0, cudaMemcpyHostToDevice));
+        // get memory address on the device
+        double *d_Nhalo_yarr_device;
+        CALL_CUDA(cudaGetSymbolAddress((void **)&d_Nhalo_yarr_device, d_Nhalo_yarr));
+
+        h_Nhalo_table_to_device.y_arr = d_Nhalo_yarr_device;
+    }
+    CALL_CUDA(cudaMemcpyToSymbol(d_Nhalo_table, &h_Nhalo_table_to_device, sizeof(RGTable1D), 0, cudaMemcpyHostToDevice));
+
+    // copy Mcoll table and its member y_arr
+    size_t size_Mcoll_yarr = sizeof(double) * h_Mcoll_table.n_bin;
+    // get a copy of Mcoll table
+    RGTable1D h_Mcoll_table_to_device = h_Mcoll_table;
+    if (h_Mcoll_table.n_bin > device_n_max){
+        return;
+    }
+    else{
+        CALL_CUDA(cudaMemcpyToSymbol(d_Mcoll_yarr, h_Mcoll_table.y_arr, size_Mcoll_yarr, 0, cudaMemcpyHostToDevice));
+        // get memory address on the device
+        double *d_Mcoll_yarr_device;
+        CALL_CUDA(cudaGetSymbolAddress((void **)&d_Mcoll_yarr_device, d_Mcoll_yarr));
+
+        h_Mcoll_table_to_device.y_arr = d_Mcoll_yarr_device;
+    }
+    CALL_CUDA(cudaMemcpyToSymbol(d_Mcoll_table, &h_Mcoll_table_to_device, sizeof(RGTable1D), 0, cudaMemcpyHostToDevice));
+
+    // copy Nhalo_inv table and its member flatten_data
+    size_t size_Nhalo_inv_flatten_data = sizeof(double) * h_Nhalo_inv_table.nx_bin * h_Nhalo_inv_table.ny_bin;
+    // get a copy of Nhalo_inv_table
+    RGTable2D h_Nhalo_inv_table_to_device = h_Nhalo_inv_table;
+
+    double *d_Nhalo_flatten_data;
+    CALL_CUDA(cudaMalloc(&d_Nhalo_flatten_data, size_Nhalo_inv_flatten_data));
+    CALL_CUDA(cudaMemcpy(d_Nhalo_flatten_data, h_Nhalo_inv_table.flatten_data, size_Nhalo_inv_flatten_data, cudaMemcpyHostToDevice));
+
+    double **d_z_arr, **z_arr_to_device;
+    size_t size_z_arr = sizeof(double *) * h_Nhalo_inv_table.nx_bin;
+    CALL_CUDA(cudaHostAlloc((void **)&z_arr_to_device, size_z_arr, cudaHostAllocDefault));
+    // get the address of flatten data on the device
+    int i;
+    for (i=0;i<h_Nhalo_inv_table.nx_bin;i++){
+        z_arr_to_device[i] = &d_Nhalo_flatten_data[i * h_Nhalo_inv_table.ny_bin];
+    }
+
+    CALL_CUDA(cudaMalloc(&d_z_arr, size_z_arr));
+    CALL_CUDA(cudaMemcpy(d_z_arr, z_arr_to_device, size_z_arr, cudaMemcpyHostToDevice));
+
+    // free data after it's been copied to the device
+    CALL_CUDA(cudaFreeHost(z_arr_to_device));
+
+    h_Nhalo_inv_table_to_device.flatten_data = d_Nhalo_flatten_data;
+    h_Nhalo_inv_table_to_device.z_arr = d_z_arr;
+
+    CALL_CUDA(cudaMemcpyToSymbol(d_Nhalo_inv_table, &h_Nhalo_inv_table_to_device, sizeof(RGTable2D), 0, cudaMemcpyHostToDevice));
+}
+
+// assume use interpolation table is true at this stage, add the check later
+// todo: double check whether I should use float or double or x, it's been mixed used in c code
+__device__ double EvaluateSigma(float x, double x_min, double x_width, float *y_arr, int n_bin)
+{
+    // using log units to make the fast option faster and the slow option slower
+    // return EvaluateRGTable1D_f(lnM, table);
+    int idx = (int)floor((x - x_min) / x_width);
+    if (idx < 0 || idx >= n_bin - 1)
+    {
+        return 0.0; // Out-of-bounds handling
+    }
+
+    double table_val = x_min + x_width * (float)idx;
+    double interp_point = (x - table_val) / x_width;
+
+    return y_arr[idx] * (1 - interp_point) + y_arr[idx + 1] * (interp_point);
+}
+
+__device__ double extrapolate_dNdM_inverse(double condition, double lnp)
+{
+    double x_min = d_Nhalo_inv_table.x_min;
+    double x_width = d_Nhalo_inv_table.x_width;
+    // printf("condition: %f; lnp: %f \n", condition, lnp); //tmp
+    int x_idx = (int)floor((condition - x_min) / x_width);
+    double x_table = x_min + x_idx * x_width;
+    double interp_point_x = (condition - x_table) / x_width;
+
+    double extrap_point_y = (lnp - d_user_params.MIN_LOGPROB) / d_Nhalo_inv_table.y_width;
+
+    // find the log-mass at the edge of the table for this condition
+    double xlimit = d_Nhalo_inv_table.z_arr[x_idx][0] * (interp_point_x) + d_Nhalo_inv_table.z_arr[x_idx + 1][0] * (1 - interp_point_x);
+    double xlimit_m1 = d_Nhalo_inv_table.z_arr[x_idx][1] * (interp_point_x) + d_Nhalo_inv_table.z_arr[x_idx + 1][1] * (1 - interp_point_x);
+
+    double result = xlimit + (xlimit_m1 - xlimit) * (extrap_point_y);
+
+    return result;
+}
+
+__device__ double EvaluateNhaloInv(double condition, double prob)
+{
+    if (prob == 0.)
+        return 1.; // q == 1 -> condition mass
+    double lnp = log(prob);
+    if (lnp < d_user_params.MIN_LOGPROB)
+        return extrapolate_dNdM_inverse(condition, lnp);
+    return EvaluateRGTable2D(condition, lnp, &d_Nhalo_inv_table);
+}
+
+__device__ double EvaluateMcoll(double condition, double growthf, double lnMmin, double lnMmax, double M_cond, double sigma, double delta)
+{
+    if (d_user_params.USE_INTERPOLATION_TABLES)
+        return EvaluateRGTable1D(condition, &d_Mcoll_table);
+    // todo: implement Mcoll_Conditional
+    return 0;
+}
+
+__device__ double EvaluateNhalo(double condition, double growthf, double lnMmin, double lnMmax, double M_cond, double sigma, double delta)
+{
+    if (d_user_params.USE_INTERPOLATION_TABLES)
+        return EvaluateRGTable1D(condition, &d_Nhalo_table);
+    // todo: implement Nhalo_Conditional
+    return 0;
+}
diff --git a/src/py21cmfast/src/interp_tables.cuh b/src/py21cmfast/src/interp_tables.cuh
new file mode 100644
index 000000000..8616f691e
--- /dev/null
+++ b/src/py21cmfast/src/interp_tables.cuh
@@ -0,0 +1,23 @@
+#ifndef _INTERP_TABLES_CUH
+#define _INTERP_TABLES_CUH
+
+#include "interpolation_types.h"
+
+#ifdef __CUDA_ARCH__
+__device__ double EvaluateSigma(float x, double x_min, double x_width, float *y_arr, int n_bin);
+__device__ double extrapolate_dNdM_inverse(double condition, double lnp);
+__device__ double EvaluateNhaloInv(double condition, double prob);
+__device__ double EvaluateMcoll(double condition, double growthf, double lnMmin, double lnMmax, double M_cond, double sigma, double delta);
+__device__ double EvaluateNhalo(double condition, double growthf, double lnMmin, double lnMmax, double M_cond, double sigma, double delta);
+#endif
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+    void copyTablesToDevice(RGTable1D h_Nhalo_table, RGTable1D h_Mcoll_table, RGTable2D h_Nhalo_inv_table);
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/py21cmfast/src/interp_tables.h b/src/py21cmfast/src/interp_tables.h
index 0fe068a49..56d98ea5b 100644
--- a/src/py21cmfast/src/interp_tables.h
+++ b/src/py21cmfast/src/interp_tables.h
@@ -2,19 +2,23 @@
 #define _INTERP_TABLES_H
 
 #include "InputParameters.h"
+#include "interpolation.h"
 #include "scaling_relations.h"
 
 // Functions within interp_tables.c need the parameter structures, but we don't want to pass them
 // all down the chain, so we broadcast them
-// TODO: in future it would be better to use a context struct. See `HaloBox.c`
 
-void initialise_SFRD_spline(int Nbin, float zmin, float zmax, struct ScalingConstants *sc);
-double EvaluateSFRD(double redshift, struct ScalingConstants *sc);
-double EvaluateSFRD_MINI(double redshift, double log10_Mturn_LW_ave, struct ScalingConstants *sc);
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void initialise_SFRD_spline(int Nbin, float zmin, float zmax, ScalingConstants *sc);
+double EvaluateSFRD(double redshift, ScalingConstants *sc);
+double EvaluateSFRD_MINI(double redshift, double log10_Mturn_LW_ave, ScalingConstants *sc);
 
-void initialise_Nion_Ts_spline(int Nbin, float zmin, float zmax, struct ScalingConstants *sc);
-double EvaluateNionTs(double redshift, struct ScalingConstants *sc);
-double EvaluateNionTs_MINI(double redshift, double log10_Mturn_LW_ave, struct ScalingConstants *sc);
+void initialise_Nion_Ts_spline(int Nbin, float zmin, float zmax, ScalingConstants *sc);
+double EvaluateNionTs(double redshift, ScalingConstants *sc);
+double EvaluateNionTs_MINI(double redshift, double log10_Mturn_LW_ave, ScalingConstants *sc);
 
 void initialise_FgtrM_delta_table(double min_dens, double max_dens, double zpp, double growth_zpp,
                                   double smin_zpp, double smax_zpp);
@@ -27,27 +31,27 @@ void initialise_Nion_Conditional_spline(double z, double min_density, double max
                                         double Mmin, double Mmax, double Mcond,
                                         double log10Mturn_min, double log10Mturn_max,
                                         double log10Mturn_min_MINI, double log10Mturn_max_MINI,
-                                        struct ScalingConstants *sc, bool prev);
+                                        ScalingConstants *sc, bool prev);
 double EvaluateNion_Conditional(double delta, double log10Mturn, double growthf, double M_min,
-                                double M_max, double M_cond, double sigma_max,
-                                struct ScalingConstants *sc, bool prev);
+                                double M_max, double M_cond, double sigma_max, ScalingConstants *sc,
+                                bool prev);
 double EvaluateNion_Conditional_MINI(double delta, double log10Mturn_m, double growthf,
                                      double M_min, double M_max, double M_cond, double sigma_max,
-                                     struct ScalingConstants *sc, bool prev);
+                                     ScalingConstants *sc, bool prev);
 void initialise_Xray_Conditional_table(double redshift, double min_density, double max_density,
                                        double Mmin, double Mmax, double Mcond,
-                                       struct ScalingConstants *sc);
+                                       ScalingConstants *sc);
 double EvaluateXray_Conditional(double delta, double log10Mturn_m, double redshift, double growthf,
                                 double M_min, double M_max, double M_cond, double sigma_max,
-                                struct ScalingConstants *sc);
+                                ScalingConstants *sc);
 void initialise_SFRD_Conditional_table(double z, double min_density, double max_density,
                                        double Mmin, double Mmax, double Mcond,
-                                       struct ScalingConstants *sc);
+                                       ScalingConstants *sc);
 double EvaluateSFRD_Conditional(double delta, double growthf, double M_min, double M_max,
-                                double M_cond, double sigma_max, struct ScalingConstants *sc);
+                                double M_cond, double sigma_max, ScalingConstants *sc);
 double EvaluateSFRD_Conditional_MINI(double delta, double log10Mturn_m, double growthf,
                                      double M_min, double M_max, double M_cond, double sigma_max,
-                                     struct ScalingConstants *sc);
+                                     ScalingConstants *sc);
 
 void initialise_dNdM_tables(double xmin, double xmax, double ymin, double ymax, double growth1,
                             double param, bool from_catalog);
@@ -75,4 +79,17 @@ void free_conditional_tables();
 void free_global_tables();
 void free_dNdM_tables();
 
+RGTable1D_f *get_SFRD_conditional_table(void);
+RGTable1D_f *get_Nion_conditional_table1D(void);
+int get_nbins(void);
+
+RGTable1D *GetNhaloTable();
+RGTable1D *GetMcollTable();
+RGTable2D *GetNhaloInvTable();
+RGTable1D_f *GetSigmaInterpTable();
+
+#ifdef __cplusplus
+}
 #endif
+
+#endif  //_INTERP_TABLES_H
diff --git a/src/py21cmfast/src/interpolation.c b/src/py21cmfast/src/interpolation.c
index 180ab1836..6f4810d92 100644
--- a/src/py21cmfast/src/interpolation.c
+++ b/src/py21cmfast/src/interpolation.c
@@ -43,10 +43,12 @@ void allocate_RGTable2D(int n_x, int n_y, RGTable2D *ptr) {
     ptr->nx_bin = n_x;
     ptr->ny_bin = n_y;
 
+    ptr->flatten_data = (double *)calloc(n_x * n_y, sizeof(double));
     ptr->z_arr = calloc(n_x, sizeof(double *));
     for (i = 0; i < n_x; i++) {
-        ptr->z_arr[i] = calloc(n_y, sizeof(double));
+        ptr->z_arr[i] = &ptr->flatten_data[i * n_y];
     }
+
     ptr->allocated = true;
 }
 
@@ -74,7 +76,7 @@ void free_RGTable2D_f(RGTable2D_f *ptr) {
 void free_RGTable2D(RGTable2D *ptr) {
     int i;
     if (ptr->allocated) {
-        for (i = 0; i < ptr->nx_bin; i++) free(ptr->z_arr[i]);
+        free(ptr->flatten_data);
         free(ptr->z_arr);
         ptr->allocated = false;
     }
diff --git a/src/py21cmfast/src/interpolation.cu b/src/py21cmfast/src/interpolation.cu
new file mode 100644
index 000000000..761ea6b94
--- /dev/null
+++ b/src/py21cmfast/src/interpolation.cu
@@ -0,0 +1,42 @@
+#include <cuda_runtime.h>
+
+#include "interpolation.cuh"
+
+__device__ double EvaluateRGTable1D(double x, RGTable1D *table)
+{
+    double x_min = table->x_min;
+    double x_width = table->x_width;
+    int idx = (int)floor((x - x_min) / x_width);
+    double table_val = x_min + x_width * (double)idx;
+    double interp_point = (x - table_val) / x_width;
+
+    // a + f(a-b) is one fewer operation but less precise
+    double result = table->y_arr[idx] * (1 - interp_point) + table->y_arr[idx + 1] * (interp_point);
+
+    return result;
+}
+
+__device__ double EvaluateRGTable2D(double x, double y, RGTable2D *table)
+{
+    double x_min = table->x_min;
+    double x_width = table->x_width;
+    double y_min = table->y_min;
+    double y_width = table->y_width;
+    int x_idx = (int)floor((x - x_min) / x_width);
+    int y_idx = (int)floor((y - y_min) / y_width);
+
+    double x_table = x_min + x_width * (double)x_idx;
+    double y_table = y_min + y_width * (double)y_idx;
+
+    double interp_point_x = (x - x_table) / x_width;
+    double interp_point_y = (y - y_table) / y_width;
+
+    double left_edge, right_edge, result;
+
+    left_edge = table->z_arr[x_idx][y_idx] * (1 - interp_point_y) + table->z_arr[x_idx][y_idx + 1] * (interp_point_y);
+    right_edge = table->z_arr[x_idx + 1][y_idx] * (1 - interp_point_y) + table->z_arr[x_idx + 1][y_idx + 1] * (interp_point_y);
+
+    result = left_edge * (1 - interp_point_x) + right_edge * (interp_point_x);
+
+    return result;
+}
diff --git a/src/py21cmfast/src/interpolation.cuh b/src/py21cmfast/src/interpolation.cuh
new file mode 100644
index 000000000..2ea3bffb7
--- /dev/null
+++ b/src/py21cmfast/src/interpolation.cuh
@@ -0,0 +1,14 @@
+#ifndef _INTERPOLATION_CUH
+#define _INTERPOLATION_CUH
+
+#include <stdbool.h>
+#include "interpolation_types.h"
+
+#ifdef __CUDA_ARCH__
+
+__device__ double EvaluateRGTable1D(double x, RGTable1D *table);
+__device__ double EvaluateRGTable2D(double x, double y, RGTable2D *table);
+
+#endif
+
+#endif
diff --git a/src/py21cmfast/src/interpolation.h b/src/py21cmfast/src/interpolation.h
index 02fa6d411..a3251068d 100644
--- a/src/py21cmfast/src/interpolation.h
+++ b/src/py21cmfast/src/interpolation.h
@@ -3,46 +3,11 @@
 
 #include <stdbool.h>
 
-typedef struct RGTable1D {
-    int n_bin;
-    double x_min;
-    double x_width;
-
-    double *y_arr;
-    bool allocated;
-} RGTable1D;
-
-typedef struct RGTable2D {
-    int nx_bin, ny_bin;
-    double x_min, y_min;
-    double x_width, y_width;
-
-    double **z_arr;
-
-    double saved_ll, saved_ul;  // for future acceleration
-    bool allocated;
-} RGTable2D;
-
-typedef struct RGTable1D_f {
-    int n_bin;
-    double x_min;
-    double x_width;
-
-    float *y_arr;
-    bool allocated;
-} RGTable1D_f;
-
-typedef struct RGTable2D_f {
-    int nx_bin, ny_bin;
-    double x_min, y_min;
-    double x_width, y_width;
-
-    float **z_arr;
-
-    double saved_ll, saved_ul;  // for future acceleration
-    bool allocated;
-} RGTable2D_f;
+#include "interpolation_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
 void allocate_RGTable1D(int n_bin, RGTable1D *ptr);
 void allocate_RGTable1D_f(int n_bin, RGTable1D_f *ptr);
 void allocate_RGTable2D(int n_x, int n_y, RGTable2D *ptr);
@@ -57,6 +22,9 @@ double EvaluateRGTable1D(double x, RGTable1D *table);
 double EvaluateRGTable2D(double x, double y, RGTable2D *table);
 double EvaluateRGTable1D_f(double x, RGTable1D_f *table);
 double EvaluateRGTable2D_f(double x, double y, RGTable2D_f *table);
+#ifdef __cplusplus
+}
+#endif
 
 bool RGTable2D_out_of_bounds(RGTable2D *table, double x_val, double y_val);
 bool RGTable2Df_out_of_bounds(RGTable2D_f *table, double x_val, double y_val);
diff --git a/src/py21cmfast/src/interpolation_types.h b/src/py21cmfast/src/interpolation_types.h
new file mode 100644
index 000000000..c869dc248
--- /dev/null
+++ b/src/py21cmfast/src/interpolation_types.h
@@ -0,0 +1,45 @@
+#ifndef _INTERPOLATION_TYPES_H
+#define _INTERPOLATION_TYPES_H
+
+typedef struct RGTable1D {
+    int n_bin;
+    double x_min;
+    double x_width;
+
+    double *y_arr;
+    bool allocated;
+} RGTable1D;
+
+typedef struct RGTable2D {
+    int nx_bin, ny_bin;
+    double x_min, y_min;
+    double x_width, y_width;
+
+    double **z_arr;
+    double *flatten_data;
+
+    double saved_ll, saved_ul;  // for future acceleration
+    bool allocated;
+} RGTable2D;
+
+typedef struct RGTable1D_f {
+    int n_bin;
+    double x_min;
+    double x_width;
+
+    float *y_arr;
+    bool allocated;
+} RGTable1D_f;
+
+typedef struct RGTable2D_f {
+    int nx_bin, ny_bin;
+    double x_min, y_min;
+    double x_width, y_width;
+
+    float **z_arr;
+
+    double saved_ll, saved_ul;  // for future acceleration
+    bool allocated;
+} RGTable2D_f;
+
+#endif
diff --git a/src/py21cmfast/src/logger.h b/src/py21cmfast/src/logger.h
index 45872477f..501e21d3c 100644
--- a/src/py21cmfast/src/logger.h
+++ b/src/py21cmfast/src/logger.h
@@ -49,7 +49,7 @@
 #include <unistd.h>
 
 // === auxiliary functions
-static inline char *timenow();
+// static inline char *timenow();
 
 #define _FILE strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__
 
@@ -130,6 +130,9 @@ static inline char *timenow();
 #define LOG_IF_ERROR(condition, message, args...)
 #endif
 
+#ifdef __cplusplus
+extern "C" {
+#endif
 static inline char *timenow() {
     static char buffer[64];
     time_t rawtime;
@@ -143,4 +146,8 @@ static inline char *timenow() {
     return buffer;
 }
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif
diff --git a/src/py21cmfast/src/map_mass.c b/src/py21cmfast/src/map_mass.c
new file mode 100644
index 000000000..ef0d62018
--- /dev/null
+++ b/src/py21cmfast/src/map_mass.c
@@ -0,0 +1,294 @@
+// Functions in this file map units of mass from Lagrangian (IC)
+//  coordinates to their real (Eulerian) Locations, these can sum
+//  masses or galaxy properties from grids or from coordinate catalogues
+
+#include "map_mass.h"
+
+#include <math.h>
+#include <omp.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "Constants.h"
+#include "HaloBox.h"
+#include "InputParameters.h"
+#include "cosmology.h"
+#include "indexing.h"
+#include "logger.h"
+
+#define do_cic_interpolation(arr, ...)                                                           \
+    _Generic((arr), float *: do_cic_interpolation_float, double *: do_cic_interpolation_double)( \
+        arr, __VA_ARGS__)
+
+static inline void do_cic_interpolation_double(double *resampled_box, double pos[3], int box_dim[3],
+                                               double curr_dens) {
+    // get the CIC indices and distances
+    int ipos[3], iposp1[3];
+    double dist[3];
+    // NOTE: assumes the cell at idx == 0 is *centred* at (0,0,0)
+    for (int axis = 0; axis < 3; axis++) {
+        ipos[axis] = (int)floor(pos[axis]);
+        iposp1[axis] = ipos[axis] + 1;
+        dist[axis] = pos[axis] - ipos[axis];
+    }
+
+    wrap_coord(ipos, box_dim);
+    wrap_coord(iposp1, box_dim);
+
+    unsigned long long int cic_indices[8] = {
+        grid_index_general(ipos[0], ipos[1], ipos[2], box_dim),
+        grid_index_general(iposp1[0], ipos[1], ipos[2], box_dim),
+        grid_index_general(ipos[0], iposp1[1], ipos[2], box_dim),
+        grid_index_general(iposp1[0], iposp1[1], ipos[2], box_dim),
+        grid_index_general(ipos[0], ipos[1], iposp1[2], box_dim),
+        grid_index_general(iposp1[0], ipos[1], iposp1[2], box_dim),
+        grid_index_general(ipos[0], iposp1[1], iposp1[2], box_dim),
+        grid_index_general(iposp1[0], iposp1[1], iposp1[2], box_dim)};
+
+    double cic_weights[8] = {(1. - dist[0]) * (1. - dist[1]) * (1. - dist[2]),
+                             dist[0] * (1. - dist[1]) * (1. - dist[2]),
+                             (1. - dist[0]) * dist[1] * (1. - dist[2]),
+                             dist[0] * dist[1] * (1. - dist[2]),
+                             (1. - dist[0]) * (1. - dist[1]) * dist[2],
+                             dist[0] * (1. - dist[1]) * dist[2],
+                             (1. - dist[0]) * dist[1] * dist[2],
+                             dist[0] * dist[1] * dist[2]};
+
+    for (int i = 0; i < 8; i++) {
+#pragma omp atomic update
+        resampled_box[cic_indices[i]] += curr_dens * cic_weights[i];
+    }
+}
+
+// Identical code as above, using a single precision output
+static inline void do_cic_interpolation_float(float *resampled_box, double pos[3], int box_dim[3],
+                                              double curr_dens) {
+    // get the CIC indices and distances
+    int ipos[3], iposp1[3];
+    double dist[3];
+    // NOTE: assumes the cell at idx == 0 is *centred* at (0,0,0)
+    for (int axis = 0; axis < 3; axis++) {
+        ipos[axis] = (int)floor(pos[axis]);
+        iposp1[axis] = ipos[axis] + 1;
+        dist[axis] = pos[axis] - ipos[axis];
+    }
+
+    wrap_coord(ipos, box_dim);
+    wrap_coord(iposp1, box_dim);
+
+    unsigned long long int cic_indices[8] = {
+        grid_index_general(ipos[0], ipos[1], ipos[2], box_dim),
+        grid_index_general(iposp1[0], ipos[1], ipos[2], box_dim),
+        grid_index_general(ipos[0], iposp1[1], ipos[2], box_dim),
+        grid_index_general(iposp1[0], iposp1[1], ipos[2], box_dim),
+        grid_index_general(ipos[0], ipos[1], iposp1[2], box_dim),
+        grid_index_general(iposp1[0], ipos[1], iposp1[2], box_dim),
+        grid_index_general(ipos[0], iposp1[1], iposp1[2], box_dim),
+        grid_index_general(iposp1[0], iposp1[1], iposp1[2], box_dim)};
+
+    double cic_weights[8] = {(1. - dist[0]) * (1. - dist[1]) * (1. - dist[2]),
+                             dist[0] * (1. - dist[1]) * (1. - dist[2]),
+                             (1. - dist[0]) * dist[1] * (1. - dist[2]),
+                             dist[0] * dist[1] * (1. - dist[2]),
+                             (1. - dist[0]) * (1. - dist[1]) * dist[2],
+                             dist[0] * (1. - dist[1]) * dist[2],
+                             (1. - dist[0]) * dist[1] * dist[2],
+                             dist[0] * dist[1] * dist[2]};
+
+    for (int i = 0; i < 8; i++) {
+#pragma omp atomic update
+        resampled_box[cic_indices[i]] += curr_dens * cic_weights[i];
+    }
+}
+
+// Function that maps a IC density grid to the perturbed density grid
+void move_grid_masses(double redshift, float *dens_pointer, int dens_dim[3], float *vel_pointers[3],
+                      float *vel_pointers_2LPT[3], int vel_dim[3], double *resampled_box,
+                      int out_dim[3]) {
+    // grid dimension constants
+    double boxlen = simulation_options_global->BOX_LEN;
+    double boxlen_z = boxlen * simulation_options_global->NON_CUBIC_FACTOR;
+    double box_size[3] = {boxlen, boxlen, boxlen_z};
+    double dim_ratio_vel = (double)vel_dim[0] / (double)dens_dim[0];
+    double dim_ratio_out = (double)out_dim[0] / (double)dens_dim[0];
+
+    // Setup IC velocity factors
+    double growth_factor = dicke(redshift);
+    double displacement_factor_2LPT = -(3.0 / 7.0) * growth_factor * growth_factor;  // 2LPT eq. D8
+
+    double init_growth_factor = dicke(simulation_options_global->INITIAL_REDSHIFT);
+    double init_displacement_factor_2LPT =
+        -(3.0 / 7.0) * init_growth_factor * init_growth_factor;  // 2LPT eq. D8
+
+    double velocity_displacement_factor[3] = {
+        (growth_factor - init_growth_factor) / box_size[0] * dens_dim[0],
+        (growth_factor - init_growth_factor) / box_size[1] * dens_dim[1],
+        (growth_factor - init_growth_factor) / box_size[2] * dens_dim[2]};
+    double velocity_displacement_factor_2LPT[3] = {
+        (displacement_factor_2LPT - init_displacement_factor_2LPT) / box_size[0] * dens_dim[0],
+        (displacement_factor_2LPT - init_displacement_factor_2LPT) / box_size[1] * dens_dim[1],
+        (displacement_factor_2LPT - init_displacement_factor_2LPT) / box_size[2] * dens_dim[2]};
+#pragma omp parallel num_threads(simulation_options_global->N_THREADS)
+    {
+        int i, j, k, axis;
+        double pos[3], curr_dens;
+        int ipos[3];
+        unsigned long long vel_index, dens_index;
+#pragma omp for
+        for (i = 0; i < dens_dim[0]; i++) {
+            for (j = 0; j < dens_dim[1]; j++) {
+                for (k = 0; k < dens_dim[2]; k++) {
+                    // Transform position to units of box size
+                    pos[0] = i;
+                    pos[1] = j;
+                    pos[2] = k;
+                    resample_index((int[3]){i, j, k}, dim_ratio_vel, ipos);
+                    wrap_coord(ipos, vel_dim);
+                    vel_index = grid_index_general(ipos[0], ipos[1], ipos[2], vel_dim);
+                    for (axis = 0; axis < 3; axis++) {
+                        pos[axis] +=
+                            vel_pointers[axis][vel_index] * velocity_displacement_factor[axis];
+                        // add 2LPT second order corrections
+                        if (matter_options_global->PERTURB_ALGORITHM == 2) {
+                            pos[axis] -= vel_pointers_2LPT[axis][vel_index] *
+                                         velocity_displacement_factor_2LPT[axis];
+                        }
+                        pos[axis] *= dim_ratio_out;
+                    }
+
+                    // CIC interpolation
+                    dens_index = grid_index_general(i, j, k, dens_dim);
+                    curr_dens = 1.0 + dens_pointer[dens_index] * init_growth_factor;
+                    do_cic_interpolation(resampled_box, pos, out_dim, curr_dens);
+                }
+            }
+        }
+    }
+}
+
+// Function that maps a IC density grid to the perturbed density grid
+// TODO: This shares a lot of code with move_grid_masses and (future) move_cat_galprops.
+//  I should look into combining elements, however since the differences
+//  are on the innermost loops, any generalisation is likely to slow things down.
+void move_grid_galprops(double redshift, float *dens_pointer, int dens_dim[3],
+                        float *vel_pointers[3], float *vel_pointers_2LPT[3], int vel_dim[3],
+                        HaloBox *boxes, int out_dim[3], float *mturn_a_grid, float *mturn_m_grid,
+                        ScalingConstants *consts, IntegralCondition *integral_cond) {
+    // grid dimension constants
+    double boxlen = simulation_options_global->BOX_LEN;
+    double boxlen_z = boxlen * simulation_options_global->NON_CUBIC_FACTOR;
+    double box_size[3] = {boxlen, boxlen, boxlen_z};
+    double dim_ratio_vel = (double)vel_dim[0] / (double)dens_dim[0];
+    double dim_ratio_out = (double)out_dim[0] / (double)dens_dim[0];
+
+    double prefactor_mass = RHOcrit * cosmo_params_global->OMm;
+    double prefactor_stars = RHOcrit * cosmo_params_global->OMb * consts->fstar_10;
+    double prefactor_stars_mini = RHOcrit * cosmo_params_global->OMb * consts->fstar_7;
+    double prefactor_sfr = prefactor_stars / consts->t_star / consts->t_h;
+    double prefactor_sfr_mini = prefactor_stars_mini / consts->t_star / consts->t_h;
+    double prefactor_nion = prefactor_stars * consts->fesc_10 * consts->pop2_ion;
+    double prefactor_nion_mini = prefactor_stars_mini * consts->fesc_7 * consts->pop3_ion;
+    double prefactor_xray = RHOcrit * cosmo_params_global->OMm;
+
+    // Setup IC velocity factors
+    double growth_factor = dicke(redshift);
+    double displacement_factor_2LPT = -(3.0 / 7.0) * growth_factor * growth_factor;  // 2LPT eq. D8
+
+    double init_growth_factor = dicke(simulation_options_global->INITIAL_REDSHIFT);
+    double init_displacement_factor_2LPT =
+        -(3.0 / 7.0) * init_growth_factor * init_growth_factor;  // 2LPT eq. D8
+
+    double velocity_displacement_factor[3] = {
+        (growth_factor - init_growth_factor) / box_size[0] * dens_dim[0],
+        (growth_factor - init_growth_factor) / box_size[1] * dens_dim[1],
+        (growth_factor - init_growth_factor) / box_size[2] * dens_dim[2]};
+    double velocity_displacement_factor_2LPT[3] = {
+        (displacement_factor_2LPT - init_displacement_factor_2LPT) / box_size[0] * dens_dim[0],
+        (displacement_factor_2LPT - init_displacement_factor_2LPT) / box_size[1] * dens_dim[1],
+        (displacement_factor_2LPT - init_displacement_factor_2LPT) / box_size[2] * dens_dim[2]};
+#pragma omp parallel num_threads(simulation_options_global->N_THREADS)
+    {
+        int i, j, k, axis;
+        double pos[3], curr_dens;
+        int ipos[3];
+        unsigned long long vel_index, dens_index;
+        double l10_mturn_a = log10(consts->mturn_a_nofb);
+        double l10_mturn_m = log10(consts->mturn_m_nofb);
+        HaloProperties properties;
+#pragma omp for
+        for (i = 0; i < dens_dim[0]; i++) {
+            for (j = 0; j < dens_dim[1]; j++) {
+                for (k = 0; k < dens_dim[2]; k++) {
+                    // Transform position to units of box size
+                    pos[0] = i;
+                    pos[1] = j;
+                    pos[2] = k;
+                    resample_index((int[3]){i, j, k}, dim_ratio_vel, ipos);
+                    wrap_coord(ipos, vel_dim);
+                    vel_index = grid_index_general(ipos[0], ipos[1], ipos[2], vel_dim);
+                    for (axis = 0; axis < 3; axis++) {
+                        pos[axis] +=
+                            vel_pointers[axis][vel_index] * velocity_displacement_factor[axis];
+                        // add 2LPT second order corrections
+                        if (matter_options_global->PERTURB_ALGORITHM == 2) {
+                            pos[axis] -= vel_pointers_2LPT[axis][vel_index] *
+                                         velocity_displacement_factor_2LPT[axis];
+                        }
+                        pos[axis] *= dim_ratio_out;
+                    }
+
+                    // CIC interpolation
+                    dens_index = grid_index_general(i, j, k, dens_dim);
+                    curr_dens = dens_pointer[dens_index] * growth_factor;
+                    if (astro_options_global->USE_MINI_HALOS) {
+                        l10_mturn_a = mturn_a_grid[dens_index];
+                        l10_mturn_m = mturn_m_grid[dens_index];
+                    }
+
+                    get_cell_integrals(curr_dens, l10_mturn_a, l10_mturn_m, consts, integral_cond,
+                                       &properties);
+
+                    // using the properties struct:
+                    // stellar_mass --> no F_esc integral ACG
+                    // stellar_mass_mini --> no F_esc integral MCG
+                    // n_ion --> F_esc integral ACG
+                    // fescweighted_sfr --> F_esc integral MCG
+                    // halo_xray --> Xray integral
+                    do_cic_interpolation(boxes->halo_sfr, pos, out_dim,
+                                         properties.stellar_mass * prefactor_sfr);
+                    do_cic_interpolation(boxes->n_ion, pos, out_dim,
+                                         properties.n_ion * prefactor_nion +
+                                             properties.fescweighted_sfr * prefactor_nion_mini);
+
+                    if (astro_options_global->USE_MINI_HALOS) {
+                        do_cic_interpolation(boxes->halo_sfr_mini, pos, out_dim,
+                                             properties.stellar_mass_mini * prefactor_sfr_mini);
+                    }
+                    if (astro_options_global->USE_TS_FLUCT) {
+                        do_cic_interpolation(boxes->halo_xray, pos, out_dim,
+                                             properties.halo_xray * prefactor_xray);
+                    }
+
+                    if (config_settings.EXTRA_HALOBOX_FIELDS) {
+                        do_cic_interpolation(boxes->halo_mass, pos, out_dim,
+                                             properties.halo_mass * prefactor_mass);
+                        do_cic_interpolation(boxes->halo_stars, pos, out_dim,
+                                             properties.stellar_mass * prefactor_stars);
+                        if (astro_options_global->USE_MINI_HALOS) {
+                            do_cic_interpolation(
+                                boxes->halo_stars_mini, pos, out_dim,
+                                properties.stellar_mass_mini * prefactor_stars_mini);
+                        }
+                    }
+                }
+            }
+        }
+    }
+    // Without stochasticity, these grids are the same to a constant
+    double prefactor_wsfr = 1 / consts->t_h / consts->t_star;
+    if (astro_options_global->INHOMO_RECO) {
+        for (int i = 0; i < HII_TOT_NUM_PIXELS; i++) {
+            boxes->whalo_sfr[i] = boxes->n_ion[i] * prefactor_wsfr;
+        }
+    }
+}
diff --git a/src/py21cmfast/src/map_mass.h b/src/py21cmfast/src/map_mass.h
new file mode 100644
index 000000000..75e5d55da
--- /dev/null
+++ b/src/py21cmfast/src/map_mass.h
@@ -0,0 +1,24 @@
+
+#include "HaloBox.h"
+#include "OutputStructs.h"
+#include "scaling_relations.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void move_grid_masses(double redshift, float *dens_pointer, int dens_dim[3], float *vel_pointers[3],
+                      float *vel_pointers_2LPT[3], int vel_dim[3], double *resampled_box,
+                      int out_dim[3]);
+
+void move_grid_galprops(double redshift, float *dens_pointer, int dens_dim[3],
+                        float *vel_pointers[3], float *vel_pointers_2LPT[3], int vel_dim[3],
+                        HaloBox *boxes, int out_dim[3], float *mturn_a_grid, float *mturn_m_grid,
+                        ScalingConstants *consts, IntegralCondition *integral_cond);
+
+double *MapMass_gpu(InitialConditions *boxes, double *resampled_box, int dimension,
+                    float f_pixel_factor, float init_growth_factor);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/py21cmfast/src/meson.build b/src/py21cmfast/src/meson.build
new file mode 100644
index 000000000..9eb82ed63
--- /dev/null
+++ b/src/py21cmfast/src/meson.build
@@ -0,0 +1,151 @@
+# Define the C source files
+c_source_files = files([
+  'BrightnessTemperatureBox.c',
+  'HaloBox.c',
+  'HaloField.c',
+  'InitialConditions.c',
+  'InputParameters.c',
+  'IonisationBox.c',
+  'LuminosityFunction.c',
+  'PerturbField.c',
+  'PerturbHaloField.c',
+  'SpinTemperatureBox.c',
+  'Stochasticity.c',
+  'bubble_helper_progs.c',
+  'cosmology.c',
+  'debugging.c',
+  'dft.c',
+  'elec_interp.c',
+  'filtering.c',
+  'heating_helper_progs.c',
+  'hmf.c',
+  'indexing.c',
+  'integral_wrappers.c',
+  'interp_tables.c',
+  'interpolation.c',
+  'photoncons.c',
+  'recombinations.c',
+  'rng.c',
+  'scaling_relations.c',
+  'thermochem.c',
+  'map_mass.c',
+])
+
+# Define the CUDA source files
+cuda_source_files = files([
+  'HaloField.cu',
+  'IonisationBox.cu',
+  'SpinTemperatureBox.cu',
+  'Stochasticity.cu',
+  'filtering.cu',
+  'hmf.cu',
+  'interp_tables.cu',
+  'interpolation.cu',
+  'device_rng.cu',
+  'MapMass_gpu.cu',
+  'test_Stochasticity.cu',
+  'cuda_hello_world.cu',
+])
+
+# C++ wrapper file
+cpp_source_files = files(['_wrapper.cpp'])
+
+# Define the 21cmFast dependencies
+omp = dependency('openmp')
+gsl = dependency('gsl')
+nanobind = dependency('nanobind', static: true)
+
+# If/when fftw gets added to Meson WrapDB, we'll be able to use this:
+# fftw = dependency('fftw3f_threads')
+# ... but until then, we need to jump through some hoops:
+cc = meson.get_compiler ('c')
+search_paths = [ '/usr/lib', '/usr/local/lib', '/opt/homebrew/lib' ]
+fftw = cc.find_library ('fftw3f', required: true, dirs: search_paths)
+fftw_threads = cc.find_library ('fftw3f_threads', required: true, dirs: search_paths)
+
+# Base dependencies (always needed)
+deps = [omp, gsl, nanobind, fftw, fftw_threads]
+
+# CUDA dependency (optional)
+# The root meson.build already checked for USE_CUDA environment variable and nvcc availability
+cuda_dep = dependency('cuda', version: '>=10.0', required: false)
+
+# Check if CUDA language is available (this will be true only if root meson.build successfully added it)
+if cuda_dep.found()
+  message('Using CUDA compilation in source build')
+  cuda_compiler = meson.get_compiler('cuda', required: false)
+  add_project_arguments('-DUSE_CUDA=1', language: ['c', 'cpp', 'cuda'])
+
+  # CUDA compiler arguments
+  # Try to detect GPU architecture, fall back to a reasonable default
+  detect_arch_cmd = run_command('nvidia-smi', '--query-gpu=compute_cap', '--format=csv,noheader,nounits', check: false)
+
+  if detect_arch_cmd.returncode() == 0 and detect_arch_cmd.stdout().strip() != ''
+    # Parse the compute capability (e.g., "7.5" -> "sm_75")
+    compute_cap = detect_arch_cmd.stdout().strip().split('\n')[0]
+    arch_major = compute_cap.split('.')[0]
+    arch_minor = compute_cap.split('.')[1]
+    detected_arch = 'sm_' + arch_major + arch_minor
+    message('Detected GPU architecture: ' + detected_arch)
+    cuda_arch = detected_arch
+  else
+    # Fall back to a widely compatible architecture
+    cuda_arch = 'sm_60'  # Pascal - widely compatible
+    message('Could not detect GPU architecture, using default: ' + cuda_arch)
+  endif
+
+  cuda_args = [
+    '-arch=' + cuda_arch,
+    '--extended-lambda',
+    '--expt-relaxed-constexpr',
+  ]
+  add_project_arguments(cuda_args, language: 'cuda')
+
+  # Add CUDA runtime library
+  cuda_rt = cuda_compiler.find_library('cudart', required: true)
+  deps = deps + [cuda_dep, cuda_rt]
+
+  # Include CUDA source files
+  all_source_files = c_source_files + cuda_source_files + cpp_source_files
+else
+  message('Using CPU-only compilation in source build')
+  add_project_arguments('-DUSE_CUDA=0', language: ['c', 'cpp'])
+  # Only include C/C++ files, no CUDA files
+  all_source_files = c_source_files + cpp_source_files
+endif
+
+# Define a mapping of log level strings to integers
+log_level_map = {
+  'NO_LOG': 0,
+  'ERROR': 1,
+  'WARNING': 2,
+  'INFO': 3,
+  'DEBUG': 4,
+  'SUPER_DEBUG': 5,
+  'ULTRA_DEBUG': 6,
+}
+
+# Check for environment variable first, then fall back to meson option
+env_log_level = run_command('printenv', 'LOG_LEVEL', check: false).stdout().strip()
+if env_log_level != ''
+  log_level_str = env_log_level
+else
+  log_level_str = get_option('log_level')
+endif
+
+# Convert the string to an integer using the map, defaulting to 2 (warnings) if the key is invalid
+log_level = log_level_map.get(log_level_str, 2)
+
+# Print the selected log level for debugging purposes
+message('Selected log level: ' + log_level.to_string())
+
+add_project_arguments('-DLOG_LEVEL=' + log_level.to_string(), language: 'c')
+
+# Define the Python extension module
+py.extension_module(
+  'c_21cmfast',
+  all_source_files,
+  dependencies: deps,
+  install: true,
+  subdir:'py21cmfast',
+)
diff --git a/src/py21cmfast/src/photoncons.c b/src/py21cmfast/src/photoncons.c
index 9f96a352d..9af217f01 100644
--- a/src/py21cmfast/src/photoncons.c
+++ b/src/py21cmfast/src/photoncons.c
@@ -111,7 +111,7 @@ int InitialisePhotonCons() {
         z_arr = calloc(Nmax, sizeof(double));
         Q_arr = calloc(Nmax, sizeof(double));
 
-        struct ScalingConstants sc_i, sc_0, sc_1;
+        ScalingConstants sc_i, sc_0, sc_1;
         set_scaling_constants(a_end, &sc_i, false);
 
         // set the minimum source mass
diff --git a/src/py21cmfast/src/photoncons.h b/src/py21cmfast/src/photoncons.h
index 5f428b6dc..91a5e2de5 100644
--- a/src/py21cmfast/src/photoncons.h
+++ b/src/py21cmfast/src/photoncons.h
@@ -5,6 +5,9 @@
 
 #include "InputParameters.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
 // This is directly accessed in the wrapper currently
 // TODO: remove this global declaration and make an internal checking function
 extern bool photon_cons_allocated;
@@ -29,4 +32,7 @@ int ObtainPhotonConsData(double *z_at_Q_data, double *Q_data, int *Ndata_analyti
 void set_alphacons_params(double norm, double slope);
 double get_fesc_fit(double redshift);
 
+#ifdef __cplusplus
+}
+#endif
 #endif
diff --git a/src/py21cmfast/src/recombinations.h b/src/py21cmfast/src/recombinations.h
index 788f9e28e..1d0aa9645 100644
--- a/src/py21cmfast/src/recombinations.h
+++ b/src/py21cmfast/src/recombinations.h
@@ -1,8 +1,14 @@
 #ifndef _RECOMB_H
 #define _RECOMB_H
 
+#ifdef __cplusplus
+extern "C" {
+#endif
 double splined_recombination_rate(double z_eff, double gamma12_bg);
 /*initializes the lookup table for the PDF density integral in MHR00 model at redshift z*/
 void init_MHR();
 
+#ifdef __cplusplus
+}
+#endif
 #endif
diff --git a/src/py21cmfast/src/scaling_relations.c b/src/py21cmfast/src/scaling_relations.c
index 37c452ec6..c567a4a85 100644
--- a/src/py21cmfast/src/scaling_relations.c
+++ b/src/py21cmfast/src/scaling_relations.c
@@ -18,7 +18,7 @@
 #include "photoncons.h"
 #include "thermochem.h"
 
-void print_sc_consts(struct ScalingConstants *c) {
+void print_sc_consts(ScalingConstants *c) {
     LOG_DEBUG("Printing scaling relation constants z = %.3f....", c->redshift);
     LOG_DEBUG("SHMR: f10 %.2e a %.2e f7 %.2e a_mini %.2e sigma %.2e", c->fstar_10, c->alpha_star,
               c->fstar_7, c->alpha_star_mini, c->sigma_star);
@@ -33,11 +33,11 @@ void print_sc_consts(struct ScalingConstants *c) {
     return;
 }
 
-void set_scaling_constants(double redshift, struct ScalingConstants *consts, bool use_photoncons) {
+void set_scaling_constants(double redshift, ScalingConstants *consts, bool use_photoncons) {
     consts->redshift = redshift;
 
     // Set on for the fixed grid case since we are missing halos above the cell mass
-    consts->fix_mean = matter_options_global->FIXED_HALO_GRIDS;
+    consts->fix_mean = matter_options_global->HMF == 2 || matter_options_global->HMF == 3;
     // whether to fix *integrated* (not sampled) galaxy properties to the expected mean
     consts->scaling_median = astro_options_global->HALO_SCALING_RELATIONS_MEDIAN;
 
@@ -103,8 +103,8 @@ void set_scaling_constants(double redshift, struct ScalingConstants *consts, boo
 }
 
 // It's often useful to create a copy of scaling constants without F_ESC
-struct ScalingConstants evolve_scaling_constants_sfr(struct ScalingConstants *sc) {
-    struct ScalingConstants sc_sfrd = *sc;
+ScalingConstants evolve_scaling_constants_sfr(ScalingConstants *sc) {
+    ScalingConstants sc_sfrd = *sc;
     sc_sfrd.fesc_10 = 1.;
     sc_sfrd.fesc_7 = 1.;
     sc_sfrd.alpha_esc = 0.;
@@ -115,10 +115,9 @@ struct ScalingConstants evolve_scaling_constants_sfr(struct ScalingConstants *sc
 }
 
 // It's often useful to create a copy of scaling relations at a different z
-struct ScalingConstants evolve_scaling_constants_to_redshift(double redshift,
-                                                             struct ScalingConstants *sc,
-                                                             bool use_photoncons) {
-    struct ScalingConstants sc_z = *sc;
+ScalingConstants evolve_scaling_constants_to_redshift(double redshift, ScalingConstants *sc,
+                                                      bool use_photoncons) {
+    ScalingConstants sc_z = *sc;
     sc_z.redshift = redshift;
     sc_z.t_h = t_hubble(redshift);
 
@@ -268,7 +267,7 @@ double get_lx_on_sfr(double sfr, double metallicity, double lx_constant) {
 }
 
 void get_halo_stellarmass(double halo_mass, double mturn_acg, double mturn_mcg, double star_rng,
-                          struct ScalingConstants *consts, double *star_acg, double *star_mcg) {
+                          ScalingConstants *consts, double *star_acg, double *star_mcg) {
     // low-mass ACG power-law parameters
     double f_10 = consts->fstar_10;
     double f_a = consts->alpha_star;
@@ -320,7 +319,7 @@ void get_halo_stellarmass(double halo_mass, double mturn_acg, double mturn_mcg,
 }
 
 void get_halo_sfr(double stellar_mass, double stellar_mass_mini, double sfr_rng,
-                  struct ScalingConstants *consts, double *sfr, double *sfr_mini) {
+                  ScalingConstants *consts, double *sfr, double *sfr_mini) {
     double sfr_mean, sfr_mean_mini;
     double sfr_sample, sfr_sample_mini;
 
@@ -376,7 +375,7 @@ void get_halo_metallicity(double sfr, double stellar, double redshift, double *z
 }
 
 void get_halo_xray(double sfr, double sfr_mini, double metallicity, double xray_rng,
-                   struct ScalingConstants *consts, double *xray_out) {
+                   ScalingConstants *consts, double *xray_out) {
     double sigma_xray = consts->sigma_xray;
 
     // adjustment to the mean for lognormal scatter
diff --git a/src/py21cmfast/src/scaling_relations.h b/src/py21cmfast/src/scaling_relations.h
index 17163ee72..afee70521 100644
--- a/src/py21cmfast/src/scaling_relations.h
+++ b/src/py21cmfast/src/scaling_relations.h
@@ -9,7 +9,7 @@
 //   These are just the values which come from the InputStruct objects and don't change within the
 //   snapshot using this reduces the use of the global parameter structs and allows fewer exp/log
 //   unit changes
-struct ScalingConstants {
+typedef struct ScalingConstants {
     double redshift;
     bool fix_mean;
     bool scaling_median;
@@ -49,28 +49,27 @@ struct ScalingConstants {
     double Mlim_Fesc;
     double Mlim_Fstar_mini;
     double Mlim_Fesc_mini;
-};
+} ScalingConstants;
 
-void set_scaling_constants(double redshift, struct ScalingConstants *consts, bool use_photoncons);
+void set_scaling_constants(double redshift, ScalingConstants *consts, bool use_photoncons);
 
 double get_lx_on_sfr(double sfr, double metallicity, double lx_constant);
 void get_halo_stellarmass(double halo_mass, double mturn_acg, double mturn_mcg, double star_rng,
-                          struct ScalingConstants *consts, double *star_acg, double *star_mcg);
+                          ScalingConstants *consts, double *star_acg, double *star_mcg);
 void get_halo_sfr(double stellar_mass, double stellar_mass_mini, double sfr_rng,
-                  struct ScalingConstants *consts, double *sfr, double *sfr_mini);
+                  ScalingConstants *consts, double *sfr, double *sfr_mini);
 void get_halo_metallicity(double sfr, double stellar, double redshift, double *z_out);
 void get_halo_xray(double sfr, double sfr_mini, double metallicity, double xray_rng,
-                   struct ScalingConstants *consts, double *xray_out);
+                   ScalingConstants *consts, double *xray_out);
 
 double scaling_PL_limit(double M, double norm, double alpha, double pivot, double limit);
 double log_scaling_PL_limit(double lnM, double ln_norm, double alpha, double ln_pivot,
                             double ln_limit);
 double scaling_double_PL(double M, double alpha_lo, double pivot_ratio, double alpha_hi,
                          double pivot_hi);
-struct ScalingConstants evolve_scaling_constants_sfr(struct ScalingConstants *sc);
-struct ScalingConstants evolve_scaling_constants_to_redshift(double redshift,
-                                                             struct ScalingConstants *sc,
-                                                             bool use_photoncons);
-void print_sc_consts(struct ScalingConstants *c);
+ScalingConstants evolve_scaling_constants_sfr(ScalingConstants *sc);
+ScalingConstants evolve_scaling_constants_to_redshift(double redshift, ScalingConstants *sc,
+                                                      bool use_photoncons);
+void print_sc_consts(ScalingConstants *c);
 
 #endif
diff --git a/src/py21cmfast/src/test_Stochasticity.cu b/src/py21cmfast/src/test_Stochasticity.cu
new file mode 100644
index 000000000..c1bbdec12
--- /dev/null
+++ b/src/py21cmfast/src/test_Stochasticity.cu
@@ -0,0 +1,46 @@
+#include <cassert>
+#include <iostream>
+
+#include "Stochasticity.cu"
+
+void testCondenseDeviceArray()
+{
+    // Input data
+    float h_array[] = {1.0f, 0.0f, 2.0f, 3.0f, 0.0f, 4.0f};
+    float mask_value = 0.0f;
+    int original_size = 6;
+
+    // Expected outputs
+    float expected_array[] = {1.0f, 2.0f, 3.0f, 4.0f, 0.0f, 0.0f};
+    int expected_valid_size = 4;
+
+    // Allocate and copy to device
+    float *d_array;
+    cudaMalloc(&d_array, original_size * sizeof(float));
+    cudaMemcpy(d_array, h_array, original_size * sizeof(float), cudaMemcpyHostToDevice);
+
+    // Call the function from Stochasticity.cu
+    int valid_size = condenseDeviceArray(d_array, original_size, mask_value);
+
+    // Copy the results back to the host
+    float h_result[original_size];
+    cudaMemcpy(h_result, d_array, original_size * sizeof(float), cudaMemcpyDeviceToHost);
+
+    // Validate the results
+    assert(valid_size == expected_valid_size);
+    for (int i = 0; i < original_size; ++i)
+    {
+        assert(h_result[i] == expected_array[i]);
+    }
+
+    std::cout << "Test passed: condenseDeviceArray\n";
+
+    // Free device memory
+    cudaFree(d_array);
+}
+
+int main()
+{
+    testCondenseDeviceArray();
+    return 0;
+}
diff --git a/src/py21cmfast/src/thermochem.h b/src/py21cmfast/src/thermochem.h
index 4a18642d2..f2e92d35f 100644
--- a/src/py21cmfast/src/thermochem.h
+++ b/src/py21cmfast/src/thermochem.h
@@ -3,6 +3,9 @@
 
 #include "InputParameters.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
 float ComputeTau(int Npoints, float *redshifts, float *global_xHI, float z_re_HeII);
 double molecular_cooling_threshold(float z);
 double atomic_cooling_threshold(float z);
@@ -21,4 +24,7 @@ double HeII_ion_crosssec(double nu);
 double HI_ion_crosssec(double nu);
 double neutral_fraction(double density, double T4, double gamma, int usecaseB);
 
+#ifdef __cplusplus
+}
+#endif
 #endif
diff --git a/src/py21cmfast/wrapper/_utils.py b/src/py21cmfast/wrapper/_utils.py
index 24b599c83..9b28e745b 100644
--- a/src/py21cmfast/wrapper/_utils.py
+++ b/src/py21cmfast/wrapper/_utils.py
@@ -3,12 +3,10 @@
 import logging
 
 import numpy as np
-from cffi import FFI
 
-from ..c_21cmfast import ffi, lib
-from .exceptions import _process_exitcode
+import py21cmfast.c_21cmfast as lib
 
-_ffi = FFI()
+from .exceptions import _process_exitcode
 
 logger = logging.getLogger(__name__)
 
@@ -30,33 +28,55 @@
 
 def asarray(ptr, shape):
     """Get the canonical C type of the elements of ptr as a string."""
-    ctype = _ffi.getctype(_ffi.typeof(ptr).item).split("*")[0].strip()
+    ctype = type(ptr).__name__  # TODO: check
 
     if ctype not in ctype2dtype:
         raise RuntimeError(
             f"Cannot create an array for element type: {ctype}. Can do {list(ctype2dtype.values())}."
         )
 
-    array = np.frombuffer(
-        _ffi.buffer(ptr, _ffi.sizeof(ctype) * np.prod(shape)), ctype2dtype[ctype]
-    )
+    array = np.frombuffer(ptr, ctype2dtype[ctype])  # TODO: check
     array.shape = shape
     return array
 
 
+def _nb_initialise_return_value(arg_string, out_shape=(1,)):
+    """Return a zero-initialised object of the correct type given a nanobind signature.
+
+    Currently only works with wrapped structures or numpy arrays.
+    """
+    # If it's a wrapped class, return the class
+    if "py21cmfast.c_21cmfast" in arg_string:
+        return getattr(lib, arg_string.split("py21cmfast.c_21cmfast")[-1])()
+
+    if "*" in arg_string or "ndarray" in arg_string:
+        base_type = arg_string.split("dtype=")[1].split("]")[0]
+        return np.zeros(out_shape, dtype=getattr(np, base_type))
+
+    raise ValueError(
+        f"Cannot create a zero-initialised object of type {arg_string}."
+        "As it is not a pointer, array or class. Please check the function signature."
+    )
+
+
 def _call_c_simple(fnc, *args):
     """Call a simple C function that just returns an object.
 
-    Any such function should be defined such that the last argument is an int pointer generating
-    the status.
+    Assumes that the last argument is a pointer to an object that will be filled in by the C function.
+    This argument is initialised here and returned.
     """
     # Parse the function to get the type of the last argument
-    cdata = str(ffi.addressof(lib, fnc.__name__))
-    kind = cdata.split("(")[-1].split(")")[0].split(",")[-1]
-    result = ffi.new(kind)
+    cdata = fnc.__nb_signature__[0][0]
+    # Nanobind signature is 'def fnc.__name__(arg0: type0, arg1: type1, ..., argN: typeN, /) -> returntype'
+    # We wish to extract the type of the last argument only.
+    signature_string = (
+        cdata.split("(")[-1].split(")")[0].split(",")[-2].replace("arg: ", "").strip()
+    )
+    # NOTE: This uses the default return size == 1 for arrays
+    result = _nb_initialise_return_value(signature_string)
     status = fnc(*args, result)
     _process_exitcode(status, fnc, args)
-    return result[0]
+    return result
 
 
 def camel_to_snake(word: str, depublicize: bool = False):
diff --git a/src/py21cmfast/wrapper/cfuncs.py b/src/py21cmfast/wrapper/cfuncs.py
index 285efdef8..a1e3e7208 100644
--- a/src/py21cmfast/wrapper/cfuncs.py
+++ b/src/py21cmfast/wrapper/cfuncs.py
@@ -9,8 +9,9 @@
 from numpy.typing import NDArray
 from scipy.interpolate import interp1d
 
+import py21cmfast.c_21cmfast as lib
+
 from .._cfg import config
-from ..c_21cmfast import ffi, lib
 from ._utils import _process_exitcode
 from .inputs import (
     InputParameters,
@@ -21,7 +22,16 @@
 # Ideally, backend functions that we access here should do all the broadcasting/initialisation themselves
 # These decorators are for lower functions which are called directly in one or two lines, like delta_crit
 
-# TODO: a lot of these assume input as numpy arrays via use of .shape, explicitly require this
+# NOTE: On casting to C pointers:
+# -------------------------------
+# Currently our wrapper functions directly take C type pointers, which
+# requires us to cast the data to the correct type before passing it to the C.
+# This is made annoying by the fact that CAMB (which is indirectly imported somewhere)
+# appears to have overwritten the ctypes library pointer types which cause errors.
+# We will use the nanobind ndarray casters, which allow us to pass
+# numpy arrays directly to C++ functions, with size and type information.
+# We will have to translate the `integral_wrapper.c` functions to C++ and (maybe?) define
+# some wrapper layer functions in C++ for the output struct functions to parse the array data.
 
 
 def broadcast_input_struct(inputs: InputParameters):
@@ -191,14 +201,10 @@ def compute_tau(
     redshifts = np.array(redshifts, dtype="float32")
     global_xHI = np.array(global_xHI, dtype="float32")
 
-    z = ffi.cast("float *", ffi.from_buffer(redshifts))
-    xHI = ffi.cast("float *", ffi.from_buffer(global_xHI))
-
     # Run the C code
     return lib.ComputeTau(
-        len(redshifts),
-        z,
-        xHI,
+        redshifts,
+        global_xHI,
         z_re_HeII,
     )
 
@@ -280,85 +286,61 @@ def compute_luminosity_function(
         )
         component = "acg"
 
-    lfunc = np.zeros(len(redshifts) * nbins)
-    Muvfunc = np.zeros(len(redshifts) * nbins)
-    Mhfunc = np.zeros(len(redshifts) * nbins)
-
-    lfunc.shape = (len(redshifts), nbins)
-    Muvfunc.shape = (len(redshifts), nbins)
-    Mhfunc.shape = (len(redshifts), nbins)
-
-    c_Muvfunc = ffi.cast("double *", ffi.from_buffer(Muvfunc))
-    c_Mhfunc = ffi.cast("double *", ffi.from_buffer(Mhfunc))
-    c_lfunc = ffi.cast("double *", ffi.from_buffer(lfunc))
+    lfunc = np.zeros((len(redshifts), nbins))
+    Muvfunc = np.zeros((len(redshifts), nbins))
+    Mhfunc = np.zeros((len(redshifts), nbins))
 
-    lfunc_MINI = np.zeros(len(redshifts) * nbins)
-    Muvfunc_MINI = np.zeros(len(redshifts) * nbins)
-    Mhfunc_MINI = np.zeros(len(redshifts) * nbins)
-
-    lfunc_MINI.shape = (len(redshifts), nbins)
-    Muvfunc_MINI.shape = (len(redshifts), nbins)
-    Mhfunc_MINI.shape = (len(redshifts), nbins)
-
-    c_Muvfunc_MINI = ffi.cast("double *", ffi.from_buffer(Muvfunc_MINI))
-    c_Mhfunc_MINI = ffi.cast("double *", ffi.from_buffer(Mhfunc_MINI))
-    c_lfunc_MINI = ffi.cast("double *", ffi.from_buffer(lfunc_MINI))
+    lfunc_MINI = np.zeros((len(redshifts), nbins))
+    Muvfunc_MINI = np.zeros((len(redshifts), nbins))
+    Mhfunc_MINI = np.zeros((len(redshifts), nbins))
 
     if component in ("both", "acg"):
         # Run the C code
         errcode = lib.ComputeLF(
-            nbins,
             1,
-            len(redshifts),
-            ffi.cast("float *", ffi.from_buffer(redshifts)),
-            ffi.cast("float *", ffi.from_buffer(mturnovers)),
-            c_Muvfunc,
-            c_Mhfunc,
-            c_lfunc,
+            nbins,
+            redshifts,
+            mturnovers,
+            Muvfunc,
+            Mhfunc,
+            lfunc,
         )
 
         _process_exitcode(
             errcode,
             lib.ComputeLF,
             (
-                nbins,
                 1,
-                len(redshifts),
+                nbins,
             ),
         )
 
     if component in ("both", "mcg"):
         # Run the C code
         errcode = lib.ComputeLF(
-            nbins,
             2,
-            len(redshifts),
-            ffi.cast("float *", ffi.from_buffer(redshifts)),
-            ffi.cast("float *", ffi.from_buffer(mturnovers_mini)),
-            c_Muvfunc_MINI,
-            c_Mhfunc_MINI,
-            c_lfunc_MINI,
+            nbins,
+            redshifts,
+            mturnovers_mini,
+            Muvfunc_MINI,
+            Mhfunc_MINI,
+            lfunc_MINI,
         )
 
         _process_exitcode(
             errcode,
             lib.ComputeLF,
             (
-                nbins,
                 2,
-                len(redshifts),
+                nbins,
             ),
         )
 
     if component == "both":
         # redo the Muv range using the faintest (most likely MINI) and the brightest (most likely massive)
-        lfunc_all = np.zeros(len(redshifts) * nbins)
-        Muvfunc_all = np.zeros(len(redshifts) * nbins)
-        Mhfunc_all = np.zeros(len(redshifts) * nbins * 2)
-
-        lfunc_all.shape = (len(redshifts), nbins)
-        Muvfunc_all.shape = (len(redshifts), nbins)
-        Mhfunc_all.shape = (len(redshifts), nbins, 2)
+        lfunc_all = np.zeros((len(redshifts), nbins))
+        Muvfunc_all = np.zeros((len(redshifts), nbins))
+        Mhfunc_all = np.zeros((len(redshifts), nbins, 2))
 
         for iz in range(len(redshifts)):
             Muvfunc_all[iz] = np.linspace(
@@ -439,7 +421,7 @@ def get_matter_power_values(
 def evaluate_sigma(
     *,
     inputs: InputParameters,
-    masses: NDArray[float],
+    masses: NDArray[np.float64],
 ):
     """
     Evaluate the variance of a mass scale.
@@ -447,14 +429,13 @@ def evaluate_sigma(
     Uses the 21cmfast backend
     """
     masses = masses.astype("f8")
-    sigma = np.zeros_like(masses)
-    dsigmasq = np.zeros_like(masses)
+    sigma = np.zeros_like(masses, dtype="f8")
+    dsigmasq = np.zeros_like(masses, dtype="f8")
 
     lib.get_sigma(
-        masses.size,
-        ffi.cast("double *", ffi.from_buffer(masses)),
-        ffi.cast("double *", ffi.from_buffer(sigma)),
-        ffi.cast("double *", ffi.from_buffer(dsigmasq)),
+        masses,
+        sigma,
+        dsigmasq,
     )
 
     return sigma, dsigmasq
@@ -507,7 +488,7 @@ def get_delta_crit_nu(hmf_int_flag: int, sigma: float, growth: float):
 @broadcast_params
 def evaluate_condition_integrals(
     inputs: InputParameters,
-    cond_array: NDArray[float],
+    cond_array: NDArray[np.float64],
     redshift: float,
     redshift_prev: float | None = None,
 ):
@@ -524,10 +505,9 @@ def evaluate_condition_integrals(
     lib.get_condition_integrals(
         redshift,
         redshift_prev if redshift_prev is not None else -1,
-        cond_array.size,
-        ffi.cast("double *", ffi.from_buffer(cond_array)),
-        ffi.cast("double *", ffi.from_buffer(n_halo)),
-        ffi.cast("double *", ffi.from_buffer(m_coll)),
+        cond_array,
+        n_halo,
+        m_coll,
     )
 
     return n_halo, m_coll
@@ -537,9 +517,9 @@ def evaluate_condition_integrals(
 def integrate_chmf_interval(
     inputs: InputParameters,
     redshift: float,
-    lnm_lower: NDArray[float],
-    lnm_upper: NDArray[float],
-    cond_values: NDArray[float],
+    lnm_lower: NDArray[np.float64],
+    lnm_upper: NDArray[np.float64],
+    cond_values: NDArray[np.float64],
     redshift_prev: float | None = None,
 ):
     """Evaluate conditional mass function integrals at a range of mass intervals."""
@@ -555,12 +535,10 @@ def integrate_chmf_interval(
     lib.get_halo_chmf_interval(
         redshift,
         redshift_prev if redshift_prev is not None else -1,
-        len(cond_values),
-        ffi.cast("double *", ffi.from_buffer(cond_values)),
-        len(lnm_lower),
-        ffi.cast("double *", ffi.from_buffer(lnm_lower)),
-        ffi.cast("double *", ffi.from_buffer(lnm_upper)),
-        ffi.cast("double *", ffi.from_buffer(out_prob)),
+        cond_values,
+        lnm_lower,
+        lnm_upper,
+        out_prob,
     )
 
     return out_prob
@@ -569,8 +547,8 @@ def integrate_chmf_interval(
 @broadcast_params
 def evaluate_inverse_table(
     inputs: InputParameters,
-    cond_array: NDArray[float],
-    probabilities: NDArray[float],
+    cond_array: NDArray[np.float64],
+    probabilities: NDArray[np.float64],
     redshift: float,
     redshift_prev: float | None = None,
 ):
@@ -591,10 +569,9 @@ def evaluate_inverse_table(
     lib.get_halomass_at_probability(
         redshift,
         redshift_prev,
-        cond_array.size,
-        ffi.cast("double *", ffi.from_buffer(cond_array)),
-        ffi.cast("double *", ffi.from_buffer(probabilities)),
-        ffi.cast("double *", ffi.from_buffer(masses)),
+        cond_array,
+        probabilities,
+        masses,
     )
 
     return masses
@@ -603,7 +580,7 @@ def evaluate_inverse_table(
 @broadcast_params
 def evaluate_FgtrM_cond(
     inputs: InputParameters,
-    densities: NDArray[float],
+    densities: NDArray[np.float64],
     redshift: float,
     R: float,
 ):
@@ -615,10 +592,9 @@ def evaluate_FgtrM_cond(
     lib.get_conditional_FgtrM(
         redshift,
         R,
-        densities.size,
-        ffi.cast("double *", ffi.from_buffer(densities)),
-        ffi.cast("double *", ffi.from_buffer(fcoll)),
-        ffi.cast("double *", ffi.from_buffer(dfcoll)),
+        densities,
+        fcoll,
+        dfcoll,
     )
     return fcoll, dfcoll
 
@@ -627,8 +603,8 @@ def evaluate_FgtrM_cond(
 def evaluate_SFRD_z(
     *,
     inputs: InputParameters,
-    redshifts: NDArray[float],
-    log10mturns: NDArray[float],
+    redshifts: NDArray[np.float64],
+    log10mturns: NDArray[np.float64],
 ):
     """Evaluate the global star formation rate density expected at a range of redshifts."""
     if redshifts.shape != log10mturns.shape:
@@ -643,11 +619,10 @@ def evaluate_SFRD_z(
     sfrd_mini = np.zeros_like(redshifts)
 
     lib.get_global_SFRD_z(
-        redshifts.size,
-        ffi.cast("double *", ffi.from_buffer(redshifts)),
-        ffi.cast("double *", ffi.from_buffer(log10mturns)),
-        ffi.cast("double *", ffi.from_buffer(sfrd)),
-        ffi.cast("double *", ffi.from_buffer(sfrd_mini)),
+        redshifts,
+        log10mturns,
+        sfrd,
+        sfrd_mini,
     )
 
     return sfrd, sfrd_mini
@@ -657,8 +632,8 @@ def evaluate_SFRD_z(
 def evaluate_Nion_z(
     *,
     inputs: InputParameters,
-    redshifts: NDArray[float],
-    log10mturns: NDArray[float],
+    redshifts: NDArray[np.float64],
+    log10mturns: NDArray[np.float64],
 ):
     """Evaluate the global ionising emissivity expected at a range of redshifts."""
     if redshifts.shape != log10mturns.shape:
@@ -673,11 +648,10 @@ def evaluate_Nion_z(
     nion_mini = np.zeros_like(redshifts)
 
     lib.get_global_Nion_z(
-        redshifts.size,
-        ffi.cast("double *", ffi.from_buffer(redshifts)),
-        ffi.cast("double *", ffi.from_buffer(log10mturns)),
-        ffi.cast("double *", ffi.from_buffer(nion)),
-        ffi.cast("double *", ffi.from_buffer(nion_mini)),
+        redshifts,
+        log10mturns,
+        nion,
+        nion_mini,
     )
 
     return nion, nion_mini
@@ -689,8 +663,8 @@ def evaluate_SFRD_cond(
     inputs: InputParameters,
     redshift: float,
     radius: float,
-    densities: NDArray[float],
-    log10mturns: NDArray[float],
+    densities: NDArray[np.float64],
+    log10mturns: NDArray[np.float64],
 ):
     """Evaluate the conditional star formation rate density expected at a range of densities."""
     if densities.shape != log10mturns.shape:
@@ -706,11 +680,10 @@ def evaluate_SFRD_cond(
     lib.get_conditional_SFRD(
         redshift,
         radius,
-        densities.size,
-        ffi.cast("double *", ffi.from_buffer(densities)),
-        ffi.cast("double *", ffi.from_buffer(log10mturns)),
-        ffi.cast("double *", ffi.from_buffer(sfrd)),
-        ffi.cast("double *", ffi.from_buffer(sfrd_mini)),
+        densities,
+        log10mturns,
+        sfrd,
+        sfrd_mini,
     )
 
     return sfrd, sfrd_mini
@@ -722,9 +695,9 @@ def evaluate_Nion_cond(
     inputs: InputParameters,
     redshift: float,
     radius: float,
-    densities: NDArray[float],
-    l10mturns_acg: NDArray[float],
-    l10mturns_mcg: NDArray[float],
+    densities: NDArray[np.float64],
+    l10mturns_acg: NDArray[np.float64],
+    l10mturns_mcg: NDArray[np.float64],
 ):
     """Evaluate the conditional ionising emissivity expected at a range of densities."""
     if not (densities.shape == l10mturns_mcg.shape == l10mturns_acg.shape):
@@ -741,12 +714,11 @@ def evaluate_Nion_cond(
     lib.get_conditional_Nion(
         redshift,
         radius,
-        densities.size,
-        ffi.cast("double *", ffi.from_buffer(densities)),
-        ffi.cast("double *", ffi.from_buffer(l10mturns_acg)),
-        ffi.cast("double *", ffi.from_buffer(l10mturns_mcg)),
-        ffi.cast("double *", ffi.from_buffer(nion)),
-        ffi.cast("double *", ffi.from_buffer(nion_mini)),
+        densities,
+        l10mturns_acg,
+        l10mturns_mcg,
+        nion,
+        nion_mini,
     )
 
     return nion, nion_mini
@@ -758,8 +730,8 @@ def evaluate_Xray_cond(
     inputs: InputParameters,
     redshift: float,
     radius: float,
-    densities: NDArray[float],
-    log10mturns: NDArray[float],
+    densities: NDArray[np.float64],
+    log10mturns: NDArray[np.float64],
 ):
     """Evaluate the conditional star formation rate density expected at a range of densities."""
     if densities.shape != log10mturns.shape:
@@ -775,10 +747,9 @@ def evaluate_Xray_cond(
     lib.get_conditional_Xray(
         redshift,
         radius,
-        densities.size,
-        ffi.cast("double *", ffi.from_buffer(densities)),
-        ffi.cast("double *", ffi.from_buffer(log10mturns)),
-        ffi.cast("double *", ffi.from_buffer(xray)),
+        densities,
+        log10mturns,
+        xray,
     )
 
     return xray
@@ -800,8 +771,7 @@ def sample_halos_from_conditions(
 
     n_cond = cond_array.size
     # all coordinates zero
-    crd_in = np.zeros(3 * n_cond).astype("f4")
-
+    crd_in = np.zeros((n_cond, 3)).astype("f4")
     cond_array = cond_array.astype("f4")
     nhalo_out = np.zeros(1).astype("i4")
     N_out = np.zeros(n_cond).astype("i4")
@@ -809,22 +779,21 @@ def sample_halos_from_conditions(
     exp_M = np.zeros(n_cond).astype("f8")
     exp_N = np.zeros(n_cond).astype("f8")
     halomass_out = np.zeros(buffer_size).astype("f4")
-    halocrd_out = np.zeros(int(3 * buffer_size)).astype("i4")
+    halocrd_out = np.zeros((buffer_size, 3)).astype("i4")
 
     lib.single_test_sample(
         inputs.random_seed,
-        n_cond,
-        ffi.cast("float *", cond_array.ctypes.data),
-        ffi.cast("float *", crd_in.ctypes.data),
+        cond_array,
+        crd_in,
         redshift,
         z_prev,
-        ffi.cast("int *", nhalo_out.ctypes.data),
-        ffi.cast("int *", N_out.ctypes.data),
-        ffi.cast("double *", exp_N.ctypes.data),
-        ffi.cast("double *", M_out.ctypes.data),
-        ffi.cast("double *", exp_M.ctypes.data),
-        ffi.cast("float *", halomass_out.ctypes.data),
-        ffi.cast("float *", halocrd_out.ctypes.data),
+        nhalo_out,
+        N_out,
+        exp_N,
+        M_out,
+        exp_M,
+        halomass_out,
+        halocrd_out,
     )
 
     return {
@@ -842,15 +811,15 @@ def convert_halo_properties(
     *,
     redshift: float,
     inputs: InputParameters,
-    halo_masses: NDArray[float],
-    star_rng: NDArray[float],
-    sfr_rng: NDArray[float],
-    xray_rng: NDArray[float],
-    halo_coords: NDArray[float] | None = None,
-    vcb_grid: NDArray[float] | None = None,
-    J_21_LW_grid: NDArray[float] | None = None,
-    z_re_grid: NDArray[float] | None = None,
-    Gamma12_grid: NDArray[float] | None = None,
+    halo_masses: NDArray[np.float64],
+    star_rng: NDArray[np.float64],
+    sfr_rng: NDArray[np.float64],
+    xray_rng: NDArray[np.float64],
+    halo_coords: NDArray[np.float64] | None = None,
+    vcb_grid: NDArray[np.float64] | None = None,
+    J_21_LW_grid: NDArray[np.float64] | None = None,
+    z_re_grid: NDArray[np.float64] | None = None,
+    Gamma12_grid: NDArray[np.float64] | None = None,
 ):
     """
     Convert a halo catalogue's mass and RNG fields to halo properties.
@@ -876,11 +845,12 @@ def convert_halo_properties(
         raise ValueError("Halo masses and rng shapes must be identical.")
 
     n_halos = halo_masses.size
+    orig_shape = halo_masses.shape
     out_buffer = np.zeros((n_halos, 12), dtype="f4")
     lo_dim = (inputs.simulation_options.HII_DIM,) * 3
 
     if halo_coords is None:
-        halo_coords = np.zeros(3 * n_halos)
+        halo_coords = np.zeros((n_halos, 3))
     if vcb_grid is None:
         vcb_grid = np.zeros(lo_dim)
     if J_21_LW_grid is None:
@@ -895,42 +865,41 @@ def convert_halo_properties(
     z_re_grid = z_re_grid.astype("f4")
     Gamma12_grid = Gamma12_grid.astype("f4")
 
-    halo_masses = halo_masses.astype("f4")
-    halo_coords = halo_coords.astype("f4")
-    star_rng = star_rng.astype("f4")
-    sfr_rng = sfr_rng.astype("f4")
-    xray_rng = xray_rng.astype("f4")
+    halo_masses = halo_masses.reshape(n_halos).astype("f4")
+    halo_coords = halo_coords.reshape(n_halos, 3).astype("f4")
+    star_rng = star_rng.reshape(n_halos).astype("f4")
+    sfr_rng = sfr_rng.reshape(n_halos).astype("f4")
+    xray_rng = xray_rng.reshape(n_halos).astype("f4")
 
     lib.test_halo_props(
         redshift,
-        ffi.cast("float *", vcb_grid.ctypes.data),
-        ffi.cast("float *", J_21_LW_grid.ctypes.data),
-        ffi.cast("float *", z_re_grid.ctypes.data),
-        ffi.cast("float *", Gamma12_grid.ctypes.data),
-        n_halos,
-        ffi.cast("float *", halo_masses.ctypes.data),
-        ffi.cast("float *", halo_coords.ctypes.data),
-        ffi.cast("float *", star_rng.ctypes.data),
-        ffi.cast("float *", sfr_rng.ctypes.data),
-        ffi.cast("float *", xray_rng.ctypes.data),
-        ffi.cast("float *", out_buffer.ctypes.data),
+        vcb_grid,
+        J_21_LW_grid,
+        z_re_grid,
+        Gamma12_grid,
+        halo_masses,
+        halo_coords,
+        star_rng,
+        sfr_rng,
+        xray_rng,
+        out_buffer,
     )
 
     out_buffer = out_buffer.reshape(n_halos, 12)
 
     return {
-        "halo_mass": out_buffer[:, 0].reshape(halo_masses.shape),
-        "halo_stars": out_buffer[:, 1].reshape(halo_masses.shape),
-        "halo_sfr": out_buffer[:, 2].reshape(halo_masses.shape),
-        "halo_xray": out_buffer[:, 3].reshape(halo_masses.shape),
-        "n_ion": out_buffer[:, 4].reshape(halo_masses.shape),
-        "halo_wsfr": out_buffer[:, 5].reshape(halo_masses.shape),
-        "halo_stars_mini": out_buffer[:, 6].reshape(halo_masses.shape),
-        "halo_sfr_mini": out_buffer[:, 7].reshape(halo_masses.shape),
-        "mturn_a": out_buffer[:, 8].reshape(halo_masses.shape),
-        "mturn_m": out_buffer[:, 9].reshape(halo_masses.shape),
-        "mturn_r": out_buffer[:, 10].reshape(halo_masses.shape),
-        "metallicity": out_buffer[:, 11].reshape(halo_masses.shape),
+        "halo_mass": out_buffer[:, 0].reshape(orig_shape),
+        "halo_stars": out_buffer[:, 1].reshape(orig_shape),
+        "halo_sfr": out_buffer[:, 2].reshape(orig_shape),
+        "halo_xray": out_buffer[:, 3].reshape(orig_shape),
+        "n_ion": out_buffer[:, 4].reshape(orig_shape),
+        "halo_wsfr": out_buffer[:, 5].reshape(orig_shape),
+        "halo_stars_mini": out_buffer[:, 6].reshape(orig_shape),
+        "halo_sfr_mini": out_buffer[:, 7].reshape(orig_shape),
+        "mturn_a": out_buffer[:, 8].reshape(orig_shape),
+        "mturn_m": out_buffer[:, 9].reshape(orig_shape),
+        "mturn_r": out_buffer[:, 10].reshape(orig_shape),
+        "metallicity": out_buffer[:, 11].reshape(orig_shape),
     }
 
 
diff --git a/src/py21cmfast/wrapper/exceptions.py b/src/py21cmfast/wrapper/exceptions.py
index 6bfed3749..e64982551 100644
--- a/src/py21cmfast/wrapper/exceptions.py
+++ b/src/py21cmfast/wrapper/exceptions.py
@@ -81,6 +81,12 @@ class MemoryAllocError(FatalCError):
     default_message = """An error has occured while attempting to allocate memory! (check the LOG for more info)"""
 
 
+class CUDAError(FatalCError):
+    """An exception when an error occurs with CUDA."""
+
+    default_message = """A CUDA error has occured! (check the LOG for more info)"""
+
+
 SUCCESS = 0
 IOERROR = 1
 GSLERROR = 2
@@ -91,12 +97,15 @@ class MemoryAllocError(FatalCError):
 INFINITYORNANERROR = 7
 MASSDEPZETAERROR = 8
 MEMORYALLOCERROR = 9
+CUDAERROR = 10
 
 
 def _process_exitcode(exitcode, fnc, args):
     """Determine what happens for different values of the (integer) exit code from a C function."""
     if exitcode != SUCCESS:
-        logger.error(f"In function: {fnc.__name__}.  Arguments: {args}")
+        logger.error(
+            f"Error code {exitcode} in function: {fnc.__name__}.  Arguments: {args}"
+        )
 
         if exitcode:
             try:
@@ -110,6 +119,7 @@ def _process_exitcode(exitcode, fnc, args):
                     INFINITYORNANERROR: InfinityorNaNError,
                     MASSDEPZETAERROR: MassDepZetaError,
                     MEMORYALLOCERROR: MemoryAllocError,
+                    CUDAERROR: CUDAError,
                 }[exitcode]
             except KeyError as e:  # pragma: no cover
                 raise FatalCError(
diff --git a/src/py21cmfast/wrapper/inputs.py b/src/py21cmfast/wrapper/inputs.py
index a007a119e..690b47c5b 100644
--- a/src/py21cmfast/wrapper/inputs.py
+++ b/src/py21cmfast/wrapper/inputs.py
@@ -119,11 +119,6 @@ class InputStruct:
     .. warning:: This class will *not* deal well with parameters of the struct which are
                  pointers. All parameters should be primitive types, except for strings,
                  which are dealt with specially.
-
-    Parameters
-    ----------
-    ffi : cffi object
-        The ffi object from any cffi-wrapped library.
     """
 
     _subclasses: ClassVar = {}
@@ -188,11 +183,6 @@ def cstruct(self) -> StructWrapper:
         cdict = self.cdict
         for k in self.struct.fieldnames:
             val = cdict[k]
-
-            if isinstance(val, str):
-                # If it is a string, need to convert it to C string ourselves.
-                val = self.ffi.new("char[]", val.encode())
-
             setattr(self.struct.cstruct, k, val)
 
         return self.struct.cstruct
@@ -349,7 +339,7 @@ def OMl(self):
         """Omega lambda, dark energy density."""
         return 1 - self.OMm
 
-    @property
+    @cached_property
     def cosmo(self):
         """An astropy cosmology object for this cosmology."""
         return self._base_cosmo.clone(
@@ -1385,7 +1375,7 @@ def _astro_options_validator(self, att, val):
             )
         elif (
             val.INTEGRATION_METHOD_ATOMIC == "GAMMA-APPROX"
-            and self.matter_options.HMF != 0
+            and self.matter_options.HMF != "PS"
         ):
             warnings.warn(
                 "The 'GAMMA-APPROX' integration method uses the EPS conditional mass function"
diff --git a/src/py21cmfast/wrapper/outputs.py b/src/py21cmfast/wrapper/outputs.py
index 96556e937..3ed46cdfa 100644
--- a/src/py21cmfast/wrapper/outputs.py
+++ b/src/py21cmfast/wrapper/outputs.py
@@ -28,7 +28,9 @@
 from astropy.cosmology import z_at_value
 from bidict import bidict
 
-from ..c_21cmfast import lib
+import py21cmfast.c_21cmfast as lib
+
+from .._cfg import config
 from .arrays import Array
 from .exceptions import _process_exitcode
 from .inputs import (
@@ -889,10 +891,10 @@ class HaloBox(OutputStructZ):
     _meta = False
     _c_compute_function = lib.ComputeHaloBox
 
-    halo_mass = _arrayfield()
-    halo_stars = _arrayfield()
+    count = _arrayfield(optional=True)
+    halo_mass = _arrayfield(optional=True)
+    halo_stars = _arrayfield(optional=True)
     halo_stars_mini = _arrayfield(optional=True)
-    count = _arrayfield()
     halo_sfr = _arrayfield()
     halo_sfr_mini = _arrayfield(optional=True)
     halo_xray = _arrayfield(optional=True)
@@ -922,15 +924,11 @@ def new(cls, inputs: InputParameters, redshift: float, **kw) -> Self:
         shape = (dim, dim, int(inputs.simulation_options.NON_CUBIC_FACTOR * dim))
 
         out = {
-            "halo_mass": Array(shape, dtype=np.float32),
-            "halo_stars": Array(shape, dtype=np.float32),
-            "count": Array(shape, dtype=np.int32),
             "halo_sfr": Array(shape, dtype=np.float32),
             "n_ion": Array(shape, dtype=np.float32),
         }
 
         if inputs.astro_options.USE_MINI_HALOS:
-            out["halo_stars_mini"] = Array(shape, dtype=np.float32)
             out["halo_sfr_mini"] = Array(shape, dtype=np.float32)
 
         if inputs.astro_options.INHOMO_RECO:
@@ -939,6 +937,13 @@ def new(cls, inputs: InputParameters, redshift: float, **kw) -> Self:
         if inputs.astro_options.USE_TS_FLUCT:
             out["halo_xray"] = Array(shape, dtype=np.float32)
 
+        if config["EXTRA_HALOBOX_FIELDS"]:
+            out["count"] = Array(shape, dtype=np.int32)
+            out["halo_mass"] = Array(shape, dtype=np.float32)
+            out["halo_stars"] = Array(shape, dtype=np.float32)
+            if inputs.astro_options.USE_MINI_HALOS:
+                out["halo_stars_mini"] = Array(shape, dtype=np.float32)
+
         return cls(
             inputs=inputs,
             redshift=redshift,
@@ -958,20 +963,24 @@ def get_required_input_arrays(self, input_box: OutputStruct) -> list[str]:
                     "sfr_rng",
                     "xray_rng",
                 ]
-        elif isinstance(input_box, PerturbedField):
-            if self.matter_options.FIXED_HALO_GRIDS:
-                required += ["density"]
         elif isinstance(input_box, TsBox):
             if self.astro_options.USE_MINI_HALOS:
                 required += ["J_21_LW"]
         elif isinstance(input_box, IonizedBox):
             required += ["ionisation_rate_G12", "z_reion"]
         elif isinstance(input_box, InitialConditions):
-            if (
-                self.matter_options.HALO_STOCHASTICITY
-                and self.astro_options.AVG_BELOW_SAMPLER
-            ):
-                required += ["lowres_density"]
+            required += [
+                "lowres_density",
+                "lowres_vx",
+                "lowres_vy",
+                "lowres_vz",
+            ]
+            if self.matter_options.PERTURB_ALGORITHM == "2LPT":
+                required += [
+                    "lowres_vx_2LPT",
+                    "lowres_vy_2LPT",
+                    "lowres_vz_2LPT",
+                ]
             if self.matter_options.USE_RELATIVE_VELOCITIES:
                 required += ["lowres_vcb"]
         else:
@@ -984,7 +993,6 @@ def compute(
         *,
         initial_conditions: InitialConditions,
         pt_halos: PerturbHaloField,
-        perturbed_field: PerturbedField,
         previous_spin_temp: TsBox,
         previous_ionize_box: IonizedBox,
         allow_already_computed: bool = False,
@@ -994,7 +1002,6 @@ def compute(
             allow_already_computed,
             self.redshift,
             initial_conditions,
-            perturbed_field,
             pt_halos,
             previous_spin_temp,
             previous_ionize_box,
diff --git a/src/py21cmfast/wrapper/photoncons.py b/src/py21cmfast/wrapper/photoncons.py
index 9970b305c..19febdba5 100644
--- a/src/py21cmfast/wrapper/photoncons.py
+++ b/src/py21cmfast/wrapper/photoncons.py
@@ -55,7 +55,8 @@
 import numpy as np
 from scipy.optimize import curve_fit
 
-from ..c_21cmfast import ffi, lib
+import py21cmfast.c_21cmfast as lib
+
 from ._utils import _process_exitcode
 from .cfuncs import broadcast_params
 from .inputs import InputParameters
@@ -79,7 +80,7 @@ def c_memory_allocated(self) -> bool:
 
     @c_memory_allocated.setter
     def c_memory_allocated(self, val):
-        lib.photon_cons_allocated = ffi.cast("bool", val)
+        lib.photon_cons_allocated = val
 
 
 _photoncons_state = _PhotonConservationState()
@@ -93,20 +94,18 @@ def _init_photon_conservation_correction(*, inputs):
     return lib.InitialisePhotonCons()
 
 
-def _calibrate_photon_conservation_correction(
-    *, redshifts_estimate, nf_estimate, NSpline
-):
+def _calibrate_photon_conservation_correction(*, redshifts_estimate, nf_estimate):
     # This function passes the calibration simulation results to C,
     #       Storing a clipped version in global arrays nf_vals and z_vals,
     #       and constructing the GSL interpolator z_NFHistory_spline
     redshifts_estimate = np.array(redshifts_estimate, dtype="float64")
     nf_estimate = np.array(nf_estimate, dtype="float64")
 
-    z = ffi.cast("double *", ffi.from_buffer(redshifts_estimate))
-    xHI = ffi.cast("double *", ffi.from_buffer(nf_estimate))
+    z = redshifts_estimate
+    xHI = nf_estimate
 
     logger.debug(f"PhotonCons nf estimates: {nf_estimate}")
-    return lib.PhotonCons_Calibration(z, xHI, NSpline)
+    return lib.PhotonCons_Calibration(z, xHI)
 
 
 def _calc_zstart_photon_cons():
@@ -114,7 +113,7 @@ def _calc_zstart_photon_cons():
     #   Set by neutral fraction astro_params.PHOTONCONS_ZSTART
     from ._utils import _call_c_simple
 
-    return _call_c_simple(lib.ComputeZstart_PhotonCons)
+    return _call_c_simple(lib.ComputeZstart_PhotonCons)[0]
 
 
 def _get_photon_nonconservation_data() -> dict:
@@ -149,16 +148,16 @@ def _get_photon_nonconservation_data() -> dict:
     IntVal2 = np.array(np.zeros(1), dtype="int32")
     IntVal3 = np.array(np.zeros(1), dtype="int32")
 
-    c_z_at_Q = ffi.cast("double *", ffi.from_buffer(data[0]))
-    c_Qval = ffi.cast("double *", ffi.from_buffer(data[1]))
-    c_z_cal = ffi.cast("double *", ffi.from_buffer(data[2]))
-    c_nf_cal = ffi.cast("double *", ffi.from_buffer(data[3]))
-    c_PC_nf = ffi.cast("double *", ffi.from_buffer(data[4]))
-    c_PC_deltaz = ffi.cast("double *", ffi.from_buffer(data[5]))
+    c_z_at_Q = data[0]
+    c_Qval = data[1]
+    c_z_cal = data[2]
+    c_nf_cal = data[3]
+    c_PC_nf = data[4]
+    c_PC_deltaz = data[5]
 
-    c_int_NQ = ffi.cast("int *", ffi.from_buffer(IntVal1))
-    c_int_NC = ffi.cast("int *", ffi.from_buffer(IntVal2))
-    c_int_NP = ffi.cast("int *", ffi.from_buffer(IntVal3))
+    c_int_NQ = IntVal1
+    c_int_NC = IntVal2
+    c_int_NP = IntVal3
 
     # Run the C code
     errcode = lib.ObtainPhotonConsData(
@@ -299,7 +298,8 @@ def calibrate_photon_cons(
     prev_perturb = None
 
     # Arrays for redshift and neutral fraction for the calibration curve
-    neutral_fraction_photon_cons = []
+    # TODO: double check, this was empty before, was that a bug?
+    neutral_fraction_photon_cons = [1.0]
 
     # Initialise the analytic expression for the reionisation history
     logger.info("About to start photon conservation correction")
@@ -361,7 +361,6 @@ def calibrate_photon_cons(
     _calibrate_photon_conservation_correction(
         redshifts_estimate=fast_node_redshifts,
         nf_estimate=neutral_fraction_photon_cons,
-        NSpline=len(fast_node_redshifts),
     )
 
 
@@ -374,9 +373,9 @@ def get_photoncons_dz(inputs, redshift):
     redshift_pc_in = np.array([redshift]).astype("f4")
     stored_redshift_pc_in = np.array([redshift]).astype("f4")
     lib.adjust_redshifts_for_photoncons(
-        ffi.cast("float *", redshift_pc_in.ctypes.data),
-        ffi.cast("float *", stored_redshift_pc_in.ctypes.data),
-        ffi.cast("float *", deltaz.ctypes.data),
+        redshift_pc_in,
+        stored_redshift_pc_in,
+        deltaz,
     )
 
     return redshift_pc_in[0], stored_redshift_pc_in[0], deltaz[0]
@@ -452,7 +451,8 @@ def photoncons_alpha(inputs):
     # ratio of given alpha with calibration
     ratio_ref = (1 - ref_pc_data["nf_calibration"]) / ref_interp
 
-    ratio_diff = ratio_test - 1 / ratio_ref[None, :]  # find N(alpha)/ref == ref/cal
+    # find N(alpha)/ref == ref/cal
+    ratio_diff = ratio_test - 1 / ratio_ref[None, :]
     diff_test = (
         (test_pc_data)
         + (1 - ref_pc_data["nf_calibration"])[None, ...]
diff --git a/src/py21cmfast/wrapper/structs.py b/src/py21cmfast/wrapper/structs.py
index e8a603701..729ee4cf4 100644
--- a/src/py21cmfast/wrapper/structs.py
+++ b/src/py21cmfast/wrapper/structs.py
@@ -2,14 +2,13 @@
 
 from __future__ import annotations
 
-import contextlib
 import logging
 from typing import Any
 
 import attrs
-from bidict import bidict
 
-from ..c_21cmfast import ffi
+import py21cmfast.c_21cmfast as lib
+
 from .arrays import Array
 
 logger = logging.getLogger(__name__)
@@ -29,9 +28,8 @@ class StructWrapper:
 
     _name: str = attrs.field(converter=str)
     cstruct = attrs.field(default=None)
-    _ffi = attrs.field(default=ffi)
 
-    _TYPEMAP = bidict({"float32": "float *", "float64": "double *", "int32": "int *"})
+    primitive_types = (bool, str, int, float)
 
     @_name.default
     def _name_default(self):
@@ -43,16 +41,21 @@ def __init__(self, *args):
         This instantiates the memory associated with the C struct, attached to this inst.
         """
         self.__attrs_init__(*args)
-        self.cstruct = self._new()
+        self._cobj = getattr(lib, self._name)  # The wrapped class
+        self.cstruct = self._new()  # The instance of the wrapped class
 
     def _new(self):
         """Return a new empty C structure corresponding to this class."""
-        return self._ffi.new(f"struct {self._name}*")
+        return self._cobj()
 
     @property
     def fields(self) -> dict[str, Any]:
         """A list of fields of the underlying C struct (a list of tuples of "name, type")."""
-        return dict(self._ffi.typeof(self.cstruct[0]).fields)
+        result = {}
+        for attr in dir(self.cstruct):
+            if not attr.startswith("__"):
+                result[attr] = type(getattr(self.cstruct, attr))
+        return result
 
     @property
     def fieldnames(self) -> list[str]:
@@ -62,19 +65,17 @@ def fieldnames(self) -> list[str]:
     @property
     def pointer_fields(self) -> list[str]:
         """A list of names of fields which have pointer type in the C struct."""
-        return [f for f, t in self.fields.items() if t.type.kind == "pointer"]
+        return [f.split("set_")[1] for f in self.fields if f.startswith("set_")]
 
     @property
     def primitive_fields(self) -> list[str]:
         """The list of names of fields which have primitive type in the C struct."""
-        return [f for f, t in self.fields.items() if t.type.kind == "primitive"]
+        return [f for f, t in self.fields.items() if t in self.primitive_types]
 
     def __getstate__(self):
         """Return the current state of the class without pointers."""
         return {
-            k: v
-            for k, v in self.__dict__.items()
-            if k not in ["_strings", "cstruct", "_ffi"]
+            k: v for k, v in self.__dict__.items() if k not in ["_strings", "cstruct"]
         }
 
     def expose_to_c(self, array: Array, name: str):
@@ -82,15 +83,13 @@ def expose_to_c(self, array: Array, name: str):
         if not array.state.initialized:
             raise ValueError("Array must be initialized before exposing to C")
 
-        def _ary2buf(ary):
-            return self._ffi.cast(
-                self._TYPEMAP[ary.dtype.name], self._ffi.from_buffer(ary)
-            )
-
         try:
-            setattr(self.cstruct, name, _ary2buf(array.value))
-        except TypeError as e:
-            raise TypeError(f"Error setting {name}") from e
+            setter = getattr(self.cstruct, "set_" + name)
+            setter(array.value)
+        except AttributeError as e:
+            raise TypeError(
+                f"Error setting {name} on {self.__class__.__name__}, no setter found"
+            ) from e
 
 
 class StructInstanceWrapper:
@@ -101,35 +100,63 @@ class StructInstanceWrapper:
     Parameters
     ----------
     wrapped :
-        The reference to the C object to wrap (contained in the ``cffi.lib`` object).
-    ffi :
-        The ``cffi.ffi`` object.
+        The reference to the C object to wrap.
     """
 
-    def __init__(self, wrapped, ffi):
+    # NOTE: currently assumes that the C object is not internally changed
+    #   We get all the values from C on initialization, and pass changes back to C
+    #   The StructInstanceWrapper holds the attributes as they appear in python,
+    #   whereas ._cobj holds primitives and getters/setters for pointers.
+    # TODO: we should ditch the object attributes and just use the C object
+    #   with a custom __getattr__
+    def __init__(self, wrapped):
         self._cobj = wrapped
-        self._ffi = ffi
-
-        for nm, _tp in self._ffi.typeof(self._cobj).fields:
-            setattr(self, nm, getattr(self._cobj, nm))
+        # nanobind does not supply a list of fields like CFFI does, so we do
+        #   this instead to return a list of members
+        for attr in dir(self._cobj):
+            # ignore dunders
+            if not attr.startswith("__"):
+                if attr.startswith("get_"):
+                    # If the attribute is a getter, we need to set the value in python
+                    #   to the value of the C++ attribute without the "get_" prefix
+                    setattr(self, attr[4:], getattr(self._cobj, attr)())
+                elif not callable(getattr(self._cobj, attr)):
+                    # Otherwise, we just set the attribute to the value
+                    setattr(self, attr, getattr(self._cobj, attr))
 
         # Get the name of the structure
-        self._ctype = self._ffi.typeof(self._cobj).cname.split()[-1]
+        self._ctype = type(self._cobj).__name__
 
     def __setattr__(self, name, value):
         """Set an attribute of the instance, attempting to change it in the C struct as well."""
-        with contextlib.suppress(AttributeError):
-            setattr(self._cobj, name, value)
+        # use the non-overridden __setattr__ to set the attribute in Python
         object.__setattr__(self, name, value)
 
+        # Set the attribute in the C struct
+        if not name.startswith("_"):
+            if "set_" + name in dir(self._cobj):
+                getattr(self._cobj, "set_" + name)(value)
+            elif name in dir(self._cobj):
+                setattr(self._cobj, name, value)
+            else:
+                raise AttributeError(
+                    f"Attribute {name} not found in {self.__class__.__name__}"
+                )
+
     def items(self):
         """Yield (name, value) pairs for each element of the struct."""
-        for nm, _tp in self._ffi.typeof(self._cobj).fields:
-            yield nm, getattr(self, nm)
+        # nanobind does not supply a list of fileds like CFFI does, so we do
+        #   this instead to return a list of members
+        for attr in dir(self._cobj):
+            if not attr.startswith("__"):
+                if attr.startswith("get_"):
+                    yield attr[4:], getattr(self._cobj, attr)()
+                elif not attr.startswith("set_"):
+                    yield attr, getattr(self._cobj, attr)
 
     def keys(self):
         """Return a list of names of elements in the struct."""
-        return [nm for nm, tp in self.items()]
+        return [nm for nm, _ in self.items()]
 
     def __iter__(self):
         """Iterate over the object like a dict."""
diff --git a/subprojects/.gitignore b/subprojects/.gitignore
new file mode 100644
index 000000000..9a1d1e46e
--- /dev/null
+++ b/subprojects/.gitignore
@@ -0,0 +1,3 @@
+*
+!.gitignore
+!*.wrap
diff --git a/subprojects/nanobind.wrap b/subprojects/nanobind.wrap
new file mode 100644
index 000000000..78e2e7c5d
--- /dev/null
+++ b/subprojects/nanobind.wrap
@@ -0,0 +1,13 @@
+[wrap-file]
+directory = nanobind-2.4.0
+source_url = https://github.com/wjakob/nanobind/archive/refs/tags/v2.4.0.tar.gz
+source_filename = nanobind-2.4.0.tar.gz
+source_hash = bb35deaed7efac5029ed1e33880a415638352f757d49207a8e6013fefb6c49a7
+patch_filename = nanobind_2.4.0-2_patch.zip
+patch_url = https://wrapdb.mesonbuild.com/v2/nanobind_2.4.0-2/get_patch
+patch_hash = cf493bda0b11ea4e8d9dd42229c3bbdd52af88cc4aedac75a1eccb102b86dd4a
+source_fallback_url = https://github.com/mesonbuild/wrapdb/releases/download/nanobind_2.4.0-2/nanobind-2.4.0.tar.gz
+wrapdb_version = 2.4.0-2
+
+[provide]
+nanobind = nanobind_dep
diff --git a/subprojects/robin-map.wrap b/subprojects/robin-map.wrap
new file mode 100644
index 000000000..3da2993bb
--- /dev/null
+++ b/subprojects/robin-map.wrap
@@ -0,0 +1,13 @@
+[wrap-file]
+directory = robin-map-1.3.0
+source_url = https://github.com/Tessil/robin-map/archive/refs/tags/v1.3.0.tar.gz
+source_filename = robin-map-1.3.0.tar.gz
+source_hash = a8424ad3b0affd4c57ed26f0f3d8a29604f0e1f2ef2089f497f614b1c94c7236
+patch_filename = robin-map_1.3.0-1_patch.zip
+patch_url = https://wrapdb.mesonbuild.com/v2/robin-map_1.3.0-1/get_patch
+patch_hash = 6d090f988541ffb053512607e0942cbd0dbc2a4fa0563e44ff6a37e810b8c739
+source_fallback_url = https://github.com/mesonbuild/wrapdb/releases/download/robin-map_1.3.0-1/robin-map-1.3.0.tar.gz
+wrapdb_version = 1.3.0-1
+
+[provide]
+robin-map = robin_map_dep
diff --git a/tests/test_data/power_spectra_dexm.h5 b/tests/test_data/power_spectra_dexm.h5
index 1c830080b..b87c7e6db 100644
Binary files a/tests/test_data/power_spectra_dexm.h5 and b/tests/test_data/power_spectra_dexm.h5 differ
diff --git a/tests/test_data/power_spectra_fixed_halogrids.h5 b/tests/test_data/power_spectra_fixed_halogrids.h5
index 913dfb5e5..9dff2b705 100644
Binary files a/tests/test_data/power_spectra_fixed_halogrids.h5 and b/tests/test_data/power_spectra_fixed_halogrids.h5 differ
diff --git a/tests/test_data/power_spectra_sampler.h5 b/tests/test_data/power_spectra_sampler.h5
index 328320c5f..a1ea2c15e 100644
Binary files a/tests/test_data/power_spectra_sampler.h5 and b/tests/test_data/power_spectra_sampler.h5 differ
diff --git a/tests/test_data/power_spectra_sampler_ir.h5 b/tests/test_data/power_spectra_sampler_ir.h5
index 3c6952e7e..a70e356f7 100644
Binary files a/tests/test_data/power_spectra_sampler_ir.h5 and b/tests/test_data/power_spectra_sampler_ir.h5 differ
diff --git a/tests/test_data/power_spectra_sampler_mini.h5 b/tests/test_data/power_spectra_sampler_mini.h5
index 88cd31c8e..645a0d0b1 100644
Binary files a/tests/test_data/power_spectra_sampler_mini.h5 and b/tests/test_data/power_spectra_sampler_mini.h5 differ
diff --git a/tests/test_data/power_spectra_sampler_noncubic.h5 b/tests/test_data/power_spectra_sampler_noncubic.h5
index 1a9920dad..3606f45eb 100644
Binary files a/tests/test_data/power_spectra_sampler_noncubic.h5 and b/tests/test_data/power_spectra_sampler_noncubic.h5 differ
diff --git a/tests/test_data/power_spectra_sampler_ts.h5 b/tests/test_data/power_spectra_sampler_ts.h5
index d01c17845..9db1f2e96 100644
Binary files a/tests/test_data/power_spectra_sampler_ts.h5 and b/tests/test_data/power_spectra_sampler_ts.h5 differ
diff --git a/tests/test_data/power_spectra_sampler_ts_ir.h5 b/tests/test_data/power_spectra_sampler_ts_ir.h5
index e0c67b4bf..c7418c7c1 100644
Binary files a/tests/test_data/power_spectra_sampler_ts_ir.h5 and b/tests/test_data/power_spectra_sampler_ts_ir.h5 differ
diff --git a/tests/test_data/power_spectra_sampler_ts_ir_onethread.h5 b/tests/test_data/power_spectra_sampler_ts_ir_onethread.h5
index 5722abbd9..b8b00a7b3 100644
Binary files a/tests/test_data/power_spectra_sampler_ts_ir_onethread.h5 and b/tests/test_data/power_spectra_sampler_ts_ir_onethread.h5 differ
diff --git a/tests/test_exceptions.py b/tests/test_exceptions.py
index d0d39fb57..1b5a968c0 100644
--- a/tests/test_exceptions.py
+++ b/tests/test_exceptions.py
@@ -3,10 +3,10 @@
 import numpy as np
 import pytest
 
-from py21cmfast.c_21cmfast import ffi, lib
+import py21cmfast.c_21cmfast as lib
 from py21cmfast.wrapper.exceptions import (
     PHOTONCONSERROR,
-    ParameterError,
+    PhotonConsError,
     _process_exitcode,
 )
 
@@ -21,19 +21,16 @@ def test_basic(subfunc):
 def test_simple(subfunc):
     answer = np.array([0], dtype="f8")
 
-    status = lib.FunctionThatCatches(
-        subfunc, False, ffi.cast("double *", ffi.from_buffer(answer))
-    )
-
-    with pytest.raises(ParameterError):
+    status = lib.FunctionThatCatches(subfunc, False, answer)
+    with pytest.raises(PhotonConsError):
         _process_exitcode(
             status,
             lib.FunctionThatCatches,
-            (False, ffi.cast("double *", ffi.from_buffer(answer))),
+            (subfunc, False, answer),
         )
 
 
 def test_pass():
     answer = np.array([0], dtype="f8")
-    lib.FunctionThatCatches(True, True, ffi.cast("double *", ffi.from_buffer(answer)))
+    lib.FunctionThatCatches(True, True, answer)
     assert answer == 5.0
diff --git a/tests/test_filtering.py b/tests/test_filtering.py
index adbd9a00e..9e92224e2 100644
--- a/tests/test_filtering.py
+++ b/tests/test_filtering.py
@@ -6,7 +6,7 @@
 from matplotlib.colors import Normalize
 from scipy.stats import binned_statistic as binstat
 
-from py21cmfast.c_21cmfast import ffi, lib
+import py21cmfast.c_21cmfast as lib
 from py21cmfast.wrapper.cfuncs import broadcast_input_struct
 
 from . import produce_integration_test_data as prd
@@ -98,19 +98,19 @@ def test_filters(filter_flag, R, plt):
     output_box_centre = np.zeros((up.HII_DIM,) * 3, dtype="f8")
     # use MFP=20 for the exp filter, use a 4 cell shell for the annular filter
     if filter_flag == 3:
-        R_param = 20
+        R_param = 20.0
     elif filter_flag == 4:
-        R_param = max(R - 4 * (up.BOX_LEN / up.HII_DIM), 0)
+        R_param = max(R - 4 * (up.BOX_LEN / up.HII_DIM), 0.0)
     else:
-        R_param = 0
+        R_param = 0.0
 
     broadcast_input_struct(inputs)
     lib.test_filter(
-        ffi.cast("float *", input_box_centre.ctypes.data),
+        input_box_centre,
         R,
         R_param,
         filter_flag,
-        ffi.cast("double *", output_box_centre.ctypes.data),
+        output_box_centre,
     )
 
     # expected outputs given in cell units
diff --git a/tests/test_halo_sampler.py b/tests/test_halo_sampler.py
index 961417b19..54c9c611d 100644
--- a/tests/test_halo_sampler.py
+++ b/tests/test_halo_sampler.py
@@ -4,11 +4,6 @@
 import numpy as np
 import pytest
 
-from py21cmfast import (
-    compute_halo_grid,
-    compute_initial_conditions,
-    perturb_field,
-)
 from py21cmfast.wrapper import cfuncs as cf
 
 from . import test_c_interpolation_tables as cint
@@ -231,112 +226,6 @@ def test_halo_prop_sampling(default_input_struct_ts, plt):
     np.testing.assert_allclose(exp_LX, sim_LX, rtol=1e-4)
 
 
-# testing that the integrals in HaloBox.c are done correctly by
-#   using the fixed grids
-# TODO: extend test to minihalos w/o feedback
-# TODO: maybe let this run with the default ics and perturbed field,
-#   even though they have different flag options?
-def test_fixed_grids(default_input_struct_ts, plt):
-    inputs = default_input_struct_ts.evolve_input_structs(
-        USE_HALO_FIELD=True,
-        FIXED_HALO_GRIDS=True,
-        USE_UPPER_STELLAR_TURNOVER=False,
-    )
-
-    ic = compute_initial_conditions(
-        inputs=inputs,
-    )
-    perturbed_field = perturb_field(initial_conditions=ic, redshift=10.0, inputs=inputs)
-    dens = perturbed_field.get("density")
-
-    hbox = compute_halo_grid(
-        initial_conditions=ic,
-        inputs=inputs,
-        perturbed_field=perturbed_field,
-    )
-
-    cell_radius = 0.620350491 * (
-        inputs.simulation_options.BOX_LEN / inputs.simulation_options.HII_DIM
-    )
-    mt_grid = np.full_like(dens, inputs.astro_params.M_TURN)
-
-    integral_sfrd, _ = cf.evaluate_SFRD_cond(
-        inputs=inputs,
-        redshift=perturbed_field.redshift,
-        radius=cell_radius,
-        densities=dens,
-        log10mturns=mt_grid,
-    )
-    integral_sfrd *= 1 + dens
-
-    integral_nion, _ = cf.evaluate_Nion_cond(
-        inputs=inputs,
-        redshift=perturbed_field.redshift,
-        radius=cell_radius,
-        densities=dens,
-        l10mturns_acg=mt_grid,
-        l10mturns_mcg=mt_grid,
-    )
-    integral_nion *= 1 + dens
-
-    integral_xray = cf.evaluate_Xray_cond(
-        inputs=inputs,
-        redshift=perturbed_field.redshift,
-        radius=cell_radius,
-        densities=perturbed_field.density.value,
-        log10mturns=mt_grid,
-    )
-    integral_xray *= 1 + dens
-
-    # mean-fixing and prefactor numerics results in 1-to-1 comparisons being more difficult
-    #   for now we just test the relative values
-    integral_sfrd *= hbox.get("halo_sfr").mean() / integral_sfrd.mean()
-    integral_nion *= hbox.get("n_ion").mean() / integral_nion.mean()
-    integral_xray *= hbox.get("halo_xray").mean() / integral_xray.mean()
-
-    if plt == mpl.pyplot:
-        plot_scatter_comparison(
-            [integral_sfrd, integral_nion, integral_xray],
-            [hbox.get("halo_sfr"), hbox.get("n_ion"), hbox.get("halo_xray")],
-            [dens, dens, dens],
-            ["SFRD", "Nion", "LX"],
-            plt=plt,
-        )
-
-    # TODO: a 5% tolerance isn't fantastic here since they should be the same to a constant factor.
-    #   this happens near the GL integration transition (<1%) and delta_crit (~4%), examine plots
-    rtol = 5e-2
-    print(f"{hbox.get('halo_sfr').shape} {integral_sfrd.shape}", flush=True)
-    print_failure_stats(
-        hbox.get("halo_sfr"),
-        integral_sfrd,
-        [dens],
-        0.0,
-        rtol,
-        "sfr",
-    )
-    print_failure_stats(
-        hbox.get("n_ion"),
-        integral_nion,
-        [dens],
-        0.0,
-        rtol,
-        "nion",
-    )
-    print_failure_stats(
-        hbox.get("halo_xray"),
-        integral_xray,
-        [dens],
-        0.0,
-        rtol,
-        "LX",
-    )
-
-    np.testing.assert_allclose(hbox.get("halo_sfr"), integral_sfrd, rtol=rtol)
-    np.testing.assert_allclose(hbox.get("n_ion"), integral_nion, rtol=rtol)
-    np.testing.assert_allclose(hbox.get("halo_xray"), integral_xray, rtol=rtol)
-
-
 # very basic scatter comparison
 def plot_scatter_comparison(
     truths, tests, inputs, names, log_vals=True, log_inp=False, plt=None
diff --git a/tests/test_integration_features.py b/tests/test_integration_features.py
index 880d927da..b873b9247 100644
--- a/tests/test_integration_features.py
+++ b/tests/test_integration_features.py
@@ -76,10 +76,9 @@ def test_power_spectra_coeval(name, module_direc, plt):
             [test_k],
             abs_tol=0,
             rel_tol=1e-4,
-            name=key,
+            name=f"{name} - {key}",
         )
 
-    any_failed = True  # TODO:remove this testing line
     if plt == mpl.pyplot and any_failed:
         make_coeval_comparison_plot(true_k, test_k, true_powers, test_powers, plt)
 
diff --git a/tests/test_output_structs.py b/tests/test_output_structs.py
index 812db02ad..bdd6726cb 100644
--- a/tests/test_output_structs.py
+++ b/tests/test_output_structs.py
@@ -9,8 +9,10 @@
 from py21cmfast import (
     InitialConditions,  # An example of an output struct
     InputParameters,
+    config,
 )
 from py21cmfast.wrapper import outputs as ox
+from py21cmfast.wrapper.arrays import Array
 
 
 @pytest.fixture
@@ -76,3 +78,157 @@ def test_all_fields_exist(struct: ox.OutputStruct):
 
     for name in cstruct.primitive_fields:
         assert name in this
+
+
+# NOTE: These do not test every field, but does test every conditional in the
+#   OutputStruct constructors, a better approach would probably be to have a
+#   comprehensive list of {"field_name": {"flag": value}} conditions for the fields
+#   in the output module which is checked in the constructors
+def test_optional_field_ic(default_input_struct_lc: InputParameters):
+    """Ensure that the correct InitialConditions fields are set based on the parameters."""
+    ic = ox.InitialConditions.new(inputs=default_input_struct_lc)
+    assert isinstance(ic.lowres_vx, Array)
+    assert isinstance(ic.lowres_vx_2LPT, Array)
+    assert ic.hires_vx is None
+    assert isinstance(ic.hires_vx_2LPT, Array)  # Python requires it, check the C
+    assert ic.lowres_vcb is None
+
+    ic = ox.InitialConditions.new(
+        inputs=default_input_struct_lc.evolve_input_structs(
+            PERTURB_ALGORITHM="ZELDOVICH"
+        )
+    )
+    assert isinstance(ic.lowres_vy, Array)
+    assert ic.lowres_vy_2LPT is None
+    assert ic.hires_vy is None
+    assert ic.hires_vy_2LPT is None
+
+    ic = ox.InitialConditions.new(
+        inputs=default_input_struct_lc.evolve_input_structs(PERTURB_ON_HIGH_RES=True)
+    )
+    assert ic.lowres_vz is None
+    assert ic.lowres_vz_2LPT is None
+    assert isinstance(ic.hires_vz, Array)
+    assert isinstance(ic.hires_vz_2LPT, Array)
+
+    ic = ox.InitialConditions.new(
+        inputs=default_input_struct_lc.evolve_input_structs(
+            USE_RELATIVE_VELOCITIES=True,
+            POWER_SPECTRUM="CLASS",
+        )
+    )
+    assert isinstance(ic.lowres_vx, Array)
+    assert isinstance(ic.lowres_vx_2LPT, Array)
+    assert ic.hires_vx is None
+    assert isinstance(ic.hires_vx_2LPT, Array)
+    assert isinstance(ic.lowres_vcb, Array)
+
+
+def test_optional_field_perturb(default_input_struct_lc: InputParameters):
+    """Ensure that the correct PerturbedField fields are set based on the parameters."""
+    pt = ox.PerturbedField.new(redshift=0.0, inputs=default_input_struct_lc)
+    assert isinstance(pt.density, Array)
+    assert isinstance(pt.velocity_z, Array)
+    assert isinstance(pt.velocity_x, Array)
+    assert isinstance(pt.velocity_y, Array)
+
+    pt = ox.PerturbedField.new(
+        redshift=0.0,
+        inputs=default_input_struct_lc.evolve_input_structs(KEEP_3D_VELOCITIES=False),
+    )
+    assert isinstance(pt.density, Array)
+    assert isinstance(pt.velocity_z, Array)
+    assert pt.velocity_x is None
+    assert pt.velocity_y is None
+
+
+def test_optional_field_halobox(default_input_struct_lc: InputParameters):
+    """Ensure that the correct HaloBox fields are set based on the parameters."""
+    hb = ox.HaloBox.new(redshift=0.0, inputs=default_input_struct_lc)
+    assert hb.halo_mass is None
+    assert isinstance(hb.halo_sfr, Array)
+    assert isinstance(hb.n_ion, Array)
+    assert hb.halo_sfr_mini is None
+    assert hb.halo_xray is None
+    assert hb.whalo_sfr is None
+
+    with config.use(EXTRA_HALOBOX_FIELDS=True):
+        hb = ox.HaloBox.new(redshift=0.0, inputs=default_input_struct_lc)
+        assert isinstance(hb.halo_mass, Array)
+
+        inputs = default_input_struct_lc.evolve_input_structs(INHOMO_RECO=True)
+        hb = ox.HaloBox.new(redshift=0.0, inputs=inputs)
+        assert isinstance(hb.whalo_sfr, Array)
+
+        inputs = inputs.evolve_input_structs(USE_TS_FLUCT=True)
+        hb = ox.HaloBox.new(redshift=0.0, inputs=inputs)
+        assert isinstance(hb.halo_xray, Array)
+
+        inputs = inputs.evolve_input_structs(USE_MINI_HALOS=True)
+        hb = ox.HaloBox.new(redshift=0.0, inputs=inputs)
+        assert isinstance(hb.halo_sfr_mini, Array)
+
+
+def test_optional_field_xrs(default_input_struct_lc: InputParameters):
+    """Ensure that the correct XraySourceBox fields are set based on the parameters."""
+    xr = ox.XraySourceBox.new(redshift=0.0, inputs=default_input_struct_lc)
+    assert isinstance(xr.filtered_sfr, Array)
+    assert isinstance(xr.filtered_xray, Array)
+    assert xr.filtered_sfr_mini is None
+
+    inputs = default_input_struct_lc.evolve_input_structs(
+        USE_TS_FLUCT=True,
+        USE_MINI_HALOS=True,
+        INHOMO_RECO=True,
+    )
+    xr = ox.XraySourceBox.new(redshift=0.0, inputs=inputs)
+    assert isinstance(xr.filtered_sfr_mini, Array)
+
+
+def test_optional_field_ts(default_input_struct_lc: InputParameters):
+    """Ensure that the correct TsBox fields are set based on the parameters."""
+    ts = ox.TsBox.new(redshift=0.0, inputs=default_input_struct_lc)
+    assert isinstance(ts.spin_temperature, Array)
+    assert isinstance(ts.xray_ionised_fraction, Array)
+    assert isinstance(ts.kinetic_temp_neutral, Array)
+    assert ts.J_21_LW is None
+
+    inputs = default_input_struct_lc.evolve_input_structs(
+        USE_TS_FLUCT=True,
+        INHOMO_RECO=True,
+        USE_MINI_HALOS=True,
+    )
+    ts = ox.TsBox.new(redshift=0.0, inputs=inputs)
+    assert isinstance(ts.J_21_LW, Array)
+
+
+def test_optional_field_ion(default_input_struct_lc: InputParameters):
+    """Ensure that the correct IonizedBox fields are set based on the parameters."""
+    ion = ox.IonizedBox.new(redshift=0.0, inputs=default_input_struct_lc)
+    assert isinstance(ion.neutral_fraction, Array)
+    assert ion.unnormalised_nion_mini is None
+    assert ion.cumulative_recombinations is None
+
+    inputs = default_input_struct_lc.evolve_input_structs(
+        INHOMO_RECO=True,
+    )
+    ion = ox.IonizedBox.new(redshift=0.0, inputs=inputs)
+    assert isinstance(ion.cumulative_recombinations, Array)
+
+    inputs = inputs.evolve_input_structs(
+        USE_TS_FLUCT=True,
+        USE_MINI_HALOS=True,
+    )
+    ion = ox.IonizedBox.new(redshift=0.0, inputs=inputs)
+    assert isinstance(ion.unnormalised_nion_mini, Array)
+
+
+def test_optional_field_bt(default_input_struct_lc: InputParameters):
+    """Ensure that the correct BrightnessTemp fields are set based on the parameters."""
+    bt = ox.BrightnessTemp.new(redshift=0.0, inputs=default_input_struct_lc)
+    assert isinstance(bt.brightness_temp, Array)
+    assert bt.tau_21 is None
+
+    inputs = default_input_struct_lc.evolve_input_structs(USE_TS_FLUCT=True)
+    bt = ox.BrightnessTemp.new(redshift=0.0, inputs=inputs)
+    assert isinstance(bt.tau_21, Array)
diff --git a/tests/test_perturb.py b/tests/test_perturb.py
new file mode 100644
index 000000000..3871ce3ac
--- /dev/null
+++ b/tests/test_perturb.py
@@ -0,0 +1,229 @@
+"""Contains the tests for the Perturbation algorithm (Linear, Zel'dovich, 2LPT).
+
+Including perturbation of galaxy properties
+"""
+
+import numpy as np
+import pytest
+
+from py21cmfast import (
+    InitialConditions,
+    compute_halo_grid,
+    perturb_field,
+)
+from py21cmfast.wrapper import cfuncs as cf
+
+
+class TestPerturb:
+    """Tests regarding the perturbation algorithms."""
+
+    @pytest.fixture(scope="class")
+    def test_pt_z(self):
+        """Set redshift at which to test the 2LPT."""
+        return 8.0
+
+    @pytest.fixture(scope="class")
+    def inputs_low(self, default_input_struct_ts):
+        """Parameters for 2LPT tests."""
+        # using 3-1 ratio for testing
+        return default_input_struct_ts.evolve_input_structs(
+            DIM=12,
+            HII_DIM=4,
+            BOX_LEN=8,
+            USE_HALO_FIELD=True,
+            FIXED_HALO_GRIDS=True,
+            PERTURB_ON_HIGH_RES=False,
+            R_BUBBLE_MAX=1.0,
+        )
+
+    @pytest.fixture(scope="class")
+    def inputs_zel(self, inputs_low):
+        """Parameters for Zel'dovich test."""
+        return inputs_low.evolve_input_structs(
+            PERTURB_ALGORITHM="ZELDOVICH",
+        )
+
+    @pytest.fixture(scope="class")
+    def inputs_linear(self, inputs_low):
+        """Parameters for Linear test."""
+        return inputs_low.evolve_input_structs(
+            PERTURB_ALGORITHM="LINEAR",
+        )
+
+    def get_fake_ics(self, inputs, test_pt_z):
+        """Make an IC instance for the testing.
+
+        These are inconsistent and strange values for real ICS but
+        very trackable.
+        """
+        ics = InitialConditions.new(inputs=inputs)
+        d_z = cf.get_growth_factor(inputs=inputs, redshift=test_pt_z)
+        d_z_i = cf.get_growth_factor(
+            inputs=inputs, redshift=inputs.simulation_options.INITIAL_REDSHIFT
+        )
+
+        res_fac = int(inputs.simulation_options.HIRES_TO_LOWRES_FACTOR)
+        lo_dim = inputs.simulation_options.HII_DIM
+        hi_dim = inputs.simulation_options.DIM
+        fac_1lpt = inputs.simulation_options.cell_size / (d_z - d_z_i)
+        fac_2lpt = inputs.simulation_options.cell_size / (
+            (-3.0 / 7.0) * (d_z**2 - d_z_i**2)
+        )
+        for name, array in ics.arrays.items():
+            setattr(ics, name, array.initialize().computed())
+
+        # setup the velocities
+        # NOTE: IC velocities are in Mpc
+        if not inputs.matter_options.PERTURB_ON_HIGH_RES:
+            fake_v = np.ones_like(ics.get("lowres_vx"))
+            ics.set("lowres_vx", 0 * fake_v)
+            ics.set("lowres_vy", fac_1lpt * fake_v)
+            ics.set("lowres_vz", 0 * fake_v)
+            if inputs.matter_options.PERTURB_ALGORITHM == "2LPT":
+                ics.set("lowres_vx_2LPT", 0 * fake_v)
+                ics.set("lowres_vy_2LPT", 0 * fake_v)
+                ics.set("lowres_vz_2LPT", fac_2lpt * fake_v)
+        else:
+            fake_v = np.ones_like(ics.get("hires_vx"))
+            ics.set("hires_vx", 0 * fake_v)
+            ics.set("hires_vy", -fac_1lpt * fake_v)
+            ics.set("hires_vz", 0 * fake_v)
+            if inputs.matter_options.PERTURB_ALGORITHM == "2LPT":
+                ics.set("hires_vx_2LPT", 0 * fake_v)
+                ics.set("hires_vy_2LPT", 0 * fake_v)
+                ics.set("hires_vz_2LPT", -fac_2lpt * fake_v)
+
+        # set some densities that can be easily tracked
+        d_lo = np.zeros_like(ics.get("lowres_density"))
+        d_lo[0, 0, 0] = 1
+        d_lo[lo_dim // 2, lo_dim // 2, lo_dim // 2] = -1
+        ics.set("lowres_density", d_lo)
+        # make similar hires densities
+        d_hi = np.zeros_like(ics.get("hires_density"))
+        d_hi[0, 0, 0] = res_fac**3
+        d_hi[hi_dim // 2, hi_dim // 2, hi_dim // 2] = -(res_fac**3)
+        ics.set("hires_density", d_hi)
+
+        return ics
+
+    @pytest.mark.parametrize("inputs", ["inputs_low", "inputs_zel"])
+    def test_lowres_perturb(self, inputs, test_pt_z, request):
+        """Tests low-resolution perturbation."""
+        inputs = request.getfixturevalue(inputs)
+        ics = self.get_fake_ics(inputs, test_pt_z)
+        z_d = (
+            test_pt_z
+            if inputs.matter_options.PERTURB_ALGORITHM == "LINEAR"
+            else inputs.simulation_options.INITIAL_REDSHIFT
+        )
+        roll_var = {
+            "LINEAR": (0, 0, 0),
+            "ZELDOVICH": (0, 1, 0),
+            "2LPT": (0, 1, -1),
+        }[inputs.matter_options.PERTURB_ALGORITHM]
+        d_z = cf.get_growth_factor(inputs=inputs, redshift=z_d)
+
+        expected_dens = np.roll(ics.get("lowres_density"), roll_var, (0, 1, 2))
+        expected_dens *= d_z
+        pt = perturb_field(
+            initial_conditions=ics,
+            redshift=test_pt_z,
+            regenerate=True,
+            write=False,
+        )
+        np.testing.assert_allclose(pt.get("density"), expected_dens, atol=1e-3)
+
+    @pytest.mark.skip(
+        reason="aliasing in downsampling makes hires 2lpt unit tests difficult"
+    )
+    def test_hires_perturb(self, inputs_hi, test_pt_z):
+        """Tests the high resolution perturbation."""
+        ics = self.get_fake_ics(inputs_hi, test_pt_z)
+        expected_dens = np.roll(ics.get("lowres_density"), (0, -1, 1), (0, 1, 2))
+        d_z_i = cf.get_growth_factor(inputs=inputs_hi, redshift=test_pt_z)
+        expected_dens *= d_z_i
+        pt = perturb_field(
+            initial_conditions=ics,
+            redshift=test_pt_z,
+            regenerate=True,
+            write=False,
+        )
+        np.testing.assert_allclose(pt.get("density"), expected_dens, atol=1e-3)
+
+    # TODO: include minihalo properties
+    # TODO: include linear (for some reason)
+    @pytest.mark.parametrize("inputs", ["inputs_low", "inputs_zel"])
+    def test_hb_perturb(self, inputs, test_pt_z, request):
+        """Tests the halo property perturbation."""
+        inputs = request.getfixturevalue(inputs)
+        ics = self.get_fake_ics(inputs, test_pt_z)
+        hbox = compute_halo_grid(
+            redshift=test_pt_z,
+            initial_conditions=ics,
+            inputs=inputs,
+        )
+        cell_radius = 0.620350491 * (
+            inputs.simulation_options.BOX_LEN / inputs.simulation_options.HII_DIM
+        )
+        d_z = cf.get_growth_factor(
+            inputs=inputs,
+            redshift=test_pt_z,
+        )
+        roll_var = {
+            "LINEAR": (0, 0, 0),
+            "ZELDOVICH": (0, 1, 0),
+            "2LPT": (0, 1, -1),
+        }[inputs.matter_options.PERTURB_ALGORITHM]
+        dens = np.roll(ics.get("lowres_density"), roll_var, (0, 1, 2)) * d_z
+        mt_grid = np.full_like(dens, inputs.astro_params.M_TURN)
+
+        prefac_sfr = (
+            inputs.cosmo_params.cosmo.critical_density(0).to("Msun Mpc-3").value
+            * inputs.astro_params.cdict["F_STAR10"]
+            * inputs.cosmo_params.OMb
+            * inputs.cosmo_params.cosmo.H(test_pt_z).to("s-1").value
+            / inputs.astro_params.t_STAR
+        )
+        prefac_nion = (
+            inputs.cosmo_params.cosmo.critical_density(0).to("Msun Mpc-3").value
+            * inputs.astro_params.cdict["F_STAR10"]
+            * inputs.cosmo_params.OMb
+            * inputs.astro_params.cdict["F_ESC10"]
+            * inputs.astro_params.cdict["POP2_ION"]
+        )
+        prefac_xray = (
+            inputs.cosmo_params.cosmo.critical_density(0).to("Msun Mpc-3").value
+            * inputs.cosmo_params.OMm
+        )
+        integral_sfrd, _ = cf.evaluate_SFRD_cond(
+            inputs=inputs,
+            redshift=test_pt_z,
+            radius=cell_radius,
+            densities=dens,
+            log10mturns=mt_grid,
+        )
+        integral_sfrd *= prefac_sfr
+
+        integral_nion, _ = cf.evaluate_Nion_cond(
+            inputs=inputs,
+            redshift=test_pt_z,
+            radius=cell_radius,
+            densities=dens,
+            l10mturns_acg=mt_grid,
+            l10mturns_mcg=mt_grid,
+        )
+        integral_nion *= prefac_nion
+
+        integral_xray = cf.evaluate_Xray_cond(
+            inputs=inputs,
+            redshift=test_pt_z,
+            radius=cell_radius,
+            densities=dens,
+            log10mturns=mt_grid,
+        )
+        integral_xray *= prefac_xray
+
+        rtol = 1e-2
+        np.testing.assert_allclose(hbox.get("halo_sfr"), integral_sfrd, rtol=rtol)
+        np.testing.assert_allclose(hbox.get("n_ion"), integral_nion, rtol=rtol)
+        np.testing.assert_allclose(hbox.get("halo_xray"), integral_xray, rtol=rtol)
diff --git a/tests/test_tables.py b/tests/test_tables.py
index 6fc9349f8..8d3a3ad94 100644
--- a/tests/test_tables.py
+++ b/tests/test_tables.py
@@ -1,6 +1,6 @@
 """Test initializing tables in C."""
 
-from py21cmfast.c_21cmfast import lib
+import py21cmfast.c_21cmfast as lib
 from py21cmfast.wrapper.cfuncs import broadcast_input_struct