scverse · Intron7 · Sep 11, 2025 · Sep 11, 2025 · Sep 11, 2025 · Sep 11, 2025
diff --git a/.clang-format b/.clang-format
@@ -0,0 +1,22 @@
+BasedOnStyle: Google
+Language: Cpp
+
+# Make braces stay on the same line (like your diffs)
+BreakBeforeBraces: Attach
+AllowShortFunctionsOnASingleLine: None
+
+# Compact/“binpack” parameter lists (what produced your earlier diffs)
+BinPackParameters: true
+BinPackArguments: true
+
+# Typical CUDA/C++ ergonomics
+IndentWidth: 2
+ColumnLimit: 100
+PointerAlignment: Left
+DerivePointerAlignment: false
+
+# Don’t reorder #includes if you don’t want surprise churn
+SortIncludes: false
+
+# Optional: make templates break more aggressively
+AlwaysBreakTemplateDeclarations: Yes
diff --git a/.gitignore b/.gitignore
@@ -17,3 +17,23 @@ __pycache__/
 
 # Venvs
 *venv/
+
+# Build artifacts (CMake / scikit-build)
+/build/
+/CMakeFiles/
+/CMakeCache.txt
+/cmake_install.cmake
+/build.ninja
+/.ninja_deps
+/.ninja_log
+/libnanobind-static.a
+/install_manifest__mean_var_cuda.txt
+/_skbuild/
+/*.egg-info/
+
+# Compiled CUDA extension copied for editable installs
+*.so
+
+# Test/coverage caches
+/.pytest_cache/
+/.coverage
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -32,3 +32,9 @@ repos:
     -   id: codespell
         additional_dependencies:
         - tomli
+-   repo: https://github.com/pre-commit/mirrors-clang-format
+    rev: v18.1.8
+    hooks:
+    -   id: clang-format
+        args: [--style=file, -i]
+        types_or: [c, c++, cuda]
diff --git a/.readthedocs.yml b/.readthedocs.yml
@@ -6,14 +6,15 @@ build:
   os: ubuntu-24.04
   tools:
     python: "3.12"
+
   commands:
     # Install and set up uv
     - asdf plugin add uv
     - asdf install uv latest
     - asdf global uv latest
 
     # Use uv to synchronize dependencies
-    - uv pip install --system .[doc]
+    - CMAKE_ARGS="-DRSC_BUILD_EXTENSIONS=OFF" uv pip install --system ".[doc]"
 
     # Build documentation using sphinx
     - python -m sphinx -T -b html -d docs/_build/doctrees -D language=en docs $READTHEDOCS_OUTPUT/html

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -0,0 +1,48 @@
+cmake_minimum_required(VERSION 3.24)
+
+project(rapids_singlecell_cuda LANGUAGES CXX)
+
+# Option to disable building compiled extensions (for docs/RTD)
+option(RSC_BUILD_EXTENSIONS "Build CUDA/C++ extensions" ON)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+if (RSC_BUILD_EXTENSIONS)
+  enable_language(CUDA)
+  find_package(Python REQUIRED COMPONENTS Interpreter Development.Module)
+  find_package(nanobind CONFIG REQUIRED)
+  find_package(CUDAToolkit REQUIRED)
+else()
+  message(STATUS "RSC_BUILD_EXTENSIONS=OFF -> skipping compiled extensions for docs")
+endif()
+
+# Helper to declare a nanobind CUDA module uniformly
+function(add_nb_cuda_module target src)
+  if (RSC_BUILD_EXTENSIONS)
+    nanobind_add_module(${target} STABLE_ABI LTO
+        ${src}
+    )
+    target_link_libraries(${target} PRIVATE CUDA::cudart)
+    set_target_properties(${target} PROPERTIES
+        CUDA_SEPARABLE_COMPILATION ON
+    )
+    install(TARGETS ${target} LIBRARY DESTINATION rapids_singlecell/_cuda)
+    # Also copy built module into source tree for editable installs
+    add_custom_command(TARGET ${target} POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E copy
+            $<TARGET_FILE:${target}>
+            ${PROJECT_SOURCE_DIR}/src/rapids_singlecell/_cuda/$<TARGET_FILE_NAME:${target}>
+    )
+  endif()
+endfunction()
+
+if (RSC_BUILD_EXTENSIONS)
+  # CUDA modules
+  add_nb_cuda_module(_mean_var_cuda     src/rapids_singlecell/_cuda/mean_var/mean_var.cu)
+  add_nb_cuda_module(_sparse2dense_cuda src/rapids_singlecell/_cuda/sparse2dense/sparse2dense.cu)
+  add_nb_cuda_module(_scale_cuda        src/rapids_singlecell/_cuda/scale/scale.cu)
+  add_nb_cuda_module(_qc_cuda           src/rapids_singlecell/_cuda/qc/qc.cu)
+  add_nb_cuda_module(_qc_dask_cuda      src/rapids_singlecell/_cuda/qc_dask/qc_kernels_dask.cu)
+endif()
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,11 @@
 [build-system]
-requires = [ "hatchling", "hatch-vcs" ]
-build-backend = "hatchling.build"
+requires = [
+    "scikit-build-core>=0.10",
+    "nanobind>=2.0.0",
+    "pybind11-stubgen; python_version>='3.11'",
+    "setuptools-scm>=8",
+]
+build-backend = "scikit_build_core.build"
 
 [project]
 name = "rapids_singlecell"
@@ -107,21 +112,22 @@ markers = [
     "gpu: tests that use a GPU (currently unused, but needs to be specified here as we import anndata.tests.helpers, which uses it)",
 ]
 
-[tool.hatch.build]
-# exclude big files that don’t need to be installed
-exclude = [
-    "tests",
-    "docs",
-    "notebooks",
-]
-[tool.hatch.build.hooks.vcs]
-version-file = "src/rapids_singlecell/_version.py"
+[tool.setuptools_scm]
+write_to = "src/rapids_singlecell/_version.py"
+# Optional but useful:
+version_scheme = "guess-next-dev"
+local_scheme = "node-and-date"
 
-[tool.hatch.version]
-source = "vcs"
+[tool.scikit-build]
+wheel.packages = [ "src/rapids_singlecell", "src/testing" ]
+cmake.version = ">=3.24"
+cmake.build-type = "Release"
+ninja.version = ">=1.10"
+experimental = false
+cmake.args = [ "-DCMAKE_CUDA_ARCHITECTURES=75;80;86;89;90;100" ]
+build-dir = "build"
+metadata.version.provider = "scikit_build_core.metadata.setuptools_scm"
 
-[tool.hatch.build.targets.wheel]
-packages = [ 'src/rapids_singlecell', 'src/testing' ]
 
 [tool.codespell]
 skip = '*.ipynb,*.csv'

diff --git a/src/rapids_singlecell/_cuda/__init__.py b/src/rapids_singlecell/_cuda/__init__.py
@@ -0,0 +1,3 @@
+from __future__ import annotations
+
+# Subpackage for CUDA extensions (built via scikit-build-core/nanobind)
diff --git a/src/rapids_singlecell/_cuda/mean_var/kernels.cuh b/src/rapids_singlecell/_cuda/mean_var/kernels.cuh
@@ -0,0 +1,53 @@
+#pragma once
+
+#include <cuda_runtime.h>
+
+template <typename T>
+__global__ void mean_var_major_kernel(const int* __restrict__ indptr,
+                                      const int* __restrict__ indices, const T* __restrict__ data,
+                                      double* __restrict__ means, double* __restrict__ vars,
+                                      int major, int /*minor*/) {
+  int major_idx = blockIdx.x;
+  if (major_idx >= major) return;
+
+  int start_idx = indptr[major_idx];
+  int stop_idx = indptr[major_idx + 1];
+
+  __shared__ double mean_place[64];
+  __shared__ double var_place[64];
+
+  mean_place[threadIdx.x] = 0.0;
+  var_place[threadIdx.x] = 0.0;
+  __syncthreads();
+
+  for (int minor_idx = start_idx + threadIdx.x; minor_idx < stop_idx; minor_idx += blockDim.x) {
+    double value = static_cast<double>(data[minor_idx]);
+    mean_place[threadIdx.x] += value;
+    var_place[threadIdx.x] += value * value;
+  }
+  __syncthreads();
+
+  for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
+    if (threadIdx.x < s) {
+      mean_place[threadIdx.x] += mean_place[threadIdx.x + s];
+      var_place[threadIdx.x] += var_place[threadIdx.x + s];
+    }
+    __syncthreads();
+  }
+  if (threadIdx.x == 0) {
+    means[major_idx] = mean_place[0];
+    vars[major_idx] = var_place[0];
+  }
+}
+
+template <typename T>
+__global__ void mean_var_minor_kernel(const int* __restrict__ indices, const T* __restrict__ data,
+                                      double* __restrict__ means, double* __restrict__ vars,
+                                      int nnz) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  if (idx >= nnz) return;
+  double value = static_cast<double>(data[idx]);
+  int minor_pos = indices[idx];
+  atomicAdd(&means[minor_pos], value);
+  atomicAdd(&vars[minor_pos], value * value);
+}
diff --git a/src/rapids_singlecell/_cuda/mean_var/mean_var.cu b/src/rapids_singlecell/_cuda/mean_var/mean_var.cu
@@ -0,0 +1,76 @@
+#include <cuda_runtime.h>
+#include <nanobind/nanobind.h>
+#include <cstdint>
+
+#include "kernels.cuh"
+
+namespace nb = nanobind;
+using nb::handle;
+
+template <typename T>
+static inline void launch_mean_var_major(std::uintptr_t indptr_ptr, std::uintptr_t indices_ptr,
+                                         std::uintptr_t data_ptr, std::uintptr_t means_ptr,
+                                         std::uintptr_t vars_ptr, int major, int minor) {
+  dim3 block(64);
+  dim3 grid(major);
+  const int* indptr = reinterpret_cast<const int*>(indptr_ptr);
+  const int* indices = reinterpret_cast<const int*>(indices_ptr);
+  const T* data = reinterpret_cast<const T*>(data_ptr);
+  double* means = reinterpret_cast<double*>(means_ptr);
+  double* vars = reinterpret_cast<double*>(vars_ptr);
+  mean_var_major_kernel<T><<<grid, block>>>(indptr, indices, data, means, vars, major, minor);
+}
+
+template <typename T>
+static inline void launch_mean_var_minor(std::uintptr_t indices_ptr, std::uintptr_t data_ptr,
+                                         std::uintptr_t means_ptr, std::uintptr_t vars_ptr,
+                                         int nnz) {
+  int block = 256;
+  int grid = (nnz + block - 1) / block;
+  const int* indices = reinterpret_cast<const int*>(indices_ptr);
+  const T* data = reinterpret_cast<const T*>(data_ptr);
+  double* means = reinterpret_cast<double*>(means_ptr);
+  double* vars = reinterpret_cast<double*>(vars_ptr);
+  mean_var_minor_kernel<T><<<grid, block>>>(indices, data, means, vars, nnz);
+}
+
+template <typename T>
+void mean_var_major_api(std::uintptr_t indptr, std::uintptr_t indices, std::uintptr_t data,
+                        std::uintptr_t means, std::uintptr_t vars, int major, int minor) {
+  launch_mean_var_major<T>(indptr, indices, data, means, vars, major, minor);
+}
+
+template <typename T>
+void mean_var_minor_api(std::uintptr_t indices, std::uintptr_t data, std::uintptr_t means,
+                        std::uintptr_t vars, int nnz) {
+  launch_mean_var_minor<T>(indices, data, means, vars, nnz);
+}
+
+NB_MODULE(_mean_var_cuda, m) {
+  m.def("mean_var_major_f32", &mean_var_major_api<float>);
+  m.def("mean_var_major_f64", &mean_var_major_api<double>);
+  m.def("mean_var_minor_f32", &mean_var_minor_api<float>);
+  m.def("mean_var_minor_f64", &mean_var_minor_api<double>);
+
+  m.def("mean_var_major",
+        [](std::uintptr_t indptr, std::uintptr_t indices, std::uintptr_t data, std::uintptr_t means,
+           std::uintptr_t vars, int major, int minor, int itemsize) {
+          if (itemsize == 4) {
+            mean_var_major_api<float>(indptr, indices, data, means, vars, major, minor);
+          } else if (itemsize == 8) {
+            mean_var_major_api<double>(indptr, indices, data, means, vars, major, minor);
+          } else {
+            throw nb::value_error("Unsupported itemsize for mean_var_major (expected 4 or 8)");
+          }
+        });
+  m.def("mean_var_minor", [](std::uintptr_t indices, std::uintptr_t data, std::uintptr_t means,
+                             std::uintptr_t vars, int nnz, int itemsize) {
+    if (itemsize == 4) {
+      mean_var_minor_api<float>(indices, data, means, vars, nnz);
+    } else if (itemsize == 8) {
+      mean_var_minor_api<double>(indices, data, means, vars, nnz);
+    } else {
+      throw nb::value_error("Unsupported itemsize for mean_var_minor (expected 4 or 8)");
+    }
+  });
+}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from __future__ import annotations

		# Subpackage for CUDA extensions (built via scikit-build-core/nanobind)