Skip to content
Open
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
54 commits
Select commit Hold shift + click to select a range
7d2c62b
add precommit
Intron7 Sep 11, 2025
7bc5367
add first implementation
Intron7 Sep 11, 2025
dc76b24
clang format
Intron7 Sep 11, 2025
dc3648b
format
Intron7 Sep 11, 2025
ef8a756
format c++
Intron7 Sep 11, 2025
76ae7aa
Merge branch 'main' into move-to-nanobind
Intron7 Sep 11, 2025
7ed481d
change clang-format
Intron7 Sep 11, 2025
48ba592
fix version
Intron7 Sep 11, 2025
1d2f12a
test docs
Intron7 Sep 11, 2025
60c4863
fix yml
Intron7 Sep 11, 2025
9a2b113
fix sparse to dense kernel launch
Intron7 Sep 11, 2025
2d5ea85
test read the docs
Intron7 Sep 11, 2025
9878e5d
try env
Intron7 Sep 11, 2025
6b46e8a
test cmakeargs
Intron7 Sep 11, 2025
55027f7
add system back
Intron7 Sep 11, 2025
b780405
add failsafe
Intron7 Sep 11, 2025
24104ff
remove print and slim down toml
Intron7 Sep 12, 2025
dddd9e8
Add almost unchanged cibw
flying-sheep Sep 15, 2025
5981d50
No macOS
flying-sheep Sep 15, 2025
b3c3853
test build wheels
Intron7 Sep 15, 2025
b24bf7b
next
Intron7 Sep 15, 2025
56aca24
remove wheels workflow
Intron7 Sep 15, 2025
7068b19
remove windows
Intron7 Sep 15, 2025
9a58ff0
remove optional parts
flying-sheep Sep 15, 2025
7f65657
test publish
Intron7 Sep 15, 2025
56f837a
3.12
flying-sheep Sep 15, 2025
3a9a9f1
fix path
Intron7 Sep 15, 2025
474de68
remove bad/useless
flying-sheep Sep 15, 2025
646ba23
fix container
Intron7 Sep 15, 2025
ae57cb1
try CUDA_PATH
flying-sheep Sep 15, 2025
33ac5af
skip musl again
flying-sheep Sep 15, 2025
a529a58
add next kernels
Intron7 Sep 15, 2025
0685436
add pca and make safe docs
Intron7 Sep 16, 2025
24551bd
Merge branch 'main' into move-to-nanobind
Intron7 Sep 16, 2025
5d327bd
make aggr safe
Intron7 Sep 16, 2025
30414ab
add harmony
Intron7 Sep 16, 2025
d46ab83
make qc smaller
Intron7 Sep 16, 2025
d45d6bf
add ligrec
Intron7 Sep 16, 2025
20cf11e
move decoupler
Intron7 Sep 16, 2025
134d2e0
remove rawkernels
Intron7 Sep 16, 2025
a872962
add release note
Intron7 Sep 16, 2025
2825de7
fix shape qc
Intron7 Sep 16, 2025
66e930f
fix entropy
Intron7 Sep 16, 2025
d386000
fix version
Intron7 Sep 16, 2025
cfdec19
add streams
Intron7 Sep 17, 2025
4876400
Merge branch 'main' into move-to-nanobind
Intron7 Sep 17, 2025
3a20dc2
fix pointer
Intron7 Sep 17, 2025
948b86a
fix test
Intron7 Sep 17, 2025
3fdde98
terse args
flying-sheep Sep 18, 2025
8abaab0
kw-only for aggr.cu
flying-sheep Sep 18, 2025
84a34c4
remaining cleanup
flying-sheep Sep 18, 2025
e53c87a
add keywords
Intron7 Sep 18, 2025
ad7ed53
fix keywords ligrec
Intron7 Sep 18, 2025
a62a596
add 120
Intron7 Sep 22, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions .clang-format
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
BasedOnStyle: Google
Language: Cpp

# Make braces stay on the same line (like your diffs)
BreakBeforeBraces: Attach
AllowShortFunctionsOnASingleLine: None

# Compact/“binpack” parameter lists (what produced your earlier diffs)
BinPackParameters: true
BinPackArguments: true

# Typical CUDA/C++ ergonomics
IndentWidth: 2
ColumnLimit: 100
PointerAlignment: Left
DerivePointerAlignment: false

# Don’t reorder #includes if you don’t want surprise churn
SortIncludes: false

# Optional: make templates break more aggressively
AlwaysBreakTemplateDeclarations: Yes
20 changes: 20 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,23 @@ __pycache__/

# Venvs
*venv/

# Build artifacts (CMake / scikit-build)
/build/
/CMakeFiles/
/CMakeCache.txt
/cmake_install.cmake
/build.ninja
/.ninja_deps
/.ninja_log
/libnanobind-static.a
/install_manifest__mean_var_cuda.txt
/_skbuild/
/*.egg-info/

# Compiled CUDA extension copied for editable installs
*.so

# Test/coverage caches
/.pytest_cache/
/.coverage
6 changes: 6 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,9 @@ repos:
- id: codespell
additional_dependencies:
- tomli
- repo: https://github.com/pre-commit/mirrors-clang-format
rev: v18.1.8
hooks:
- id: clang-format
args: [--style=file, -i]
types_or: [c, c++, cuda]
3 changes: 2 additions & 1 deletion .readthedocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,15 @@ build:
os: ubuntu-24.04
tools:
python: "3.12"

commands:
# Install and set up uv
- asdf plugin add uv
- asdf install uv latest
- asdf global uv latest

# Use uv to synchronize dependencies
- uv pip install --system .[doc]
- CMAKE_ARGS="-DRSC_BUILD_EXTENSIONS=OFF" uv pip install --system ".[doc]"

# Build documentation using sphinx
- python -m sphinx -T -b html -d docs/_build/doctrees -D language=en docs $READTHEDOCS_OUTPUT/html
Expand Down
48 changes: 48 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
cmake_minimum_required(VERSION 3.24)

project(rapids_singlecell_cuda LANGUAGES CXX)

# Option to disable building compiled extensions (for docs/RTD)
option(RSC_BUILD_EXTENSIONS "Build CUDA/C++ extensions" ON)

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)

if (RSC_BUILD_EXTENSIONS)
enable_language(CUDA)
find_package(Python REQUIRED COMPONENTS Interpreter Development.Module)
find_package(nanobind CONFIG REQUIRED)
find_package(CUDAToolkit REQUIRED)
else()
message(STATUS "RSC_BUILD_EXTENSIONS=OFF -> skipping compiled extensions for docs")
endif()

# Helper to declare a nanobind CUDA module uniformly
function(add_nb_cuda_module target src)
if (RSC_BUILD_EXTENSIONS)
nanobind_add_module(${target} STABLE_ABI LTO
${src}
)
target_link_libraries(${target} PRIVATE CUDA::cudart)
set_target_properties(${target} PROPERTIES
CUDA_SEPARABLE_COMPILATION ON
)
install(TARGETS ${target} LIBRARY DESTINATION rapids_singlecell/_cuda)
# Also copy built module into source tree for editable installs
add_custom_command(TARGET ${target} POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
$<TARGET_FILE:${target}>
${PROJECT_SOURCE_DIR}/src/rapids_singlecell/_cuda/$<TARGET_FILE_NAME:${target}>
)
endif()
endfunction()

if (RSC_BUILD_EXTENSIONS)
# CUDA modules
add_nb_cuda_module(_mean_var_cuda src/rapids_singlecell/_cuda/mean_var/mean_var.cu)
add_nb_cuda_module(_sparse2dense_cuda src/rapids_singlecell/_cuda/sparse2dense/sparse2dense.cu)
add_nb_cuda_module(_scale_cuda src/rapids_singlecell/_cuda/scale/scale.cu)
add_nb_cuda_module(_qc_cuda src/rapids_singlecell/_cuda/qc/qc.cu)
add_nb_cuda_module(_qc_dask_cuda src/rapids_singlecell/_cuda/qc_dask/qc_kernels_dask.cu)
endif()
36 changes: 21 additions & 15 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
[build-system]
requires = [ "hatchling", "hatch-vcs" ]
build-backend = "hatchling.build"
requires = [
"scikit-build-core>=0.10",
"nanobind>=2.0.0",
"pybind11-stubgen; python_version>='3.11'",
"setuptools-scm>=8",
]
build-backend = "scikit_build_core.build"

[project]
name = "rapids_singlecell"
Expand Down Expand Up @@ -107,21 +112,22 @@ markers = [
"gpu: tests that use a GPU (currently unused, but needs to be specified here as we import anndata.tests.helpers, which uses it)",
]

[tool.hatch.build]
# exclude big files that don’t need to be installed
exclude = [
"tests",
"docs",
"notebooks",
]
[tool.hatch.build.hooks.vcs]
version-file = "src/rapids_singlecell/_version.py"
[tool.setuptools_scm]
write_to = "src/rapids_singlecell/_version.py"
# Optional but useful:
version_scheme = "guess-next-dev"
local_scheme = "node-and-date"

[tool.hatch.version]
source = "vcs"
[tool.scikit-build]
wheel.packages = [ "src/rapids_singlecell", "src/testing" ]
cmake.version = ">=3.24"
cmake.build-type = "Release"
ninja.version = ">=1.10"
experimental = false
cmake.args = [ "-DCMAKE_CUDA_ARCHITECTURES=75;80;86;89;90;100" ]
build-dir = "build"
metadata.version.provider = "scikit_build_core.metadata.setuptools_scm"

[tool.hatch.build.targets.wheel]
packages = [ 'src/rapids_singlecell', 'src/testing' ]

[tool.codespell]
skip = '*.ipynb,*.csv'
Expand Down
3 changes: 3 additions & 0 deletions src/rapids_singlecell/_cuda/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from __future__ import annotations

# Subpackage for CUDA extensions (built via scikit-build-core/nanobind)
53 changes: 53 additions & 0 deletions src/rapids_singlecell/_cuda/mean_var/kernels.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#pragma once

#include <cuda_runtime.h>

template <typename T>
__global__ void mean_var_major_kernel(const int* __restrict__ indptr,
const int* __restrict__ indices, const T* __restrict__ data,
double* __restrict__ means, double* __restrict__ vars,
int major, int /*minor*/) {
int major_idx = blockIdx.x;
if (major_idx >= major) return;

int start_idx = indptr[major_idx];
int stop_idx = indptr[major_idx + 1];

__shared__ double mean_place[64];
__shared__ double var_place[64];

mean_place[threadIdx.x] = 0.0;
var_place[threadIdx.x] = 0.0;
__syncthreads();

for (int minor_idx = start_idx + threadIdx.x; minor_idx < stop_idx; minor_idx += blockDim.x) {
double value = static_cast<double>(data[minor_idx]);
mean_place[threadIdx.x] += value;
var_place[threadIdx.x] += value * value;
}
__syncthreads();

for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
if (threadIdx.x < s) {
mean_place[threadIdx.x] += mean_place[threadIdx.x + s];
var_place[threadIdx.x] += var_place[threadIdx.x + s];
}
__syncthreads();
}
if (threadIdx.x == 0) {
means[major_idx] = mean_place[0];
vars[major_idx] = var_place[0];
}
}

template <typename T>
__global__ void mean_var_minor_kernel(const int* __restrict__ indices, const T* __restrict__ data,
double* __restrict__ means, double* __restrict__ vars,
int nnz) {
int idx = blockDim.x * blockIdx.x + threadIdx.x;
if (idx >= nnz) return;
double value = static_cast<double>(data[idx]);
int minor_pos = indices[idx];
atomicAdd(&means[minor_pos], value);
atomicAdd(&vars[minor_pos], value * value);
}
76 changes: 76 additions & 0 deletions src/rapids_singlecell/_cuda/mean_var/mean_var.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#include <cuda_runtime.h>
#include <nanobind/nanobind.h>
#include <cstdint>

#include "kernels.cuh"

namespace nb = nanobind;
using nb::handle;

template <typename T>
static inline void launch_mean_var_major(std::uintptr_t indptr_ptr, std::uintptr_t indices_ptr,
std::uintptr_t data_ptr, std::uintptr_t means_ptr,
std::uintptr_t vars_ptr, int major, int minor) {
dim3 block(64);
dim3 grid(major);
const int* indptr = reinterpret_cast<const int*>(indptr_ptr);
const int* indices = reinterpret_cast<const int*>(indices_ptr);
const T* data = reinterpret_cast<const T*>(data_ptr);
double* means = reinterpret_cast<double*>(means_ptr);
double* vars = reinterpret_cast<double*>(vars_ptr);
mean_var_major_kernel<T><<<grid, block>>>(indptr, indices, data, means, vars, major, minor);
}

template <typename T>
static inline void launch_mean_var_minor(std::uintptr_t indices_ptr, std::uintptr_t data_ptr,
std::uintptr_t means_ptr, std::uintptr_t vars_ptr,
int nnz) {
int block = 256;
int grid = (nnz + block - 1) / block;
const int* indices = reinterpret_cast<const int*>(indices_ptr);
const T* data = reinterpret_cast<const T*>(data_ptr);
double* means = reinterpret_cast<double*>(means_ptr);
double* vars = reinterpret_cast<double*>(vars_ptr);
mean_var_minor_kernel<T><<<grid, block>>>(indices, data, means, vars, nnz);
}

template <typename T>
void mean_var_major_api(std::uintptr_t indptr, std::uintptr_t indices, std::uintptr_t data,
std::uintptr_t means, std::uintptr_t vars, int major, int minor) {
launch_mean_var_major<T>(indptr, indices, data, means, vars, major, minor);
}

template <typename T>
void mean_var_minor_api(std::uintptr_t indices, std::uintptr_t data, std::uintptr_t means,
std::uintptr_t vars, int nnz) {
launch_mean_var_minor<T>(indices, data, means, vars, nnz);
}

NB_MODULE(_mean_var_cuda, m) {
m.def("mean_var_major_f32", &mean_var_major_api<float>);
m.def("mean_var_major_f64", &mean_var_major_api<double>);
m.def("mean_var_minor_f32", &mean_var_minor_api<float>);
m.def("mean_var_minor_f64", &mean_var_minor_api<double>);

m.def("mean_var_major",
[](std::uintptr_t indptr, std::uintptr_t indices, std::uintptr_t data, std::uintptr_t means,
std::uintptr_t vars, int major, int minor, int itemsize) {
if (itemsize == 4) {
mean_var_major_api<float>(indptr, indices, data, means, vars, major, minor);
} else if (itemsize == 8) {
mean_var_major_api<double>(indptr, indices, data, means, vars, major, minor);
} else {
throw nb::value_error("Unsupported itemsize for mean_var_major (expected 4 or 8)");
}
});
m.def("mean_var_minor", [](std::uintptr_t indices, std::uintptr_t data, std::uintptr_t means,
std::uintptr_t vars, int nnz, int itemsize) {
if (itemsize == 4) {
mean_var_minor_api<float>(indices, data, means, vars, nnz);
} else if (itemsize == 8) {
mean_var_minor_api<double>(indices, data, means, vars, nnz);
} else {
throw nb::value_error("Unsupported itemsize for mean_var_minor (expected 4 or 8)");
}
});
}
Loading
Loading