diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..78bb5e2 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +# Python bytecode +**/*.pyc +**/__pycache__/ +*.pyo +*.pyd + diff --git a/CMakeLists.txt b/CMakeLists.txt deleted file mode 100644 index f27d490..0000000 --- a/CMakeLists.txt +++ /dev/null @@ -1,70 +0,0 @@ -cmake_minimum_required(VERSION 3.24) - -project(cuslines LANGUAGES CUDA CXX VERSION 1.0) - -# Build settings -set(CMAKE_CXX_STANDARD 11) -set(CMAKE_CXX_STANDARD_REQUIRED ON) -set(CMAKE_POSITION_INDEPENDENT_CODE ON) -set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -Wall -Werror=reorder") -set(CMAKE_CXX_FLAGS_RELEASE "-O3") - -if(NOT CMAKE_BUILD_TYPE) - set(CMAKE_BUILD_TYPE "Release") -endif() - -if (CMAKE_BUILD_TYPE STREQUAL "Debug" ) - set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_DEBUG}") -else() - set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_RELEASE}") -endif() - -# CUDA -find_package(CUDAToolkit REQUIRED) - -# Set default CUDA compute capabilities if unset -if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) - include(FindCUDA/select_compute_arch.cmake) - cuda_select_nvcc_arch_flags(CUDA_ARCH_FLAGS Auto) - set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH_FLAGS}) -endif() -message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}") - -# OpenMP -find_package(OpenMP) -if(OPENMP_FOUND) - set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") - set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") - - # Set OMP runtime based on compiler - if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") - set(OMP_RUNTIME "INTEL") - elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") - set(OMP_RUNTIME "GNU") - elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel") - set(OMP_RUNTIME "INTEL") - endif() - message(STATUS "OpenMP runtime used: ${OMP_RUNTIME}") -endif() - -# Find pybind11 -execute_process(COMMAND python -c "import pybind11; print(pybind11.get_cmake_dir())" - OUTPUT_VARIABLE pybind11_DIR - OUTPUT_STRIP_TRAILING_WHITESPACE) -list(APPEND CMAKE_PREFIX_PATH ${pybind11_DIR}) -find_package(pybind11 REQUIRED) - -# Build library and pybind11 module -add_library(cuslines_kernels) -target_sources(cuslines_kernels - PRIVATE - ${CMAKE_SOURCE_DIR}/cuslines/generate_streamlines_cuda.cu) -set_target_properties(cuslines_kernels PROPERTIES OUTPUT_NAME cuslines_kernels - POSITION_INDEPENDENT_CODE TRUE) - -pybind11_add_module(cuslines ${CMAKE_SOURCE_DIR}/cuslines/cuslines.cpp) -target_include_directories(cuslines PUBLIC "${CMAKE_SOURCE_DIR}/cuslines" "${CUDAToolkit_INCLUDE_DIRS}") -target_link_libraries(cuslines PRIVATE cuslines_kernels CUDA::cudart_static) - -# Install -install(TARGETS cuslines cuslines_kernels LIBRARY DESTINATION .) diff --git a/Dockerfile b/Dockerfile index 06e9de9..889371d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -16,17 +16,20 @@ RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.0/cmake-3.24.0 && mkdir /opt/cmake \ && /tmp/cmake-install.sh --skip-license --prefix=/opt/cmake \ && rm /tmp/cmake-install.sh -ENV PATH /opt/cmake/bin:${PATH} +ENV PATH=/opt/cmake/bin:${PATH} RUN curl -L "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh" \ -o "/tmp/Miniconda3.sh" RUN bash /tmp/Miniconda3.sh -b -p /opt/anaconda RUN rm -rf /tmp/Miniconda3.sh RUN cd /opt && eval "$(/opt/anaconda/bin/conda shell.bash hook)" -ENV PATH /opt/anaconda/bin:${PATH} -ENV LD_LIBRARY_PATH /opt/anaconda/lib:${LD_LIBRARY_PATH} +ENV PATH=/opt/anaconda/bin:${PATH} +ENV LD_LIBRARY_PATH=/opt/anaconda/lib:${LD_LIBRARY_PATH} # python prereqs +RUN conda tos accept --override-channels --channel conda-forge +RUN conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main +RUN conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r RUN conda install -c conda-forge git RUN pip install numpy>=2.0.0 RUN pip install scipy>=1.13.0 cython nibabel dipy tqdm diff --git a/README.md b/README.md index 5cda7f5..9e0275d 100644 --- a/README.md +++ b/README.md @@ -48,63 +48,7 @@ Destroy GPUTracker... Note that if you experience memory errors, you can adjust the `--chunk-size` flag. -To run on more seeds, we suggest enabling the `--use-fast-write` flag in the GPU script to not get bottlenecked by writing files. Here is a comparison running on 500K seeds on 1 GPU with and without this flag: - -Without `--use-fast-write`: -``` -$ python run_gpu_streamlines.py --output-prefix small --nseeds 500000 --ngpus 1 -parsing arguments -Fitting Tensor -Computing anisotropy measures (FA,MD,RGB) -slowadcodf -Bootstrap direction getter -streamline gen -Creating GPUTracker with 1 GPUs... -Generated 143891 streamlines from 100000 seeds, time: 7.978902339935303 s -Saved streamlines to small.1_5.trk, time 11.439777851104736 s -Generated 151932 streamlines from 100000 seeds, time: 10.155118703842163 s -Saved streamlines to small.2_5.trk, time 12.438884019851685 s -Generated 146971 streamlines from 100000 seeds, time: 9.822870016098022 s -Saved streamlines to small.3_5.trk, time 12.377111673355103 s -Generated 153824 streamlines from 100000 seeds, time: 11.133368968963623 s -Saved streamlines to small.4_5.trk, time 13.317519187927246 s -Generated 162004 streamlines from 100000 seeds, time: 13.19784665107727 s -Saved streamlines to small.5_5.trk, time 14.21276593208313 s -Completed processing 500000 seeds. -Initialization time: 14.789637088775635 sec -Streamline generation total time: 116.0746865272522 sec - Streamline processing: 52.28810667991638 sec - File writing: 63.7860586643219 sec -Destroy GPUTracker... -``` - -With `--use-fast-write`: -``` -$ python run_gpu_streamlines.py --output-prefix small --nseeds 500000 --ngpus 1 --use-fast-write -parsing arguments -Fitting Tensor -Computing anisotropy measures (FA,MD,RGB) -slowadcodf -Bootstrap direction getter -streamline gen -Creating GPUTracker with 1 GPUs... -Generated 143891 streamlines from 100000 seeds, time: 7.962322473526001 s -Saved streamlines to small.1_5_*.trk, time 0.1053612232208252 s -Generated 151932 streamlines from 100000 seeds, time: 10.148677825927734 s -Saved streamlines to small.2_5_*.trk, time 0.1606450080871582 s -Generated 146971 streamlines from 100000 seeds, time: 9.811130285263062 s -Saved streamlines to small.3_5_*.trk, time 0.571892499923706 s -Generated 153824 streamlines from 100000 seeds, time: 11.186563968658447 s -Saved streamlines to small.4_5_*.trk, time 0.3091111183166504 s -Generated 162004 streamlines from 100000 seeds, time: 13.282683610916138 s -Saved streamlines to small.5_5_*.trk, time 0.7107999324798584 s -Completed processing 500000 seeds. -Initialization time: 14.705361366271973 sec -Streamline generation total time: 54.24975609779358 sec - Streamline processing: 52.39137816429138 sec - File writing: 1.8578097820281982 sec -Destroy GPUTracker... -``` +To run on more seeds, we suggest setting the `--write-method trx` flag in the GPU script to not get bottlenecked by writing files. ## Running on AWS with Docker First, set up an AWS instance with GPU and ssh into it (we recommend a P3 instance with at least 1 V100 16 GB GPU and a Deep Learning AMI Ubuntu 18.04 v 33.0.). Then do the following: diff --git a/build_cuslines.sh b/build_cuslines.sh deleted file mode 100755 index 8375223..0000000 --- a/build_cuslines.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash - -build_dir=$(pwd)/build -install_dir=$(pwd)/install - -# set up build dir -mkdir -p ${build_dir} -cd ${build_dir} - -# configure -cmake -DCMAKE_INSTALL_PREFIX=${install_dir} \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_C_COMPILER=gcc \ - -DCMAKE_CXX_COMPILER=g++ \ - -DPYTHON_EXECUTABLE=$(which python) \ - .. - -# compile -make && make install diff --git a/cuslines/Makefile b/cuslines/Makefile deleted file mode 100644 index 1061a16..0000000 --- a/cuslines/Makefile +++ /dev/null @@ -1,55 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# 3. Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -CUDA_HOME=/usr/local/cuda -CUDACC=$(CUDA_HOME)/bin/nvcc # -G -g -dopt=on -CXX=g++ -LD=g++ - -CXXFLAGS= -c -O3 -std=c++11 -fopenmp -fPIC `python3 -m pybind11 --includes` -I$(CUDA_HOME)/include - -SMS ?= 70 -CUDA_ARCH = $(foreach SM,$(SMS),-gencode arch=compute_$(SM),code=sm_$(SM)) -LASTSM := $(lastword $(sort $(SMS))) -CUDA_ARCH += -gencode arch=compute_$(LASTSM),code=compute_$(LASTSM) -CUDACFLAGS=-c -O3 -lineinfo -Xptxas=-v -std=c++11 -Xcompiler -fPIC -Xcompiler=-fopenmp $(CUDA_ARCH) -LDFLAGS= -shared -fopenmp -L$(CUDA_HOME)/lib64 -lcudart -lnvToolsExt - -all: cuslines - -cuslines: generate_streamlines_cuda.o cuslines.o - $(LD) cuslines.o generate_streamlines_cuda.o -o cuslines`python3-config --extension-suffix` $(LDFLAGS) - -%.o : %.cu - $(CUDACC) $(CUDACFLAGS) $< -o $@ - -%.o: %.cpp - $(CXX) $(CXXFLAGS) $< -o $@ - -clean: - rm *.o cuslines`python3-config --extension-suffix` __pycache__/*.pyc diff --git a/cuslines/__init__.py b/cuslines/__init__.py new file mode 100644 index 0000000..b96cca1 --- /dev/null +++ b/cuslines/__init__.py @@ -0,0 +1,13 @@ +from .cuda_python import ( + GPUTracker, + ProbDirectionGetter, + PttDirectionGetter, + BootDirectionGetter +) + +__all__ = [ + "GPUTracker", + "ProbDirectionGetter", + "PttDirectionGetter", + "BootDirectionGetter" +] diff --git a/cuslines/cuda_c/boot.cu b/cuslines/cuda_c/boot.cu new file mode 100644 index 0000000..133c43d --- /dev/null +++ b/cuslines/cuda_c/boot.cu @@ -0,0 +1,1066 @@ +//#define USE_FIXED_PERMUTATION +#ifdef USE_FIXED_PERMUTATION +//__device__ const int fixedPerm[] = {44, 47, 53, 0, 3, 3, 39, 9, 19, 21, 50, 36, 23, +// 6, 24, 24, 12, 1, 38, 39, 23, 46, 24, 17, 37, 25, +// 13, 8, 9, 20, 51, 16, 51, 5, 15, 47, 0, 18, 35, +// 24, 49, 51, 29, 19, 19, 14, 39, 32, 1, 9, 32, 31, +// 10, 52, 23}; +__device__ const int fixedPerm[] = { + 47, 117, 67, 103, 9, 21, 36, 87, 70, 88, 140, 58, 39, 87, 88, 81, 25, 77, + 72, 9, 148, 115, 79, 82, 99, 29, 147, 147, 142, 32, 9, 127, 32, 31, 114, 28, + 34, 128, 128, 53, 133, 38, 17, 79, 132, 105, 42, 31, 120, 1, 65, 57, 35, 102, + 119, 11, 82, 91, 128, 142, 99, 53, 140, 121, 84, 68, 6, 47, 127, 131, 100, 78, + 143, 148, 23, 141, 117, 85, 48, 49, 69, 95, 94, 0, 113, 36, 48, 93, 131, 98, + 42, 112, 149, 127, 0, 138, 114, 43, 127, 23, 130, 121, 98, 62, 123, 82, 148, 50, + 14, 41, 58, 36, 10, 86, 43, 104, 11, 2, 51, 80, 32, 128, 38, 19, 42, 115, + 77, 30, 24, 125, 2, 3, 94, 107, 13, 112, 40, 72, 19, 95, 72, 67, 61, 14, + 96, 4, 139, 86, 121, 109}; +#endif + +template +__device__ VAL_T avgMask(const int mskLen, + const int *__restrict__ mask, + const VAL_T *__restrict__ data) { + + const int tidx = threadIdx.x; + const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; + + const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); + + int __myCnt = 0; + VAL_T __mySum = 0; + + for(int i = tidx; i < mskLen; i += BDIM_X) { + if(mask[i]) { + __myCnt++; + __mySum += data[i]; + } + } + + #pragma unroll + for(int i = BDIM_X/2; i; i /= 2) { + __mySum += __shfl_xor_sync(WMASK, __mySum, i, BDIM_X); + __myCnt += __shfl_xor_sync(WMASK, __myCnt, i, BDIM_X); + } + + return __mySum/__myCnt; + +} + +template< + int BDIM_X, + typename LEN_T, + typename MSK_T, + typename VAL_T> +__device__ LEN_T maskGet(const LEN_T n, + const MSK_T *__restrict__ mask, + const VAL_T *__restrict__ plain, + VAL_T *__restrict__ masked) { + + const int tidx = threadIdx.x; + + const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; + const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); + + const int __laneMask = (1 << tidx)-1; + + int woff = 0; + for(int j = 0; j < n; j += BDIM_X) { + + const int __act = (j+tidx < n) ? !mask[j+tidx] : 0; + const int __msk = __ballot_sync(WMASK, __act); + + const int toff = __popc(__msk & __laneMask); + if (__act) { + masked[woff+toff] = plain[j+tidx]; + } + woff += __popc(__msk); + } + return woff; +} + +template< + int BDIM_X, + typename LEN_T, + typename MSK_T, + typename VAL_T> +__device__ void maskPut(const LEN_T n, + const MSK_T *__restrict__ mask, + const VAL_T *__restrict__ masked, + VAL_T *__restrict__ plain) { + + const int tidx = threadIdx.x; + + const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; + const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); + + const int __laneMask = (1 << tidx)-1; + + int woff = 0; + for(int j = 0; j < n; j += BDIM_X) { + + const int __act = (j+tidx < n) ? !mask[j+tidx] : 0; + const int __msk = __ballot_sync(WMASK, __act); + + const int toff = __popc(__msk & __laneMask); + if (__act) { + plain[j+tidx] = masked[woff+toff]; + } + woff += __popc(__msk); + } + return; +} + +template +__device__ int closest_peak_d(const REAL_T max_angle, + const REAL3_T direction, //dir + const int npeaks, + const REAL3_T *__restrict__ peaks, + REAL3_T *__restrict__ peak) {// dirs, + + const int tidx = threadIdx.x; + + const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; + const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); + + //const REAL_T cos_similarity = COS(MAX_ANGLE_P); + const REAL_T cos_similarity = COS(max_angle); +#if 0 + if (!threadIdx.y && !tidx) { + printf("direction: (%f, %f, %f)\n", + direction.x, direction.y, direction.z); + } + __syncwarp(WMASK); +#endif + REAL_T cpeak_dot = 0; + int cpeak_idx = -1; + for(int j = 0; j < npeaks; j += BDIM_X) { + if (j+tidx < npeaks) { +#if 0 + if (!threadIdx.y && !tidx) { + printf("j+tidx: %d, peaks[j+tidx]: (%f, %f, %f)\n", + j+tidx, peaks[j+tidx].x, peaks[j+tidx].y, peaks[j+tidx].z); + } +#endif + const REAL_T dot = direction.x*peaks[j+tidx].x+ + direction.y*peaks[j+tidx].y+ + direction.z*peaks[j+tidx].z; + + if (FABS(dot) > FABS(cpeak_dot)) { + cpeak_dot = dot; + cpeak_idx = j+tidx; + } + } + } +#if 0 + if (!threadIdx.y && !tidx) { + printf("cpeak_idx: %d, cpeak_dot: %f\n", cpeak_idx, cpeak_dot); + } + __syncwarp(WMASK); +#endif + + #pragma unroll + for(int j = BDIM_X/2; j; j /= 2) { + + const REAL_T dot = __shfl_xor_sync(WMASK, cpeak_dot, j, BDIM_X); + const int idx = __shfl_xor_sync(WMASK, cpeak_idx, j, BDIM_X); + if (FABS(dot) > FABS(cpeak_dot)) { + cpeak_dot = dot; + cpeak_idx = idx; + } + } +#if 0 + if (!threadIdx.y && !tidx) { + printf("cpeak_idx: %d, cpeak_dot: %f, cos_similarity: %f\n", cpeak_idx, cpeak_dot, cos_similarity); + } + __syncwarp(WMASK); +#endif + if (cpeak_idx >= 0) { + if (cpeak_dot >= cos_similarity) { + peak[0] = peaks[cpeak_idx]; + return 1; + } + if (cpeak_dot <= -cos_similarity) { + peak[0] = MAKE_REAL3(-peaks[cpeak_idx].x, + -peaks[cpeak_idx].y, + -peaks[cpeak_idx].z); + return 1; + } + } + return 0; +} + +template +__device__ void ndotp_d(const int N, + const int M, + const VAL_T *__restrict__ srcV, + const VAL_T *__restrict__ srcM, + VAL_T *__restrict__ dstV) { + + const int tidx = threadIdx.x; + + const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; + const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); + + //#pragma unroll + for(int i = 0; i < N; i++) { + + VAL_T __tmp = 0; + + //#pragma unroll + for(int j = 0; j < M; j += BDIM_X) { + if (j+tidx < M) { + __tmp += srcV[j+tidx]*srcM[i*M + j+tidx]; + } + } + #pragma unroll + for(int j = BDIM_X/2; j; j /= 2) { +#if 0 + __tmp += __shfl_xor_sync(WMASK, __tmp, j, BDIM_X); +#else + __tmp += __shfl_down_sync(WMASK, __tmp, j, BDIM_X); +#endif + } + // values could be held by BDIM_X threads and written + // together every BDIM_X iterations... + + if (tidx == 0) { + dstV[i] = __tmp; + } + } + return; +} + + +template +__device__ void ndotp_log_opdt_d(const int N, + const int M, + const VAL_T *__restrict__ srcV, + const VAL_T *__restrict__ srcM, + VAL_T *__restrict__ dstV) { + + const int tidx = threadIdx.x; + + const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; + const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); + + const VAL_T ONEP5 = static_cast(1.5); + + //#pragma unroll + for(int i = 0; i < N; i++) { + + VAL_T __tmp = 0; + + //#pragma unroll + for(int j = 0; j < M; j += BDIM_X) { + if (j+tidx < M) { + const VAL_T v = srcV[j+tidx]; + __tmp += -LOG(v)*(ONEP5+LOG(v))*v * srcM[i*M + j+tidx]; + } + } + #pragma unroll + for(int j = BDIM_X/2; j; j /= 2) { +#if 0 + __tmp += __shfl_xor_sync(WMASK, __tmp, j, BDIM_X); +#else + __tmp += __shfl_down_sync(WMASK, __tmp, j, BDIM_X); +#endif + } + // values could be held by BDIM_X threads and written + // together every BDIM_X iterations... + + if (tidx == 0) { + dstV[i] = __tmp; + } + } + return; +} + +template +__device__ void ndotp_log_csa_d(const int N, + const int M, + const VAL_T *__restrict__ srcV, + const VAL_T *__restrict__ srcM, + VAL_T *__restrict__ dstV) { + + const int tidx = threadIdx.x; + + const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; + const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); + // Clamp values + constexpr VAL_T min = .001; + constexpr VAL_T max = .999; + + //#pragma unroll + for(int i = 0; i < N; i++) { + + VAL_T __tmp = 0; + + //#pragma unroll + for(int j = 0; j < M; j += BDIM_X) { + if (j+tidx < M) { + const VAL_T v = MIN(MAX(srcV[j+tidx], min), max); + __tmp += LOG(-LOG(v)) * srcM[i*M + j+tidx]; + } + } + #pragma unroll + for(int j = BDIM_X/2; j; j /= 2) { +#if 0 + __tmp += __shfl_xor_sync(WMASK, __tmp, j, BDIM_X); +#else + __tmp += __shfl_down_sync(WMASK, __tmp, j, BDIM_X); +#endif + } + // values could be held by BDIM_X threads and written + // together every BDIM_X iterations... + + if (tidx == 0) { + dstV[i] = __tmp; + } + } + return; +} + + +template +__device__ void fit_opdt(const int delta_nr, + const int hr_side, + const REAL_T *__restrict__ delta_q, + const REAL_T *__restrict__ delta_b, + const REAL_T *__restrict__ __msk_data_sh, + REAL_T *__restrict__ __h_sh, + REAL_T *__restrict__ __r_sh) { + const int tidx = threadIdx.x; + const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; + const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); + + ndotp_log_opdt_d(delta_nr, hr_side, __msk_data_sh, delta_q, __r_sh); + ndotp_d (delta_nr, hr_side, __msk_data_sh, delta_b, __h_sh); + __syncwarp(WMASK); + #pragma unroll + for(int j = tidx; j < delta_nr; j += BDIM_X) { + __r_sh[j] -= __h_sh[j]; + } + __syncwarp(WMASK); +} + +template +__device__ void fit_csa(const int delta_nr, + const int hr_side, + const REAL_T *__restrict__ fit_matrix, + const REAL_T *__restrict__ __msk_data_sh, + REAL_T *__restrict__ __r_sh) { + const int tidx = threadIdx.x; + const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; + const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); + + constexpr REAL _n0_const = 0.28209479177387814; // .5 / sqrt(pi) + ndotp_log_csa_d(delta_nr, hr_side, __msk_data_sh, fit_matrix, __r_sh); + __syncwarp(WMASK); + if (tidx == 0) { + __r_sh[0] = _n0_const; + } + __syncwarp(WMASK); +} + +template +__device__ void fit_model_coef(const int delta_nr, // delta_nr is number of ODF directions + const int hr_side, // hr_side is number of data directions + const REAL_T *__restrict__ delta_q, + const REAL_T *__restrict__ delta_b, // these are fit matrices the model can use, different for each model + const REAL_T *__restrict__ __msk_data_sh, // __msk_data_sh is the part of the data currently being operated on by this block + REAL_T *__restrict__ __h_sh, // these last two are modifications to the coefficients that will be returned + REAL_T *__restrict__ __r_sh) { + switch(MODEL_T) { + case OPDT: + fit_opdt(delta_nr, hr_side, delta_q, delta_b, __msk_data_sh, __h_sh, __r_sh); + break; + case CSA: + fit_csa(delta_nr, hr_side, delta_q, __msk_data_sh, __r_sh); + break; + default: + printf("FATAL: Invalid Model Type.\n"); + break; + } +} + +template +__device__ int get_direction_boot_d( + curandStatePhilox4_32_10_t *st, + const REAL_T max_angle, + const REAL_T min_signal, + const REAL_T relative_peak_thres, + const REAL_T min_separation_angle, + REAL3_T dir, + const int dimx, + const int dimy, + const int dimz, + const int dimt, + const REAL_T *__restrict__ dataf, + const int *__restrict__ b0s_mask, // not using this (and its opposite, dwi_mask) + // but not clear if it will never be needed so + // we'll keep it here for now... + const REAL3_T point, + const REAL_T *__restrict__ H, + const REAL_T *__restrict__ R, + // model unused + // max_angle, pmf_threshold from global defines + // b0s_mask already passed + // min_signal from global defines + const int delta_nr, + const REAL_T *__restrict__ delta_b, + const REAL_T *__restrict__ delta_q, // fit_matrix + const int samplm_nr, + const REAL_T *__restrict__ sampling_matrix, + const REAL3_T *__restrict__ sphere_vertices, + const int2 *__restrict__ sphere_edges, + const int num_edges, + REAL3_T *__restrict__ dirs) { + + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + + const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; + const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); + + const int n32dimt = ((dimt+31)/32)*32; + + extern REAL_T __shared__ __sh[]; + + REAL_T *__vox_data_sh = reinterpret_cast(__sh); + REAL_T *__msk_data_sh = __vox_data_sh + BDIM_Y*n32dimt; + + REAL_T *__r_sh = __msk_data_sh + BDIM_Y*n32dimt; + REAL_T *__h_sh = __r_sh + BDIM_Y*MAX(n32dimt, samplm_nr); + + __vox_data_sh += tidy*n32dimt; + __msk_data_sh += tidy*n32dimt; + + __r_sh += tidy*MAX(n32dimt, samplm_nr); + __h_sh += tidy*MAX(n32dimt, samplm_nr); + + // compute hr_side (may be passed from python) + int hr_side = 0; + for(int j = tidx; j < dimt; j += BDIM_X) { + hr_side += !b0s_mask[j] ? 1 : 0; + } + #pragma unroll + for(int i = BDIM_X/2; i; i /= 2) { + hr_side += __shfl_xor_sync(WMASK, hr_side, i, BDIM_X); + } + + #pragma unroll + for(int i = 0; i < NATTEMPTS; i++) { + + const int rv = trilinear_interp_d(dimx, dimy, dimz, dimt, -1, dataf, point, __vox_data_sh); + + const int nmsk = maskGet(dimt, b0s_mask, __vox_data_sh, __msk_data_sh); + + //if (!tidx && !threadIdx.y && !blockIdx.x) { + // + // printf("interp of %f, %f, %f\n", point.x, point.y, point.z); + // printf("hr_side: %d\n", hr_side); + // printArray("vox_data", 6, dimt, __vox_data_sh[tidy]); + // printArray("msk_data", 6, nmsk, __msk_data_sh[tidy]); + //} + //break; + + __syncwarp(WMASK); + + if (rv == 0) { + + ndotp_d(hr_side, hr_side, __msk_data_sh, R, __r_sh); + //__syncwarp(); + //printArray("__r", 5, hr_side*hr_side, R); + //printArray("__r_sh", 6, hr_side, __r_sh[tidy]); + + ndotp_d(hr_side, hr_side, __msk_data_sh, H, __h_sh); + //__syncwarp(); + //printArray("__h_sh", 6, hr_side, __h_sh[tidy]); + + __syncwarp(WMASK); + + for(int j = 0; j < hr_side; j += BDIM_X) { + if (j+tidx < hr_side) { +#ifdef USE_FIXED_PERMUTATION + const int srcPermInd = fixedPerm[j+tidx]; +#else + const int srcPermInd = curand(st) % hr_side; +// if (srcPermInd < 0 || srcPermInd >= hr_side) { +// printf("srcPermInd: %d\n", srcPermInd); +// } +#endif + __h_sh[j+tidx] += __r_sh[srcPermInd]; + //__h_sh[j+tidx] += __r_sh[j+tidx]; + } + } + __syncwarp(WMASK); + + //printArray("h+perm(r):", 6, hr_side, __h_sh[tidy]); + //__syncwarp(); + + // vox_data[dwi_mask] = masked_data + maskPut(dimt, b0s_mask, __h_sh, __vox_data_sh); + __syncwarp(WMASK); + + //printArray("vox_data[dwi_mask]:", 6, dimt, __vox_data_sh[tidy]); + //__syncwarp(); + + for(int j = tidx; j < dimt; j += BDIM_X) { + //__vox_data_sh[j] = MAX(MIN_SIGNAL_P, __vox_data_sh[j]); + __vox_data_sh[j] = MAX(min_signal, __vox_data_sh[j]); + } + __syncwarp(WMASK); + + const REAL_T denom = avgMask(dimt, b0s_mask, __vox_data_sh); + + for(int j = tidx; j < dimt; j += BDIM_X) { + __vox_data_sh[j] /= denom; + } + __syncwarp(); + + //if (!tidx && !threadIdx.y && !blockIdx.x) { + // printf("denom: %f\n", denom); + //} + ////break; + //if (!tidx && !threadIdx.y && !blockIdx.x) { + // + // printf("__vox_data_sh:\n"); + // printArray("vox_data", 6, dimt, __vox_data_sh[tidy]); + //} + //break; + + maskGet(dimt, b0s_mask, __vox_data_sh, __msk_data_sh); + __syncwarp(WMASK); + + fit_model_coef(delta_nr, hr_side, delta_q, delta_b, __msk_data_sh, __h_sh, __r_sh); + + // __r_sh[tidy] <- python 'coef' + + ndotp_d(samplm_nr, delta_nr, __r_sh, sampling_matrix, __h_sh); + + // __h_sh[tidy] <- python 'pmf' + } else { + #pragma unroll + for(int j = tidx; j < samplm_nr; j += BDIM_X) { + __h_sh[j] = 0; + } + // __h_sh[tidy] <- python 'pmf' + } + __syncwarp(WMASK); +#if 0 + if (!threadIdx.y && threadIdx.x == 0) { + for(int j = 0; j < samplm_nr; j++) { + printf("pmf[%d]: %f\n", j, __h_sh[tidy][j]); + } + } + //return; +#endif + const REAL_T abs_pmf_thr = PMF_THRESHOLD_P*max_d(samplm_nr, __h_sh, REAL_MIN); + __syncwarp(WMASK); + + #pragma unroll + for(int j = tidx; j < samplm_nr; j += BDIM_X) { + const REAL_T __v = __h_sh[j]; + if (__v < abs_pmf_thr) { + __h_sh[j] = 0; + } + } + __syncwarp(WMASK); +#if 0 + if (!threadIdx.y && threadIdx.x == 0) { + printf("abs_pmf_thr: %f\n", abs_pmf_thr); + for(int j = 0; j < samplm_nr; j++) { + printf("pmfNORM[%d]: %f\n", j, __h_sh[tidy][j]); + } + } + //return; +#endif +#if 0 + if init: + directions = peak_directions(pmf, sphere)[0] + return directions + else: + peaks = peak_directions(pmf, sphere)[0] + if (len(peaks) > 0): + return closest_peak(directions, peaks, cos_similarity) +#endif + const int ndir = peak_directions_d(__h_sh, dirs, + sphere_vertices, + sphere_edges, + num_edges, + samplm_nr, + reinterpret_cast(__r_sh), // reuse __r_sh as shInd in func which is large enough + relative_peak_thres, + min_separation_angle); + if (NATTEMPTS == 1) { // init=True... + return ndir; // and dirs; + } else { // init=False... + if (ndir > 0) { + /* + if (!threadIdx.y && threadIdx.x == 0 && ndir > 1) { + printf("NATTEMPTS=5 and ndir: %d!!!\n", ndir); + } + */ + REAL3_T peak; + const int foundPeak = closest_peak_d(max_angle, dir, ndir, dirs, &peak); + __syncwarp(WMASK); + if (foundPeak) { + if (tidx == 0) { + dirs[0] = peak; + } + return 1; + } + } + } + } + return 0; +} + +template +__global__ void getNumStreamlinesBoot_k( + const ModelType model_type, + const REAL_T max_angle, + const REAL_T min_signal, + const REAL_T relative_peak_thres, + const REAL_T min_separation_angle, + const long long rndSeed, + const int nseed, + const REAL3_T *__restrict__ seeds, + const int dimx, + const int dimy, + const int dimz, + const int dimt, + const REAL_T *__restrict__ dataf, + const REAL_T *__restrict__ H, + const REAL_T *__restrict__ R, + const int delta_nr, + const REAL_T *__restrict__ delta_b, + const REAL_T *__restrict__ delta_q, + const int *__restrict__ b0s_mask, // change to int + const int samplm_nr, + const REAL_T *__restrict__ sampling_matrix, + const REAL3_T *__restrict__ sphere_vertices, + const int2 *__restrict__ sphere_edges, + const int num_edges, + REAL3_T *__restrict__ shDir0, + int *slineOutOff) { + + const int tidx = threadIdx.x; + const int slid = blockIdx.x*blockDim.y + threadIdx.y; + const size_t gid = blockIdx.x * blockDim.y * blockDim.x + blockDim.x * threadIdx.y + threadIdx.x; + + if (slid >= nseed) { + return; + } + + REAL3_T seed = seeds[slid]; + // seed = lin_mat*seed + offset + + REAL3_T *__restrict__ __shDir = shDir0+slid*samplm_nr; + + // const int hr_side = dimt-1; + + curandStatePhilox4_32_10_t st; + //curand_init(rndSeed, slid + rndOffset, DIV_UP(hr_side, BDIM_X)*tidx, &st); // each thread uses DIV_UP(hr_side/BDIM_X) + curand_init(rndSeed, gid, 0, &st); // each thread uses DIV_UP(hr_side/BDIM_X) + // elements of the same sequence + // python: + //directions = get_direction(None, dataf, dwi_mask, sphere, s, H, R, model, max_angle, + // pmf_threshold, b0s_mask, min_signal, fit_matrix, + // sampling_matrix, init=True) + + //if (!tidx && !threadIdx.y && !blockIdx.x) { + // printf("seed: %f, %f, %f\n", seed.x, seed.y, seed.z); + //} + + int ndir; + switch(model_type) { + case OPDT: + ndir = get_direction_boot_d( + &st, + max_angle, + min_signal, + relative_peak_thres, + min_separation_angle, + MAKE_REAL3(0,0,0), + dimx, dimy, dimz, dimt, dataf, + b0s_mask /* !dwi_mask */, + seed, + H, R, + // model unused + // max_angle, pmf_threshold from global defines + // b0s_mask already passed + // min_signal from global defines + delta_nr, + delta_b, delta_q, // fit_matrix + samplm_nr, + sampling_matrix, + sphere_vertices, + sphere_edges, + num_edges, + __shDir); + break; + case CSA: + ndir = get_direction_boot_d( + &st, + max_angle, + min_signal, + relative_peak_thres, + min_separation_angle, + MAKE_REAL3(0,0,0), + dimx, dimy, dimz, dimt, dataf, + b0s_mask /* !dwi_mask */, + seed, + H, R, + // model unused + // max_angle, pmf_threshold from global defines + // b0s_mask already passed + // min_signal from global defines + delta_nr, + delta_b, delta_q, // fit_matrix + samplm_nr, + sampling_matrix, + sphere_vertices, + sphere_edges, + num_edges, + __shDir); + break; + default: + printf("FATAL: Invalid Model Type.\n"); + break; + } + + if (tidx == 0) { + slineOutOff[slid] = ndir; + } + + return; +} + +template +__device__ int tracker_boot_d( + curandStatePhilox4_32_10_t *st, + const REAL_T max_angle, + const REAL_T tc_threshold, + const REAL_T step_size, + const REAL_T relative_peak_thres, + const REAL_T min_separation_angle, + REAL3_T seed, + REAL3_T first_step, + REAL3_T voxel_size, + const int dimx, + const int dimy, + const int dimz, + const int dimt, + const REAL_T *__restrict__ dataf, + const REAL_T *__restrict__ metric_map, + const int samplm_nr, + const REAL3_T *__restrict__ sphere_vertices, + const int2 *__restrict__ sphere_edges, + const int num_edges, + /*BOOT specific params*/ + const REAL_T min_signal, + const int delta_nr, + const REAL_T *__restrict__ H, + const REAL_T *__restrict__ R, + const REAL_T *__restrict__ delta_b, + const REAL_T *__restrict__ delta_q, + const REAL_T *__restrict__ sampling_matrix, + const int *__restrict__ b0s_mask, + /*BOOT specific params*/ + int *__restrict__ nsteps, + REAL3_T *__restrict__ streamline) { + + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + + const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; + const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); + + int tissue_class = TRACKPOINT; + + REAL3_T point = seed; + REAL3_T direction = first_step; + __shared__ REAL3_T __sh_new_dir[BDIM_Y]; + + if (tidx == 0) { + streamline[0] = point; +#if 0 + if (threadIdx.y == 1) { + printf("streamline[0]: %f, %f, %f\n", point.x, point.y, point.z); + } +#endif + } + __syncwarp(WMASK); + + int step_frac = 1; + + int i; + for(i = 1; i < MAX_SLINE_LEN*step_frac; i++) { + int ndir = get_direction_boot_d( + st, + max_angle, + min_signal, + relative_peak_thres, + min_separation_angle, + direction, + dimx, dimy, dimz, dimt, dataf, + b0s_mask /* !dwi_mask */, + point, + H, R, + delta_nr, + delta_b, delta_q, // fit_matrix + samplm_nr, + sampling_matrix, + sphere_vertices, + sphere_edges, + num_edges, + __sh_new_dir + tidy); + __syncwarp(WMASK); + direction = __sh_new_dir[tidy]; + __syncwarp(WMASK); + + if (ndir == 0) { + break; + } + + point.x += (direction.x / voxel_size.x) * (step_size / step_frac); + point.y += (direction.y / voxel_size.y) * (step_size / step_frac); + point.z += (direction.z / voxel_size.z) * (step_size / step_frac); + + if ((tidx == 0) && ((i % step_frac) == 0)){ + streamline[i/step_frac] = point; + } + __syncwarp(WMASK); + + tissue_class = check_point_d(tc_threshold, point, dimx, dimy, dimz, metric_map); + + if (tissue_class == ENDPOINT || + tissue_class == INVALIDPOINT || + tissue_class == OUTSIDEIMAGE) { + break; + } + } + nsteps[0] = i/step_frac; + if (((i % step_frac) != 0) && i < step_frac*(MAX_SLINE_LEN - 1)){ + nsteps[0]++; + if (tidx == 0) { + streamline[nsteps[0]] = point; + } + } + + return tissue_class; +} + +template +__global__ void genStreamlinesMergeBoot_k( + const REAL_T max_angle, + const REAL_T tc_threshold, + const REAL_T step_size, + const REAL_T relative_peak_thres, + const REAL_T min_separation_angle, + const long long rndSeed, + const int rndOffset, + const int nseed, + const REAL3_T *__restrict__ seeds, + const int dimx, + const int dimy, + const int dimz, + const int dimt, + const REAL_T *__restrict__ dataf, + const REAL_T *__restrict__ metric_map, + const int samplm_nr, + const REAL3_T *__restrict__ sphere_vertices, + const int2 *__restrict__ sphere_edges, + const int num_edges, + /*BOOT specific params*/ + const REAL_T min_signal, + const int delta_nr, + const REAL_T *__restrict__ H, + const REAL_T *__restrict__ R, + const REAL_T *__restrict__ delta_b, + const REAL_T *__restrict__ delta_q, + const REAL_T *__restrict__ sampling_matrix, + const int *__restrict__ b0s_mask, + /*BOOT specific params*/ + const int *__restrict__ slineOutOff, + REAL3_T *__restrict__ shDir0, + int *__restrict__ slineSeed, + int *__restrict__ slineLen, + REAL3_T *__restrict__ sline) { + + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + + const int slid = blockIdx.x*blockDim.y + threadIdx.y; + + const int lid = (tidy*BDIM_X + tidx) % 32; + const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); + + curandStatePhilox4_32_10_t st; + // const int gbid = blockIdx.y*gridDim.x + blockIdx.x; + const size_t gid = blockIdx.x * blockDim.y * blockDim.x + blockDim.x * threadIdx.y + threadIdx.x; + //curand_init(rndSeed, slid+rndOffset, DIV_UP(hr_side, BDIM_X)*tidx, &st); // each thread uses DIV_UP(HR_SIDE/BDIM_X) + curand_init(rndSeed, gid+1, 0, &st); // each thread uses DIV_UP(hr_side/BDIM_X) + // elements of the same sequence + if (slid >= nseed) { + return; + } + + REAL3_T seed = seeds[slid]; + + int ndir = slineOutOff[slid+1]-slineOutOff[slid]; +#if 0 + if (threadIdx.y == 0 && threadIdx.x == 0) { + printf("%s: ndir: %d\n", __func__, ndir); + for(int i = 0; i < ndir; i++) { + printf("__shDir[%d][%d]: (%f, %f, %f)\n", + tidy, i, __shDir[tidy][i].x, __shDir[tidy][i].y, __shDir[tidy][i].z); + } + } +#endif + __syncwarp(WMASK); + + int slineOff = slineOutOff[slid]; + + for(int i = 0; i < ndir; i++) { + REAL3_T first_step = shDir0[slid*samplm_nr + i]; + + REAL3_T *__restrict__ currSline = sline + slineOff*MAX_SLINE_LEN*2; + + if (tidx == 0) { + slineSeed[slineOff] = slid; + } +#if 0 + if (threadIdx.y == 0 && threadIdx.x == 0) { + printf("calling trackerF from: (%f, %f, %f)\n", first_step.x, first_step.y, first_step.z); + } +#endif + + int stepsB; + const int tissue_classB = tracker_boot_d( + &st, + max_angle, + tc_threshold, + step_size, + relative_peak_thres, + min_separation_angle, + seed, + MAKE_REAL3(-first_step.x, -first_step.y, -first_step.z), + MAKE_REAL3(1, 1, 1), + dimx, dimy, dimz, dimt, dataf, + metric_map, + samplm_nr, + sphere_vertices, + sphere_edges, + num_edges, + min_signal, + delta_nr, + H, + R, + delta_b, + delta_q, + sampling_matrix, + b0s_mask, + &stepsB, + currSline); + + // reverse backward sline + for(int j = 0; j < stepsB/2; j += BDIM_X) { + if (j+tidx < stepsB/2) { + const REAL3_T __p = currSline[j+tidx]; + currSline[j+tidx] = currSline[stepsB-1 - (j+tidx)]; + currSline[stepsB-1 - (j+tidx)] = __p; + } + } + + int stepsF; + const int tissue_classF = tracker_boot_d( + &st, + max_angle, + tc_threshold, + step_size, + relative_peak_thres, + min_separation_angle, + seed, + first_step, + MAKE_REAL3(1, 1, 1), + dimx, dimy, dimz, dimt, dataf, + metric_map, + samplm_nr, + sphere_vertices, + sphere_edges, + num_edges, + min_signal, + delta_nr, + H, + R, + delta_b, + delta_q, + sampling_matrix, + b0s_mask, + &stepsF, + currSline + stepsB-1); + if (tidx == 0) { + slineLen[slineOff] = stepsB-1+stepsF; + } + + slineOff += 1; +#if 0 + if (threadIdx.y == 0 && threadIdx.x == 0) { + printf("%s: stepsF: %d, tissue_classF: %d\n", __func__, stepsF, tissue_classF); + } + __syncwarp(WMASK); +#endif + //if (/* !return_all || */0 && + // tissue_classF != ENDPOINT && + // tissue_classF != OUTSIDEIMAGE) { + // continue; + //} + //if (/* !return_all || */ 0 && + // tissue_classB != ENDPOINT && + // tissue_classB != OUTSIDEIMAGE) { + // continue; + //} + } + return; +} diff --git a/cuslines/cudamacro.h b/cuslines/cuda_c/cudamacro.h similarity index 86% rename from cuslines/cudamacro.h rename to cuslines/cuda_c/cudamacro.h index 49ac24c..7f03c6c 100644 --- a/cuslines/cudamacro.h +++ b/cuslines/cuda_c/cudamacro.h @@ -45,6 +45,18 @@ exit(EXIT_FAILURE); \ }} +#if CUDART_VERSION >= 13000 +#define CUDA_MEM_ADVISE(devPtr, count, advice, device) \ + cudaMemLocation loc; \ + loc.type = cudaMemLocationTypeDevice; \ + loc.id = (device); \ + CHECK_CUDA(cudaMemAdvise((devPtr), (count), (advice), loc)); +#else +#define CUDA_MEM_ADVISE(devPtr, count, advice, device) \ + CHECK_CUDA(cudaMemAdvise((devPtr), (count), (advice), (device))) +#endif + + #ifdef USE_NVTX #include "nvToolsExt.h" diff --git a/cuslines/cuwsort.cuh b/cuslines/cuda_c/cuwsort.cuh similarity index 94% rename from cuslines/cuwsort.cuh rename to cuslines/cuda_c/cuwsort.cuh index 18858f0..aac70ac 100644 --- a/cuslines/cuwsort.cuh +++ b/cuslines/cuda_c/cuwsort.cuh @@ -79,12 +79,15 @@ int swap4[3][4] = {{ 2, 3, 0, 1}, __device__ __constant__ int swap2[1][2] = {{ 1, 0}}; -__device__ __constant__ const int *__swaps[] = {NULL, - reinterpret_cast(&swap2[0][0]), - reinterpret_cast(&swap4[0][0]), - reinterpret_cast(&swap8[0][0]), - reinterpret_cast(&swap16[0][0]), - reinterpret_cast(&swap32[0][0])}; +template +__device__ __forceinline__ const int* get_swap_ptr() { + if constexpr (GSIZE == 2) return (const int*)swap2; + else if constexpr (GSIZE == 4) return (const int*)swap4; + else if constexpr (GSIZE == 8) return (const int*)swap8; + else if constexpr (GSIZE == 16) return (const int*)swap16; + else if constexpr (GSIZE == 32) return (const int*)swap32; + else return nullptr; +} template struct STATIC_LOG2 { @@ -113,7 +116,7 @@ __device__ KEY_T warp_sort(KEY_T v) { const int gid = lid % GSIZE; - const int (*swap)[GSIZE] = reinterpret_cast(__swaps[LOG2_GSIZE]); + const int (*swap)[GSIZE] = reinterpret_cast(get_swap_ptr()); #pragma unroll for(int i = 0; i < NSWAP; i++) { @@ -140,7 +143,7 @@ __device__ void warp_sort(KEY_T *__restrict__ k, VAL_T *__restrict__ v) { const int gid = lid % GSIZE; - const int (*swap)[GSIZE] = reinterpret_cast(__swaps[LOG2_GSIZE]); + const int (*swap)[GSIZE] = reinterpret_cast(get_swap_ptr()); #pragma unroll for(int i = 0; i < NSWAP; i++) { diff --git a/cuslines/disc.h b/cuslines/cuda_c/disc.h similarity index 100% rename from cuslines/disc.h rename to cuslines/cuda_c/disc.h diff --git a/cuslines/cuda_c/generate_streamlines_cuda.cu b/cuslines/cuda_c/generate_streamlines_cuda.cu new file mode 100644 index 0000000..f5629e0 --- /dev/null +++ b/cuslines/cuda_c/generate_streamlines_cuda.cu @@ -0,0 +1,695 @@ +/* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include + +#include "globals.h" +#include "cuwsort.cuh" +#include "ptt.cuh" + +#include "utils.cu" +#include "tracking_helpers.cu" +#include "boot.cu" +#include "ptt.cu" + +#define MAX_NUM_DIR (128) + +#define NTHR_GEN (128) + +#define MAX_DIMS (8) +#define MAX_STR_LEN (256) + +template +__device__ int get_direction_prob_d(curandStatePhilox4_32_10_t *st, + const REAL_T *__restrict__ pmf, + const REAL_T max_angle, + const REAL_T relative_peak_thres, + const REAL_T min_separation_angle, + REAL3_T dir, + const int dimx, + const int dimy, + const int dimz, + const int dimt, + const REAL3_T point, + const REAL3_T *__restrict__ sphere_vertices, + const int2 *__restrict__ sphere_edges, + const int num_edges, + REAL3_T *__restrict__ dirs) { + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + + const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; + const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); + + const int n32dimt = ((dimt+31)/32)*32; + + extern __shared__ REAL_T __sh[]; + REAL_T *__pmf_data_sh = __sh + tidy*n32dimt; + + // pmf = self.pmf_gen.get_pmf_c(&point[0], pmf) + __syncwarp(WMASK); + const int rv = trilinear_interp_d(dimx, dimy, dimz, dimt, -1, pmf, point, __pmf_data_sh); + __syncwarp(WMASK); + if (rv != 0) { + return 0; + } + + // for i in range(_len): + // if pmf[i] > max_pmf: + // max_pmf = pmf[i] + // absolute_pmf_threshold = pmf_threshold * max_pmf + const REAL_T absolpmf_thresh = PMF_THRESHOLD_P * max_d(dimt, __pmf_data_sh, REAL_MIN); + __syncwarp(WMASK); + + // for i in range(_len): + // if pmf[i] < absolute_pmf_threshold: + // pmf[i] = 0.0 + #pragma unroll + for(int i = tidx; i < dimt; i += BDIM_X) { + if (__pmf_data_sh[i] < absolpmf_thresh) { + __pmf_data_sh[i] = 0.0; + } + } + __syncwarp(WMASK); + + if (IS_START) { + int *__shInd = reinterpret_cast(__sh + BDIM_Y*n32dimt) + tidy*n32dimt; + return peak_directions_d(__pmf_data_sh, + dirs, + sphere_vertices, + sphere_edges, + num_edges, + dimt, + __shInd, + relative_peak_thres, + min_separation_angle); + } else { + REAL_T __tmp; + #ifdef DEBUG + __syncwarp(WMASK); + if (tidx == 0) { + printArray("__pmf_data_sh initial", 8, dimt, __pmf_data_sh); + printf("absolpmf_thresh %10.8f\n", absolpmf_thresh); + printf("---> dir %10.8f, %10.8f, %10.8f\n", dir.x, dir.y, dir.z); + printf("---> point %10.8f, %10.8f, %10.8f\n", point.x, point.y, point.z); + } + __syncwarp(WMASK); + if (tidx == 15) { + printf("absolpmf_thresh %10.8f l15\n", absolpmf_thresh); + printf("---> dir %10.8f, %10.8f, %10.8f l15\n", dir.x, dir.y, dir.z); + printf("---> point %10.8f, %10.8f, %10.8f l15\n", point.x, point.y, point.z); + } + __syncwarp(WMASK); + if (tidx == 31) { + printArray("__pmf_data_sh initial l31", 8, dimt, __pmf_data_sh); + printf("absolpmf_thresh %10.8f l31\n", absolpmf_thresh); + printf("---> dir %10.8f, %10.8f, %10.8f l31\n", dir.x, dir.y, dir.z); + printf("---> point %10.8f, %10.8f, %10.8f l31\n", point.x, point.y, point.z); + } + __syncwarp(WMASK); + #endif + + // // These should not be relevant + // if norm(&direction[0]) == 0: + // return 1 + // normalize(&direction[0]) + + // for i in range(_len): + // cos_sim = self.vertices[i][0] * direction[0] \ + // + self.vertices[i][1] * direction[1] \ + // + self.vertices[i][2] * direction[2] + // if cos_sim < 0: + // cos_sim = cos_sim * -1 + // if cos_sim < self.cos_similarity: + // pmf[i] = 0 + const REAL_T cos_similarity = COS(max_angle); + + #pragma unroll + for(int i = tidx; i < dimt; i += BDIM_X) { + const REAL_T dot = dir.x*sphere_vertices[i].x+ + dir.y*sphere_vertices[i].y+ + dir.z*sphere_vertices[i].z; + + if (FABS(dot) < cos_similarity) { + __pmf_data_sh[i] = 0.0; + } + } + __syncwarp(WMASK); + + #ifdef DEBUG + __syncwarp(WMASK); + if (tidx == 0) { + printArray("__pmf_data_sh after filtering", 8, dimt, __pmf_data_sh); + } + __syncwarp(WMASK); + #endif + + // cumsum(pmf, pmf, _len) + prefix_sum_sh_d(__pmf_data_sh, dimt); + + #ifdef DEBUG + __syncwarp(WMASK); + if (tidx == 0) { + printArray("__pmf_data_sh after cumsum", 8, dimt, __pmf_data_sh); + } + __syncwarp(WMASK); + #endif + + // last_cdf = pmf[_len - 1] + // if last_cdf == 0: + // return 1 + REAL_T last_cdf = __pmf_data_sh[dimt - 1]; + if (last_cdf == 0) { + return 0; + } + + // idx = where_to_insert(pmf, random() * last_cdf, _len) + if (tidx == 0) { + __tmp = curand_uniform(st) * last_cdf; + } + REAL_T selected_cdf = __shfl_sync(WMASK, __tmp, 0, BDIM_X); +// Both these implementations work +#if 1 + int low = 0; + int high = dimt - 1; + while ((high - low) >= BDIM_X) { + const int mid = (low + high) / 2; + if (__pmf_data_sh[mid] < selected_cdf) { + low = mid; + } else { + high = mid; + } + } + const bool __ballot = (low+tidx <= high) ? (selected_cdf < __pmf_data_sh[low+tidx]) : 0; + const int __msk = __ballot_sync(WMASK, __ballot); + const int indProb = low + __ffs(__msk) - 1; +#else + int indProb = dimt - 1; + for (int ii = 0; ii < dimt; ii+=BDIM_X) { + int __is_greater = 0; + if (ii+tidx < dimt) { + __is_greater = selected_cdf < __pmf_data_sh[ii+tidx]; + } + const int __msk = __ballot_sync(WMASK, __is_greater); + if (__msk != 0) { + indProb = ii + __ffs(__msk) - 1; + break; + } + } +#endif + + #ifdef DEBUG + __syncwarp(WMASK); + if (tidx == 0) { + printf("last_cdf %10.8f\n", last_cdf); + printf("selected_cdf %10.8f\n", selected_cdf); + printf("indProb %i out of %i\n", indProb, dimt); + } + __syncwarp(WMASK); + #endif + + // newdir = self.vertices[idx] + // if (direction[0] * newdir[0] + // + direction[1] * newdir[1] + // + direction[2] * newdir[2] > 0): + // copy_point(&newdir[0], &direction[0]) + // else: + // newdir[0] = newdir[0] * -1 + // newdir[1] = newdir[1] * -1 + // newdir[2] = newdir[2] * -1 + // copy_point(&newdir[0], &direction[0]) + if (tidx == 0) { + if ((dir.x * sphere_vertices[indProb].x + + dir.y * sphere_vertices[indProb].y + + dir.z * sphere_vertices[indProb].z) > 0) { + *dirs = MAKE_REAL3(sphere_vertices[indProb].x, + sphere_vertices[indProb].y, + sphere_vertices[indProb].z); + } else { + *dirs = MAKE_REAL3(-sphere_vertices[indProb].x, + -sphere_vertices[indProb].y, + -sphere_vertices[indProb].z); + } + // printf("direction addr write %p, slid %i\n", dirs, blockIdx.x*blockDim.y+threadIdx.y); + } + + #ifdef DEBUG + __syncwarp(WMASK); + if (tidx == 0) { + printf("last_cdf %10.8f\n", last_cdf); + printf("selected_cdf %10.8f\n", selected_cdf); + printf("indProb %i out of %i\n", indProb, dimt); + } + __syncwarp(WMASK); + if (tidx == 15) { + printf("last_cdf %10.8f l15\n", last_cdf); + printf("selected_cdf %10.8f l15\n", selected_cdf); + printf("indProb %i out of %i l15\n", indProb, dimt); + } + __syncwarp(WMASK); + if (tidx == 31) { + printf("last_cdf %10.8f l31\n", last_cdf); + printf("selected_cdf %10.8f l31\n", selected_cdf); + printf("indProb %i out of %i l31\n", indProb, dimt); + } + __syncwarp(WMASK); + #endif + return 1; + } +} + +template +__device__ int tracker_d(curandStatePhilox4_32_10_t *st, + const REAL_T max_angle, + const REAL_T tc_threshold, + const REAL_T step_size, + const REAL_T relative_peak_thres, + const REAL_T min_separation_angle, + REAL3_T seed, + REAL3_T first_step, + REAL_T* ptt_frame, + REAL3_T voxel_size, + const int dimx, + const int dimy, + const int dimz, + const int dimt, + const REAL_T *__restrict__ dataf, + const REAL_T *__restrict__ metric_map, + const int samplm_nr, + const REAL3_T *__restrict__ sphere_vertices, + const int2 *__restrict__ sphere_edges, + const int num_edges, + int *__restrict__ nsteps, + REAL3_T *__restrict__ streamline) { + + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + + const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; + const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); + + int tissue_class = TRACKPOINT; + + REAL3_T point = seed; + REAL3_T direction = first_step; + __shared__ REAL3_T __sh_new_dir[BDIM_Y]; + + if (tidx == 0) { + streamline[0] = point; + } + __syncwarp(WMASK); + + int step_frac; + if (MODEL_T == PTT) { + step_frac = STEP_FRAC; + } else { + step_frac = 1; // STEP_FRAC could be useful in other models + } + + int i; + for(i = 1; i < MAX_SLINE_LEN*step_frac; i++) { + int ndir; + if constexpr (MODEL_T == PROB) { + ndir = get_direction_prob_d( + st, + dataf, + max_angle, + relative_peak_thres, + min_separation_angle, + direction, + dimx, dimy, dimz, dimt, + point, + sphere_vertices, + sphere_edges, + num_edges, + __sh_new_dir + tidy); + } else if constexpr (MODEL_T == PTT) { + ndir = get_direction_ptt_d( + st, + dataf, + max_angle, + step_size, + direction, + ptt_frame, + dimx, dimy, dimz, dimt, + point, + sphere_vertices, + __sh_new_dir + tidy); + } + __syncwarp(WMASK); + direction = __sh_new_dir[tidy]; + __syncwarp(WMASK); + + if (ndir == 0) { + break; + } +#if 0 + if (threadIdx.y == 1 && threadIdx.x == 0) { + printf("tracker: i: %d, direction: (%f, %f, %f)\n", i, direction.x, direction.y, direction.z); + } + //return; +#endif + + point.x += (direction.x / voxel_size.x) * (step_size / step_frac); + point.y += (direction.y / voxel_size.y) * (step_size / step_frac); + point.z += (direction.z / voxel_size.z) * (step_size / step_frac); + + if ((tidx == 0) && ((i % step_frac) == 0)){ + streamline[i/step_frac] = point; +#if 0 + if (threadIdx.y == 1) { + printf("streamline[%d]: %f, %f, %f\n", i, point.x, point.y, point.z); + } +#endif + } + __syncwarp(WMASK); + + tissue_class = check_point_d(tc_threshold, point, dimx, dimy, dimz, metric_map); + +#if 0 + __syncwarp(WMASK); + if (tidx == 0) { + printf("step_size %f\n", step_size); + printf("direction %f, %f, %f\n", direction.x, direction.y, direction.z); + printf("direction addr read %p, slid %i\n", __shDir, blockIdx.x*blockDim.y+threadIdx.y); + printf("voxel_size %f, %f, %f\n", voxel_size.x, voxel_size.y, voxel_size.z); + printf("point %f, %f, %f\n", point.x, point.y, point.z); + printf("tc %i\n", tissue_class); + } + __syncwarp(WMASK); + if (tidx == 15) { + printf("step_size %f l15\n", step_size); + printf("direction %f, %f, %f l15\n", direction.x, direction.y, direction.z); + printf("direction addr read %p, slid %i l15\n", __shDir, blockIdx.x*blockDim.y+threadIdx.y); + printf("voxel_size %f, %f, %f l15\n", voxel_size.x, voxel_size.y, voxel_size.z); + printf("point %f, %f, %f l15\n", point.x, point.y, point.z); + printf("tc %i l15\n", tissue_class); + } + __syncwarp(WMASK); + if (tidx == 31) { + printf("step_size %f l31\n", step_size); + printf("direction %f, %f, %f l31\n", direction.x, direction.y, direction.z); + printf("direction addr read %p, slid %i l31\n", __shDir, blockIdx.x*blockDim.y+threadIdx.y); + printf("voxel_size %f, %f, %f l31\n", voxel_size.x, voxel_size.y, voxel_size.z); + printf("point %f, %f, %f l31\n", point.x, point.y, point.z); + printf("tc %i l31\n", tissue_class); + } + __syncwarp(WMASK); +#endif + + if (tissue_class == ENDPOINT || + tissue_class == INVALIDPOINT || + tissue_class == OUTSIDEIMAGE) { + break; + } + } + nsteps[0] = i/step_frac; + if (((i % step_frac) != 0) && i < step_frac*(MAX_SLINE_LEN - 1)){ + nsteps[0]++; + if (tidx == 0) { + streamline[nsteps[0]] = point; + } + } + + return tissue_class; +} + +template +__global__ void getNumStreamlinesProb_k(const REAL_T max_angle, + const REAL_T relative_peak_thres, + const REAL_T min_separation_angle, + const long long rndSeed, + const int nseed, + const REAL3_T *__restrict__ seeds, + const int dimx, + const int dimy, + const int dimz, + const int dimt, + const REAL_T *__restrict__ dataf, + const REAL3_T *__restrict__ sphere_vertices, + const int2 *__restrict__ sphere_edges, + const int num_edges, + REAL3_T *__restrict__ shDir0, + int *slineOutOff) { + + const int tidx = threadIdx.x; + const int slid = blockIdx.x*blockDim.y + threadIdx.y; + const size_t gid = blockIdx.x * blockDim.y * blockDim.x + blockDim.x * threadIdx.y + threadIdx.x; + + if (slid >= nseed) { + return; + } + + REAL3_T *__restrict__ __shDir = shDir0+slid*dimt; + curandStatePhilox4_32_10_t st; + curand_init(rndSeed, gid, 0, &st); + + int ndir = get_direction_prob_d( + &st, + dataf, + max_angle, + relative_peak_thres, + min_separation_angle, + MAKE_REAL3(0,0,0), + dimx, dimy, dimz, dimt, + seeds[slid], + sphere_vertices, + sphere_edges, + num_edges, + __shDir); + if (tidx == 0) { + slineOutOff[slid] = ndir; + } + + return; +} + +template +__global__ void genStreamlinesMergeProb_k( + const REAL_T max_angle, + const REAL_T tc_threshold, + const REAL_T step_size, + const REAL_T relative_peak_thres, + const REAL_T min_separation_angle, + const long long rndSeed, + const int rndOffset, + const int nseed, + const REAL3_T *__restrict__ seeds, + const int dimx, + const int dimy, + const int dimz, + const int dimt, + const REAL_T *__restrict__ dataf, + const REAL_T *__restrict__ metric_map, + const int samplm_nr, + const REAL3_T *__restrict__ sphere_vertices, + const int2 *__restrict__ sphere_edges, + const int num_edges, + const int *__restrict__ slineOutOff, + REAL3_T *__restrict__ shDir0, + int *__restrict__ slineSeed, + int *__restrict__ slineLen, + REAL3_T *__restrict__ sline) { + + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + + const int slid = blockIdx.x*blockDim.y + threadIdx.y; + + const int lid = (tidy*BDIM_X + tidx) % 32; + const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); + + __shared__ REAL_T frame_sh[((MODEL_T == PTT) ? BDIM_Y*18 : 1)]; // Only used by PTT, TODO: way to remove this in other cases + REAL_T* __ptt_frame = frame_sh + tidy*18; + // const int hr_side = dimt-1; + + curandStatePhilox4_32_10_t st; + // const int gbid = blockIdx.y*gridDim.x + blockIdx.x; + const size_t gid = blockIdx.x * blockDim.y * blockDim.x + blockDim.x * threadIdx.y + threadIdx.x; + //curand_init(rndSeed, slid+rndOffset, DIV_UP(hr_side, BDIM_X)*tidx, &st); // each thread uses DIV_UP(HR_SIDE/BDIM_X) + curand_init(rndSeed, gid+1, 0, &st); // each thread uses DIV_UP(hr_side/BDIM_X) + // elements of the same sequence + if (slid >= nseed) { + return; + } + + REAL3_T seed = seeds[slid]; + + int ndir = slineOutOff[slid+1]-slineOutOff[slid]; +#if 0 + if (threadIdx.y == 0 && threadIdx.x == 0) { + printf("%s: ndir: %d\n", __func__, ndir); + for(int i = 0; i < ndir; i++) { + printf("__shDir[%d][%d]: (%f, %f, %f)\n", + tidy, i, __shDir[tidy][i].x, __shDir[tidy][i].y, __shDir[tidy][i].z); + } + } +#endif + __syncwarp(WMASK); + + int slineOff = slineOutOff[slid]; + + for(int i = 0; i < ndir; i++) { + REAL3_T first_step = shDir0[slid*samplm_nr + i]; + + REAL3_T *__restrict__ currSline = sline + slineOff*MAX_SLINE_LEN*2; + + if (tidx == 0) { + slineSeed[slineOff] = slid; + } +#if 0 + if (threadIdx.y == 0 && threadIdx.x == 0) { + printf("calling trackerF from: (%f, %f, %f)\n", first_step.x, first_step.y, first_step.z); + } +#endif + + if (MODEL_T == PTT) { + if (!init_frame_ptt_d( + &st, + dataf, + max_angle, + step_size, + first_step, + dimx, dimy, dimz, dimt, + seed, + sphere_vertices, + __ptt_frame + )) { // this fails rarely + if (tidx == 0) { + slineLen[slineOff] = 1; + currSline[0] = seed; + } + __syncwarp(WMASK); + slineOff += 1; + continue; + } + } + + + int stepsB; + const int tissue_classB = tracker_d(&st, + max_angle, + tc_threshold, + step_size, + relative_peak_thres, + min_separation_angle, + seed, + MAKE_REAL3(-first_step.x, -first_step.y, -first_step.z), + __ptt_frame, + MAKE_REAL3(1, 1, 1), + dimx, dimy, dimz, dimt, dataf, + metric_map, + samplm_nr, + sphere_vertices, + sphere_edges, + num_edges, + &stepsB, + currSline); + //if (tidx == 0) { + // slineLenB[slineOff] = stepsB; + //} + + // reverse backward sline + for(int j = 0; j < stepsB/2; j += BDIM_X) { + if (j+tidx < stepsB/2) { + const REAL3_T __p = currSline[j+tidx]; + currSline[j+tidx] = currSline[stepsB-1 - (j+tidx)]; + currSline[stepsB-1 - (j+tidx)] = __p; + } + } + + int stepsF; + const int tissue_classF = tracker_d(&st, + max_angle, + tc_threshold, + step_size, + relative_peak_thres, + min_separation_angle, + seed, + first_step, + __ptt_frame + 9, + MAKE_REAL3(1, 1, 1), + dimx, dimy, dimz, dimt, dataf, + metric_map, + samplm_nr, + sphere_vertices, + sphere_edges, + num_edges, + &stepsF, + currSline + stepsB-1); + if (tidx == 0) { + slineLen[slineOff] = stepsB-1+stepsF; + } + + slineOff += 1; +#if 0 + if (threadIdx.y == 0 && threadIdx.x == 0) { + printf("%s: stepsF: %d, tissue_classF: %d\n", __func__, stepsF, tissue_classF); + } + __syncwarp(WMASK); +#endif + //if (/* !return_all || */0 && + // tissue_classF != ENDPOINT && + // tissue_classF != OUTSIDEIMAGE) { + // continue; + //} + //if (/* !return_all || */ 0 && + // tissue_classB != ENDPOINT && + // tissue_classB != OUTSIDEIMAGE) { + // continue; + //} + } + return; +} diff --git a/cuslines/globals.h b/cuslines/cuda_c/globals.h similarity index 90% rename from cuslines/globals.h rename to cuslines/cuda_c/globals.h index 25c2f29..71bcd73 100644 --- a/cuslines/globals.h +++ b/cuslines/cuda_c/globals.h @@ -29,7 +29,7 @@ #ifndef __GLOBALS_H__ #define __GLOBALS_H__ -#define REAL_SIZE 8 +#define REAL_SIZE 4 #if REAL_SIZE == 4 @@ -40,7 +40,7 @@ #define FLOOR floorf #define LOG __logf #define EXP __expf -#define REAL_MAX (FLT_MAX) +#define REAL_MAX __int_as_float(0x7f7fffffU) #define REAL_MIN (-REAL_MAX) #define COS __cosf #define SIN __sinf @@ -58,7 +58,7 @@ #define FLOOR floor #define LOG log #define EXP exp -#define REAL_MAX (DBL_MAX) +#define REAL_MAX __longlong_as_double(0x7fefffffffffffffLL) #define REAL_MIN (-REAL_MAX) #define COS cos #define SIN sin @@ -68,9 +68,9 @@ #define ACOS acos #endif - +// TODO: half this in when WMGMI seeding #define MAX_SLINE_LEN (501) -#define PMF_THRESHOLD_P ((REAL)0.1) +#define PMF_THRESHOLD_P ((REAL)0.05) #define THR_X_BL (64) #define THR_X_SL (32) @@ -85,6 +85,8 @@ #define EXCESS_ALLOC_FACT 2 +#define NORM_EPS ((REAL)1e-8) + #if 0 #define DEBUG #endif @@ -96,4 +98,6 @@ enum ModelType { PTT = 3, }; +enum {OUTSIDEIMAGE, INVALIDPOINT, TRACKPOINT, ENDPOINT}; + #endif diff --git a/cuslines/ptt.cu b/cuslines/cuda_c/ptt.cu similarity index 52% rename from cuslines/ptt.cu rename to cuslines/cuda_c/ptt.cu index b36e747..5684272 100644 --- a/cuslines/ptt.cu +++ b/cuslines/cuda_c/ptt.cu @@ -1,18 +1,19 @@ template -__device__ void norm3_d(REAL_T *num, int fail_ind) { +__device__ __forceinline__ void norm3_d(REAL_T *num, int fail_ind) { const REAL_T scale = SQRT(num[0] * num[0] + num[1] * num[1] + num[2] * num[2]); - if (scale != 0) { + if (scale > NORM_EPS) { num[0] /= scale; num[1] /= scale; num[2] /= scale; } else { + num[0] = num[1] = num[2] = 0; num[fail_ind] = 1.0; // this can happen randomly during propogation, though is exceedingly rare } } template -__device__ void crossnorm3_d(REAL_T *dest, const REAL_T *src1, const REAL_T *src2, int fail_ind) { +__device__ __forceinline__ void crossnorm3_d(REAL_T *dest, const REAL_T *src1, const REAL_T *src2, int fail_ind) { dest[0] = src1[1] * src2[2] - src1[2] * src2[1]; dest[1] = src1[2] * src2[0] - src1[0] * src2[2]; dest[2] = src1[0] * src2[1] - src1[1] * src2[0]; @@ -20,13 +21,20 @@ __device__ void crossnorm3_d(REAL_T *dest, const REAL_T *src1, const REAL_T *src norm3_d(dest, fail_ind); } -template +template __device__ REAL_T interp4_d(const REAL3_T pos, const REAL_T* frame, const REAL_T *__restrict__ pmf, const int dimx, const int dimy, const int dimz, const int dimt, const REAL3_T *__restrict__ odf_sphere_vertices) { + const int tidx = threadIdx.x; + + const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; + const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); + int closest_odf_idx = 0; - REAL_T __max_cos = 0; - for (int ii = 0; ii < dimt; ii++) { + REAL_T __max_cos = REAL_T(0); + + #pragma unroll + for (int ii = tidx; ii < dimt; ii+= BDIM_X) { // TODO: I need to think about better ways of parallelizing this REAL_T cos_sim = FABS( odf_sphere_vertices[ii].x * frame[0] \ + odf_sphere_vertices[ii].y * frame[1] \ @@ -36,15 +44,31 @@ __device__ REAL_T interp4_d(const REAL3_T pos, const REAL_T* frame, const REAL_T closest_odf_idx = ii; } } + __syncwarp(WMASK); - const int rv = trilinear_interp_d(dimx, dimy, dimz, dimt, closest_odf_idx, pmf, pos, &__max_cos); + #pragma unroll + for(int i = BDIM_X/2; i; i /= 2) { + const REAL_T __tmp = __shfl_xor_sync(WMASK, __max_cos, i, BDIM_X); + const int __tmp_idx = __shfl_xor_sync(WMASK, closest_odf_idx, i, BDIM_X); + if (__tmp > __max_cos || + (__tmp == __max_cos && __tmp_idx < closest_odf_idx)) { + __max_cos = __tmp; + closest_odf_idx = __tmp_idx; + } + } + __syncwarp(WMASK); #if 0 - printf("inerpolated %f at %f, %f, %f, %i\n", __max_cos, pos.x, pos.y, pos.z, closest_odf_idx); + if (closest_odf_idx >= dimt || closest_odf_idx < 0) { + printf("Error: closest_odf_idx out of bounds: %d (dimt: %d)\n", closest_odf_idx, dimt); + } #endif + // TODO: maybe this should be texture memory, I am not so sure + const int rv = trilinear_interp_d(dimx, dimy, dimz, dimt, closest_odf_idx, pmf, pos, &__max_cos); + if (rv != 0) { - return -1; + return 0; // No support } else { return __max_cos; } @@ -87,24 +111,57 @@ __device__ void prepare_propagator_d(REAL_T k1, REAL_T k2, REAL_T arclength, } } +template +__device__ void random_normal(curandStatePhilox4_32_10_t *st, REAL_T* probing_frame) { + probing_frame[3] = curand_normal(st); + probing_frame[4] = curand_normal(st); + probing_frame[5] = curand_normal(st); + REAL_T dot = probing_frame[3]*probing_frame[0] + + probing_frame[4]*probing_frame[1] + + probing_frame[5]*probing_frame[2]; + + probing_frame[3] -= dot*probing_frame[0]; + probing_frame[4] -= dot*probing_frame[1]; + probing_frame[5] -= dot*probing_frame[2]; + REAL_T n2 = probing_frame[3]*probing_frame[3] + + probing_frame[4]*probing_frame[4] + + probing_frame[5]*probing_frame[5]; + + if (n2 < NORM_EPS) { + REAL_T abs_x = FABS(probing_frame[0]); + REAL_T abs_y = FABS(probing_frame[1]); + REAL_T abs_z = FABS(probing_frame[2]); + + if (abs_x <= abs_y && abs_x <= abs_z) { + probing_frame[3] = 0.0; + probing_frame[4] = probing_frame[2]; + probing_frame[5] = -probing_frame[1]; + } + else if (abs_y <= abs_z) { + probing_frame[3] = -probing_frame[2]; + probing_frame[4] = 0.0; + probing_frame[5] = probing_frame[0]; + } + else { + probing_frame[3] = probing_frame[1]; + probing_frame[4] = -probing_frame[0]; + probing_frame[5] = 0.0; + } + } +} + template __device__ void get_probing_frame_d(const REAL_T* frame, curandStatePhilox4_32_10_t *st, REAL_T* probing_frame) { if (IS_INIT) { for (int ii = 0; ii < 3; ii++) { // tangent probing_frame[ii] = frame[ii]; } - if ((probing_frame[0] != 0) && (probing_frame[1] != 0)) { // norm - probing_frame[3] = -probing_frame[1]; - probing_frame[4] = probing_frame[0]; - probing_frame[5] = 0; - } else { - probing_frame[3] = 0; - probing_frame[4] = -probing_frame[2]; - probing_frame[5] = 0; - } + norm3_d(probing_frame, 0); - norm3_d(probing_frame, 0); // tangent + random_normal(st, probing_frame); norm3_d(probing_frame + 3, 1); // norm + + // calculate binorm crossnorm3_d(probing_frame + 2*3, probing_frame, probing_frame + 3, 2); // binorm } else { for (int ii = 0; ii < 9; ii++) { @@ -114,7 +171,7 @@ __device__ void get_probing_frame_d(const REAL_T* frame, curandStatePhilox4_32_1 } template -__device__ void propogate_frame_d(REAL_T* propagator, REAL_T* frame, REAL_T* direc) { +__device__ void propagate_frame_d(REAL_T* propagator, REAL_T* frame, REAL_T* direc) { REAL_T __tmp[3]; for (int ii = 0; ii < 3; ii++) { @@ -123,50 +180,61 @@ __device__ void propogate_frame_d(REAL_T* propagator, REAL_T* frame, REAL_T* dir frame[2*3 + ii] = propagator[6]*frame[ii] + propagator[7]*frame[3+ii] + propagator[8]*frame[6+ii]; } -#if 1 norm3_d(__tmp, 0); // normalize tangent crossnorm3_d(frame + 3, frame + 2*3, __tmp, 1); // calc normal crossnorm3_d(frame + 2*3, __tmp, frame + 3, 2); // calculate binorm from tangent, norm -#else - norm3_d(__tmp, 0); // normalize tangent - norm3_d(frame + 2*3, 2); // normalize binorm - crossnorm3_d(frame + 3, frame + 2*3, __tmp, 1); // calculate normal from binorm, tangent -#endif for (int ii = 0; ii < 3; ii++) { frame[ii] = __tmp[ii]; } } -template +template __device__ REAL_T calculate_data_support_d(REAL_T support, const REAL3_T pos, const REAL_T *__restrict__ pmf, const int dimx, const int dimy, const int dimz, const int dimt, const REAL_T probe_step_size, + const REAL_T absolpmf_thresh, const REAL3_T *__restrict__ odf_sphere_vertices, - REAL_T k1, REAL_T k2, - REAL_T* probing_frame) { - REAL_T probing_prop[9]; - REAL_T direc[3]; - REAL3_T probing_pos; - REAL_T fod_amp; - - prepare_propagator_d(k1, k2, probe_step_size, probing_prop); - probing_pos.x = pos.x; - probing_pos.y = pos.y; - probing_pos.z = pos.z; + REAL_T* probing_prop_sh, + REAL_T* direc_sh, + REAL3_T* probing_pos_sh, + REAL_T* k1_sh, REAL_T* k2_sh, + REAL_T* probing_frame_sh) { + const int tidx = threadIdx.x; + + const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; + const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); + + if (tidx == 0) { + prepare_propagator_d( + *k1_sh, *k2_sh, + probe_step_size, probing_prop_sh); + probing_pos_sh->x = pos.x; + probing_pos_sh->y = pos.y; + probing_pos_sh->z = pos.z; + } + __syncwarp(WMASK); for (int ii = 0; ii < PROBE_QUALITY; ii++) { // we spend about 2/3 of our time in this loop when doing PTT - propogate_frame_d(probing_prop, probing_frame, direc); + if (tidx == 0) { + propagate_frame_d( + probing_prop_sh, + probing_frame_sh, + direc_sh); + + probing_pos_sh->x += direc_sh[0]; + probing_pos_sh->y += direc_sh[1]; + probing_pos_sh->z += direc_sh[2]; + } + __syncwarp(WMASK); - probing_pos.x += direc[0]; - probing_pos.y += direc[1]; - probing_pos.z += direc[2]; + const REAL_T fod_amp = interp4_d( // This is the most expensive call + *probing_pos_sh, probing_frame_sh, pmf, + dimx, dimy, dimz, dimt, + odf_sphere_vertices); - fod_amp = interp4_d(probing_pos, probing_frame, pmf, - dimx, dimy, dimz, dimt, - odf_sphere_vertices); - if (!ALLOW_WEAK_LINK && (fod_amp < PMF_THRESHOLD_P)) { + if (!ALLOW_WEAK_LINK && (fod_amp < absolpmf_thresh)) { return 0; } support += fod_amp; @@ -204,16 +272,37 @@ __device__ int get_direction_ptt_d( const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); - REAL_T __shared__ face_cdf_sh[BDIM_Y*DISC_FACE_CNT]; - REAL_T __shared__ vert_pdf_sh[BDIM_Y*DISC_VERT_CNT]; - REAL_T __shared__ first_val_sh[BDIM_Y]; + __shared__ REAL_T face_cdf_sh[BDIM_Y*DISC_FACE_CNT]; + __shared__ REAL_T vert_pdf_sh[BDIM_Y*DISC_VERT_CNT]; + + __shared__ REAL_T probing_frame_sh[BDIM_Y*9]; + __shared__ REAL_T k1_probe_sh[BDIM_Y]; + __shared__ REAL_T k2_probe_sh[BDIM_Y]; + + __shared__ REAL_T probing_prop_sh[BDIM_Y*9]; + __shared__ REAL_T direc_sh[BDIM_Y*3]; + __shared__ REAL3_T probing_pos_sh[BDIM_Y]; REAL_T *__face_cdf_sh = face_cdf_sh + tidy*DISC_FACE_CNT; REAL_T *__vert_pdf_sh = vert_pdf_sh + tidy*DISC_VERT_CNT; - REAL_T *__first_val_sh = first_val_sh + tidy; - const REAL_T max_curvature = SIN(max_angle / 2) / step_size; // bigger numbers means wiggle more - const REAL_T probe_step_size = ((step_size / 2) / (PROBE_QUALITY - 1)); + REAL_T *__probing_frame_sh = probing_frame_sh + tidy*9; + REAL_T *__k1_probe_sh = k1_probe_sh + tidy; + REAL_T *__k2_probe_sh = k2_probe_sh + tidy; + + REAL_T *__probing_prop_sh = probing_prop_sh + tidy*9; + REAL_T *__direc_sh = direc_sh + tidy*3; + REAL3_T *__probing_pos_sh = probing_pos_sh + tidy; + + const REAL_T probe_step_size = ((step_size / PROBE_FRAC) / (PROBE_QUALITY - 1)); + const REAL_T max_curvature = 2.0 * SIN(max_angle / 2.0) / (step_size / PROBE_FRAC); // This seems to work well + const REAL_T absolpmf_thresh = PMF_THRESHOLD_P * max_d(dimt, pmf, REAL_MIN); + +#if 0 + printf("absolpmf_thresh: %f, max_curvature: %f, probe_step_size: %f\n", absolpmf_thresh, max_curvature, probe_step_size); + printf("max_angle: %f\n", max_angle); + printf("step_size: %f\n", step_size); +#endif REAL_T __tmp; @@ -225,30 +314,32 @@ __device__ int get_direction_ptt_d( __frame_sh[2] = dir.z; } } - if (tidx==0) { - *__first_val_sh = interp4_d(pos, __frame_sh, pmf, - dimx, dimy, dimz, dimt, - odf_sphere_vertices); - } + + const REAL_T first_val = interp4_d( + pos, __frame_sh, pmf, + dimx, dimy, dimz, dimt, + odf_sphere_vertices); __syncwarp(WMASK); // Calculate __vert_pdf_sh - REAL_T probing_frame[9]; - REAL_T k1_probe, k2_probe; - bool support_found = 0; - for (int ii = tidx; ii < DISC_VERT_CNT; ii += BDIM_X) { - k1_probe = DISC_VERT[ii*2] * max_curvature; - k2_probe = DISC_VERT[ii*2+1] * max_curvature; - - get_probing_frame_d(__frame_sh, st, probing_frame); + bool support_found = false; + for (int ii = 0; ii < DISC_VERT_CNT; ii++) { + if (tidx == 0) { + *__k1_probe_sh = DISC_VERT[ii*2] * max_curvature; + *__k2_probe_sh = DISC_VERT[ii*2+1] * max_curvature; + get_probing_frame_d(__frame_sh, st, __probing_frame_sh); + } + __syncwarp(WMASK); - const REAL_T this_support = calculate_data_support_d( - *__first_val_sh, + const REAL_T this_support = calculate_data_support_d( + first_val, pos, pmf, dimx, dimy, dimz, dimt, probe_step_size, + absolpmf_thresh, odf_sphere_vertices, - k1_probe, k2_probe, - probing_frame); + __probing_prop_sh, __direc_sh, __probing_pos_sh, + __k1_probe_sh, __k2_probe_sh, + __probing_frame_sh); #if 0 if (threadIdx.y == 1 && ii == 0) { @@ -256,15 +347,18 @@ __device__ int get_direction_ptt_d( } #endif - if (this_support < NORM_MIN_SUPPORT) { - __vert_pdf_sh[ii] = 0; + if (this_support < PROBE_QUALITY * absolpmf_thresh) { + if (tidx == 0) { + __vert_pdf_sh[ii] = 0; + } } else { - __vert_pdf_sh[ii] = this_support; + if (tidx == 0) { + __vert_pdf_sh[ii] = this_support; + } support_found = 1; } } - const int __msk = __ballot_sync(WMASK, support_found); - if (__msk == 0) { + if (support_found == 0) { return 0; } @@ -323,82 +417,71 @@ __device__ int get_direction_ptt_d( #endif // Sample random valid faces randomly - REAL_T r1, r2; - for (int ii = 0; ii < TRIES_PER_REJECTION_SAMPLING / BDIM_X; ii++) { - r1 = curand_uniform(st); - r2 = curand_uniform(st); - if (r1 + r2 > 1) { - r1 = 1 - r1; - r2 = 1 - r2; - } - - __tmp = curand_uniform(st) * last_cdf; - int jj; - for (jj = 0; jj < DISC_FACE_CNT; jj++) { - if (__face_cdf_sh[jj] >= __tmp) - break; - } - - const REAL_T vx0 = max_curvature * DISC_VERT[DISC_FACE[jj*3]*2]; - const REAL_T vx1 = max_curvature * DISC_VERT[DISC_FACE[jj*3+1]*2]; - const REAL_T vx2 = max_curvature * DISC_VERT[DISC_FACE[jj*3+2]*2]; - - const REAL_T vy0 = max_curvature * DISC_VERT[DISC_FACE[jj*3]*2 + 1]; - const REAL_T vy1 = max_curvature * DISC_VERT[DISC_FACE[jj*3+1]*2 + 1]; - const REAL_T vy2 = max_curvature * DISC_VERT[DISC_FACE[jj*3+2]*2 + 1]; - - k1_probe = vx0 + r1 * (vx1 - vx0) + r2 * (vx2 - vx0); - k2_probe = vy0 + r1 * (vy1 - vy0) + r2 * (vy2 - vy0); - - get_probing_frame_d(__frame_sh, st, probing_frame); - - const REAL_T this_support = calculate_data_support_d(*__first_val_sh, - pos, pmf, dimx, dimy, dimz, dimt, - probe_step_size, - odf_sphere_vertices, - k1_probe, k2_probe, - probing_frame); + for (int ii = 0; ii < TRIES_PER_REJECTION_SAMPLING; ii++) { + if (tidx == 0) { + REAL_T r1 = curand_uniform(st); + REAL_T r2 = curand_uniform(st); + if (r1 + r2 > 1) { + r1 = 1 - r1; + r2 = 1 - r2; + } + + __tmp = curand_uniform(st) * last_cdf; + int jj; + for (jj = 0; jj < DISC_FACE_CNT; jj++) { // TODO: parallelize this + if (__face_cdf_sh[jj] >= __tmp) + break; + } + + const REAL_T vx0 = max_curvature * DISC_VERT[DISC_FACE[jj*3]*2]; + const REAL_T vx1 = max_curvature * DISC_VERT[DISC_FACE[jj*3+1]*2]; + const REAL_T vx2 = max_curvature * DISC_VERT[DISC_FACE[jj*3+2]*2]; + const REAL_T vy0 = max_curvature * DISC_VERT[DISC_FACE[jj*3]*2 + 1]; + const REAL_T vy1 = max_curvature * DISC_VERT[DISC_FACE[jj*3+1]*2 + 1]; + const REAL_T vy2 = max_curvature * DISC_VERT[DISC_FACE[jj*3+2]*2 + 1]; + *__k1_probe_sh = vx0 + r1 * (vx1 - vx0) + r2 * (vx2 - vx0); + *__k2_probe_sh = vy0 + r1 * (vy1 - vy0) + r2 * (vy2 - vy0); + get_probing_frame_d(__frame_sh, st, __probing_frame_sh); + } + __syncwarp(WMASK); + + const REAL_T this_support = calculate_data_support_d( + first_val, + pos, pmf, dimx, dimy, dimz, dimt, + probe_step_size, + absolpmf_thresh, + odf_sphere_vertices, + __probing_prop_sh, __direc_sh, __probing_pos_sh, + __k1_probe_sh, __k2_probe_sh, + __probing_frame_sh); __syncwarp(WMASK); - int winning_lane = -1; // -1 indicates nobody won - int __msk = __ballot_sync(WMASK, this_support >= NORM_MIN_SUPPORT); - if (__msk != 0) { - REAL_T group_max_support = this_support; - #pragma unroll - for(int j = 1; j < PROBABILISTIC_GROUP_SZ; j *= 2) { - __tmp = __shfl_xor_sync(WMASK, group_max_support, j, BDIM_X); - group_max_support = MAX(group_max_support, __tmp); - } - __msk &= __ballot_sync(WMASK, this_support == group_max_support); - winning_lane = __ffs(__msk) - 1; + if (this_support < PROBE_QUALITY * absolpmf_thresh) { + continue; } - if (winning_lane != -1) { - if (tidx == winning_lane) { -#if 0 - if (threadIdx.y == 1) { - printf("winning k1 %f, k2 %f, cdf %f, cdf_idx %i", k1_probe, k2_probe, __tmp, jj); - } -#endif - if (IS_INIT) { - dirs[0] = dir; - } else { - REAL_T __prop[9]; - REAL_T __dir[3]; - prepare_propagator_d(k1_probe, k2_probe, step_size/STEP_FRAC, __prop); - propogate_frame_d(__prop, probing_frame, __dir); - norm3_d(__dir, 0); // this will be scaled by the generic stepping code - dirs[0] = (REAL3_T) {__dir[0], __dir[1], __dir[2]}; - } - for (int jj = 0; jj < 9; jj++) { - __frame_sh[jj] = probing_frame[jj]; - } + if (tidx == 0) { + if (IS_INIT) { + dirs[0] = dir; + } else { + // propagate, but only 1/STEP_FRAC of a step + prepare_propagator_d( + *__k1_probe_sh, *__k2_probe_sh, + step_size/STEP_FRAC, __probing_prop_sh); + get_probing_frame_d<0>(__frame_sh, st, __probing_frame_sh); + propagate_frame_d(__probing_prop_sh, __probing_frame_sh, __direc_sh); + norm3_d(__direc_sh, 0); // this will be scaled by the generic stepping code + dirs[0] = MAKE_REAL3(__direc_sh[0], __direc_sh[1], __direc_sh[2]); } - __syncwarp(WMASK); - return 1; } + + if (tidx < 9) { + __frame_sh[tidx] = __probing_frame_sh[tidx]; + } + __syncwarp(WMASK); + return 1; } return 0; } diff --git a/cuslines/ptt.cuh b/cuslines/cuda_c/ptt.cuh similarity index 71% rename from cuslines/ptt.cuh rename to cuslines/cuda_c/ptt.cuh index d8986b5..9126250 100644 --- a/cuslines/ptt.cuh +++ b/cuslines/cuda_c/ptt.cuh @@ -4,18 +4,13 @@ #include "disc.h" #include "globals.h" -#define STEP_FRAC 20 // divides output step size (usually 0.5) into this many internal steps -#define PROBE_FRAC 2 // divides output step size (usually 0.5) to find probe length -#define PROBE_QUALITY 4 -#define SAMPLING_QUALITY 4 // can be 2-7 -#define PROBABILISTIC_BIAS 1 // 1 looks good. can be 0-log_2(N_WARPS) (typically 0-5). 0 is fully probabilistic, 4 is close to deterministic. -#define ALLOW_WEAK_LINK 1 -#define TRIES_PER_REJECTION_SAMPLING 1024 -#define DEFAULT_PTT_MINDATASUPPORT 0.05 -#define K_SMALL 0.0001 - -#define NORM_MIN_SUPPORT (DEFAULT_PTT_MINDATASUPPORT * PROBE_QUALITY) -#define PROBABILISTIC_GROUP_SZ POW2(PROBABILISTIC_BIAS) +#define STEP_FRAC (20) // divides output step size (usually 0.5) into this many internal steps +#define PROBE_FRAC (2) // divides output step size (usually 0.5) to find probe length +#define PROBE_QUALITY (4) // Number of probing steps +#define SAMPLING_QUALITY (2) // can be 2-7 +#define ALLOW_WEAK_LINK (0) +#define TRIES_PER_REJECTION_SAMPLING (1024) +#define K_SMALL ((REAL) 0.0001) #if SAMPLING_QUALITY == 2 #define DISC_VERT_CNT DISC_2_VERT_CNT diff --git a/cuslines/cuda_c/tracking_helpers.cu b/cuslines/cuda_c/tracking_helpers.cu new file mode 100644 index 0000000..21d5f67 --- /dev/null +++ b/cuslines/cuda_c/tracking_helpers.cu @@ -0,0 +1,290 @@ + +using namespace cuwsort; + +template +__device__ REAL_T interpolation_helper_d(const REAL_T*__restrict__ dataf, const REAL_T wgh[3][2], const long long coo[3][2], int dimy, int dimz, int dimt, int t) { + REAL_T __tmp = 0; + #pragma unroll + for (int i = 0; i < 2; i++) { + #pragma unroll + for (int j = 0; j < 2; j++) { + #pragma unroll + for (int k = 0; k < 2; k++) { + __tmp += wgh[0][i] * wgh[1][j] * wgh[2][k] * + dataf[coo[0][i] * dimy * dimz * dimt + + coo[1][j] * dimz * dimt + + coo[2][k] * dimt + + t]; + } + } + } + return __tmp; +} + +template +__device__ int trilinear_interp_d(const int dimx, + const int dimy, + const int dimz, + const int dimt, + int dimt_idx, // If -1, get all + const REAL_T *__restrict__ dataf, + const REAL3_T point, + REAL_T *__restrict__ __vox_data) { + const REAL_T HALF = static_cast(0.5); + + // all thr compute the same here + if (point.x < -HALF || point.x+HALF >= dimx || + point.y < -HALF || point.y+HALF >= dimy || + point.z < -HALF || point.z+HALF >= dimz) { + return -1; + } + + long long coo[3][2]; + REAL_T wgh[3][2]; // could use just one... + + const REAL_T ONE = static_cast(1.0); + + const REAL3_T fl = MAKE_REAL3(FLOOR(point.x), + FLOOR(point.y), + FLOOR(point.z)); + + wgh[0][1] = point.x - fl.x; + wgh[0][0] = ONE-wgh[0][1]; + coo[0][0] = MAX(0, fl.x); + coo[0][1] = MIN(dimx-1, coo[0][0]+1); + + wgh[1][1] = point.y - fl.y; + wgh[1][0] = ONE-wgh[1][1]; + coo[1][0] = MAX(0, fl.y); + coo[1][1] = MIN(dimy-1, coo[1][0]+1); + + wgh[2][1] = point.z - fl.z; + wgh[2][0] = ONE-wgh[2][1]; + coo[2][0] = MAX(0, fl.z); + coo[2][1] = MIN(dimz-1, coo[2][0]+1); + + if (dimt_idx == -1) { + for (int t = threadIdx.x; t < dimt; t += BDIM_X) { + __vox_data[t] = interpolation_helper_d(dataf, wgh, coo, dimy, dimz, dimt, t); + } + } else { + *__vox_data = interpolation_helper_d(dataf, wgh, coo, dimy, dimz, dimt, dimt_idx); + } + + // if (threadIdx.x == 0) { + // printf("point: %f, %f, %f\n", point.x, point.y, point.z); + // printf("dimt_idx: %d\n", dimt_idx); + // // for(int i = 0; i < dimt; i++) { + // // printf("__vox_data[%d]: %f\n", i, __vox_data[i]); + // // } + // } + return 0; +} + +template +__device__ int check_point_d(const REAL_T tc_threshold, + const REAL3_T point, + const int dimx, + const int dimy, + const int dimz, + const REAL_T *__restrict__ metric_map) { + + const int tidy = threadIdx.y; + + const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; + const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); + + __shared__ REAL_T __shInterpOut[BDIM_Y]; + + const int rv = trilinear_interp_d(dimx, dimy, dimz, 1, 0, metric_map, point, __shInterpOut+tidy); + __syncwarp(WMASK); +#if 0 + if (threadIdx.y == 1 && threadIdx.x == 0) { + printf("__shInterpOut[tidy]: %f, TC_THRESHOLD_P: %f\n", __shInterpOut[tidy], TC_THRESHOLD_P); + } +#endif + if (rv != 0) { + return OUTSIDEIMAGE; + } + //return (__shInterpOut[tidy] > TC_THRESHOLD_P) ? TRACKPOINT : ENDPOINT; + return (__shInterpOut[tidy] > tc_threshold) ? TRACKPOINT : ENDPOINT; +} + +template +__device__ int peak_directions_d(const REAL_T *__restrict__ odf, + REAL3_T *__restrict__ dirs, + const REAL3_T *__restrict__ sphere_vertices, + const int2 *__restrict__ sphere_edges, + const int num_edges, + int samplm_nr, + int *__restrict__ __shInd, + const REAL_T relative_peak_thres, + const REAL_T min_separation_angle) { + + const int tidx = threadIdx.x; + + const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; + const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); + + const unsigned int lmask = (1 << lid)-1; + +// __shared__ int __shInd[BDIM_Y][SAMPLM_NR]; + + #pragma unroll + for(int j = tidx; j < samplm_nr; j += BDIM_X) { + __shInd[j] = 0; + } + + REAL_T odf_min = min_d(samplm_nr, odf, REAL_MAX); + odf_min = MAX(0, odf_min); + + __syncwarp(WMASK); + + // local_maxima() + _compare_neighbors() + // selecting only the indices corrisponding to maxima Ms + // such that M-odf_min >= relative_peak_thres + //#pragma unroll + for(int j = 0; j < num_edges; j += BDIM_X) { + if (j+tidx < num_edges) { + const int u_ind = sphere_edges[j+tidx].x; + const int v_ind = sphere_edges[j+tidx].y; + + //if (u_ind >= NUM_EDGES || v_ind >= NUM_EDGES) { ERROR; } + + const REAL_T u_val = odf[u_ind]; + const REAL_T v_val = odf[v_ind]; + + //if (u_val != u_val || v_val != v_val) { ERROR_NANs; } + + // only check that they are not equal + //if (u_val != v_val) { + // __shInd[tidy][u_val < v_val ? u_ind : v_ind] = -1; // benign race conditions... + //} + if (u_val < v_val) { + atomicExch(__shInd+u_ind, -1); + atomicOr( __shInd+v_ind, 1); + } else if (v_val < u_val) { + atomicExch(__shInd+v_ind, -1); + atomicOr( __shInd+u_ind, 1); + } + } + } + __syncwarp(WMASK); + + const REAL_T compThres = relative_peak_thres*max_mask_transl_d(samplm_nr, __shInd, odf, -odf_min, REAL_MIN); +#if 1 +/* + if (!tidy && !tidx) { + for(int j = 0; j < SAMPLM_NR; j++) { + printf("local_max[%d]: %d (%f)\n", j, __shInd[tidy][j], odf[j]); + } + printf("maxMax with offset %f: %f\n", -odf_min, compThres); + } + __syncwarp(WMASK); +*/ + // compact indices of positive values to the right + int n = 0; + + for(int j = 0; j < samplm_nr; j += BDIM_X) { + + const int __v = (j+tidx < samplm_nr) ? __shInd[j+tidx] : -1; + const int __keep = (__v > 0) && ((odf[j+tidx]-odf_min) >= compThres); + const int __msk = __ballot_sync(WMASK, __keep); + +//__syncwarp(WMASK); // unnecessary + if (__keep) { + const int myoff = __popc(__msk & lmask); + __shInd[n + myoff] = j+tidx; + } + n += __popc(__msk); +//__syncwarp(WMASK); // should be unnecessary + } + __syncwarp(WMASK); +/* + if (!tidy && !tidx) { + for(int j = 0; j < n; j++) { + printf("local_max_compact[%d]: %d\n", j, __shInd[tidy][j]); + } + } + __syncwarp(WMASK); +*/ + + // sort local maxima indices + if (n < BDIM_X) { + REAL_T k = REAL_MIN; + int v = 0; + if (tidx < n) { + v = __shInd[tidx]; + k = odf[v]; + } + warp_sort<32, BDIM_X, WSORT_DIR_DEC>(&k, &v); + __syncwarp(WMASK); + + if (tidx < n) { + __shInd[tidx] = v; + } + } else { + // ERROR !!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + } + __syncwarp(WMASK); + + // __shInd[tidy][] contains the indices in odf correspoding to + // normalized maxima NOT sorted! + if (n != 0) { + // remove_similar_vertices() + // PRELIMINARY INEFFICIENT, SINGLE TH, IMPLEMENTATION + if (tidx == 0) { + const REAL_T cos_similarity = COS(min_separation_angle); + + dirs[0] = sphere_vertices[__shInd[0]]; + + int k = 1; + for(int i = 1; i < n; i++) { + + const REAL3_T abc = sphere_vertices[__shInd[i]]; + + int j = 0; + for(; j < k; j++) { + const REAL_T cos = FABS(abc.x*dirs[j].x+ + abc.y*dirs[j].y+ + abc.z*dirs[j].z); + if (cos > cos_similarity) { + break; + } + } + if (j == k) { + dirs[k++] = abc; + } + } + n = k; + } + n = __shfl_sync(WMASK, n, 0, BDIM_X); + __syncwarp(WMASK); + + } +/* + if (!tidy && !tidx) { + for(int j = 0; j < n; j++) { + printf("local_max_compact_uniq[%d]: %d\n", j, __shInd[tidy][j]); + } + } + __syncwarp(WMASK); +*/ +#else + const int indMax = max_d(__shInd[tidy], -1); + if (indMax != -1) { + __ret = MAKE_REAL3(sphere_vertices[indMax][0], + sphere_vertices[indMax][1], + sphere_vertices[indMax][2]); + } +#endif + return n; +} diff --git a/cuslines/cuda_c/utils.cu b/cuslines/cuda_c/utils.cu new file mode 100644 index 0000000..8c5afe1 --- /dev/null +++ b/cuslines/cuda_c/utils.cu @@ -0,0 +1,138 @@ +template +__device__ VAL_T max_d(const int n, const VAL_T *__restrict__ src, const VAL_T minVal) { + + const int tidx = threadIdx.x; + + const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; + const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); + + VAL_T __m = minVal; + + for(int i = tidx; i < n; i += BDIM_X) { + __m = MAX(__m, src[i]); + } + + #pragma unroll + for(int i = BDIM_X/2; i; i /= 2) { + const VAL_T __tmp = __shfl_xor_sync(WMASK, __m, i, BDIM_X); + __m = MAX(__m, __tmp); + } + + return __m; +} + +template +__device__ VAL_T max_mask_transl_d(const int n, + const LEN_T *__restrict__ srcMsk, + const VAL_T *__restrict__ srcVal, + const VAL_T offset, + const VAL_T minVal) { + + const int tidx = threadIdx.x; + + const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; + const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); + + VAL_T __m = minVal; + + for(int i = tidx; i < n; i += BDIM_X) { + const LEN_T sel = srcMsk[i]; + if (sel > 0) { + __m = MAX(__m, srcVal[i]+offset); + } + } + + #pragma unroll + for(int i = BDIM_X/2; i; i /= 2) { + const VAL_T __tmp = __shfl_xor_sync(WMASK, __m, i, BDIM_X); + __m = MAX(__m, __tmp); + } + + return __m; +} + +template +__device__ VAL_T min_d(const int n, const VAL_T *__restrict__ src, const VAL_T maxVal) { + + const int tidx = threadIdx.x; + + const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; + const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); + + VAL_T __m = maxVal; + + for(int i = tidx; i < n; i += BDIM_X) { + __m = MIN(__m, src[i]); + } + + #pragma unroll + for(int i = BDIM_X/2; i; i /= 2) { + const VAL_T __tmp = __shfl_xor_sync(WMASK, __m, i, BDIM_X); + __m = MIN(__m, __tmp); + } + + return __m; +} + +template +__device__ void prefix_sum_sh_d(REAL_T *num_sh, int __len) { + const int tidx = threadIdx.x; + + const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; + const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); + +#if 0 + // for debugging + __syncwarp(WMASK); + if (tidx == 0) { + for (int j = 1; j < __len; j++) { + num_sh[j] += num_sh[j-1]; + } + } + __syncwarp(WMASK); +#else + for (int j = 0; j < __len; j += BDIM_X) { + if ((tidx == 0) && (j != 0)) { + num_sh[j] += num_sh[j-1]; + } + __syncwarp(WMASK); + + REAL_T __t_pmf; + if (j+tidx < __len) { + __t_pmf = num_sh[j+tidx]; + } + for (int i = 1; i < BDIM_X; i*=2) { + REAL_T __tmp = __shfl_up_sync(WMASK, __t_pmf, i, BDIM_X); + if ((tidx >= i) && (j+tidx < __len)) { + __t_pmf += __tmp; + } + } + if (j+tidx < __len) { + num_sh[j+tidx] = __t_pmf; + } + __syncwarp(WMASK); + } +#endif +} + +template +__device__ void printArrayAlways(const char *name, int ncol, int n, REAL_T *arr) { + printf("%s:\n", name); + + for(int j = 0; j < n; j++) { + if ((j%ncol)==0) printf("\n"); + printf("%10.8f ", arr[j]); + } + printf("\n"); +} + +template +__device__ void printArray(const char *name, int ncol, int n, REAL_T *arr) { + if (!threadIdx.x && !threadIdx.y && !blockIdx.x) { + printArrayAlways(name, ncol, n, arr); + } +} diff --git a/cuslines/cuda_python/__init__.py b/cuslines/cuda_python/__init__.py new file mode 100644 index 0000000..fd05c1e --- /dev/null +++ b/cuslines/cuda_python/__init__.py @@ -0,0 +1,13 @@ +from .cu_tractography import GPUTracker +from .cu_direction_getters import ( + ProbDirectionGetter, + PttDirectionGetter, + BootDirectionGetter, +) + +__all__ = [ + "GPUTracker", + "ProbDirectionGetter", + "PttDirectionGetter", + "BootDirectionGetter", +] diff --git a/cuslines/cuda_python/_globals.py b/cuslines/cuda_python/_globals.py new file mode 100644 index 0000000..c19368e --- /dev/null +++ b/cuslines/cuda_python/_globals.py @@ -0,0 +1,10 @@ +# AUTO-GENERATED FROM globals.h — DO NOT EDIT + +EXCESS_ALLOC_FACT = 2 +MAX_SLINES_PER_SEED = 10 +MAX_SLINE_LEN = 501 +NORM_EPS = 1e-08 +PMF_THRESHOLD_P = 0.05 +REAL_SIZE = 4 +THR_X_BL = 64 +THR_X_SL = 32 diff --git a/cuslines/cuda_python/cu_direction_getters.py b/cuslines/cuda_python/cu_direction_getters.py new file mode 100644 index 0000000..617f893 --- /dev/null +++ b/cuslines/cuda_python/cu_direction_getters.py @@ -0,0 +1,472 @@ +import numpy as np +from abc import ABC, abstractmethod +import logging +from importlib.resources import files +from time import time + +from dipy.reconst import shm + +from cuda.core import Device, LaunchConfig, Program, launch, ProgramOptions +from cuda.pathfinder import find_nvidia_header_directory +from cuda.cccl import get_include_paths +from cuda.bindings import runtime, driver +from cuda.bindings.runtime import cudaMemcpyKind + +from cuslines.cuda_python.cutils import ( + REAL_SIZE, + REAL_DTYPE, + REAL_DTYPE_AS_STR, + REAL3_DTYPE_AS_STR, + checkCudaErrors, + ModelType, + THR_X_SL, + BLOCK_Y, +) + +logger = logging.getLogger("GPUStreamlines") + + +class GPUDirectionGetter(ABC): + @abstractmethod + def getNumStreamlines(self, n, nseeds_gpu, block, grid, sp): + pass + + @abstractmethod + def generateStreamlines(self, n, nseeds_gpu, block, grid, sp): + pass + + def allocate_on_gpu(self, n): + pass + + def deallocate_on_gpu(self, n): + pass + + def compile_program(self, debug: bool = False): + start_time = time() + logger.info("Compiling GPUStreamlines") + + cuslines_cuda = files("cuslines").joinpath("cuda_c") + + if debug: + program_opts = { + "ptxas_options": ["-O0", "-v"], + "device_code_optimize": True, + "debug": True, + "lineinfo": True, + } + else: + program_opts = {"ptxas_options": ["-O3"]} + + program_options = ProgramOptions( + name="cuslines", + use_fast_math=True, + std="c++17", + define_macro="__NVRTC__", + include_path=[ + str(cuslines_cuda), + find_nvidia_header_directory("cudart"), + find_nvidia_header_directory("curand"), + get_include_paths().libcudacxx, + ], + **program_opts, + ) + + # Here we assume all devices are the same, + # so we compile once for any current device. + # I think this is reasonable + dev = Device() + dev.set_current() + cuda_path = cuslines_cuda.joinpath("generate_streamlines_cuda.cu") + with open(cuda_path, "r") as f: + prog = Program(f.read(), code_type="c++", options=program_options) + self.module = prog.compile( + "cubin", + name_expressions=( + self.getnum_kernel_name, + self.genstreamlines_kernel_name, + ), + ) + logger.info( + "GPUStreamlines compiled successfully in %.2f seconds", time() - start_time + ) + + +class BootDirectionGetter(GPUDirectionGetter): + def __init__( + self, + model_type: str, + min_signal: float, + H: np.ndarray, + R: np.ndarray, + delta_b: np.ndarray, + delta_q: np.ndarray, + sampling_matrix: np.ndarray, + b0s_mask: np.ndarray, + ): + if model_type.upper() == "OPDT": + self.model_type = int(ModelType.OPDT) + elif model_type.upper() == "CSA": + self.model_type = int(ModelType.CSA) + else: + raise ValueError( + f"Invalid model_type {model_type}, must be one of 'OPDT', 'CSA'" + ) + + checkCudaErrors(driver.cuInit(0)) + + self.H = np.ascontiguousarray(H, dtype=REAL_DTYPE) + self.R = np.ascontiguousarray(R, dtype=REAL_DTYPE) + self.delta_b = np.ascontiguousarray(delta_b, dtype=REAL_DTYPE) + self.delta_q = np.ascontiguousarray(delta_q, dtype=REAL_DTYPE) + self.delta_nr = int(delta_b.shape[0]) + self.min_signal = REAL_DTYPE(min_signal) + self.sampling_matrix = np.ascontiguousarray(sampling_matrix, dtype=REAL_DTYPE) + self.b0s_mask = np.ascontiguousarray(b0s_mask, dtype=np.int32) + + self.H_d = [] + self.R_d = [] + self.delta_b_d = [] + self.delta_q_d = [] + self.b0s_mask_d = [] + self.sampling_matrix_d = [] + + self.getnum_kernel_name = f"getNumStreamlinesBoot_k<{THR_X_SL},{BLOCK_Y},{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>" + self.genstreamlines_kernel_name = f"genStreamlinesMergeBoot_k<{THR_X_SL},{BLOCK_Y},{model_type.upper()},{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>" + self.compile_program() + + @classmethod + def from_dipy_opdt( + cls, + gtab, + sphere, + sh_order_max=6, + full_basis=False, + sh_lambda=0.006, + min_signal=1, + ): + sampling_matrix, _, _ = shm.real_sh_descoteaux( + sh_order_max, sphere.theta, sphere.phi, full_basis=full_basis, legacy=False + ) + + model = shm.OpdtModel( + gtab, sh_order_max=sh_order_max, smooth=sh_lambda, min_signal=min_signal + ) + fit_matrix = model._fit_matrix + delta_b, delta_q = fit_matrix + + b0s_mask = gtab.b0s_mask + dwi_mask = ~b0s_mask + x, y, z = model.gtab.gradients[dwi_mask].T + _, theta, phi = shm.cart2sphere(x, y, z) + B, _, _ = shm.real_sym_sh_basis(sh_order_max, theta, phi) + H = shm.hat(B) + R = shm.lcr_matrix(H) + + return cls( + model_type="OPDT", + min_signal=min_signal, + H=H, + R=R, + delta_b=delta_b, + delta_q=delta_q, + sampling_matrix=sampling_matrix, + b0s_mask=gtab.b0s_mask, + ) + + @classmethod + def from_dipy_csa( + cls, + gtab, + sphere, + sh_order_max=6, + full_basis=False, + sh_lambda=0.006, + min_signal=1, + ): + sampling_matrix, _, _ = shm.real_sh_descoteaux( + sh_order_max, sphere.theta, sphere.phi, full_basis=full_basis, legacy=False + ) + + model = shm.CsaOdfModel( + gtab, sh_order_max=sh_order_max, smooth=sh_lambda, min_signal=min_signal + ) + fit_matrix = model._fit_matrix + delta_b = fit_matrix + delta_q = fit_matrix + + b0s_mask = gtab.b0s_mask + dwi_mask = ~b0s_mask + x, y, z = model.gtab.gradients[dwi_mask].T + _, theta, phi = shm.cart2sphere(x, y, z) + B, _, _ = shm.real_sym_sh_basis(sh_order_max, theta, phi) + H = shm.hat(B) + R = shm.lcr_matrix(H) + + return cls( + model_type="CSA", + min_signal=min_signal, + H=H, + R=R, + delta_b=delta_b, + delta_q=delta_q, + sampling_matrix=sampling_matrix, + b0s_mask=gtab.b0s_mask, + ) + + def allocate_on_gpu(self, n): + self.H_d.append(checkCudaErrors(runtime.cudaMalloc(REAL_SIZE * self.H.size))) + self.R_d.append(checkCudaErrors(runtime.cudaMalloc(REAL_SIZE * self.R.size))) + self.delta_b_d.append( + checkCudaErrors(runtime.cudaMalloc(REAL_SIZE * self.delta_b.size)) + ) + self.delta_q_d.append( + checkCudaErrors(runtime.cudaMalloc(REAL_SIZE * self.delta_q.size)) + ) + self.b0s_mask_d.append( + checkCudaErrors(runtime.cudaMalloc(np.int32().nbytes * self.b0s_mask.size)) + ) + self.sampling_matrix_d.append( + checkCudaErrors(runtime.cudaMalloc(REAL_SIZE * self.sampling_matrix.size)) + ) + + checkCudaErrors( + runtime.cudaMemcpy( + self.H_d[n], + self.H.ctypes.data, + REAL_SIZE * self.H.size, + cudaMemcpyKind.cudaMemcpyHostToDevice, + ) + ) + checkCudaErrors( + runtime.cudaMemcpy( + self.R_d[n], + self.R.ctypes.data, + REAL_SIZE * self.R.size, + cudaMemcpyKind.cudaMemcpyHostToDevice, + ) + ) + checkCudaErrors( + runtime.cudaMemcpy( + self.delta_b_d[n], + self.delta_b.ctypes.data, + REAL_SIZE * self.delta_b.size, + cudaMemcpyKind.cudaMemcpyHostToDevice, + ) + ) + checkCudaErrors( + runtime.cudaMemcpy( + self.delta_q_d[n], + self.delta_q.ctypes.data, + REAL_SIZE * self.delta_q.size, + cudaMemcpyKind.cudaMemcpyHostToDevice, + ) + ) + checkCudaErrors( + runtime.cudaMemcpy( + self.b0s_mask_d[n], + self.b0s_mask.ctypes.data, + np.int32().nbytes * self.b0s_mask.size, + cudaMemcpyKind.cudaMemcpyHostToDevice, + ) + ) + checkCudaErrors( + runtime.cudaMemcpy( + self.sampling_matrix_d[n], + self.sampling_matrix.ctypes.data, + REAL_SIZE * self.sampling_matrix.size, + cudaMemcpyKind.cudaMemcpyHostToDevice, + ) + ) + + def deallocate_on_gpu(self, n): + if self.H_d[n]: + checkCudaErrors(runtime.cudaFree(self.H_d[n])) + if self.R_d[n]: + checkCudaErrors(runtime.cudaFree(self.R_d[n])) + if self.delta_b_d[n]: + checkCudaErrors(runtime.cudaFree(self.delta_b_d[n])) + if self.delta_q_d[n]: + checkCudaErrors(runtime.cudaFree(self.delta_q_d[n])) + if self.b0s_mask_d[n]: + checkCudaErrors(runtime.cudaFree(self.b0s_mask_d[n])) + if self.sampling_matrix_d[n]: + checkCudaErrors(runtime.cudaFree(self.sampling_matrix_d[n])) + + def _shared_mem_bytes(self, sp): + return ( + REAL_SIZE + * BLOCK_Y + * 2 + * ( + sp.gpu_tracker.n32dimt + + max(sp.gpu_tracker.n32dimt, sp.gpu_tracker.samplm_nr) + ) + + np.int32().nbytes * BLOCK_Y * sp.gpu_tracker.samplm_nr + ) + + def getNumStreamlines(self, n, nseeds_gpu, block, grid, sp): + ker = self.module.get_kernel(self.getnum_kernel_name) + shared_memory = self._shared_mem_bytes(sp) + config = LaunchConfig(block=block, grid=grid, shmem_size=shared_memory) + + launch( + sp.gpu_tracker.streams[n], + config, + ker, + self.model_type, + sp.gpu_tracker.max_angle, + self.min_signal, + sp.gpu_tracker.relative_peak_thresh, + sp.gpu_tracker.min_separation_angle, + sp.gpu_tracker.rng_seed, + nseeds_gpu, + sp.seeds_d[n], + sp.gpu_tracker.dimx, + sp.gpu_tracker.dimy, + sp.gpu_tracker.dimz, + sp.gpu_tracker.dimt, + sp.gpu_tracker.dataf_d[n], + self.H_d[n], + self.R_d[n], + self.delta_nr, + self.delta_b_d[n], + self.delta_q_d[n], + self.b0s_mask_d[n], + sp.gpu_tracker.samplm_nr, + self.sampling_matrix_d[n], + sp.gpu_tracker.sphere_vertices_d[n], + sp.gpu_tracker.sphere_edges_d[n], + sp.gpu_tracker.nedges, + sp.shDirTemp0_d[n], + sp.slinesOffs_d[n], + ) + + def generateStreamlines(self, n, nseeds_gpu, block, grid, sp): + ker = self.module.get_kernel(self.genstreamlines_kernel_name) + shared_memory = self._shared_mem_bytes(sp) + config = LaunchConfig(block=block, grid=grid, shmem_size=shared_memory) + + launch( + sp.gpu_tracker.streams[n], + config, + ker, + sp.gpu_tracker.max_angle, + sp.gpu_tracker.tc_threshold, + sp.gpu_tracker.step_size, + sp.gpu_tracker.relative_peak_thresh, + sp.gpu_tracker.min_separation_angle, + sp.gpu_tracker.rng_seed, + sp.gpu_tracker.rng_offset + n * nseeds_gpu, + nseeds_gpu, + sp.seeds_d[n], + sp.gpu_tracker.dimx, + sp.gpu_tracker.dimy, + sp.gpu_tracker.dimz, + sp.gpu_tracker.dimt, + sp.gpu_tracker.dataf_d[n], + sp.gpu_tracker.metric_map_d[n], + sp.gpu_tracker.samplm_nr, + sp.gpu_tracker.sphere_vertices_d[n], + sp.gpu_tracker.sphere_edges_d[n], + sp.gpu_tracker.nedges, + self.min_signal, + self.delta_nr, + self.H_d[n], + self.R_d[n], + self.delta_b_d[n], + self.delta_q_d[n], + self.sampling_matrix_d[n], + self.b0s_mask_d[n], + sp.slinesOffs_d[n], + sp.shDirTemp0_d[n], + sp.slineSeed_d[n], + sp.slineLen_d[n], + sp.sline_d[n], + ) + + +class ProbDirectionGetter(GPUDirectionGetter): + def __init__(self): + checkCudaErrors(driver.cuInit(0)) + self.getnum_kernel_name = f"getNumStreamlinesProb_k<{THR_X_SL},{BLOCK_Y},{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>" + self.genstreamlines_kernel_name = f"genStreamlinesMergeProb_k<{THR_X_SL},{BLOCK_Y},PROB,{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>" + self.compile_program() + + def getNumStreamlines(self, n, nseeds_gpu, block, grid, sp): + ker = self.module.get_kernel(self.getnum_kernel_name) + shared_memory = ( + REAL_SIZE * BLOCK_Y * sp.gpu_tracker.n32dimt + + np.int32().nbytes * BLOCK_Y * sp.gpu_tracker.n32dimt + ) + config = LaunchConfig(block=block, grid=grid, shmem_size=shared_memory) + + launch( + sp.gpu_tracker.streams[n], + config, + ker, + sp.gpu_tracker.max_angle, + sp.gpu_tracker.relative_peak_thresh, + sp.gpu_tracker.min_separation_angle, + sp.gpu_tracker.rng_seed, + nseeds_gpu, + sp.seeds_d[n], + sp.gpu_tracker.dimx, + sp.gpu_tracker.dimy, + sp.gpu_tracker.dimz, + sp.gpu_tracker.dimt, + sp.gpu_tracker.dataf_d[n], + sp.gpu_tracker.sphere_vertices_d[n], + sp.gpu_tracker.sphere_edges_d[n], + sp.gpu_tracker.nedges, + sp.shDirTemp0_d[n], + sp.slinesOffs_d[n], + ) + + def _shared_mem_bytes(self, sp): + return REAL_SIZE * BLOCK_Y * sp.gpu_tracker.n32dimt + + def generateStreamlines(self, n, nseeds_gpu, block, grid, sp): + ker = self.module.get_kernel(self.genstreamlines_kernel_name) + shared_memory = self._shared_mem_bytes(sp) + config = LaunchConfig(block=block, grid=grid, shmem_size=shared_memory) + + launch( + sp.gpu_tracker.streams[n], + config, + ker, + sp.gpu_tracker.max_angle, + sp.gpu_tracker.tc_threshold, + sp.gpu_tracker.step_size, + sp.gpu_tracker.relative_peak_thresh, + sp.gpu_tracker.min_separation_angle, + sp.gpu_tracker.rng_seed, + sp.gpu_tracker.rng_offset + n * nseeds_gpu, + nseeds_gpu, + sp.seeds_d[n], + sp.gpu_tracker.dimx, + sp.gpu_tracker.dimy, + sp.gpu_tracker.dimz, + sp.gpu_tracker.dimt, + sp.gpu_tracker.dataf_d[n], + sp.gpu_tracker.metric_map_d[n], + sp.gpu_tracker.samplm_nr, + sp.gpu_tracker.sphere_vertices_d[n], + sp.gpu_tracker.sphere_edges_d[n], + sp.gpu_tracker.nedges, + sp.slinesOffs_d[n], + sp.shDirTemp0_d[n], + sp.slineSeed_d[n], + sp.slineLen_d[n], + sp.sline_d[n], + ) + + +class PttDirectionGetter(ProbDirectionGetter): + def __init__(self): + checkCudaErrors(driver.cuInit(0)) + self.getnum_kernel_name = f"getNumStreamlinesProb_k<{THR_X_SL},{BLOCK_Y},{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>" + self.genstreamlines_kernel_name = f"genStreamlinesMergeProb_k<{THR_X_SL},{BLOCK_Y},PTT,{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>" + self.compile_program() + + def _shared_mem_bytes(self, sp): + return 0 diff --git a/cuslines/cuda_python/cu_propagate_seeds.py b/cuslines/cuda_python/cu_propagate_seeds.py new file mode 100644 index 0000000..b8991e5 --- /dev/null +++ b/cuslines/cuda_python/cu_propagate_seeds.py @@ -0,0 +1,259 @@ +import numpy as np +import gc +from cuda.bindings import runtime +from cuda.bindings.runtime import cudaMemcpyKind + +from nibabel.streamlines.array_sequence import ArraySequence, MEGABYTE +import logging + +from cuslines.cuda_python.cutils import ( + REAL_SIZE, + REAL_DTYPE, + REAL3_DTYPE, + MAX_SLINE_LEN, + EXCESS_ALLOC_FACT, + THR_X_SL, + THR_X_BL, + DEV_PTR, + div_up, + checkCudaErrors, +) + + +logger = logging.getLogger("GPUStreamlines") + + +class SeedBatchPropagator: + def __init__(self, gpu_tracker): + self.gpu_tracker = gpu_tracker + self.ngpus = gpu_tracker.ngpus + + self.nSlines_old = np.zeros(self.ngpus, dtype=np.int32) + self.nSlines = np.zeros(self.ngpus, dtype=np.int32) + self.slines = np.zeros(self.ngpus, dtype=np.ndarray) + self.sline_lens = np.zeros(self.ngpus, dtype=np.ndarray) + + self.seeds_d = np.empty(self.ngpus, dtype=DEV_PTR) + self.slineSeed_d = np.empty(self.ngpus, dtype=DEV_PTR) + self.slinesOffs_d = np.empty(self.ngpus, dtype=DEV_PTR) + self.shDirTemp0_d = np.empty(self.ngpus, dtype=DEV_PTR) + self.slineLen_d = np.empty(self.ngpus, dtype=DEV_PTR) + self.sline_d = np.empty(self.ngpus, dtype=DEV_PTR) + + def _switch_device(self, n): + checkCudaErrors(runtime.cudaSetDevice(n)) + + nseeds_gpu = min( + self.nseeds_per_gpu, max(0, self.nseeds - n * self.nseeds_per_gpu) + ) + block = (THR_X_SL, THR_X_BL // THR_X_SL, 1) + grid = (div_up(nseeds_gpu, THR_X_BL // THR_X_SL), 1, 1) + + return nseeds_gpu, block, grid + + def _get_sl_buffer_size(self, n): + return REAL_SIZE * 2 * 3 * MAX_SLINE_LEN * self.nSlines[n].astype(np.int64) + + def _allocate_seed_memory(self, seeds): + # Move seeds to GPU + for ii in range(self.ngpus): + nseeds_gpu, _, _ = self._switch_device(ii) + self.seeds_d[ii] = checkCudaErrors( + runtime.cudaMalloc(REAL_SIZE * 3 * nseeds_gpu) + ) + seeds_host = np.ascontiguousarray( + seeds[ii * self.nseeds_per_gpu : ii * self.nseeds_per_gpu + nseeds_gpu], + dtype=REAL_DTYPE, + ) + checkCudaErrors( + runtime.cudaMemcpy( + self.seeds_d[ii], + seeds_host.ctypes.data, + REAL_SIZE * 3 * nseeds_gpu, + cudaMemcpyKind.cudaMemcpyHostToDevice, + ) + ) + + for ii in range(self.ngpus): + nseeds_gpu, block, grid = self._switch_device(ii) + # Streamline offsets + self.slinesOffs_d[ii] = checkCudaErrors( + runtime.cudaMalloc(np.int32().nbytes * (nseeds_gpu + 1)) + ) + # Initial directions from each seed + self.shDirTemp0_d[ii] = checkCudaErrors( + runtime.cudaMalloc( + REAL3_DTYPE.itemsize + * self.gpu_tracker.samplm_nr + * grid[0] + * block[1] + ) + ) + + def _cumsum_offsets( + self, + ): # TODO: performance: do this on device? not crucial for performance now + for ii in range(self.ngpus): + nseeds_gpu, _, _ = self._switch_device(ii) + if nseeds_gpu == 0: + self.nSlines[ii] = 0 + continue + + slinesOffs_h = np.empty(nseeds_gpu + 1, dtype=np.int32) + checkCudaErrors( + runtime.cudaMemcpy( + slinesOffs_h.ctypes.data, + self.slinesOffs_d[ii], + slinesOffs_h.nbytes, + cudaMemcpyKind.cudaMemcpyDeviceToHost, + ) + ) + + __pval = slinesOffs_h[0] + slinesOffs_h[0] = 0 + for jj in range(1, nseeds_gpu + 1): + __cval = slinesOffs_h[jj] + slinesOffs_h[jj] = slinesOffs_h[jj - 1] + __pval + __pval = __cval + self.nSlines[ii] = int(slinesOffs_h[nseeds_gpu]) + + checkCudaErrors( + runtime.cudaMemcpy( + self.slinesOffs_d[ii], + slinesOffs_h.ctypes.data, + slinesOffs_h.nbytes, + cudaMemcpyKind.cudaMemcpyHostToDevice, + ) + ) + + def _allocate_tracking_memory(self): + for ii in range(self.ngpus): + self._switch_device(ii) + + self.slineSeed_d[ii] = checkCudaErrors( + runtime.cudaMalloc(self.nSlines[ii] * np.int32().nbytes) + ) + checkCudaErrors( + runtime.cudaMemset( + self.slineSeed_d[ii], -1, self.nSlines[ii] * np.int32().nbytes + ) + ) + + if self.nSlines[ii] > EXCESS_ALLOC_FACT * self.nSlines_old[ii]: + self.slines[ii] = 0 + self.sline_lens[ii] = 0 + gc.collect() + + buffer_size = self._get_sl_buffer_size(ii) + logger.debug(f"Streamline buffer size: {buffer_size}") + + if not self.slines[ii]: + self.slines[ii] = np.empty( + (EXCESS_ALLOC_FACT * self.nSlines[ii], MAX_SLINE_LEN * 2, 3), + dtype=REAL_DTYPE, + ) + if not self.sline_lens[ii]: + self.sline_lens[ii] = np.empty( + EXCESS_ALLOC_FACT * self.nSlines[ii], dtype=np.int32 + ) + + for ii in range(self.ngpus): + self._switch_device(ii) + buffer_size = self._get_sl_buffer_size(ii) + + self.slineLen_d[ii] = checkCudaErrors( + runtime.cudaMalloc(np.int32().nbytes * self.nSlines[ii]) + ) + self.sline_d[ii] = checkCudaErrors(runtime.cudaMalloc(buffer_size)) + + def _cleanup(self): + for ii in range(self.ngpus): + self._switch_device(ii) + checkCudaErrors( + runtime.cudaMemcpyAsync( + self.slines[ii], + self.sline_d[ii], + self._get_sl_buffer_size(ii), + cudaMemcpyKind.cudaMemcpyDeviceToHost, + self.gpu_tracker.streams[ii], + ) + ) + checkCudaErrors( + runtime.cudaMemcpyAsync( + self.sline_lens[ii], + self.slineLen_d[ii], + np.int32().nbytes * self.nSlines[ii], + cudaMemcpyKind.cudaMemcpyDeviceToHost, + self.gpu_tracker.streams[ii], + ) + ) + + for ii in range(self.ngpus): + self._switch_device(ii) + checkCudaErrors(runtime.cudaStreamSynchronize(self.gpu_tracker.streams[ii])) + checkCudaErrors(runtime.cudaFree(self.seeds_d[ii])) + checkCudaErrors(runtime.cudaFree(self.slineSeed_d[ii])) + checkCudaErrors(runtime.cudaFree(self.slinesOffs_d[ii])) + checkCudaErrors(runtime.cudaFree(self.shDirTemp0_d[ii])) + checkCudaErrors(runtime.cudaFree(self.slineLen_d[ii])) + checkCudaErrors(runtime.cudaFree(self.sline_d[ii])) + + self.nSlines_old = self.nSlines + self.gpu_tracker.rng_offset += self.nseeds + + # TODO: performance: better queuing/batching of seeds, + # if more performance needed, + # given exponential nature of streamlines + # May be better to do in cuda code directly + def propagate(self, seeds): + self.nseeds = len(seeds) + self.nseeds_per_gpu = ( + self.nseeds + self.gpu_tracker.ngpus - 1 + ) // self.gpu_tracker.ngpus + + self._allocate_seed_memory(seeds) + + for ii in range(self.ngpus): + nseeds_gpu, block, grid = self._switch_device(ii) + if nseeds_gpu == 0: + continue + self.gpu_tracker.dg.getNumStreamlines(ii, nseeds_gpu, block, grid, self) + for ii in range(self.ngpus): + checkCudaErrors(runtime.cudaStreamSynchronize(self.gpu_tracker.streams[ii])) + + self._cumsum_offsets() + self._allocate_tracking_memory() + + for ii in range(self.ngpus): + nseeds_gpu, block, grid = self._switch_device(ii) + if nseeds_gpu == 0: + continue + self.gpu_tracker.dg.generateStreamlines(ii, nseeds_gpu, block, grid, self) + for ii in range(self.ngpus): + checkCudaErrors(runtime.cudaStreamSynchronize(self.gpu_tracker.streams[ii])) + + self._cleanup() + + def get_buffer_size(self): + buffer_size = 0 + for ii in range(self.ngpus): + lens = self.sline_lens[ii] + for jj in range(self.nSlines[ii]): + buffer_size += lens[jj] * 3 * REAL_SIZE + return buffer_size + + def as_generator(self): + def _yield_slines(): + for ii in range(self.ngpus): + this_sls = self.slines[ii] + this_len = self.sline_lens[ii] + + for jj in range(self.nSlines[ii]): + npts = this_len[jj] + + yield np.asarray(this_sls[jj], dtype=REAL_DTYPE)[:npts] + + return _yield_slines() + + def as_array_sequence(self): + return ArraySequence(self.as_generator(), self.get_buffer_size() // MEGABYTE) diff --git a/cuslines/cuda_python/cu_tractography.py b/cuslines/cuda_python/cu_tractography.py new file mode 100644 index 0000000..9c24cd7 --- /dev/null +++ b/cuslines/cuda_python/cu_tractography.py @@ -0,0 +1,315 @@ +from cuda.bindings import runtime +from cuda.bindings.runtime import cudaMemcpyKind +# TODO: consider cuda core over cuda bindings + +import numpy as np +from tqdm import tqdm +import logging +from math import radians + +from cuslines.cuda_python.cutils import ( + REAL_SIZE, + REAL_DTYPE, + checkCudaErrors, +) +from cuslines.cuda_python.cu_direction_getters import ( + GPUDirectionGetter, + BootDirectionGetter, +) +from cuslines.cuda_python.cu_propagate_seeds import SeedBatchPropagator + +from trx.trx_file_memmap import TrxFile + +from nibabel.streamlines.tractogram import Tractogram +from nibabel.streamlines.array_sequence import ArraySequence, MEGABYTE + +from dipy.io.stateful_tractogram import Space, StatefulTractogram + +logger = logging.getLogger("GPUStreamlines") + +# TODO performance: +# ACT +# SCIL streamline reduction onboard GPU +# Remove small/long streamlines on gpu + + +class GPUTracker: + def __init__( + self, + dg: GPUDirectionGetter, + dataf: np.ndarray, + stop_map: np.ndarray, + stop_theshold: float, + sphere_vertices: np.ndarray, + sphere_edges: np.ndarray, + max_angle: float = radians(60), + step_size: float = 0.5, + relative_peak_thresh: float = 0.25, + min_separation_angle: float = radians(45), + ngpus: int = 1, + rng_seed: int = 0, + rng_offset: int = 0, + chunk_size: int = 100000, + ): + """ + Initialize GPUTracker with necessary data. + + Parameters + ---------- + dg : GPUDirectionGetter + Direction getter to use for tracking from + cuslines.cu_direction_getters + dataf : np.ndarray + 4D numpy array with ODFs for prob/ptt, diffusion data if doing + bootstrapping. + stop_map : np.ndarray + 3D numpy array with stopping metric (e.g., GFA, FA) + stop_theshold : float + Threshold for stopping metric (e.g., 0.2) + sphere_vertices : np.ndarray + Vertices of the sphere used for direction sampling. + sphere_edges : np.ndarray + Edges of the sphere used for direction sampling. + max_angle : float, optional + Maximum angle (in radians) between steps + default: radians(60) + step_size : float, optional + Step size for tracking + default: 0.5 + relative_peak_thresh : float, optional + Relative peak threshold for direction selection + default: 0.25 + min_separation_angle : float, optional + Minimum separation angle (in radians) between peaks + default: radians(45) + ngpus : int, optional + Number of GPUs to use + default: 1 + rng_seed : int, optional + Seed for random number generator + default: 0 + rng_offset : int, optional + Offset for random number generator + default: 0 + """ + self.dataf = np.ascontiguousarray(dataf, dtype=REAL_DTYPE) + self.metric_map = np.ascontiguousarray(stop_map, dtype=REAL_DTYPE) + self.sphere_vertices = np.ascontiguousarray(sphere_vertices, dtype=REAL_DTYPE) + self.sphere_edges = np.ascontiguousarray(sphere_edges, dtype=np.int32) + + self.dimx, self.dimy, self.dimz, self.dimt = dataf.shape + self.nedges = int(sphere_edges.shape[0]) + if isinstance(dg, BootDirectionGetter): + self.samplm_nr = int(dg.sampling_matrix.shape[0]) + else: + self.samplm_nr = self.dimt + self.n32dimt = ((self.dimt + 31) // 32) * 32 + + self.dg = dg + self.max_angle = REAL_DTYPE(max_angle) + self.tc_threshold = REAL_DTYPE(stop_theshold) + self.step_size = REAL_DTYPE(step_size) + self.relative_peak_thresh = REAL_DTYPE(relative_peak_thresh) + self.min_separation_angle = REAL_DTYPE(min_separation_angle) + + self.ngpus = int(ngpus) + self.rng_seed = int(rng_seed) + self.rng_offset = int(rng_offset) + self.chunk_size = int(chunk_size) + + avail = checkCudaErrors(runtime.cudaGetDeviceCount()) + if self.ngpus > avail: + raise RuntimeError( + f"Requested {self.ngpus} GPUs but only {avail} available" + ) + + logger.info("Creating GPUTracker with %d GPUs...", self.ngpus) + + self.dataf_d = [] + self.metric_map_d = [] + self.sphere_vertices_d = [] + self.sphere_edges_d = [] + + self.streams = [] + self.managed_data = [] + + self.seed_propagator = SeedBatchPropagator(gpu_tracker=self) + self._allocated = False + + def __enter__(self): + self._allocate() + return self + + def _allocate(self): + if self._allocated: + return + + for ii in range(self.ngpus): + checkCudaErrors(runtime.cudaSetDevice(ii)) + self.streams.append( + checkCudaErrors( + runtime.cudaStreamCreateWithFlags(runtime.cudaStreamNonBlocking) + ) + ) + + for ii in range(self.ngpus): + checkCudaErrors(runtime.cudaSetDevice(ii)) + + # TODO: performance: dataf could be managed or texture memory instead? + self.dataf_d.append( + checkCudaErrors(runtime.cudaMalloc(REAL_SIZE * self.dataf.size)) + ) + self.metric_map_d.append( + checkCudaErrors(runtime.cudaMalloc(REAL_SIZE * self.metric_map.size)) + ) + self.sphere_vertices_d.append( + checkCudaErrors( + runtime.cudaMalloc(REAL_SIZE * self.sphere_vertices.size) + ) + ) + self.sphere_edges_d.append( + checkCudaErrors( + runtime.cudaMalloc(np.int32().nbytes * self.sphere_edges.size) + ) + ) + + checkCudaErrors( + runtime.cudaMemcpy( + self.dataf_d[ii], + self.dataf.ctypes.data, + REAL_SIZE * self.dataf.size, + cudaMemcpyKind.cudaMemcpyHostToDevice, + ) + ) + checkCudaErrors( + runtime.cudaMemcpy( + self.metric_map_d[ii], + self.metric_map.ctypes.data, + REAL_SIZE * self.metric_map.size, + cudaMemcpyKind.cudaMemcpyHostToDevice, + ) + ) + checkCudaErrors( + runtime.cudaMemcpy( + self.sphere_vertices_d[ii], + self.sphere_vertices.ctypes.data, + REAL_SIZE * self.sphere_vertices.size, + cudaMemcpyKind.cudaMemcpyHostToDevice, + ) + ) + checkCudaErrors( + runtime.cudaMemcpy( + self.sphere_edges_d[ii], + self.sphere_edges.ctypes.data, + np.int32().nbytes * self.sphere_edges.size, + cudaMemcpyKind.cudaMemcpyHostToDevice, + ) + ) + self.dg.allocate_on_gpu(ii) + + self._allocated = True + + def __exit__(self, exc_type, exc, tb): + logger.info("Destroying GPUTracker and freeing GPU memory...") + + for n in range(self.ngpus): + checkCudaErrors(runtime.cudaSetDevice(n)) + if self.dataf_d[n]: + checkCudaErrors(runtime.cudaFree(self.dataf_d[n])) + if self.metric_map_d[n]: + checkCudaErrors(runtime.cudaFree(self.metric_map_d[n])) + if self.sphere_vertices_d[n]: + checkCudaErrors(runtime.cudaFree(self.sphere_vertices_d[n])) + if self.sphere_edges_d[n]: + checkCudaErrors(runtime.cudaFree(self.sphere_edges_d[n])) + self.dg.deallocate_on_gpu(n) + + checkCudaErrors(runtime.cudaStreamDestroy(self.streams[n])) + return False + + def _divide_chunks(self, seeds): + global_chunk_sz = self.chunk_size * self.ngpus + nchunks = (seeds.shape[0] + global_chunk_sz - 1) // global_chunk_sz + return global_chunk_sz, nchunks + + def generate_sft(self, seeds, ref_img): + global_chunk_sz, nchunks = self._divide_chunks(seeds) + buffer_size = 0 + generators = [] + + with tqdm(total=seeds.shape[0]) as pbar: + for idx in range(nchunks): + self.seed_propagator.propagate( + seeds[idx * global_chunk_sz : (idx + 1) * global_chunk_sz] + ) + buffer_size += self.seed_propagator.get_buffer_size() + generators.append(self.seed_propagator.as_generator()) + pbar.update( + seeds[idx * global_chunk_sz : (idx + 1) * global_chunk_sz].shape[0] + ) + array_sequence = ArraySequence( + (item for gen in generators for item in gen), buffer_size // MEGABYTE + ) + return StatefulTractogram(array_sequence, ref_img, Space.VOX) + + # TODO: performance: consider a way to just output in VOX space directly + def generate_trx(self, seeds, ref_img): + global_chunk_sz, nchunks = self._divide_chunks(seeds) + + # Will resize by a factor of 2 if these are exceeded + sl_len_guess = 100 + sl_per_seed_guess = 3 + n_sls_guess = sl_per_seed_guess * seeds.shape[0] + + # trx files use memory mapping + trx_file = TrxFile( + reference=ref_img, + nb_streamlines=n_sls_guess, + nb_vertices=n_sls_guess * sl_len_guess, + ) + trx_file.streamlines._offsets = trx_file.streamlines._offsets.astype(np.uint64) + offsets_idx = 0 + sls_data_idx = 0 + + with tqdm(total=seeds.shape[0]) as pbar: + for idx in range(int(nchunks)): + self.seed_propagator.propagate( + seeds[idx * global_chunk_sz : (idx + 1) * global_chunk_sz] + ) + tractogram = Tractogram( + self.seed_propagator.as_array_sequence(), + affine_to_rasmm=ref_img.affine, + ) + tractogram.to_world() + sls = tractogram.streamlines + + new_offsets_idx = offsets_idx + len(sls._offsets) + new_sls_data_idx = sls_data_idx + len(sls._data) + + if ( + new_offsets_idx > trx_file.header["NB_STREAMLINES"] + or new_sls_data_idx > trx_file.header["NB_VERTICES"] + ): + logger.info("TRX resizing...") + trx_file.resize( + nb_streamlines=new_offsets_idx * 2, + nb_vertices=new_sls_data_idx * 2, + ) + + # TRX uses memmaps here + trx_file.streamlines._data[sls_data_idx:new_sls_data_idx] = sls._data + trx_file.streamlines._offsets[offsets_idx:new_offsets_idx] = ( + sls_data_idx + sls._offsets + ) + trx_file.streamlines._lengths[offsets_idx:new_offsets_idx] = ( + sls._lengths + ) + + offsets_idx = new_offsets_idx + sls_data_idx = new_sls_data_idx + pbar.update( + seeds[idx * global_chunk_sz : (idx + 1) * global_chunk_sz].shape[0] + ) + trx_file.resize() + + return trx_file diff --git a/cuslines/cuda_python/cutils.py b/cuslines/cuda_python/cutils.py new file mode 100644 index 0000000..db4115a --- /dev/null +++ b/cuslines/cuda_python/cutils.py @@ -0,0 +1,64 @@ +from cuda.bindings import driver, nvrtc + +import numpy as np + +from enum import IntEnum + +from cuslines.cuda_python._globals import * + + +class ModelType(IntEnum): + OPDT = 0 + CSA = 1 + PROB = 2 + PTT = 3 + + +REAL3_SIZE = 3 * REAL_SIZE +if REAL_SIZE == 4: + REAL_DTYPE = np.float32 + REAL3_DTYPE = np.dtype( + [("x", np.float32), ("y", np.float32), ("z", np.float32)], align=True + ) + REAL_DTYPE_AS_STR = "float" + REAL3_DTYPE_AS_STR = "float3" +elif REAL_SIZE == 8: + REAL_DTYPE = np.float64 + REAL3_DTYPE = np.dtype( + [("x", np.float64), ("y", np.float64), ("z", np.float64)], align=True + ) + REAL_DTYPE_AS_STR = "double" + REAL3_DTYPE_AS_STR = "double3" +else: + raise NotImplementedError(f"Unsupported REAL_SIZE={REAL_SIZE} in globals.h") +BLOCK_Y = THR_X_BL // THR_X_SL +DEV_PTR = object + + +def _cudaGetErrorEnum(error): + if isinstance(error, driver.CUresult): + err, name = driver.cuGetErrorName(error) + return name if err == driver.CUresult.CUDA_SUCCESS else "" + elif isinstance(error, nvrtc.nvrtcResult): + return nvrtc.nvrtcGetErrorString(error)[1] + else: + raise RuntimeError("Unknown error type: {}".format(error)) + + +def checkCudaErrors(result): + if result[0].value: + raise RuntimeError( + "CUDA error code={}({})".format( + result[0].value, _cudaGetErrorEnum(result[0]) + ) + ) + if len(result) == 1: + return None + elif len(result) == 2: + return result[1] + else: + return result[1:] + + +def div_up(a, b): + return (a + b - 1) // b diff --git a/cuslines/cuslines.cpp b/cuslines/cuslines.cpp deleted file mode 100644 index 4e8bc30..0000000 --- a/cuslines/cuslines.cpp +++ /dev/null @@ -1,359 +0,0 @@ -/* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include -#include -#include - -#include -#include -#include -namespace py = pybind11; - -#include - -// #define USE_NVTX - -#include "globals.h" -#include "cudamacro.h" -#include "generate_streamlines_cuda.h" - -using np_array = py::array_t; -using np_array_int = py::array_t; - -using np_array_cast = py::array_t; -using np_array_int_cast = py::array_t; - -// Handle to cleanup returned host allocations when associated Python object is destroyed -template -py::capsule cleanup(T* ptr) { - return py::capsule(ptr, [](void *f) { - T *g = reinterpret_cast(f); - delete [] g; - }); -} - -class GPUTracker { - public: - GPUTracker(ModelType model_type, - double max_angle, - double min_signal, - double tc_threshold, - double step_size, - double relative_peak_thresh, - double min_separation_angle, - np_array_cast dataf, - np_array_cast H, - np_array_cast R, - np_array_cast delta_b, - np_array_cast delta_q, - np_array_int_cast b0s_mask, - np_array_cast metric_map, - np_array_cast sampling_matrix, - np_array_cast sphere_vertices, - np_array_int_cast sphere_edges, - int ngpus = 1, - int rng_seed = 0, - int rng_offset = 0) { - - // Get info structs from numpy objects - auto dataf_info = dataf.request(); - auto H_info = H.request(); - auto R_info = R.request(); - auto delta_b_info = delta_b.request(); - auto delta_q_info = delta_q.request(); - auto b0s_mask_info = b0s_mask.request(); - auto metric_map_info = metric_map.request(); - auto sampling_matrix_info = sampling_matrix.request(); - auto sphere_vertices_info = sphere_vertices.request(); - auto sphere_edges_info = sphere_edges.request(); - - dimx_ = dataf_info.shape[0]; - dimy_ = dataf_info.shape[1]; - dimz_ = dataf_info.shape[2]; - dimt_ = dataf_info.shape[3]; - nedges_ = sphere_edges_info.shape[0]; - - delta_nr_ = delta_b_info.shape[0]; - samplm_nr_ = sampling_matrix_info.shape[0]; - -// No longer needed -#if 0 - // Error checking for template parameters. - // TODO: Need to make kernel more general. - if (delta_b_info.shape[0] != 28 || - sampling_matrix_info.shape[0] != 181 || - dataf_info.shape[3] > 160) { - std::cout << delta_b_info.shape[0] << " " << sampling_matrix_info.shape[0] << " " << dataf_info.shape[3] << std::endl; - throw std::invalid_argument("Input data dimensions not currently supported."); - } -#endif - - // Get number of GPUs - int ngpus_avail; - CHECK_CUDA(cudaGetDeviceCount(&ngpus_avail)); - if (ngpus > ngpus_avail) { - throw std::runtime_error("Requested to use more GPUs than available on system."); - } - - std::cerr << "Creating GPUTracker with " << ngpus << " GPUs..." << std::endl; - ngpus_ = ngpus; - - model_type_ = model_type; - max_angle_ = max_angle; - min_signal_ = min_signal; - tc_threshold_ = tc_threshold; - step_size_ = step_size; - relative_peak_thresh_ = relative_peak_thresh, - min_separation_angle_ = min_separation_angle, - - // Allocate/copy constant problem data on GPUs - dataf_d.resize(ngpus_, nullptr); - H_d.resize(ngpus_, nullptr); - R_d.resize(ngpus_, nullptr); - delta_b_d.resize(ngpus_, nullptr); - delta_q_d.resize(ngpus_, nullptr); - b0s_mask_d.resize(ngpus_, nullptr); - metric_map_d.resize(ngpus_, nullptr); - sampling_matrix_d.resize(ngpus_, nullptr); - sphere_vertices_d.resize(ngpus_, nullptr); - sphere_edges_d.resize(ngpus_, nullptr); - - //#pragma omp parallel for - for (int n = 0; n < ngpus_; ++n) { - CHECK_CUDA(cudaSetDevice(n)); - CHECK_CUDA(cudaMallocManaged(&dataf_d[n], sizeof(*dataf_d[n]) * dataf_info.size)); - CHECK_CUDA(cudaMemAdvise(dataf_d[n], sizeof(*dataf_d[n]) * dataf_info.size, cudaMemAdviseSetPreferredLocation, n)); - CHECK_CUDA(cudaMalloc(&H_d[n], sizeof(*H_d[n]) * H_info.size)); - CHECK_CUDA(cudaMalloc(&R_d[n], sizeof(*R_d[n]) * R_info.size)); - CHECK_CUDA(cudaMalloc(&delta_b_d[n], sizeof(*delta_b_d[n]) * delta_b_info.size)); - CHECK_CUDA(cudaMalloc(&delta_q_d[n], sizeof(*delta_q_d[n]) * delta_q_info.size)); - CHECK_CUDA(cudaMalloc(&b0s_mask_d[n], sizeof(*b0s_mask_d[n]) * b0s_mask_info.size)); - CHECK_CUDA(cudaMalloc(&metric_map_d[n], sizeof(*metric_map_d[n]) * metric_map_info.size)); - CHECK_CUDA(cudaMalloc(&sampling_matrix_d[n], sizeof(*sampling_matrix_d[n]) * sampling_matrix_info.size)); - CHECK_CUDA(cudaMalloc(&sphere_vertices_d[n], sizeof(*sphere_vertices_d[n]) * sphere_vertices_info.size)); - CHECK_CUDA(cudaMalloc(&sphere_edges_d[n], sizeof(*sphere_edges_d[n]) * sphere_edges_info.size)); - - CHECK_CUDA(cudaMemcpy(dataf_d[n], dataf_info.ptr, sizeof(*dataf_d[n]) * dataf_info.size, cudaMemcpyHostToDevice)); - CHECK_CUDA(cudaMemcpy(H_d[n], H_info.ptr, sizeof(*H_d[n]) * H_info.size, cudaMemcpyHostToDevice)); - CHECK_CUDA(cudaMemcpy(R_d[n], R_info.ptr, sizeof(*R_d[n]) * R_info.size, cudaMemcpyHostToDevice)); - CHECK_CUDA(cudaMemcpy(delta_b_d[n], delta_b_info.ptr, sizeof(*delta_b_d[n]) * delta_b_info.size, cudaMemcpyHostToDevice)); - CHECK_CUDA(cudaMemcpy(delta_q_d[n], delta_q_info.ptr, sizeof(*delta_q_d[n]) * delta_q_info.size, cudaMemcpyHostToDevice)); - CHECK_CUDA(cudaMemcpy(b0s_mask_d[n], b0s_mask_info.ptr, sizeof(*b0s_mask_d[n]) * b0s_mask_info.size, cudaMemcpyHostToDevice)); - CHECK_CUDA(cudaMemcpy(metric_map_d[n], metric_map_info.ptr, sizeof(*metric_map_d[n]) * metric_map_info.size, cudaMemcpyHostToDevice)); - CHECK_CUDA(cudaMemcpy(sampling_matrix_d[n], sampling_matrix_info.ptr, sizeof(*sampling_matrix_d[n]) * sampling_matrix_info.size, cudaMemcpyHostToDevice)); - CHECK_CUDA(cudaMemcpy(sphere_vertices_d[n], sphere_vertices_info.ptr, sizeof(*sphere_vertices_d[n]) * sphere_vertices_info.size, cudaMemcpyHostToDevice)); - CHECK_CUDA(cudaMemcpy(sphere_edges_d[n], sphere_edges_info.ptr, sizeof(*sphere_edges_d[n]) * sphere_edges_info.size, cudaMemcpyHostToDevice)); - } - - rng_seed_ = rng_seed; - rng_offset_ = rng_offset; - nSlines_old_.resize(ngpus_, 0); - slines_.resize(ngpus_, nullptr); - slinesLen_.resize(ngpus_, nullptr); - - streams_.resize(ngpus_); - for (int n = 0; n < ngpus_; ++n) { - CHECK_CUDA(cudaSetDevice(n)); - CHECK_CUDA(cudaStreamCreateWithFlags(&streams_[n], cudaStreamNonBlocking)); - } - - } - - ~GPUTracker() { - std::cerr << "Destroy GPUTracker..." << std::endl; - for (int n = 0; n < ngpus_; ++n) { - CHECK_CUDA(cudaSetDevice(n)); - if (dataf_d[n]) CHECK_CUDA(cudaFree(dataf_d[n])); - if (H_d[n]) CHECK_CUDA(cudaFree(H_d[n])); - if (R_d[n]) CHECK_CUDA(cudaFree(R_d[n])); - if (delta_b_d[n]) CHECK_CUDA(cudaFree(delta_b_d[n])); - if (delta_q_d[n]) CHECK_CUDA(cudaFree(delta_q_d[n])); - if (b0s_mask_d[n]) CHECK_CUDA(cudaFree(b0s_mask_d[n])); - if (metric_map_d[n]) CHECK_CUDA(cudaFree(metric_map_d[n])); - if (sampling_matrix_d[n]) CHECK_CUDA(cudaFree(sampling_matrix_d[n])); - if (sphere_vertices_d[n]) CHECK_CUDA(cudaFree(sphere_vertices_d[n])); - if (sphere_edges_d[n]) CHECK_CUDA(cudaFree(sphere_edges_d[n])); - - if (slines_[n]) CHECK_CUDA(cudaFreeHost(slines_[n])); - if (slinesLen_[n]) CHECK_CUDA(cudaFreeHost(slinesLen_[n])); - - CHECK_CUDA(cudaStreamDestroy(streams_[n])); - } - } - - std::vector> generate_streamlines(np_array seeds) { - - auto seeds_info = seeds.request(); - int nseeds = seeds_info.shape[0]; - - std::vector seeds_d(ngpus_, nullptr); - int nseeds_per_gpu = (nseeds + ngpus_ - 1) / ngpus_; - - //#pragma omp parallel for - for (int n = 0; n < ngpus_; ++n) { - int nseeds_gpu = std::min(nseeds_per_gpu, std::max(0, nseeds - n*nseeds_per_gpu)); - CHECK_CUDA(cudaSetDevice(n)); - CHECK_CUDA(cudaMalloc(&seeds_d[n], sizeof(*seeds_d[n]) * 3 * nseeds_gpu)); - CHECK_CUDA(cudaMemcpy(seeds_d[n], reinterpret_cast(seeds_info.ptr) + 3*n*nseeds_per_gpu, sizeof(*seeds_d[n]) * 3 * nseeds_gpu, cudaMemcpyHostToDevice)); - } - - std::vector nSlines(ngpus_); - - // Call GPU routine - generate_streamlines_cuda_mgpu(model_type_, max_angle_, min_signal_, tc_threshold_, step_size_, - relative_peak_thresh_, min_separation_angle_, - nseeds, seeds_d, - dimx_, dimy_, dimz_, dimt_, - dataf_d, H_d, R_d, delta_nr_, delta_b_d, delta_q_d, b0s_mask_d, metric_map_d, samplm_nr_, sampling_matrix_d, - sphere_vertices_d, sphere_edges_d, nedges_, - slines_, slinesLen_, nSlines, nSlines_old_, rng_seed_, rng_offset_, ngpus_, - streams_); - - nSlines_old_ = nSlines; //store number of slines for next set of seeds - - // Update rng_offset for next set of seeds - rng_offset_ += nseeds; - - int nSlines_total = 0; - for (int n = 0; n < ngpus_; ++n) { - CHECK_CUDA(cudaFree(seeds_d[n])); - nSlines_total += nSlines[n]; - } - - std::vector> slines_list; - slines_list.reserve(nSlines_total); - for (int n = 0; n < ngpus_; ++n) { - for (int i = 0; i < nSlines[n]; ++i) { - REAL* sl = new REAL[slinesLen_[n][i]*3]; - std::memcpy(sl, slines_[n] + i*3*2*MAX_SLINE_LEN, slinesLen_[n][i]*3*sizeof(*sl)); - auto sl_arr = py::array_t({slinesLen_[n][i], 3}, // shape - {3*sizeof(REAL), sizeof(REAL)}, // strides - sl, - cleanup(sl)); - slines_list.push_back(sl_arr); - } - } - - return slines_list; - - } - - void dump_streamlines(std::string output_prefix, std::string voxel_order, - np_array_int roi_shape, np_array voxel_size, np_array vox_to_ras) { - - auto roi_shape_info = roi_shape.request(); - auto voxel_size_info = voxel_size.request(); - auto vox_to_ras_info = vox_to_ras.request(); - - START_RANGE("filewrite", 0); - - //#pragma omp parallel for - for (int n = 0; n < ngpus_; ++n) { - std::stringstream ss; - ss << output_prefix << "_" << std::to_string(n) << ".trk"; - write_trk(ss.str().c_str(), reinterpret_cast(roi_shape_info.ptr), reinterpret_cast(voxel_size_info.ptr), - voxel_order.c_str(), reinterpret_cast(vox_to_ras_info.ptr), nSlines_old_[n], slinesLen_[n], - reinterpret_cast(slines_[n])); - } - - END_RANGE; - } - - private: - int ngpus_; - int rng_seed_; - int rng_offset_; - int dimx_, dimy_, dimz_, dimt_; - int nedges_; - int delta_nr_, samplm_nr_; - - ModelType model_type_; - double max_angle_; - double tc_threshold_; - double min_signal_; - double step_size_; - double relative_peak_thresh_; - double min_separation_angle_; - - std::vector nSlines_old_; - std::vector slines_; - std::vector slinesLen_; - - std::vector dataf_d; - std::vector H_d; - std::vector R_d; - std::vector delta_b_d; - std::vector delta_q_d; - std::vector b0s_mask_d; - std::vector metric_map_d; - std::vector sampling_matrix_d; - std::vector sphere_vertices_d; - std::vector sphere_edges_d; - - std::vector streams_; - -}; - - -PYBIND11_MODULE(cuslines, m) { - m.attr("MAX_SLINE_LEN") = py::int_(MAX_SLINE_LEN); - m.attr("REAL_SIZE") = py::int_(REAL_SIZE); - - py::enum_(m, "ModelType") - .value("OPDT", OPDT) - .value("CSA", CSA) - .value("PROB", PROB) - .value("PTT", PTT); - - py::class_(m, "GPUTracker") - .def(py::init(), - py::arg().noconvert(), py::arg().noconvert(), py::arg().noconvert(), py::arg().noconvert(), py::arg().noconvert(), - py::arg().noconvert(), py::arg().noconvert(), - py::arg().noconvert(), py::arg().noconvert(), - py::arg().noconvert(), py::arg().noconvert(), - py::arg().noconvert(), py::arg().noconvert(), - py::arg().noconvert(), py::arg().noconvert(), - py::arg().noconvert(), py::arg().noconvert(), - py::arg("ngpus") = 1, py::arg("rng_seed") = 0, - py::arg("rng_offset") = 0) - - .def("generate_streamlines", &GPUTracker::generate_streamlines, - "Generates streamline for dipy test case.") - - .def("dump_streamlines", &GPUTracker::dump_streamlines, - "Dump streamlines to file."); -} - diff --git a/cuslines/generate_streamlines_cuda.cu b/cuslines/generate_streamlines_cuda.cu deleted file mode 100644 index 374c7c1..0000000 --- a/cuslines/generate_streamlines_cuda.cu +++ /dev/null @@ -1,2420 +0,0 @@ -/* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "cudamacro.h" /* for time() */ -#include "globals.h" - -#include - -#include "cuwsort.cuh" -#include "ptt.cuh" - -#include "utils.cu" -#include "ptt.cu" - -#define MAX_NUM_DIR (128) - -#define NTHR_GEN (128) - -#define MAX_DIMS (8) -#define MAX_STR_LEN (256) - -using namespace cuwsort; - -//#define USE_FIXED_PERMUTATION -#ifdef USE_FIXED_PERMUTATION -//__device__ const int fixedPerm[] = {44, 47, 53, 0, 3, 3, 39, 9, 19, 21, 50, 36, 23, -// 6, 24, 24, 12, 1, 38, 39, 23, 46, 24, 17, 37, 25, -// 13, 8, 9, 20, 51, 16, 51, 5, 15, 47, 0, 18, 35, -// 24, 49, 51, 29, 19, 19, 14, 39, 32, 1, 9, 32, 31, -// 10, 52, 23}; -__device__ const int fixedPerm[] = { - 47, 117, 67, 103, 9, 21, 36, 87, 70, 88, 140, 58, 39, 87, 88, 81, 25, 77, - 72, 9, 148, 115, 79, 82, 99, 29, 147, 147, 142, 32, 9, 127, 32, 31, 114, 28, - 34, 128, 128, 53, 133, 38, 17, 79, 132, 105, 42, 31, 120, 1, 65, 57, 35, 102, - 119, 11, 82, 91, 128, 142, 99, 53, 140, 121, 84, 68, 6, 47, 127, 131, 100, 78, - 143, 148, 23, 141, 117, 85, 48, 49, 69, 95, 94, 0, 113, 36, 48, 93, 131, 98, - 42, 112, 149, 127, 0, 138, 114, 43, 127, 23, 130, 121, 98, 62, 123, 82, 148, 50, - 14, 41, 58, 36, 10, 86, 43, 104, 11, 2, 51, 80, 32, 128, 38, 19, 42, 115, - 77, 30, 24, 125, 2, 3, 94, 107, 13, 112, 40, 72, 19, 95, 72, 67, 61, 14, - 96, 4, 139, 86, 121, 109}; -#endif - -template -__device__ void ndotp_d(const int N, - const int M, - const VAL_T *__restrict__ srcV, - const VAL_T *__restrict__ srcM, - VAL_T *__restrict__ dstV) { - - const int tidx = threadIdx.x; - - const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; - const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); - - //#pragma unroll - for(int i = 0; i < N; i++) { - - VAL_T __tmp = 0; - - //#pragma unroll - for(int j = 0; j < M; j += BDIM_X) { - if (j+tidx < M) { - __tmp += srcV[j+tidx]*srcM[i*M + j+tidx]; - } - } - #pragma unroll - for(int j = BDIM_X/2; j; j /= 2) { -#if 0 - __tmp += __shfl_xor_sync(WMASK, __tmp, j, BDIM_X); -#else - __tmp += __shfl_down_sync(WMASK, __tmp, j, BDIM_X); -#endif - } - // values could be held by BDIM_X threads and written - // together every BDIM_X iterations... - - if (tidx == 0) { - dstV[i] = __tmp; - } - } - return; -} - - -template -__device__ void ndotp_log_opdt_d(const int N, - const int M, - const VAL_T *__restrict__ srcV, - const VAL_T *__restrict__ srcM, - VAL_T *__restrict__ dstV) { - - const int tidx = threadIdx.x; - - const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; - const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); - - const VAL_T ONEP5 = static_cast(1.5); - - //#pragma unroll - for(int i = 0; i < N; i++) { - - VAL_T __tmp = 0; - - //#pragma unroll - for(int j = 0; j < M; j += BDIM_X) { - if (j+tidx < M) { - const VAL_T v = srcV[j+tidx]; - __tmp += -LOG(v)*(ONEP5+LOG(v))*v * srcM[i*M + j+tidx]; - } - } - #pragma unroll - for(int j = BDIM_X/2; j; j /= 2) { -#if 0 - __tmp += __shfl_xor_sync(WMASK, __tmp, j, BDIM_X); -#else - __tmp += __shfl_down_sync(WMASK, __tmp, j, BDIM_X); -#endif - } - // values could be held by BDIM_X threads and written - // together every BDIM_X iterations... - - if (tidx == 0) { - dstV[i] = __tmp; - } - } - return; -} - -template -__device__ void ndotp_log_csa_d(const int N, - const int M, - const VAL_T *__restrict__ srcV, - const VAL_T *__restrict__ srcM, - VAL_T *__restrict__ dstV) { - - const int tidx = threadIdx.x; - - const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; - const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); - // Clamp values - constexpr VAL_T min = .001; - constexpr VAL_T max = .999; - - //#pragma unroll - for(int i = 0; i < N; i++) { - - VAL_T __tmp = 0; - - //#pragma unroll - for(int j = 0; j < M; j += BDIM_X) { - if (j+tidx < M) { - const VAL_T v = MIN(MAX(srcV[j+tidx], min), max); - __tmp += LOG(-LOG(v)) * srcM[i*M + j+tidx]; - } - } - #pragma unroll - for(int j = BDIM_X/2; j; j /= 2) { -#if 0 - __tmp += __shfl_xor_sync(WMASK, __tmp, j, BDIM_X); -#else - __tmp += __shfl_down_sync(WMASK, __tmp, j, BDIM_X); -#endif - } - // values could be held by BDIM_X threads and written - // together every BDIM_X iterations... - - if (tidx == 0) { - dstV[i] = __tmp; - } - } - return; -} - - -template -__device__ void fit_opdt(const int delta_nr, - const int hr_side, - const REAL_T *__restrict__ delta_q, - const REAL_T *__restrict__ delta_b, - const REAL_T *__restrict__ __msk_data_sh, - REAL_T *__restrict__ __h_sh, - REAL_T *__restrict__ __r_sh) { - const int tidx = threadIdx.x; - const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; - const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); - - ndotp_log_opdt_d(delta_nr, hr_side, __msk_data_sh, delta_q, __r_sh); - ndotp_d (delta_nr, hr_side, __msk_data_sh, delta_b, __h_sh); - __syncwarp(WMASK); - #pragma unroll - for(int j = tidx; j < delta_nr; j += BDIM_X) { - __r_sh[j] -= __h_sh[j]; - } - __syncwarp(WMASK); -} - -template -__device__ void fit_csa(const int delta_nr, - const int hr_side, - const REAL_T *__restrict__ fit_matrix, - const REAL_T *__restrict__ __msk_data_sh, - REAL_T *__restrict__ __r_sh) { - const int tidx = threadIdx.x; - const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; - const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); - - constexpr REAL _n0_const = 0.28209479177387814; // .5 / sqrt(pi) - ndotp_log_csa_d(delta_nr, hr_side, __msk_data_sh, fit_matrix, __r_sh); - __syncwarp(WMASK); - if (tidx == 0) { - __r_sh[0] = _n0_const; - } - __syncwarp(WMASK); -} - -template -__device__ void fit_model_coef(const int delta_nr, // delta_nr is number of ODF directions - const int hr_side, // hr_side is number of data directions - const REAL_T *__restrict__ delta_q, - const REAL_T *__restrict__ delta_b, // these are fit matrices the model can use, different for each model - const REAL_T *__restrict__ __msk_data_sh, // __msk_data_sh is the part of the data currently being operated on by this block - REAL_T *__restrict__ __h_sh, // these last two are modifications to the coefficients that will be returned - REAL_T *__restrict__ __r_sh) { - switch(MODEL_T) { - case OPDT: - fit_opdt(delta_nr, hr_side, delta_q, delta_b, __msk_data_sh, __h_sh, __r_sh); - break; - case CSA: - fit_csa(delta_nr, hr_side, delta_q, __msk_data_sh, __r_sh); - break; - default: - printf("FATAL: Invalid Model Type.\n"); - break; - } -} - -template -__device__ VAL_T max_mask_transl_d(const int n, - const LEN_T *__restrict__ srcMsk, - const VAL_T *__restrict__ srcVal, - const VAL_T offset, - const VAL_T minVal) { - - const int tidx = threadIdx.x; - - const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; - const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); - - VAL_T __m = minVal; - - for(int i = tidx; i < n; i += BDIM_X) { - const LEN_T sel = srcMsk[i]; - if (sel > 0) { - __m = MAX(__m, srcVal[i]+offset); - } - } - - #pragma unroll - for(int i = BDIM_X/2; i; i /= 2) { - const VAL_T __tmp = __shfl_xor_sync(WMASK, __m, i, BDIM_X); - __m = MAX(__m, __tmp); - } - - return __m; -} - -template -__device__ VAL_T max_d(const int n, const VAL_T *__restrict__ src, const VAL_T minVal) { - - const int tidx = threadIdx.x; - - const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; - const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); - - VAL_T __m = minVal; - - for(int i = tidx; i < n; i += BDIM_X) { - __m = MAX(__m, src[i]); - } - - #pragma unroll - for(int i = BDIM_X/2; i; i /= 2) { - const VAL_T __tmp = __shfl_xor_sync(WMASK, __m, i, BDIM_X); - __m = MAX(__m, __tmp); - } - - return __m; -} - -template -__device__ VAL_T min_d(const int n, const VAL_T *__restrict__ src, const VAL_T maxVal) { - - const int tidx = threadIdx.x; - - const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; - const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); - - VAL_T __m = maxVal; - - for(int i = tidx; i < n; i += BDIM_X) { - __m = MIN(__m, src[i]); - } - - #pragma unroll - for(int i = BDIM_X/2; i; i /= 2) { - const VAL_T __tmp = __shfl_xor_sync(WMASK, __m, i, BDIM_X); - __m = MIN(__m, __tmp); - } - - return __m; -} - -template -__device__ VAL_T avgMask(const int mskLen, - const int *__restrict__ mask, - const VAL_T *__restrict__ data) { - - const int tidx = threadIdx.x; - const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; - - const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); - - int __myCnt = 0; - VAL_T __mySum = 0; - - for(int i = tidx; i < mskLen; i += BDIM_X) { - if(mask[i]) { - __myCnt++; - __mySum += data[i]; - } - } - - #pragma unroll - for(int i = BDIM_X/2; i; i /= 2) { - __mySum += __shfl_xor_sync(WMASK, __mySum, i, BDIM_X); - __myCnt += __shfl_xor_sync(WMASK, __myCnt, i, BDIM_X); - } - - return __mySum/__myCnt; - -} - -template -__device__ int peak_directions_d(const REAL_T *__restrict__ odf, - REAL3_T *__restrict__ dirs, - const REAL3_T *__restrict__ sphere_vertices, - const int2 *__restrict__ sphere_edges, - const int num_edges, - int samplm_nr, - int *__restrict__ __shInd, - const REAL_T relative_peak_thres, - const REAL_T min_separation_angle) { - - const int tidx = threadIdx.x; - - const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; - const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); - - const unsigned int lmask = (1 << lid)-1; - -// __shared__ int __shInd[BDIM_Y][SAMPLM_NR]; - - #pragma unroll - for(int j = tidx; j < samplm_nr; j += BDIM_X) { - __shInd[j] = 0; - } - - REAL_T odf_min = min_d(samplm_nr, odf, REAL_MAX); - odf_min = MAX(0, odf_min); - - __syncwarp(WMASK); - - // local_maxima() + _compare_neighbors() - // selecting only the indices corrisponding to maxima Ms - // such that M-odf_min >= relative_peak_thres - //#pragma unroll - for(int j = 0; j < num_edges; j += BDIM_X) { - if (j+tidx < num_edges) { - const int u_ind = sphere_edges[j+tidx].x; - const int v_ind = sphere_edges[j+tidx].y; - - //if (u_ind >= NUM_EDGES || v_ind >= NUM_EDGES) { ERROR; } - - const REAL_T u_val = odf[u_ind]; - const REAL_T v_val = odf[v_ind]; - - //if (u_val != u_val || v_val != v_val) { ERROR_NANs; } - - // only check that they are not equal - //if (u_val != v_val) { - // __shInd[tidy][u_val < v_val ? u_ind : v_ind] = -1; // benign race conditions... - //} - if (u_val < v_val) { - atomicExch(__shInd+u_ind, -1); - atomicOr( __shInd+v_ind, 1); - } else if (v_val < u_val) { - atomicExch(__shInd+v_ind, -1); - atomicOr( __shInd+u_ind, 1); - } - } - } - __syncwarp(WMASK); - - const REAL_T compThres = relative_peak_thres*max_mask_transl_d(samplm_nr, __shInd, odf, -odf_min, REAL_MIN); -#if 1 -/* - if (!tidy && !tidx) { - for(int j = 0; j < SAMPLM_NR; j++) { - printf("local_max[%d]: %d (%f)\n", j, __shInd[tidy][j], odf[j]); - } - printf("maxMax with offset %f: %f\n", -odf_min, compThres); - } - __syncwarp(WMASK); -*/ - // compact indices of positive values to the right - int n = 0; - - for(int j = 0; j < samplm_nr; j += BDIM_X) { - - const int __v = (j+tidx < samplm_nr) ? __shInd[j+tidx] : -1; - const int __keep = (__v > 0) && ((odf[j+tidx]-odf_min) >= compThres); - const int __msk = __ballot_sync(WMASK, __keep); - -//__syncwarp(WMASK); // unnecessary - if (__keep) { - const int myoff = __popc(__msk & lmask); - __shInd[n + myoff] = j+tidx; - } - n += __popc(__msk); -//__syncwarp(WMASK); // should be unnecessary - } - __syncwarp(WMASK); -/* - if (!tidy && !tidx) { - for(int j = 0; j < n; j++) { - printf("local_max_compact[%d]: %d\n", j, __shInd[tidy][j]); - } - } - __syncwarp(WMASK); -*/ - - // sort local maxima indices - if (n < BDIM_X) { - REAL_T k = REAL_MIN; - int v = 0; - if (tidx < n) { - v = __shInd[tidx]; - k = odf[v]; - } - warp_sort<32, BDIM_X, WSORT_DIR_DEC>(&k, &v); - __syncwarp(WMASK); - - if (tidx < n) { - __shInd[tidx] = v; - } - } else { - // ERROR !!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - } - __syncwarp(WMASK); - - // __shInd[tidy][] contains the indices in odf correspoding to - // normalized maxima NOT sorted! - if (n != 0) { - // remove_similar_vertices() - // PRELIMINARY INEFFICIENT, SINGLE TH, IMPLEMENTATION - if (tidx == 0) { - const REAL_T cos_similarity = COS(min_separation_angle); - - dirs[0] = sphere_vertices[__shInd[0]]; - - int k = 1; - for(int i = 1; i < n; i++) { - - const REAL3_T abc = sphere_vertices[__shInd[i]]; - - int j = 0; - for(; j < k; j++) { - const REAL_T cos = FABS(abc.x*dirs[j].x+ - abc.y*dirs[j].y+ - abc.z*dirs[j].z); - if (cos > cos_similarity) { - break; - } - } - if (j == k) { - dirs[k++] = abc; - } - } - n = k; - } - n = __shfl_sync(WMASK, n, 0, BDIM_X); - __syncwarp(WMASK); - - } -/* - if (!tidy && !tidx) { - for(int j = 0; j < n; j++) { - printf("local_max_compact_uniq[%d]: %d\n", j, __shInd[tidy][j]); - } - } - __syncwarp(WMASK); -*/ -#else - const int indMax = max_d(__shInd[tidy], -1); - if (indMax != -1) { - __ret = MAKE_REAL3(sphere_vertices[indMax][0], - sphere_vertices[indMax][1], - sphere_vertices[indMax][2]); - } -#endif - return n; -} - -template -__device__ int closest_peak_d(const REAL_T max_angle, - const REAL3_T direction, //dir - const int npeaks, - const REAL3_T *__restrict__ peaks, - REAL3_T *__restrict__ peak) {// dirs, - - const int tidx = threadIdx.x; - - const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; - const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); - - //const REAL_T cos_similarity = COS(MAX_ANGLE_P); - const REAL_T cos_similarity = COS(max_angle); -#if 0 - if (!threadIdx.y && !tidx) { - printf("direction: (%f, %f, %f)\n", - direction.x, direction.y, direction.z); - } - __syncwarp(WMASK); -#endif - REAL_T cpeak_dot = 0; - int cpeak_idx = -1; - for(int j = 0; j < npeaks; j += BDIM_X) { - if (j+tidx < npeaks) { -#if 0 - if (!threadIdx.y && !tidx) { - printf("j+tidx: %d, peaks[j+tidx]: (%f, %f, %f)\n", - j+tidx, peaks[j+tidx].x, peaks[j+tidx].y, peaks[j+tidx].z); - } -#endif - const REAL_T dot = direction.x*peaks[j+tidx].x+ - direction.y*peaks[j+tidx].y+ - direction.z*peaks[j+tidx].z; - - if (FABS(dot) > FABS(cpeak_dot)) { - cpeak_dot = dot; - cpeak_idx = j+tidx; - } - } - } -#if 0 - if (!threadIdx.y && !tidx) { - printf("cpeak_idx: %d, cpeak_dot: %f\n", cpeak_idx, cpeak_dot); - } - __syncwarp(WMASK); -#endif - - #pragma unroll - for(int j = BDIM_X/2; j; j /= 2) { - - const REAL_T dot = __shfl_xor_sync(WMASK, cpeak_dot, j, BDIM_X); - const int idx = __shfl_xor_sync(WMASK, cpeak_idx, j, BDIM_X); - if (FABS(dot) > FABS(cpeak_dot)) { - cpeak_dot = dot; - cpeak_idx = idx; - } - } -#if 0 - if (!threadIdx.y && !tidx) { - printf("cpeak_idx: %d, cpeak_dot: %f, cos_similarity: %f\n", cpeak_idx, cpeak_dot, cos_similarity); - } - __syncwarp(WMASK); -#endif - if (cpeak_idx >= 0) { - if (cpeak_dot >= cos_similarity) { - peak[0] = peaks[cpeak_idx]; - return 1; - } - if (cpeak_dot <= -cos_similarity) { - peak[0] = MAKE_REAL3(-peaks[cpeak_idx].x, - -peaks[cpeak_idx].y, - -peaks[cpeak_idx].z); - return 1; - } - } - return 0; -} - -template -__device__ LEN_T maskGet(const LEN_T n, - const MSK_T *__restrict__ mask, - const VAL_T *__restrict__ plain, - VAL_T *__restrict__ masked) { - - const int tidx = threadIdx.x; - - const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; - const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); - - const int __laneMask = (1 << tidx)-1; - - int woff = 0; - for(int j = 0; j < n; j += BDIM_X) { - - const int __act = (j+tidx < n) ? !mask[j+tidx] : 0; - const int __msk = __ballot_sync(WMASK, __act); - - const int toff = __popc(__msk & __laneMask); - if (__act) { - masked[woff+toff] = plain[j+tidx]; - } - woff += __popc(__msk); - } - return woff; -} - -template -__device__ void maskPut(const LEN_T n, - const MSK_T *__restrict__ mask, - const VAL_T *__restrict__ masked, - VAL_T *__restrict__ plain) { - - const int tidx = threadIdx.x; - - const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; - const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); - - const int __laneMask = (1 << tidx)-1; - - int woff = 0; - for(int j = 0; j < n; j += BDIM_X) { - - const int __act = (j+tidx < n) ? !mask[j+tidx] : 0; - const int __msk = __ballot_sync(WMASK, __act); - - const int toff = __popc(__msk & __laneMask); - if (__act) { - plain[j+tidx] = masked[woff+toff]; - } - woff += __popc(__msk); - } - return; -} - -template -__device__ int get_direction_prob_d(curandStatePhilox4_32_10_t *st, - const REAL_T *__restrict__ pmf, - const REAL_T max_angle, - const REAL_T relative_peak_thres, - const REAL_T min_separation_angle, - REAL3_T dir, - const int dimx, - const int dimy, - const int dimz, - const int dimt, - const REAL3_T point, - const REAL3_T *__restrict__ sphere_vertices, - const int2 *__restrict__ sphere_edges, - const int num_edges, - REAL3_T *__restrict__ dirs) { - const int tidx = threadIdx.x; - const int tidy = threadIdx.y; - - const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; - const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); - - const int n32dimt = ((dimt+31)/32)*32; - - extern __shared__ REAL_T __sh[]; - REAL_T *__pmf_data_sh = __sh + tidy*n32dimt; - - // pmf = self.pmf_gen.get_pmf_c(&point[0], pmf) - __syncwarp(WMASK); - const int rv = trilinear_interp_d(dimx, dimy, dimz, dimt, -1, pmf, point, __pmf_data_sh); - __syncwarp(WMASK); - if (rv != 0) { - return 0; - } - - // for i in range(_len): - // if pmf[i] > max_pmf: - // max_pmf = pmf[i] - // absolute_pmf_threshold = pmf_threshold * max_pmf - const REAL_T absolpmf_thresh = PMF_THRESHOLD_P * max_d(dimt, __pmf_data_sh, REAL_MIN); - __syncwarp(WMASK); - - // for i in range(_len): - // if pmf[i] < absolute_pmf_threshold: - // pmf[i] = 0.0 - #pragma unroll - for(int i = tidx; i < dimt; i += BDIM_X) { - if (__pmf_data_sh[i] < absolpmf_thresh) { - __pmf_data_sh[i] = 0.0; - } - } - __syncwarp(WMASK); - - if (IS_START) { - int *__shInd = reinterpret_cast(__sh + BDIM_Y*n32dimt) + tidy*n32dimt; - return peak_directions_d(__pmf_data_sh, - dirs, - sphere_vertices, - sphere_edges, - num_edges, - dimt, - __shInd, - relative_peak_thres, - min_separation_angle); - } else { - REAL_T __tmp; - #ifdef DEBUG - __syncwarp(WMASK); - if (tidx == 0) { - printArray("__pmf_data_sh initial", 8, dimt, __pmf_data_sh); - printf("absolpmf_thresh %10.8f\n", absolpmf_thresh); - printf("---> dir %10.8f, %10.8f, %10.8f\n", dir.x, dir.y, dir.z); - printf("---> point %10.8f, %10.8f, %10.8f\n", point.x, point.y, point.z); - } - __syncwarp(WMASK); - if (tidx == 15) { - printf("absolpmf_thresh %10.8f l15\n", absolpmf_thresh); - printf("---> dir %10.8f, %10.8f, %10.8f l15\n", dir.x, dir.y, dir.z); - printf("---> point %10.8f, %10.8f, %10.8f l15\n", point.x, point.y, point.z); - } - __syncwarp(WMASK); - if (tidx == 31) { - printArray("__pmf_data_sh initial l31", 8, dimt, __pmf_data_sh); - printf("absolpmf_thresh %10.8f l31\n", absolpmf_thresh); - printf("---> dir %10.8f, %10.8f, %10.8f l31\n", dir.x, dir.y, dir.z); - printf("---> point %10.8f, %10.8f, %10.8f l31\n", point.x, point.y, point.z); - } - __syncwarp(WMASK); - #endif - - // // These should not be relevant - // if norm(&direction[0]) == 0: - // return 1 - // normalize(&direction[0]) - - // for i in range(_len): - // cos_sim = self.vertices[i][0] * direction[0] \ - // + self.vertices[i][1] * direction[1] \ - // + self.vertices[i][2] * direction[2] - // if cos_sim < 0: - // cos_sim = cos_sim * -1 - // if cos_sim < self.cos_similarity: - // pmf[i] = 0 - const REAL_T cos_similarity = COS(max_angle); - - #pragma unroll - for(int i = tidx; i < dimt; i += BDIM_X) { - const REAL_T dot = dir.x*sphere_vertices[i].x+ - dir.y*sphere_vertices[i].y+ - dir.z*sphere_vertices[i].z; - - if (FABS(dot) < cos_similarity) { - __pmf_data_sh[i] = 0.0; - } - } - __syncwarp(WMASK); - - #ifdef DEBUG - __syncwarp(WMASK); - if (tidx == 0) { - printArray("__pmf_data_sh after filtering", 8, dimt, __pmf_data_sh); - } - __syncwarp(WMASK); - #endif - - // cumsum(pmf, pmf, _len) - prefix_sum_sh_d(__pmf_data_sh, dimt); - - #ifdef DEBUG - __syncwarp(WMASK); - if (tidx == 0) { - printArray("__pmf_data_sh after cumsum", 8, dimt, __pmf_data_sh); - } - __syncwarp(WMASK); - #endif - - // last_cdf = pmf[_len - 1] - // if last_cdf == 0: - // return 1 - REAL_T last_cdf = __pmf_data_sh[dimt - 1]; - if (last_cdf == 0) { - return 0; - } - - // idx = where_to_insert(pmf, random() * last_cdf, _len) - if (tidx == 0) { - __tmp = curand_uniform(st) * last_cdf; - } - REAL_T selected_cdf = __shfl_sync(WMASK, __tmp, 0, BDIM_X); -// Both these implementations work -#if 1 - int low = 0; - int high = dimt - 1; - while ((high - low) >= BDIM_X) { - const int mid = (low + high) / 2; - if (__pmf_data_sh[mid] < selected_cdf) { - low = mid; - } else { - high = mid; - } - } - const bool __ballot = (low+tidx <= high) ? (selected_cdf < __pmf_data_sh[low+tidx]) : 0; - const int __msk = __ballot_sync(WMASK, __ballot); - const int indProb = low + __ffs(__msk) - 1; -#else - int indProb = dimt - 1; - for (int ii = 0; ii < dimt; ii+=BDIM_X) { - int __is_greater = 0; - if (ii+tidx < dimt) { - __is_greater = selected_cdf < __pmf_data_sh[ii+tidx]; - } - const int __msk = __ballot_sync(WMASK, __is_greater); - if (__msk != 0) { - indProb = ii + __ffs(__msk) - 1; - break; - } - } -#endif - - #ifdef DEBUG - __syncwarp(WMASK); - if (tidx == 0) { - printf("last_cdf %10.8f\n", last_cdf); - printf("selected_cdf %10.8f\n", selected_cdf); - printf("indProb %i out of %i\n", indProb, dimt); - } - __syncwarp(WMASK); - #endif - - // newdir = self.vertices[idx] - // if (direction[0] * newdir[0] - // + direction[1] * newdir[1] - // + direction[2] * newdir[2] > 0): - // copy_point(&newdir[0], &direction[0]) - // else: - // newdir[0] = newdir[0] * -1 - // newdir[1] = newdir[1] * -1 - // newdir[2] = newdir[2] * -1 - // copy_point(&newdir[0], &direction[0]) - if (tidx == 0) { - if ((dir.x * sphere_vertices[indProb].x + - dir.y * sphere_vertices[indProb].y + - dir.z * sphere_vertices[indProb].z) > 0) { - *dirs = MAKE_REAL3(sphere_vertices[indProb].x, - sphere_vertices[indProb].y, - sphere_vertices[indProb].z); - } else { - *dirs = MAKE_REAL3(-sphere_vertices[indProb].x, - -sphere_vertices[indProb].y, - -sphere_vertices[indProb].z); - } - // printf("direction addr write %p, slid %i\n", dirs, blockIdx.x*blockDim.y+threadIdx.y); - } - - #ifdef DEBUG - __syncwarp(WMASK); - if (tidx == 0) { - printf("last_cdf %10.8f\n", last_cdf); - printf("selected_cdf %10.8f\n", selected_cdf); - printf("indProb %i out of %i\n", indProb, dimt); - } - __syncwarp(WMASK); - if (tidx == 15) { - printf("last_cdf %10.8f l15\n", last_cdf); - printf("selected_cdf %10.8f l15\n", selected_cdf); - printf("indProb %i out of %i l15\n", indProb, dimt); - } - __syncwarp(WMASK); - if (tidx == 31) { - printf("last_cdf %10.8f l31\n", last_cdf); - printf("selected_cdf %10.8f l31\n", selected_cdf); - printf("indProb %i out of %i l31\n", indProb, dimt); - } - __syncwarp(WMASK); - #endif - return 1; - } -} - -template -__device__ int get_direction_boot_d( - curandStatePhilox4_32_10_t *st, - const REAL_T max_angle, - const REAL_T min_signal, - const REAL_T relative_peak_thres, - const REAL_T min_separation_angle, - REAL3_T dir, - const int dimx, - const int dimy, - const int dimz, - const int dimt, - const REAL_T *__restrict__ dataf, - const int *__restrict__ b0s_mask, // not using this (and its opposite, dwi_mask) - // but not clear if it will never be needed so - // we'll keep it here for now... - const REAL3_T point, - const REAL_T *__restrict__ H, - const REAL_T *__restrict__ R, - // model unused - // max_angle, pmf_threshold from global defines - // b0s_mask already passed - // min_signal from global defines - const int delta_nr, - const REAL_T *__restrict__ delta_b, - const REAL_T *__restrict__ delta_q, // fit_matrix - const int samplm_nr, - const REAL_T *__restrict__ sampling_matrix, - const REAL3_T *__restrict__ sphere_vertices, - const int2 *__restrict__ sphere_edges, - const int num_edges, - REAL3_T *__restrict__ dirs) { - - const int tidx = threadIdx.x; - const int tidy = threadIdx.y; - - const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; - const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); - - const int n32dimt = ((dimt+31)/32)*32; - - extern REAL_T __shared__ __sh[]; - - REAL_T *__vox_data_sh = reinterpret_cast(__sh); - REAL_T *__msk_data_sh = __vox_data_sh + BDIM_Y*n32dimt; - - REAL_T *__r_sh = __msk_data_sh + BDIM_Y*n32dimt; - REAL_T *__h_sh = __r_sh + BDIM_Y*MAX(n32dimt, samplm_nr); - - __vox_data_sh += tidy*n32dimt; - __msk_data_sh += tidy*n32dimt; - - __r_sh += tidy*MAX(n32dimt, samplm_nr); - __h_sh += tidy*MAX(n32dimt, samplm_nr); - - // compute hr_side (may be passed from python) - int hr_side = 0; - for(int j = tidx; j < dimt; j += BDIM_X) { - hr_side += !b0s_mask[j] ? 1 : 0; - } - #pragma unroll - for(int i = BDIM_X/2; i; i /= 2) { - hr_side += __shfl_xor_sync(WMASK, hr_side, i, BDIM_X); - } - - #pragma unroll - for(int i = 0; i < NATTEMPTS; i++) { - - const int rv = trilinear_interp_d(dimx, dimy, dimz, dimt, -1, dataf, point, __vox_data_sh); - - const int nmsk = maskGet(dimt, b0s_mask, __vox_data_sh, __msk_data_sh); - - //if (!tidx && !threadIdx.y && !blockIdx.x) { - // - // printf("interp of %f, %f, %f\n", point.x, point.y, point.z); - // printf("hr_side: %d\n", hr_side); - // printArray("vox_data", 6, dimt, __vox_data_sh[tidy]); - // printArray("msk_data", 6, nmsk, __msk_data_sh[tidy]); - //} - //break; - - __syncwarp(WMASK); - - if (rv == 0) { - - ndotp_d(hr_side, hr_side, __msk_data_sh, R, __r_sh); - //__syncwarp(); - //printArray("__r", 5, hr_side*hr_side, R); - //printArray("__r_sh", 6, hr_side, __r_sh[tidy]); - - ndotp_d(hr_side, hr_side, __msk_data_sh, H, __h_sh); - //__syncwarp(); - //printArray("__h_sh", 6, hr_side, __h_sh[tidy]); - - __syncwarp(WMASK); - - for(int j = 0; j < hr_side; j += BDIM_X) { - if (j+tidx < hr_side) { -#ifdef USE_FIXED_PERMUTATION - const int srcPermInd = fixedPerm[j+tidx]; -#else - const int srcPermInd = curand(st) % hr_side; -// if (srcPermInd < 0 || srcPermInd >= hr_side) { -// printf("srcPermInd: %d\n", srcPermInd); -// } -#endif - __h_sh[j+tidx] += __r_sh[srcPermInd]; - //__h_sh[j+tidx] += __r_sh[j+tidx]; - } - } - __syncwarp(WMASK); - - //printArray("h+perm(r):", 6, hr_side, __h_sh[tidy]); - //__syncwarp(); - - // vox_data[dwi_mask] = masked_data - maskPut(dimt, b0s_mask, __h_sh, __vox_data_sh); - __syncwarp(WMASK); - - //printArray("vox_data[dwi_mask]:", 6, dimt, __vox_data_sh[tidy]); - //__syncwarp(); - - for(int j = tidx; j < dimt; j += BDIM_X) { - //__vox_data_sh[j] = MAX(MIN_SIGNAL_P, __vox_data_sh[j]); - __vox_data_sh[j] = MAX(min_signal, __vox_data_sh[j]); - } - __syncwarp(WMASK); - - const REAL_T denom = avgMask(dimt, b0s_mask, __vox_data_sh); - - for(int j = tidx; j < dimt; j += BDIM_X) { - __vox_data_sh[j] /= denom; - } - __syncwarp(); - - //if (!tidx && !threadIdx.y && !blockIdx.x) { - // printf("denom: %f\n", denom); - //} - ////break; - //if (!tidx && !threadIdx.y && !blockIdx.x) { - // - // printf("__vox_data_sh:\n"); - // printArray("vox_data", 6, dimt, __vox_data_sh[tidy]); - //} - //break; - - maskGet(dimt, b0s_mask, __vox_data_sh, __msk_data_sh); - __syncwarp(WMASK); - - fit_model_coef(delta_nr, hr_side, delta_q, delta_b, __msk_data_sh, __h_sh, __r_sh); - - // __r_sh[tidy] <- python 'coef' - - ndotp_d(samplm_nr, delta_nr, __r_sh, sampling_matrix, __h_sh); - - // __h_sh[tidy] <- python 'pmf' - } else { - #pragma unroll - for(int j = tidx; j < samplm_nr; j += BDIM_X) { - __h_sh[j] = 0; - } - // __h_sh[tidy] <- python 'pmf' - } - __syncwarp(WMASK); -#if 0 - if (!threadIdx.y && threadIdx.x == 0) { - for(int j = 0; j < samplm_nr; j++) { - printf("pmf[%d]: %f\n", j, __h_sh[tidy][j]); - } - } - //return; -#endif - const REAL_T abs_pmf_thr = PMF_THRESHOLD_P*max_d(samplm_nr, __h_sh, REAL_MIN); - __syncwarp(WMASK); - - #pragma unroll - for(int j = tidx; j < samplm_nr; j += BDIM_X) { - const REAL_T __v = __h_sh[j]; - if (__v < abs_pmf_thr) { - __h_sh[j] = 0; - } - } - __syncwarp(WMASK); -#if 0 - if (!threadIdx.y && threadIdx.x == 0) { - printf("abs_pmf_thr: %f\n", abs_pmf_thr); - for(int j = 0; j < samplm_nr; j++) { - printf("pmfNORM[%d]: %f\n", j, __h_sh[tidy][j]); - } - } - //return; -#endif -#if 0 - if init: - directions = peak_directions(pmf, sphere)[0] - return directions - else: - peaks = peak_directions(pmf, sphere)[0] - if (len(peaks) > 0): - return closest_peak(directions, peaks, cos_similarity) -#endif - const int ndir = peak_directions_d(__h_sh, dirs, - sphere_vertices, - sphere_edges, - num_edges, - samplm_nr, - reinterpret_cast(__r_sh), // reuse __r_sh as shInd in func which is large enough - relative_peak_thres, - min_separation_angle); - if (NATTEMPTS == 1) { // init=True... - return ndir; // and dirs; - } else { // init=False... - if (ndir > 0) { - /* - if (!threadIdx.y && threadIdx.x == 0 && ndir > 1) { - printf("NATTEMPTS=5 and ndir: %d!!!\n", ndir); - } - */ - REAL3_T peak; - const int foundPeak = closest_peak_d(max_angle, dir, ndir, dirs, &peak); - __syncwarp(WMASK); - if (foundPeak) { - if (tidx == 0) { - dirs[0] = peak; - } - return 1; - } - } - } - } - return 0; -} - -enum {OUTSIDEIMAGE, INVALIDPOINT, TRACKPOINT, ENDPOINT}; - -template -__device__ int check_point_d(const REAL_T tc_threshold, - const REAL3_T point, - const int dimx, - const int dimy, - const int dimz, - const REAL_T *__restrict__ metric_map) { - - const int tidy = threadIdx.y; - - const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; - const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); - - __shared__ REAL_T __shInterpOut[BDIM_Y]; - - const int rv = trilinear_interp_d(dimx, dimy, dimz, 1, 0, metric_map, point, __shInterpOut+tidy); - __syncwarp(WMASK); -#if 0 - if (threadIdx.y == 1 && threadIdx.x == 0) { - printf("__shInterpOut[tidy]: %f, TC_THRESHOLD_P: %f\n", __shInterpOut[tidy], TC_THRESHOLD_P); - } -#endif - if (rv != 0) { - return OUTSIDEIMAGE; - } - //return (__shInterpOut[tidy] > TC_THRESHOLD_P) ? TRACKPOINT : ENDPOINT; - return (__shInterpOut[tidy] > tc_threshold) ? TRACKPOINT : ENDPOINT; -} - -template -__device__ int tracker_d(curandStatePhilox4_32_10_t *st, - const REAL_T max_angle, - const REAL_T min_signal, - const REAL_T tc_threshold, - const REAL_T step_size, - const REAL_T relative_peak_thres, - const REAL_T min_separation_angle, - REAL3_T seed, - REAL3_T first_step, - REAL_T* ptt_frame, - REAL3_T voxel_size, - const int dimx, - const int dimy, - const int dimz, - const int dimt, - const REAL_T *__restrict__ dataf, - const int *__restrict__ b0s_mask, // not using this (and its opposite, dwi_mask) - const REAL_T *__restrict__ H, - const REAL_T *__restrict__ R, - // model unused - // step_size from global defines - // max_angle, pmf_threshold from global defines - // b0s_mask already passed - // min_signal from global defines - // tc_threshold from global defines - // pmf_threashold from global defines - const REAL_T *__restrict__ metric_map, - const int delta_nr, - const REAL_T *__restrict__ delta_b, - const REAL_T *__restrict__ delta_q, // fit_matrix - const int samplm_nr, - const REAL_T *__restrict__ sampling_matrix, - const REAL3_T *__restrict__ sphere_vertices, - const int2 *__restrict__ sphere_edges, - const int num_edges, - int *__restrict__ nsteps, - REAL3_T *__restrict__ streamline) { - - const int tidx = threadIdx.x; - const int tidy = threadIdx.y; - - const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; - const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); - - int tissue_class = TRACKPOINT; - - REAL3_T point = seed; - REAL3_T direction = first_step; - __shared__ REAL3_T __sh_new_dir[BDIM_Y]; - - if (tidx == 0) { - streamline[0] = point; -#if 0 - if (threadIdx.y == 1) { - printf("streamline[0]: %f, %f, %f\n", point.x, point.y, point.z); - } -#endif - } - __syncwarp(WMASK); - - int step_frac; - if (MODEL_T == PTT) { - step_frac = STEP_FRAC; - } else { - step_frac = 1; // STEP_FRAC could be useful in other models - } - - int i; - for(i = 1; i < MAX_SLINE_LEN*step_frac; i++) { - int ndir; - if (MODEL_T == PROB) { - ndir = get_direction_prob_d( - st, - dataf, - max_angle, - relative_peak_thres, - min_separation_angle, - direction, - dimx, dimy, dimz, dimt, - point, - sphere_vertices, - sphere_edges, - num_edges, - __sh_new_dir + tidy); - } else if (MODEL_T == PTT) { - ndir = get_direction_ptt_d( - st, - dataf, - max_angle, - step_size, - direction, - ptt_frame, - dimx, dimy, dimz, dimt, - point, - sphere_vertices, - __sh_new_dir + tidy); - } else { - // call get_direction_boot_d() with NATTEMPTS=5 - ndir = get_direction_boot_d( - st, - max_angle, - min_signal, - relative_peak_thres, - min_separation_angle, - direction, - dimx, dimy, dimz, dimt, dataf, - b0s_mask /* !dwi_mask */, - point, - H, R, - // model unused - // max_angle, pmf_threshold from global defines - // b0s_mask already passed - // min_signal from global defines - delta_nr, - delta_b, delta_q, // fit_matrix - samplm_nr, - sampling_matrix, - sphere_vertices, - sphere_edges, - num_edges, - __sh_new_dir + tidy); - } - __syncwarp(WMASK); - direction = __sh_new_dir[tidy]; - __syncwarp(WMASK); - - if (ndir == 0) { - break; - } -#if 0 - if (threadIdx.y == 1 && threadIdx.x == 0) { - printf("tracker: i: %d, direction: (%f, %f, %f)\n", i, direction.x, direction.y, direction.z); - } - //return; -#endif - //point.x += (direction.x / voxel_size.x) * STEP_SIZE_P; - //point.y += (direction.y / voxel_size.y) * STEP_SIZE_P; - //point.z += (direction.z / voxel_size.z) * STEP_SIZE_P; - point.x += (direction.x / voxel_size.x) * (step_size / step_frac); - point.y += (direction.y / voxel_size.y) * (step_size / step_frac); - point.z += (direction.z / voxel_size.z) * (step_size / step_frac); - - if ((tidx == 0) && ((i % step_frac) == 0)){ - streamline[i/step_frac] = point; -#if 0 - if (threadIdx.y == 1) { - printf("streamline[%d]: %f, %f, %f\n", i, point.x, point.y, point.z); - } -#endif - } - __syncwarp(WMASK); - - tissue_class = check_point_d(tc_threshold, point, dimx, dimy, dimz, metric_map); - -#if 0 - __syncwarp(WMASK); - if (tidx == 0) { - printf("step_size %f\n", step_size); - printf("direction %f, %f, %f\n", direction.x, direction.y, direction.z); - printf("direction addr read %p, slid %i\n", __shDir, blockIdx.x*blockDim.y+threadIdx.y); - printf("voxel_size %f, %f, %f\n", voxel_size.x, voxel_size.y, voxel_size.z); - printf("point %f, %f, %f\n", point.x, point.y, point.z); - printf("tc %i\n", tissue_class); - } - __syncwarp(WMASK); - if (tidx == 15) { - printf("step_size %f l15\n", step_size); - printf("direction %f, %f, %f l15\n", direction.x, direction.y, direction.z); - printf("direction addr read %p, slid %i l15\n", __shDir, blockIdx.x*blockDim.y+threadIdx.y); - printf("voxel_size %f, %f, %f l15\n", voxel_size.x, voxel_size.y, voxel_size.z); - printf("point %f, %f, %f l15\n", point.x, point.y, point.z); - printf("tc %i l15\n", tissue_class); - } - __syncwarp(WMASK); - if (tidx == 31) { - printf("step_size %f l31\n", step_size); - printf("direction %f, %f, %f l31\n", direction.x, direction.y, direction.z); - printf("direction addr read %p, slid %i l31\n", __shDir, blockIdx.x*blockDim.y+threadIdx.y); - printf("voxel_size %f, %f, %f l31\n", voxel_size.x, voxel_size.y, voxel_size.z); - printf("point %f, %f, %f l31\n", point.x, point.y, point.z); - printf("tc %i l31\n", tissue_class); - } - __syncwarp(WMASK); -#endif - - if (tissue_class == ENDPOINT || - tissue_class == INVALIDPOINT || - tissue_class == OUTSIDEIMAGE) { - break; - } - } - nsteps[0] = i/step_frac; - if (((i % step_frac) != 0) && i < step_frac*(MAX_SLINE_LEN - 1)){ - nsteps[0]++; - if (tidx == 0) { - streamline[nsteps[0]] = point; - } - } - - return tissue_class; -} - -template -__global__ void getNumStreamlinesBoot_k( - const ModelType model_type, - const REAL_T max_angle, - const REAL_T min_signal, - const REAL_T relative_peak_thres, - const REAL_T min_separation_angle, - const long long rndSeed, - const int nseed, - const REAL3_T *__restrict__ seeds, - const int dimx, - const int dimy, - const int dimz, - const int dimt, - const REAL_T *__restrict__ dataf, - const REAL_T *__restrict__ H, - const REAL_T *__restrict__ R, - const int delta_nr, - const REAL_T *__restrict__ delta_b, - const REAL_T *__restrict__ delta_q, - const int *__restrict__ b0s_mask, // change to int - const int samplm_nr, - const REAL_T *__restrict__ sampling_matrix, - const REAL3_T *__restrict__ sphere_vertices, - const int2 *__restrict__ sphere_edges, - const int num_edges, - REAL3_T *__restrict__ shDir0, - int *slineOutOff) { - - const int tidx = threadIdx.x; - const int slid = blockIdx.x*blockDim.y + threadIdx.y; - const size_t gid = blockIdx.x * blockDim.y * blockDim.x + blockDim.x * threadIdx.y + threadIdx.x; - - if (slid >= nseed) { - return; - } - - REAL3_T seed = seeds[slid]; - // seed = lin_mat*seed + offset - - REAL3_T *__restrict__ __shDir = shDir0+slid*samplm_nr; - - // const int hr_side = dimt-1; - - curandStatePhilox4_32_10_t st; - //curand_init(rndSeed, slid + rndOffset, DIV_UP(hr_side, BDIM_X)*tidx, &st); // each thread uses DIV_UP(hr_side/BDIM_X) - curand_init(rndSeed, gid, 0, &st); // each thread uses DIV_UP(hr_side/BDIM_X) - // elements of the same sequence - // python: - //directions = get_direction(None, dataf, dwi_mask, sphere, s, H, R, model, max_angle, - // pmf_threshold, b0s_mask, min_signal, fit_matrix, - // sampling_matrix, init=True) - - //if (!tidx && !threadIdx.y && !blockIdx.x) { - // printf("seed: %f, %f, %f\n", seed.x, seed.y, seed.z); - //} - - int ndir; - switch(model_type) { - case OPDT: - ndir = get_direction_boot_d( - &st, - max_angle, - min_signal, - relative_peak_thres, - min_separation_angle, - MAKE_REAL3(0,0,0), - dimx, dimy, dimz, dimt, dataf, - b0s_mask /* !dwi_mask */, - seed, - H, R, - // model unused - // max_angle, pmf_threshold from global defines - // b0s_mask already passed - // min_signal from global defines - delta_nr, - delta_b, delta_q, // fit_matrix - samplm_nr, - sampling_matrix, - sphere_vertices, - sphere_edges, - num_edges, - __shDir); - break; - case CSA: - ndir = get_direction_boot_d( - &st, - max_angle, - min_signal, - relative_peak_thres, - min_separation_angle, - MAKE_REAL3(0,0,0), - dimx, dimy, dimz, dimt, dataf, - b0s_mask /* !dwi_mask */, - seed, - H, R, - // model unused - // max_angle, pmf_threshold from global defines - // b0s_mask already passed - // min_signal from global defines - delta_nr, - delta_b, delta_q, // fit_matrix - samplm_nr, - sampling_matrix, - sphere_vertices, - sphere_edges, - num_edges, - __shDir); - break; - default: - printf("FATAL: Invalid Model Type.\n"); - break; - } - - if (tidx == 0) { - slineOutOff[slid] = ndir; - } - - return; -} - -template -__global__ void getNumStreamlinesProb_k(const REAL_T max_angle, - const REAL_T relative_peak_thres, - const REAL_T min_separation_angle, - const long long rndSeed, - const int nseed, - const REAL3_T *__restrict__ seeds, - const int dimx, - const int dimy, - const int dimz, - const int dimt, - const REAL_T *__restrict__ dataf, - const REAL3_T *__restrict__ sphere_vertices, - const int2 *__restrict__ sphere_edges, - const int num_edges, - REAL3_T *__restrict__ shDir0, - int *slineOutOff) { - - const int tidx = threadIdx.x; - const int slid = blockIdx.x*blockDim.y + threadIdx.y; - const size_t gid = blockIdx.x * blockDim.y * blockDim.x + blockDim.x * threadIdx.y + threadIdx.x; - - if (slid >= nseed) { - return; - } - - REAL3_T *__restrict__ __shDir = shDir0+slid*dimt; - curandStatePhilox4_32_10_t st; - curand_init(rndSeed, gid, 0, &st); - - int ndir = get_direction_prob_d( - &st, - dataf, - max_angle, - relative_peak_thres, - min_separation_angle, - MAKE_REAL3(0,0,0), - dimx, dimy, dimz, dimt, - seeds[slid], - sphere_vertices, - sphere_edges, - num_edges, - __shDir); - if (tidx == 0) { - slineOutOff[slid] = ndir; - } - - return; -} - -template -__global__ void genStreamlinesMerge_k( - const REAL_T max_angle, - const REAL_T min_signal, - const REAL_T tc_threshold, - const REAL_T step_size, - const REAL_T relative_peak_thres, - const REAL_T min_separation_angle, - const long long rndSeed, - const int rndOffset, - const int nseed, - const REAL3_T *__restrict__ seeds, - const int dimx, - const int dimy, - const int dimz, - const int dimt, - const REAL_T *__restrict__ dataf, - const REAL_T *__restrict__ H, - const REAL_T *__restrict__ R, - const int delta_nr, - const REAL_T *__restrict__ delta_b, - const REAL_T *__restrict__ delta_q, - const int *__restrict__ b0s_mask, // change to int - const REAL_T *__restrict__ metric_map, - const int samplm_nr, - const REAL_T *__restrict__ sampling_matrix, - const REAL3_T *__restrict__ sphere_vertices, - const int2 *__restrict__ sphere_edges, - const int num_edges, - const int *__restrict__ slineOutOff, - REAL3_T *__restrict__ shDir0, - int *__restrict__ slineSeed, - int *__restrict__ slineLen, - REAL3_T *__restrict__ sline) { - - const int tidx = threadIdx.x; - const int tidy = threadIdx.y; - - const int slid = blockIdx.x*blockDim.y + threadIdx.y; - - const int lid = (tidy*BDIM_X + tidx) % 32; - const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); - - __shared__ REAL_T frame_sh[((MODEL_T == PTT) ? BDIM_Y*18 : 1)]; // Only used by PTT, TODO: way to remove this in other cases - REAL_T* __ptt_frame = frame_sh + tidy*18; - // const int hr_side = dimt-1; - - curandStatePhilox4_32_10_t st; - // const int gbid = blockIdx.y*gridDim.x + blockIdx.x; - const size_t gid = blockIdx.x * blockDim.y * blockDim.x + blockDim.x * threadIdx.y + threadIdx.x; - //curand_init(rndSeed, slid+rndOffset, DIV_UP(hr_side, BDIM_X)*tidx, &st); // each thread uses DIV_UP(HR_SIDE/BDIM_X) - curand_init(rndSeed, gid+1, 0, &st); // each thread uses DIV_UP(hr_side/BDIM_X) - // elements of the same sequence - if (slid >= nseed) { - return; - } - - REAL3_T seed = seeds[slid]; - - int ndir = slineOutOff[slid+1]-slineOutOff[slid]; -#if 0 - if (threadIdx.y == 0 && threadIdx.x == 0) { - printf("%s: ndir: %d\n", __func__, ndir); - for(int i = 0; i < ndir; i++) { - printf("__shDir[%d][%d]: (%f, %f, %f)\n", - tidy, i, __shDir[tidy][i].x, __shDir[tidy][i].y, __shDir[tidy][i].z); - } - } -#endif - __syncwarp(WMASK); - - int slineOff = slineOutOff[slid]; - - for(int i = 0; i < ndir; i++) { - REAL3_T first_step = shDir0[slid*samplm_nr + i]; - - REAL3_T *__restrict__ currSline = sline + slineOff*MAX_SLINE_LEN*2; - - if (tidx == 0) { - slineSeed[slineOff] = slid; - } -#if 0 - if (threadIdx.y == 0 && threadIdx.x == 0) { - printf("calling trackerF from: (%f, %f, %f)\n", first_step.x, first_step.y, first_step.z); - } -#endif - - if (MODEL_T == PTT) { - if (!init_frame_ptt_d( - &st, - dataf, - max_angle, - step_size, - first_step, - dimx, dimy, dimz, dimt, - seed, - sphere_vertices, - __ptt_frame - )) { // this fails rarely - if (tidx == 0) { - slineLen[slineOff] = 1; - currSline[0] = seed; - } - __syncwarp(WMASK); - slineOff += 1; - continue; - } - } - - - int stepsB; - const int tissue_classB = tracker_d(&st, - max_angle, - min_signal, - tc_threshold, - step_size, - relative_peak_thres, - min_separation_angle, - seed, - MAKE_REAL3(-first_step.x, -first_step.y, -first_step.z), - __ptt_frame, - MAKE_REAL3(1, 1, 1), - dimx, dimy, dimz, dimt, dataf, - b0s_mask, - H, R, - metric_map, - delta_nr, - delta_b, delta_q, //fit_matrix - samplm_nr, - sampling_matrix, - sphere_vertices, - sphere_edges, - num_edges, - &stepsB, - currSline); - //if (tidx == 0) { - // slineLenB[slineOff] = stepsB; - //} - - // reverse backward sline - for(int j = 0; j < stepsB/2; j += BDIM_X) { - if (j+tidx < stepsB/2) { - const REAL3_T __p = currSline[j+tidx]; - currSline[j+tidx] = currSline[stepsB-1 - (j+tidx)]; - currSline[stepsB-1 - (j+tidx)] = __p; - } - } - - int stepsF; - const int tissue_classF = tracker_d(&st, - max_angle, - min_signal, - tc_threshold, - step_size, - relative_peak_thres, - min_separation_angle, - seed, - first_step, - __ptt_frame + 9, - MAKE_REAL3(1, 1, 1), - dimx, dimy, dimz, dimt, dataf, - b0s_mask, - H, R, - metric_map, - delta_nr, - delta_b, delta_q, //fit_matrix - samplm_nr, - sampling_matrix, - sphere_vertices, - sphere_edges, - num_edges, - &stepsF, - currSline + stepsB-1); - if (tidx == 0) { - slineLen[slineOff] = stepsB-1+stepsF; - } - - slineOff += 1; -#if 0 - if (threadIdx.y == 0 && threadIdx.x == 0) { - printf("%s: stepsF: %d, tissue_classF: %d\n", __func__, stepsF, tissue_classF); - } - __syncwarp(WMASK); -#endif - //if (/* !return_all || */0 && - // tissue_classF != ENDPOINT && - // tissue_classF != OUTSIDEIMAGE) { - // continue; - //} - //if (/* !return_all || */ 0 && - // tissue_classB != ENDPOINT && - // tissue_classB != OUTSIDEIMAGE) { - // continue; - //} - } - return; -} - -void generate_streamlines_cuda_mgpu(const ModelType model_type, const REAL max_angle, const REAL min_signal, const REAL tc_threshold, const REAL step_size, - const REAL relative_peak_thresh, const REAL min_separation_angle, - const int nseeds, const std::vector &seeds_d, - const int dimx, const int dimy, const int dimz, const int dimt, - const std::vector &dataf_d, const std::vector &H_d, const std::vector &R_d, - const int delta_nr, - const std::vector &delta_b_d, const std::vector &delta_q_d, - const std::vector &b0s_mask_d, const std::vector &metric_map_d, - const int samplm_nr, - const std::vector &sampling_matrix_d, - const std::vector &sphere_vertices_d, const std::vector &sphere_edges_d, const int nedges, - std::vector &slines_h, std::vector &slinesLen_h, std::vector &nSlines_h, - const std::vector nSlines_old_h, const int rng_seed, const int rng_offset, - const int ngpus, const std::vector &streams) { - - int nseeds_per_gpu = (nseeds + ngpus - 1) / ngpus; - - std::vector slinesOffs_d(ngpus, nullptr); - std::vector shDirTemp0_d(ngpus, nullptr); - - //#pragma omp parallel for - for (int n = 0; n < ngpus; ++n) { - CHECK_CUDA(cudaSetDevice(n)); - int nseeds_gpu = std::min(nseeds_per_gpu, std::max(0, nseeds - n*nseeds_per_gpu)); - dim3 block(THR_X_SL, THR_X_BL/THR_X_SL); - dim3 grid(DIV_UP(nseeds_gpu, THR_X_BL/THR_X_SL)); - - CHECK_CUDA(cudaMalloc(&slinesOffs_d[n], sizeof(*slinesOffs_d[n])*(nseeds_gpu+1))); - CHECK_CUDA(cudaMalloc(&shDirTemp0_d[n], sizeof(*shDirTemp0_d[n])*samplm_nr*grid.x*block.y)); - } - - int n32dimt = ((dimt+31)/32)*32; - - size_t shSizeGNS; - //#pragma omp parallel for - for (int n = 0; n < ngpus; ++n) { - CHECK_CUDA(cudaSetDevice(n)); - int nseeds_gpu = std::min(nseeds_per_gpu, std::max(0, nseeds - n*nseeds_per_gpu)); - if (nseeds_gpu == 0) continue; - dim3 block(THR_X_SL, THR_X_BL/THR_X_SL); - dim3 grid(DIV_UP(nseeds_gpu, THR_X_BL/THR_X_SL)); - - // Precompute number of streamlines before allocating memory - if (!((model_type == PTT) || (model_type == PROB))) { - shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*(2*n32dimt + 2*MAX(n32dimt, samplm_nr)) + // for get_direction_boot_d - sizeof(int)*samplm_nr; // for peak_directions_d - getNumStreamlinesBoot_k - <<>>( - model_type, - max_angle, - min_signal, - relative_peak_thresh, - min_separation_angle, - rng_seed, - nseeds_gpu, - reinterpret_cast(seeds_d[n]), - dimx, - dimy, - dimz, - dimt, - dataf_d[n], - H_d[n], - R_d[n], - delta_nr, - delta_b_d[n], - delta_q_d[n], - b0s_mask_d[n], - samplm_nr, - sampling_matrix_d[n], - reinterpret_cast(sphere_vertices_d[n]), - reinterpret_cast(sphere_edges_d[n]), - nedges, - shDirTemp0_d[n], - slinesOffs_d[n]); - } else { - shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*n32dimt + sizeof(int)*(THR_X_BL/THR_X_SL)*n32dimt; - getNumStreamlinesProb_k - <<>>( - max_angle, - relative_peak_thresh, - min_separation_angle, - rng_seed, - nseeds_gpu, - reinterpret_cast(seeds_d[n]), - dimx, - dimy, - dimz, - dimt, - dataf_d[n], - reinterpret_cast(sphere_vertices_d[n]), - reinterpret_cast(sphere_edges_d[n]), - nedges, - shDirTemp0_d[n], - slinesOffs_d[n]); - } - } - - std::vector slinesOffs_h; - //#pragma omp parallel for - for (int n = 0; n < ngpus; ++n) { - //std::vector slinesOffs_h; - int nseeds_gpu = std::min(nseeds_per_gpu, std::max(0, nseeds - n*nseeds_per_gpu)); - if (nseeds_gpu == 0) { - nSlines_h[n] = 0; - continue; - } - slinesOffs_h.resize(nseeds_gpu+1); - CHECK_CUDA(cudaMemcpy(slinesOffs_h.data(), slinesOffs_d[n], sizeof(*slinesOffs_h.data())*(nseeds_gpu+1), cudaMemcpyDeviceToHost)); - - int __pval = slinesOffs_h[0]; - slinesOffs_h[0] = 0; - for(int i = 1; i < nseeds_gpu+1; i++) { - const int __cval = slinesOffs_h[i]; - slinesOffs_h[i] = slinesOffs_h[i-1] + __pval; - __pval = __cval; - } - nSlines_h[n] = slinesOffs_h[nseeds_gpu]; - CHECK_CUDA(cudaMemcpy(slinesOffs_d[n], slinesOffs_h.data(), sizeof(*slinesOffs_d[n])*(nseeds_gpu+1), cudaMemcpyHostToDevice)); - } - - std::vector slineSeed_d(ngpus, nullptr); - - //#pragma omp parallel for - for (int n = 0; n < ngpus; ++n) { - CHECK_CUDA(cudaSetDevice(n)); - int nseeds_gpu = std::min(nseeds_per_gpu, std::max(0, nseeds - n*nseeds_per_gpu)); - - CHECK_CUDA(cudaMalloc(&slineSeed_d[n], sizeof(*slineSeed_d[n])*nSlines_h[n])); - CHECK_CUDA(cudaMemset(slineSeed_d[n], -1, sizeof(*slineSeed_d[n])*nSlines_h[n])); - - // Allocate/reallocate output arrays if necessary - if (nSlines_h[n] > EXCESS_ALLOC_FACT*nSlines_old_h[n]) { - if(slines_h[n]) cudaFreeHost(slines_h[n]); - if(slinesLen_h[n]) cudaFreeHost(slinesLen_h[n]); - slines_h[n] = nullptr; - slinesLen_h[n] = nullptr; - } - -#ifdef DEBUG - printf("buffer size %zu\n", sizeof(*slines_h[n])*EXCESS_ALLOC_FACT*2*3*MAX_SLINE_LEN*nSlines_h[n]); -#endif - - if (!slines_h[n]) CHECK_CUDA(cudaMallocHost(&slines_h[n], sizeof(*slines_h[n])*EXCESS_ALLOC_FACT*2*3*MAX_SLINE_LEN*nSlines_h[n])); - if (!slinesLen_h[n]) CHECK_CUDA(cudaMallocHost(&slinesLen_h[n], sizeof(*slinesLen_h[n])*EXCESS_ALLOC_FACT*nSlines_h[n])); - } - - //if (nSlines_h) { - - std::vector slineLen_d(ngpus, nullptr); - std::vector sline_d(ngpus, nullptr); - //#pragma omp parallel for - for (int n = 0; n < ngpus; ++n) { - CHECK_CUDA(cudaSetDevice(n)); - CHECK_CUDA(cudaMalloc(&slineLen_d[n], sizeof(*slineLen_d[n])*nSlines_h[n])); - - CHECK_CUDA(cudaMalloc(&sline_d[n], sizeof(*sline_d[n])*2*MAX_SLINE_LEN*nSlines_h[n])); - -#if 0 - size_t free_mem, total_mem; - CHECK_CUDA(cudaMemGetInfo(&free_mem, &total_mem)); - std::cerr << "GPU " << n << ": "; - std::cerr << "GPU Memory Usage before genStreamlinesMerge_k: "; - std::cerr << (total_mem-free_mem)/(1024*1024) << " MiB used, "; - std::cerr << total_mem/(1024*1024) << " MiB total "; - std::cerr << std::endl; -#endif - } - - //#pragma omp parallel for - for (int n = 0; n < ngpus; ++n) { - CHECK_CUDA(cudaSetDevice(n)); - int nseeds_gpu = std::min(nseeds_per_gpu, std::max(0, nseeds - n*nseeds_per_gpu)); - if (nseeds_gpu == 0) continue; - dim3 block(THR_X_SL, THR_X_BL/THR_X_SL); - dim3 grid(DIV_UP(nseeds_gpu, THR_X_BL/THR_X_SL)); -#if 0 - std::cerr << "GPU " << n << ": "; - std::cerr << "Generating " << nSlines_h[n] << " streamlines (from " << nseeds_gpu << " seeds)" << std::endl; -#endif - - //fprintf(stderr, "Launching kernel with %u blocks of size (%u, %u)\n", grid.x, block.x, block.y); - switch(model_type) { - case OPDT: - genStreamlinesMerge_k <<>>( - max_angle, min_signal, tc_threshold, step_size, relative_peak_thresh, min_separation_angle, - rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast(seeds_d[n]), - dimx, dimy, dimz, dimt, dataf_d[n], H_d[n], R_d[n], delta_nr, delta_b_d[n], delta_q_d[n], - b0s_mask_d[n], metric_map_d[n], samplm_nr, sampling_matrix_d[n], - reinterpret_cast(sphere_vertices_d[n]), reinterpret_cast(sphere_edges_d[n]), - nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]); - break; - - case CSA: - genStreamlinesMerge_k <<>>( - max_angle, min_signal, tc_threshold, step_size, relative_peak_thresh, min_separation_angle, - rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast(seeds_d[n]), - dimx, dimy, dimz, dimt, dataf_d[n], H_d[n], R_d[n], delta_nr, delta_b_d[n], delta_q_d[n], - b0s_mask_d[n], metric_map_d[n], samplm_nr, sampling_matrix_d[n], - reinterpret_cast(sphere_vertices_d[n]), reinterpret_cast(sphere_edges_d[n]), - nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]); - break; - - case PROB: - // Shared memory requirements are smaller for probabilistic for main run - // than for preliminary run - shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*n32dimt; - genStreamlinesMerge_k <<>>( - max_angle, min_signal, tc_threshold, step_size, relative_peak_thresh, min_separation_angle, - rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast(seeds_d[n]), - dimx, dimy, dimz, dimt, dataf_d[n], H_d[n], R_d[n], delta_nr, delta_b_d[n], delta_q_d[n], - b0s_mask_d[n], metric_map_d[n], samplm_nr, sampling_matrix_d[n], - reinterpret_cast(sphere_vertices_d[n]), reinterpret_cast(sphere_edges_d[n]), - nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]); - break; - - case PTT: - shSizeGNS = 0; // PTT uses exclusively static shared memory - genStreamlinesMerge_k <<>>( - max_angle, min_signal, tc_threshold, step_size, relative_peak_thresh, min_separation_angle, - rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast(seeds_d[n]), - dimx, dimy, dimz, dimt, dataf_d[n], H_d[n], R_d[n], delta_nr, delta_b_d[n], delta_q_d[n], - b0s_mask_d[n], metric_map_d[n], samplm_nr, sampling_matrix_d[n], - reinterpret_cast(sphere_vertices_d[n]), reinterpret_cast(sphere_edges_d[n]), - nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]); - break; - - default: - printf("FATAL: Invalid Model Type.\n"); - break; - } - - CHECK_ERROR("genStreamlinesMerge_k"); - } - - //CHECK_CUDA(cudaDeviceSynchronize()); - - //#pragma omp parallel for - for (int n = 0; n < ngpus; ++n) { - CHECK_CUDA(cudaSetDevice(n)); - CHECK_CUDA(cudaMemcpyAsync(slines_h[n], - reinterpret_cast(sline_d[n]), - sizeof(*slines_h[n])*2*MAX_SLINE_LEN*nSlines_h[n]*3, - cudaMemcpyDeviceToHost, streams[n])); - CHECK_CUDA(cudaMemcpyAsync(slinesLen_h[n], - slineLen_d[n], - sizeof(*slinesLen_h[n])*nSlines_h[n], - cudaMemcpyDeviceToHost, streams[n])); - - } - //}; - - //#pragma omp parallel for - for (int n = 0; n < ngpus; ++n) { - CHECK_CUDA(cudaSetDevice(n)); - CHECK_CUDA(cudaStreamSynchronize(streams[n])); - CHECK_CUDA(cudaFree(slineSeed_d[n])); - CHECK_CUDA(cudaFree(slinesOffs_d[n])); - CHECK_CUDA(cudaFree(shDirTemp0_d[n])); - CHECK_CUDA(cudaFree(slineLen_d[n])); - CHECK_CUDA(cudaFree(sline_d[n])); - } - -} - -#if 1 -void write_trk(const char *fname, - const /*short*/ int *dims, - const REAL *voxel_size, - const char *voxel_order, - const REAL *vox_to_ras, - const int nsline, - const int *slineLen, - const REAL3 *sline) { - - FILE *fp = fopen(fname, "w"); - if (!fp) { - fprintf(stderr, "Cannot open file %s for writing...\n", fname); - exit(EXIT_FAILURE); - } - - const char ID_STRING[6] = "TRACK"; - short DIM[3] = {1, 1, 1}; - float VOXEL_SIZE[3] = {1.0f, 1.0f, 1.0f}; - float VOX_TO_RAS[4][4] = {{1.0f, 0.0f, 0.0, 0.0f}, - {0.0f, 1.0f, 0.0, 0.0f}, - {0.0f, 0.0f, 1.0, 0.0f}, - {0.0f, 0.0f, 0.0, 1.0f}}; - //const char VOXEL_ORDER[2][4] = {"RAS", "LAS"}; - const float ORIGIN[3] = {0.0f, 0.0f, 0.0f}; - const float IMAGE_ORIENTATION_PATIENT[6] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; - const int VERSION = 2; - const int HDR_SIZE = 1000; - - // write header - unsigned char header[1000]; - memset(&header[0], 0, sizeof(header)); - - long long int off = 0; - - memcpy(header, ID_STRING, sizeof(ID_STRING)); - off += sizeof(ID_STRING); - - if (dims) { - DIM[0] = dims[0]; - DIM[1] = dims[1]; - DIM[2] = dims[2]; - } - memcpy(header+off, DIM, sizeof(DIM)); - off += sizeof(DIM); - - if (voxel_size) { - VOXEL_SIZE[0] = (float)voxel_size[0]; - VOXEL_SIZE[1] = (float)voxel_size[1]; - VOXEL_SIZE[2] = (float)voxel_size[2]; - } - memcpy(header+off, VOXEL_SIZE, sizeof(VOXEL_SIZE)); - off += sizeof(VOXEL_SIZE); - - memcpy(header+off, ORIGIN, sizeof(ORIGIN)); - off += sizeof(ORIGIN); - - // skip n_scalaer(2b) + scalar_name(200b) + - // n_properties(2b) + property_name(200b) - off += 404; - - if (vox_to_ras) { - for(int i = 0; i < 4; i++) { - for(int j = 0; j < 4; j++) { - VOX_TO_RAS[i][j] = (float)vox_to_ras[i*4+j]; - } - } - } - memcpy(header+off, VOX_TO_RAS, sizeof(VOX_TO_RAS)); - off += sizeof(VOX_TO_RAS); - - // skip reserved(444b) - off += 444; - - if (voxel_order) { - memcpy(header+off, voxel_order, 4); - } else { - memcpy(header+off, "LAS", 4); - } - off += 4; //sizeof(VOXEL_ORDER[voxel_order]); - - // skip pad2(4b) - off += 4; - - memcpy(header+off, IMAGE_ORIENTATION_PATIENT, sizeof(IMAGE_ORIENTATION_PATIENT)); - off += sizeof(IMAGE_ORIENTATION_PATIENT); - - // skip pad1(2b) - off += 2; - - // skip invert_x(1b), invert_y(1b), invert_x(1b), swap_xy(1b), swap_yz(1b), swap_zx(1b) - off += 6; - - memcpy(header+off, &nsline, sizeof(int)); - off += sizeof(int); - - memcpy(header+off, &VERSION, sizeof(VERSION)); - off += sizeof(VERSION); - - memcpy(header+off, &HDR_SIZE, sizeof(HDR_SIZE)); - off += sizeof(HDR_SIZE); - - //assert(off == 1000); - if (off != 1000) { - fprintf(stderr, "%s:%s:%d: heder size = %lld, (!= 1000)!\n", __FILE__, __func__, __LINE__, off); - exit(EXIT_FAILURE); - } - - size_t nw = fwrite(header, sizeof(header), 1, fp); - if (nw != 1) { - fprintf(stderr, "Error while writing to file!\n"); - exit(EXIT_FAILURE); - } -#if 0 - // write body - long long maxSlineLen = slineLen[0]; - for(long long i = 1; i < nsline; i++) { - maxSlineLen = MAX(maxSlineLen, slineLen[i]); - } - - float *slineData = (float *)Malloc((1+3*maxSlineLen)*sizeof(*slineData)); -#else - float slineData[1 + 3*(2*MAX_SLINE_LEN)]; -#endif - for(int i = 0; i < nsline; i++) { - reinterpret_cast(slineData)[0] = slineLen[i]; - for(int j = 0; j < slineLen[i]; j++) { - slineData[1+3*j+0] = (float)((sline[i*2*MAX_SLINE_LEN + j].x+0.5)*VOXEL_SIZE[0]); - slineData[1+3*j+1] = (float)((sline[i*2*MAX_SLINE_LEN + j].y+0.5)*VOXEL_SIZE[1]); - slineData[1+3*j+2] = (float)((sline[i*2*MAX_SLINE_LEN + j].z+0.5)*VOXEL_SIZE[2]); - } - nw = fwrite(slineData, (1+3*slineLen[i])*sizeof(*slineData), 1, fp); - if (nw != 1) { - fprintf(stderr, "Error while writing to file!\n"); - exit(EXIT_FAILURE); - } - } -#if 0 - free(slineData); -#endif - fclose(fp); - - return; -} -#else -void write_trk(const int num_threads, - const char *fname, - const /*short*/ int *dims, - const REAL *voxel_size, - const char *voxel_order, - const REAL *vox_to_ras, - const int nsline, - const int *slineLen, - const REAL3 *sline) { - - FILE *fp = fopen(fname, "w"); - if (!fp) { - fprintf(stderr, "Cannot open file %s for writing...\n", fname); - exit(EXIT_FAILURE); - } - - const char ID_STRING[6] = "TRACK"; - short DIM[3] = {1, 1, 1}; - float VOXEL_SIZE[3] = {1.0f, 1.0f, 1.0f}; - float VOX_TO_RAS[4][4] = {{1.0f, 0.0f, 0.0, 0.0f}, - {0.0f, 1.0f, 0.0, 0.0f}, - {0.0f, 0.0f, 1.0, 0.0f}, - {0.0f, 0.0f, 0.0, 1.0f}}; - //const char VOXEL_ORDER[2][4] = {"RAS", "LAS"}; - const float ORIGIN[3] = {0.0f, 0.0f, 0.0f}; - const float IMAGE_ORIENTATION_PATIENT[6] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; - const int VERSION = 2; - const int HDR_SIZE = 1000; - - // write header - unsigned char header[1000]; - memset(&header[0], 0, sizeof(header)); - - long long int off = 0; - - memcpy(header, ID_STRING, sizeof(ID_STRING)); - off += sizeof(ID_STRING); - - if (dims) { - DIM[0] = dims[0]; - DIM[1] = dims[1]; - DIM[2] = dims[2]; - } - memcpy(header+off, DIM, sizeof(DIM)); - off += sizeof(DIM); - - if (voxel_size) { - VOXEL_SIZE[0] = (float)voxel_size[0]; - VOXEL_SIZE[1] = (float)voxel_size[1]; - VOXEL_SIZE[2] = (float)voxel_size[2]; - } - memcpy(header+off, VOXEL_SIZE, sizeof(VOXEL_SIZE)); - off += sizeof(VOXEL_SIZE); - - memcpy(header+off, ORIGIN, sizeof(ORIGIN)); - off += sizeof(ORIGIN); - - // skip n_scalaer(2b) + scalar_name(200b) + - // n_properties(2b) + property_name(200b) - off += 404; - - if (vox_to_ras) { - for(int i = 0; i < 4; i++) { - for(int j = 0; j < 4; j++) { - VOX_TO_RAS[i][j] = (float)vox_to_ras[i*4+j]; - } - } - } - memcpy(header+off, VOX_TO_RAS, sizeof(VOX_TO_RAS)); - off += sizeof(VOX_TO_RAS); - - // skip reserved(444b) - off += 444; - - if (voxel_order) { - memcpy(header+off, voxel_order, 4); - } else { - memcpy(header+off, "LAS", 4); - } - off += 4; //sizeof(VOXEL_ORDER[voxel_order]); - - // skip pad2(4b) - off += 4; - - memcpy(header+off, IMAGE_ORIENTATION_PATIENT, sizeof(IMAGE_ORIENTATION_PATIENT)); - off += sizeof(IMAGE_ORIENTATION_PATIENT); - - // skip pad1(2b) - off += 2; - - // skip invert_x(1b), invert_y(1b), invert_x(1b), swap_xy(1b), swap_yz(1b), swap_zx(1b) - off += 6; - - memcpy(header+off, &nsline, sizeof(int)); - off += sizeof(int); - - memcpy(header+off, &VERSION, sizeof(VERSION)); - off += sizeof(VERSION); - - memcpy(header+off, &HDR_SIZE, sizeof(HDR_SIZE)); - off += sizeof(HDR_SIZE); - - //assert(off == 1000); - if (off != 1000) { - fprintf(stderr, "%s:%s:%d: heder size = %lld, (!= 1000)!\n", __FILE__, __func__, __LINE__, off); - exit(EXIT_FAILURE); - } - - size_t nw = fwrite(header, sizeof(header), 1, fp); - if (nw != 1) { - fprintf(stderr, "Error while writing to file!\n"); - exit(EXIT_FAILURE); - } - - // write body - long long maxSlineLen = slineLen[0]; - for(long long i = 1; i < nsline; i++) { - maxSlineLen = MAX(maxSlineLen, slineLen[i]); - } - - //omp_set_dynamic(0); - const int NTHREADS = num_threads > 0 ? num_threads : 1; - omp_set_num_threads(NTHREADS); - - const int NFLTS_PER_TH = 1 + 2*(3*MAX_SLINE_LEN); - float *slineData = (float *)Malloc(NFLTS_PER_TH*NTHREADS*sizeof(*slineData)); - - #pragma omp parallel - { - const int tid = omp_get_thread_num(); - float *__mySlineData = slineData+tid*NFLTS_PER_TH; -#if 1 - //#pragma omp for schedule(static) - for(int i = 0; i < nsline; i += NTHREADS) { - if (i+tid < nsline) { - reinterpret_cast(__mySlineData)[0] = slineLen[i+tid]; - for(int j = 0; j < slineLen[i+tid]; j++) { - __mySlineData[1+3*j+0] = (float)((sline[(i+tid)*2*MAX_SLINE_LEN + j].x+0.5)*VOXEL_SIZE[0]); - __mySlineData[1+3*j+1] = (float)((sline[(i+tid)*2*MAX_SLINE_LEN + j].y+0.5)*VOXEL_SIZE[1]); - __mySlineData[1+3*j+2] = (float)((sline[(i+tid)*2*MAX_SLINE_LEN + j].z+0.5)*VOXEL_SIZE[2]); - } - } - #pragma omp barrier - if (tid == 0) { - for(int j = 0; j < NTHREADS; j++) { - if (i+j >= nsline) { - break; - } - nw = fwrite(slineData+j*NFLTS_PER_TH, (1+3*slineLen[i+j])*sizeof(*slineData), 1, fp); - if (nw != 1) { - fprintf(stderr, "Error while writing to file!\n"); - exit(EXIT_FAILURE); - } - } - } - #pragma omp barrier - } -#else - // streamlines are not required to be in any specific order inside the trk file... - #pragma omp for - for(int i = 0; i < nsline; i++) { - reinterpret_cast(__mySlineData)[0] = slineLen[i]; - for(int j = 0; j < slineLen[i]; j++) { - __mySlineData[1+3*j+0] = (float)((sline[i*2*MAX_SLINE_LEN + j].x+0.5)*VOXEL_SIZE[0]); - __mySlineData[1+3*j+1] = (float)((sline[i*2*MAX_SLINE_LEN + j].y+0.5)*VOXEL_SIZE[1]); - __mySlineData[1+3*j+2] = (float)((sline[i*2*MAX_SLINE_LEN + j].z+0.5)*VOXEL_SIZE[2]); - } - nw = fwrite(__mySlineData, (1+3*slineLen[i])*sizeof(*__mySlineData), 1, fp); - if (nw != 1) { - fprintf(stderr, "Error while writing to file!\n"); - exit(EXIT_FAILURE); - } - } -#endif - } - free(slineData); - fclose(fp); - - return; -} -#endif diff --git a/cuslines/generate_streamlines_cuda.h b/cuslines/generate_streamlines_cuda.h deleted file mode 100644 index 14105ce..0000000 --- a/cuslines/generate_streamlines_cuda.h +++ /dev/null @@ -1,70 +0,0 @@ -/* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef __GENERATE_STREAMLINES_CUDA_H__ -#define __GENERATE_STREAMLINES_CUDA_H__ - -#include - -#include "globals.h" - -void generate_streamlines_cuda_mgpu(const ModelType model_type, const REAL max_angle, const REAL min_signal, const REAL tc_threshold, const REAL step_size, - const REAL relative_peak_thresh, const REAL min_separation_angle, - const int nseeds, const std::vector &seeds_d, - const int dimx, const int dimy, const int dimz, const int dimt, - const std::vector &dataf_d, const std::vector &H_d, const std::vector &R_d, - const int delta_nr, - const std::vector &delta_b_d, const std::vector &delta_q_d, - const std::vector &b0s_mask_d, const std::vector &metric_map_d, - const int samplm_nr, - const std::vector &sampling_matrix_d, - const std::vector &sphere_vertices_d, const std::vector &sphere_edges_d, const int nedges, - std::vector &slines_h, std::vector &slinesLen_h, std::vector &nSlines_h, - const std::vector nSlines_old_h, const int rng_seed, const int rng_offset, - const int ngpus, const std::vector &streams); -#if 1 -void write_trk(const char *fname, - const /*short*/ int *dims, - const REAL *voxel_size, - const char *voxel_order, - const REAL *vox_to_ras, - const int nsline, - const int *slineLen, - const REAL3 *sline); -#else -void write_trk(const int num_threads, - const char *fname, - const /*short*/ int *dims, - const REAL *voxel_size, - const char *voxel_order, - const REAL *vox_to_ras, - const int nsline, - const int *slineLen, - const REAL3 *sline); -#endif -#endif diff --git a/cuslines/utils.cu b/cuslines/utils.cu deleted file mode 100644 index c7fe47f..0000000 --- a/cuslines/utils.cu +++ /dev/null @@ -1,143 +0,0 @@ - -template -__device__ void prefix_sum_sh_d(REAL_T *num_sh, int __len) { - const int tidx = threadIdx.x; - - const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32; - const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1))); - -#if 0 - // for debugging - __syncwarp(WMASK); - if (tidx == 0) { - for (int j = 1; j < __len; j++) { - num_sh[j] += num_sh[j-1]; - } - } - __syncwarp(WMASK); -#else - for (int j = 0; j < __len; j += BDIM_X) { - if ((tidx == 0) && (j != 0)) { - num_sh[j] += num_sh[j-1]; - } - __syncwarp(WMASK); - - REAL_T __t_pmf; - if (j+tidx < __len) { - __t_pmf = num_sh[j+tidx]; - } - for (int i = 1; i < BDIM_X; i*=2) { - REAL_T __tmp = __shfl_up_sync(WMASK, __t_pmf, i, BDIM_X); - if ((tidx >= i) && (j+tidx < __len)) { - __t_pmf += __tmp; - } - } - if (j+tidx < __len) { - num_sh[j+tidx] = __t_pmf; - } - __syncwarp(WMASK); - } -#endif -} - -template -__device__ void printArrayAlways(const char *name, int ncol, int n, REAL_T *arr) { - printf("%s:\n", name); - - for(int j = 0; j < n; j++) { - if ((j%ncol)==0) printf("\n"); - printf("%10.8f ", arr[j]); - } - printf("\n"); -} - -template -__device__ void printArray(const char *name, int ncol, int n, REAL_T *arr) { - if (!threadIdx.x && !threadIdx.y && !blockIdx.x) { - printArrayAlways(name, ncol, n, arr); - } -} - -template -__device__ REAL_T interpolation_helper_d(const REAL_T* dataf, const REAL_T wgh[3][2], const long long coo[3][2], int dimy, int dimz, int dimt, int t) { - REAL_T __tmp = 0; - #pragma unroll - for (int i = 0; i < 2; i++) { - #pragma unroll - for (int j = 0; j < 2; j++) { - #pragma unroll - for (int k = 0; k < 2; k++) { - __tmp += wgh[0][i] * wgh[1][j] * wgh[2][k] * - dataf[coo[0][i] * dimy * dimz * dimt + - coo[1][j] * dimz * dimt + - coo[2][k] * dimt + - t]; - } - } - } - return __tmp; -} - -template -__device__ int trilinear_interp_d(const int dimx, - const int dimy, - const int dimz, - const int dimt, - int dimt_idx, // If -1, get all - const REAL_T *__restrict__ dataf, - const REAL3_T point, - REAL_T *__restrict__ __vox_data) { - const REAL_T HALF = static_cast(0.5); - - // all thr compute the same here - if (point.x < -HALF || point.x+HALF >= dimx || - point.y < -HALF || point.y+HALF >= dimy || - point.z < -HALF || point.z+HALF >= dimz) { - return -1; - } - - long long coo[3][2]; - REAL_T wgh[3][2]; // could use just one... - - const REAL_T ONE = static_cast(1.0); - - const REAL3_T fl = MAKE_REAL3(FLOOR(point.x), - FLOOR(point.y), - FLOOR(point.z)); - - wgh[0][1] = point.x - fl.x; - wgh[0][0] = ONE-wgh[0][1]; - coo[0][0] = MAX(0, fl.x); - coo[0][1] = MIN(dimx-1, coo[0][0]+1); - - wgh[1][1] = point.y - fl.y; - wgh[1][0] = ONE-wgh[1][1]; - coo[1][0] = MAX(0, fl.y); - coo[1][1] = MIN(dimy-1, coo[1][0]+1); - - wgh[2][1] = point.z - fl.z; - wgh[2][0] = ONE-wgh[2][1]; - coo[2][0] = MAX(0, fl.z); - coo[2][1] = MIN(dimz-1, coo[2][0]+1); - - if (dimt_idx == -1) { - for (int t = threadIdx.x; t < dimt; t += BDIM_X) { - __vox_data[t] = interpolation_helper_d(dataf, wgh, coo, dimy, dimz, dimt, t); - } - } else { - *__vox_data = interpolation_helper_d(dataf, wgh, coo, dimy, dimz, dimt, dimt_idx); - } - - /* - __syncwarp(WMASK); - if (tidx == 0 && threadIdx.y == 0) { - printf("point: %f, %f, %f\n", point.x, point.y, point.z); - for(int i = 0; i < dimt; i++) { - printf("__vox_data[%d]: %f\n", i, __vox_data[i]); - } - } - */ - return 0; -} diff --git a/merge_trk.sh b/merge_trk.sh deleted file mode 100755 index c47412f..0000000 --- a/merge_trk.sh +++ /dev/null @@ -1,99 +0,0 @@ -#!/bin/bash - -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# 3. Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -usage() { - echo "$(basename $0) [ -v ] -o trk_outfile ..." -} - -if [ $# -eq 0 ]; then - usage - exit 1 -fi - -OUT_FILE="" -VERBOSE="" - -OPTS=$(getopt -o "vho:" -- "$@") -eval set -- "$OPTS" - -while true; do - case "$1" in - -o) - OUT_FILE=$2 - shift - shift - ;; - -v) - VERBOSE="1" - shift - ;; - -h) - usage - exit 1 - ;; - --) - shift - break - ;; - esac -done - -if [ -z $OUT_FILE ]; then - echo "Please provide an output file name with the -o option!" - exit 1 -fi - -# necessary when running via docker to expand again -# the parameter list turning spaces into separators -set -- $* - -TRK_FILES=("$@") -NTRKF=$(($#)) - -#echo $TRK_FILES -#echo $NTRKF - -if [ $VERBOSE ]; then - echo "Merging $NTRKF files into $OUT_FILE..." -fi - -head -c1000 ${TRK_FILES[0]} > $OUT_FILE - -NTRACK=0 -for((i=0; i<$NTRKF; i++)); do - if [ $VERBOSE ]; then - printf "%8d/%8d\r" $i $NTRKF - fi - NTRACK=$(($NTRACK + $(od -A none -t dI -j 988 -N4 ${TRK_FILES[$i]}))); - tail -c+1001 ${TRK_FILES[$i]} >> $OUT_FILE -done - -NTRACK=$(printf "%08X" $NTRACK) - -printf "\x${NTRACK:6:2}\x${NTRACK:4:2}\x${NTRACK:2:2}\x${NTRACK:0:2}" | dd of=$OUT_FILE bs=1 seek=988 count=4 conv=notrunc &> /dev/null diff --git a/pyproject.toml b/pyproject.toml index 7ad8645..67877c1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [build-system] -requires = ["scikit-build-core", "pybind11"] -build-backend = "scikit_build_core.build" +requires = ["setuptools>=61.0"] +build-backend = "setuptools.build_meta" [project] name = "cuslines" @@ -10,8 +10,15 @@ readme = "README.md" requires-python = ">=3.7" dependencies = [ "numpy", - "pybind11" + "nibabel", + "tqdm", + "dipy", + "trx-python", + "cuda-python", + "cuda-core", + "cuda-cccl" ] -[tool.scikit-build] -cmake.build-type = "Release" +[tool.setuptools.packages.find] +where = ["."] +include = ["cuslines*"] diff --git a/run_gpu_streamlines.py b/run_gpu_streamlines.py index e627978..0d6c447 100644 --- a/run_gpu_streamlines.py +++ b/run_gpu_streamlines.py @@ -30,34 +30,38 @@ import argparse import random import time -import zipfile import numpy as np -import numpy.linalg as npl import dipy.reconst.dti as dti from dipy.io import read_bvals_bvecs -from dipy.io.stateful_tractogram import Origin, Space, StatefulTractogram +from dipy.io.stateful_tractogram import Space, StatefulTractogram from dipy.io.streamline import save_tractogram from dipy.tracking import utils from dipy.core.gradients import gradient_table, unique_bvals_magnitude from dipy.data import default_sphere -from dipy.direction import (BootDirectionGetter, ProbabilisticDirectionGetter, PTTDirectionGetter) +from dipy.direction import ( + BootDirectionGetter as cpu_BootDirectionGetter, + ProbabilisticDirectionGetter as cpu_ProbDirectionGetter, + PTTDirectionGetter as cpu_PTTDirectionGetter) from dipy.reconst.shm import OpdtModel, CsaOdfModel from dipy.reconst.csdeconv import ConstrainedSphericalDeconvModel, auto_response_ssst from dipy.tracking.local_tracking import LocalTracking from dipy.tracking.stopping_criterion import ThresholdStoppingCriterion -from dipy.reconst import shm from dipy.data import get_fnames from dipy.data import read_stanford_pve_maps import nibabel as nib from nibabel.orientations import aff2axcodes -from trx.trx_file_memmap import TrxFile, zip_from_folder +from trx.io import save as save_trx -# Import custom module -import cuslines +from cuslines import ( + BootDirectionGetter, + GPUTracker, + ProbDirectionGetter, + PttDirectionGetter, +) t0 = time.time() @@ -68,7 +72,7 @@ #Get Gradient values def get_gtab(fbval, fbvec): bvals, bvecs = read_bvals_bvecs(fbval, fbvec) - gtab = gradient_table(bvals, bvecs) + gtab = gradient_table(bvals=bvals, bvecs=bvecs) return gtab def get_img(ep2_seq): @@ -87,7 +91,7 @@ def get_img(ep2_seq): parser.add_argument("--chunk-size", type=int, default=100000, help="how many seeds to process per sweep, per GPU") parser.add_argument("--nseeds", type=int, default=100000, help="how many seeds to process in total") parser.add_argument("--ngpus", type=int, default=1, help="number of GPUs to use if using gpu") -parser.add_argument("--write-method", type=str, default="fast", help="Can be trx, fast, or standard") +parser.add_argument("--write-method", type=str, default="trk", help="Can be trx or trk") parser.add_argument("--max-angle", type=float, default=60, help="max angle (in degrees)") parser.add_argument("--min-signal", type=float, default=1.0, help="default: 1.0") parser.add_argument("--step-size", type=float, default=0.5, help="default: 0.5") @@ -101,9 +105,9 @@ def get_img(ep2_seq): args = parser.parse_args() -if args.device == "cpu" and args.write_method != "standard": - print("WARNING: only standard write method is implemented for cpu testing.") - write_method = "standard" +if args.device == "cpu" and args.write_method != "trk": + print("WARNING: only trk write method is implemented for cpu testing.") + write_method = "trk" else: write_method = args.write_method @@ -111,7 +115,8 @@ def get_img(ep2_seq): if not all(arg == 'hardi' for arg in [args.nifti_file, args.bvals, args.bvecs, args.mask_nifti, args.roi_nifti]): raise ValueError("If any of the arguments is 'hardi', all must be 'hardi'") # Get Stanford HARDI data - hardi_nifti_fname, hardi_bval_fname, hardi_bvec_fname = get_fnames('stanford_hardi') + hardi_nifti_fname, hardi_bval_fname, hardi_bvec_fname = get_fnames( + name='stanford_hardi') csf, gm, wm = read_stanford_pve_maps() wm_data = wm.get_fdata() @@ -135,18 +140,14 @@ def get_img(ep2_seq): tenmodel = dti.TensorModel(gtab, fit_method='WLS') print('Fitting Tensor') -tenfit = tenmodel.fit(data, mask) +tenfit = tenmodel.fit(data, mask=mask) print('Computing anisotropy measures (FA,MD,RGB)') FA = tenfit.fa -FA[np.isnan(FA)] = 0 # Setup tissue_classifier args tissue_classifier = ThresholdStoppingCriterion(FA, args.fa_threshold) -metric_map = np.asarray(FA, 'float64') # Create seeds for ROI -# seed_mask = utils.seeds_from_mask(roi_data, density=args.sampling_density, affine=np.eye(4)) -# seed_mask = seed_mask[0:args.nseeds] seed_mask = np.asarray(utils.random_seeds_from_mask( roi_data, seeds_count=args.nseeds, seed_count_per_voxel=False, @@ -155,20 +156,27 @@ def get_img(ep2_seq): # Setup model sphere = default_sphere if args.model == "opdt": - model_type = cuslines.ModelType.OPDT - print("Running OPDT model...") - model = OpdtModel(gtab, sh_order=args.sh_order, smooth=args.sm_lambda, min_signal=args.min_signal) - fit_matrix = model._fit_matrix - delta_b, delta_q = fit_matrix + if args.device == "cpu": + model = OpdtModel(gtab, sh_order=args.sh_order, smooth=args.sm_lambda, min_signal=args.min_signal) + dg = cpu_BootDirectionGetter + else: + dg = BootDirectionGetter.from_dipy_opdt( + gtab, + sphere, + sh_order_max=args.sh_order, + sh_lambda=args.sm_lambda, + min_signal=args.min_signal) elif args.model == "csa": - model_type = cuslines.ModelType.CSA - print("Running CSA model...") - model = CsaOdfModel(gtab, sh_order=args.sh_order, smooth=args.sm_lambda, min_signal=args.min_signal) - fit_matrix = model._fit_matrix - # Unlike OPDT, CSA has a single matrix used for fit_matrix. Populating delta_b and delta_q with necessary values for - # now. - delta_b = fit_matrix - delta_q = fit_matrix + if args.device == "cpu": + model = CsaOdfModel(gtab, sh_order=args.sh_order, smooth=args.sm_lambda, min_signal=args.min_signal) + dg = cpu_BootDirectionGetter + else: + dg = BootDirectionGetter.from_dipy_csa( + gtab, + sphere, + sh_order_max=args.sh_order, + sh_lambda=args.sm_lambda, + min_signal=args.min_signal) else: print("Running CSD model...") unique_bvals = unique_bvals_magnitude(gtab.bvals) @@ -186,158 +194,67 @@ def get_img(ep2_seq): roi_radii=10, fa_thr=0.7) model = ConstrainedSphericalDeconvModel(gtab, response, sh_order=args.sh_order) - # TODO: we shouldnt have to do this, also for CSA, but we populate delta_b, delta_q. - # we need to name change delta_b/delta_q and make it possible for them to be None, or something like this - delta_b = model._X - delta_q = model.B_reg - -if args.dg != "boot": - if args.dg == "prob": - model_type = cuslines.ModelType.PROB - dg = ProbabilisticDirectionGetter - else: - model_type = cuslines.ModelType.PTT - dg = PTTDirectionGetter - fit = model.fit(data, mask=(metric_map >= args.fa_threshold)) + fit = model.fit(data, mask=(FA >= args.fa_threshold)) data = fit.odf(sphere).clip(min=0) -else: - dg = BootDirectionGetter - -global_chunk_size = args.chunk_size + if args.dg == "ptt": + if args.device == "cpu": + dg = cpu_PTTDirectionGetter() + else: + # Set FOD to 0 outside mask for probing + data[FA < args.fa_threshold, :] = 0 + dg = PttDirectionGetter() + elif args.dg == "prob": + if args.device == "cpu": + dg = cpu_ProbDirectionGetter() + else: + dg = ProbDirectionGetter() + else: + raise ValueError("Unknown direction getter type: {}".format(args.dg)) # Setup direction getter args if args.device == "cpu": if args.dg != "boot": dg = dg.from_pmf(data, max_angle=args.max_angle, sphere=sphere, relative_peak_threshold=args.relative_peak_threshold, min_separation_angle=args.min_separation_angle) else: - dg = BootDirectionGetter.from_data(data, model, max_angle=args.max_angle, sphere=sphere, sh_order=args.sh_order, relative_peak_threshold=args.relative_peak_threshold, min_separation_angle=args.min_separation_angle) -else: - # Setup direction getter args - b0s_mask = gtab.b0s_mask - dwi_mask = ~b0s_mask - - # setup sampling matrix - theta = sphere.theta - phi = sphere.phi - sampling_matrix, _, _ = shm.real_sym_sh_basis(args.sh_order, theta, phi) - - ## from BootPmfGen __init__ - # setup H and R matrices - # TODO: figure out how to get H, R matrices from direction getter object - x, y, z = model.gtab.gradients[dwi_mask].T - r, theta, phi = shm.cart2sphere(x, y, z) - B, _, _ = shm.real_sym_sh_basis(args.sh_order, theta, phi) - H = shm.hat(B) - R = shm.lcr_matrix(H) - - # create floating point copy of data - dataf = np.asarray(data, dtype=np.float64) - - gpu_tracker = cuslines.GPUTracker(model_type, - args.max_angle * np.pi/180, - args.min_signal, - args.fa_threshold, - args.step_size, - args.relative_peak_threshold, - args.min_separation_angle * np.pi/180, - dataf.astype(np.float64), H.astype(np.float64), R.astype(np.float64), delta_b.astype(np.float64), delta_q.astype(np.float64), - b0s_mask.astype(np.int32), metric_map.astype(np.float64), sampling_matrix.astype(np.float64), - sphere.vertices.astype(np.float64), sphere.edges.astype(np.int32), - ngpus=args.ngpus, rng_seed=0) - -print('streamline gen') -nchunks = (seed_mask.shape[0] + global_chunk_size - 1) // global_chunk_size - -t1 = time.time() -streamline_time = 0 -io_time = 0 - -if args.output_prefix and write_method == "trx": - # Will resize by a factor of 2 if these are exceeded - sl_len_guess = 100 - sl_per_seed_guess = 3 - n_sls_guess = sl_per_seed_guess*len(seed_mask) - - # trx files use memory mapping - trx_file = TrxFile( - reference=hardi_nifti_fname, - nb_streamlines=n_sls_guess, - nb_vertices=n_sls_guess*sl_len_guess) - offsets_idx = 0 - sls_data_idx = 0 - -for idx in range(int(nchunks)): - # Main streamline computation - ts = time.time() - if args.device == "cpu": - streamline_generator = LocalTracking(dg, tissue_classifier, seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size], affine=np.eye(4), step_size=args.step_size) - streamlines = [s for s in streamline_generator] - else: - streamlines = gpu_tracker.generate_streamlines(seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size]) - te = time.time() - streamline_time += (te-ts) - print("Generated {} streamlines from {} seeds, time: {} s".format(len(streamlines), - seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size].shape[0], - te-ts)) + dg = dg.from_data(data, model, max_angle=args.max_angle, sphere=sphere, sh_order=args.sh_order, relative_peak_threshold=args.relative_peak_threshold, min_separation_angle=args.min_separation_angle) - # Save tracklines file - if args.output_prefix: ts = time.time() - if write_method == "standard": - fname = "{}.{}_{}.trk".format(args.output_prefix, idx+1, nchunks) - sft = StatefulTractogram(streamlines, args.nifti_file, Space.VOX) - save_tractogram(sft, fname) - te = time.time() - print("Saved streamlines to {}, time {} s".format(fname, te-ts)) - elif write_method == "trx": - tractogram = nib.streamlines.Tractogram(streamlines, affine_to_rasmm=img.affine) - tractogram.to_world() - sls = tractogram.streamlines - - new_offsets_idx = offsets_idx + len(sls._offsets) - new_sls_data_idx = sls_data_idx + len(sls._data) - - if new_offsets_idx > trx_file.header["NB_STREAMLINES"]\ - or new_sls_data_idx > trx_file.header["NB_VERTICES"]: - print("TRX resizing...") - trx_file.resize(nb_streamlines=new_offsets_idx*2, nb_vertices=new_sls_data_idx*2) - - # TRX uses memmaps here - trx_file.streamlines._data[sls_data_idx:new_sls_data_idx] = sls._data - trx_file.streamlines._offsets[offsets_idx:new_offsets_idx] = offsets_idx + sls._offsets - trx_file.streamlines._lengths[offsets_idx:new_offsets_idx] = sls._lengths - - offsets_idx = new_offsets_idx - sls_data_idx = new_sls_data_idx - - te = time.time() - print("Streamlines to TRX format, time {} s".format(te-ts)) - else: - fname = "{}.{}_{}".format(args.output_prefix, idx+1, nchunks) - gpu_tracker.dump_streamlines(fname, voxel_order, wm.shape, wm.header.get_zooms(), img.affine) - te = time.time() - print("Saved streamlines to {}, time {} s".format(fname, te-ts)) - - io_time += (te-ts) - -if args.output_prefix and write_method == "trx": - ts = time.time() - fname = "{}.trx".format(args.output_prefix) - trx_file.resize() - zip_from_folder( - trx_file._uncompressed_folder_handle.name, - fname, - zipfile.ZIP_STORED) - trx_file.close() - te = time.time() - print("Saved streamlines to {}, time {} s".format(fname, te-ts)) - io_time += (te-ts) - -t2 = time.time() + streamline_generator = LocalTracking(dg, tissue_classifier, seed_mask, affine=np.eye(4), step_size=args.step_size) + sft = StatefulTractogram(streamline_generator, img, Space.VOX) + n_sls = len(sft.streamlines) + te = time.time() +else: + with GPUTracker( + dg, + data, + FA, + args.fa_threshold, + sphere.vertices, + sphere.edges, + max_angle=args.max_angle * np.pi/180, + step_size=args.step_size, + relative_peak_thresh=args.relative_peak_threshold, + min_separation_angle=args.min_separation_angle * np.pi/180, + ngpus=args.ngpus, + rng_seed=0, + chunk_size=args.chunk_size + ) as gpu_tracker: + ts = time.time() + if args.output_prefix and write_method == "trx": + trx_file = gpu_tracker.generate_trx(seed_mask, img) + n_sls = len(trx_file.streamlines) + else: + sft = gpu_tracker.generate_sft(seed_mask, img) + n_sls = len(sft.streamlines) + te = time.time() +print("Generated {} streamlines from {} seeds, time: {} s".format(n_sls, + seed_mask.shape[0], + te-ts)) -print("Completed processing {} seeds.".format(seed_mask.shape[0])) -print("Initialization time: {} sec".format(t1-t0)) -print("Streamline generation total time: {} sec".format(t2-t1)) -print("\tStreamline processing: {} sec".format(streamline_time)) if args.output_prefix: - print("\tFile writing: {} sec".format(io_time)) + if write_method == "trx": + fname = "{}.trx".format(args.output_prefix) + save_trx(trx_file, fname) + else: + fname = "{}.trk".format(args.output_prefix) + save_tractogram(sft, fname) diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..ae15d6a --- /dev/null +++ b/setup.py @@ -0,0 +1,46 @@ +from setuptools import setup +from setuptools.command.build_py import build_py +from pathlib import Path +import re + + +def defines_to_python(src, dst): + src = Path(src) + dst = Path(dst) + + INT_DEFINE = re.compile( + r"#define\s+(\w+)\s+\(?\s*([0-9]+)\s*\)?" + ) + + REAL_CAST_DEFINE = re.compile( + r"#define\s+(\w+)\s+\(\(REAL\)\s*([0-9eE\.\+\-]+)\s*\)" + ) + + defines = {} + + for line in src.read_text().splitlines(): + if m := INT_DEFINE.match(line): + defines[m.group(1)] = int(m.group(2)) + elif m := REAL_CAST_DEFINE.match(line): + defines[m.group(1)] = float(m.group(2)) + + dst.parent.mkdir(parents=True, exist_ok=True) + + with dst.open("w") as f: + f.write("# AUTO-GENERATED FROM globals.h — DO NOT EDIT\n\n") + for k, v in sorted(defines.items()): + f.write(f"{k} = {v}\n") + +class build_py_with_cuda(build_py): + def run(self): + root = Path(__file__).parent + + globals_src = str(root / "cuslines" / "globals.h") + globals_dst = str(root / "cuslines" / "cuda_python" / "_globals.py") + defines_to_python(globals_src, globals_dst) + + super().run() + +setup( + cmdclass={"build_py": build_py_with_cuda}, +)