diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..78bb5e2
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,6 @@
+# Python bytecode
+**/*.pyc
+**/__pycache__/
+*.pyo
+*.pyd
+
diff --git a/CMakeLists.txt b/CMakeLists.txt
deleted file mode 100644
index f27d490..0000000
--- a/CMakeLists.txt
+++ /dev/null
@@ -1,70 +0,0 @@
-cmake_minimum_required(VERSION 3.24)
-
-project(cuslines LANGUAGES CUDA CXX VERSION 1.0)
-
-# Build settings
-set(CMAKE_CXX_STANDARD 11)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-set(CMAKE_POSITION_INDEPENDENT_CODE ON)
-set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -Wall -Werror=reorder")
-set(CMAKE_CXX_FLAGS_RELEASE "-O3")
-
-if(NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE "Release")
-endif()
-
-if (CMAKE_BUILD_TYPE STREQUAL "Debug" )
-  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_DEBUG}")
-else()
-  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_RELEASE}")
-endif()
-
-# CUDA
-find_package(CUDAToolkit REQUIRED)
-
-# Set default CUDA compute capabilities if unset
-if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-  include(FindCUDA/select_compute_arch.cmake)
-  cuda_select_nvcc_arch_flags(CUDA_ARCH_FLAGS Auto)
-  set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH_FLAGS})
-endif()
-message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
-
-# OpenMP
-find_package(OpenMP)
-if(OPENMP_FOUND)
-    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
-    set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
-    
-    # Set OMP runtime based on compiler
-    if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
-      set(OMP_RUNTIME "INTEL")
-    elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
-      set(OMP_RUNTIME "GNU")
-    elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
-      set(OMP_RUNTIME "INTEL")
-    endif()
-    message(STATUS "OpenMP runtime used: ${OMP_RUNTIME}")
-endif()
-
-# Find pybind11
-execute_process(COMMAND python -c "import pybind11; print(pybind11.get_cmake_dir())"
-                OUTPUT_VARIABLE pybind11_DIR
-                OUTPUT_STRIP_TRAILING_WHITESPACE)
-list(APPEND CMAKE_PREFIX_PATH ${pybind11_DIR})
-find_package(pybind11 REQUIRED)
-
-# Build library and pybind11 module
-add_library(cuslines_kernels)
-target_sources(cuslines_kernels
-	       PRIVATE
-	       ${CMAKE_SOURCE_DIR}/cuslines/generate_streamlines_cuda.cu)
-set_target_properties(cuslines_kernels PROPERTIES OUTPUT_NAME cuslines_kernels
-                                                  POSITION_INDEPENDENT_CODE TRUE)
-
-pybind11_add_module(cuslines ${CMAKE_SOURCE_DIR}/cuslines/cuslines.cpp)
-target_include_directories(cuslines PUBLIC "${CMAKE_SOURCE_DIR}/cuslines" "${CUDAToolkit_INCLUDE_DIRS}")
-target_link_libraries(cuslines PRIVATE cuslines_kernels CUDA::cudart_static)
-
-# Install
-install(TARGETS cuslines cuslines_kernels LIBRARY DESTINATION .)
diff --git a/Dockerfile b/Dockerfile
index 06e9de9..889371d 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -16,17 +16,20 @@ RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.0/cmake-3.24.0
     && mkdir /opt/cmake \
     && /tmp/cmake-install.sh --skip-license --prefix=/opt/cmake \
     && rm /tmp/cmake-install.sh
-ENV PATH /opt/cmake/bin:${PATH}
+ENV PATH=/opt/cmake/bin:${PATH}
 
 RUN curl -L "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh" \
     -o "/tmp/Miniconda3.sh"
 RUN bash /tmp/Miniconda3.sh -b -p /opt/anaconda
 RUN rm -rf /tmp/Miniconda3.sh
 RUN cd /opt && eval "$(/opt/anaconda/bin/conda shell.bash hook)"
-ENV PATH /opt/anaconda/bin:${PATH}
-ENV LD_LIBRARY_PATH /opt/anaconda/lib:${LD_LIBRARY_PATH}
+ENV PATH=/opt/anaconda/bin:${PATH}
+ENV LD_LIBRARY_PATH=/opt/anaconda/lib:${LD_LIBRARY_PATH}
 
 # python prereqs
+RUN conda tos accept --override-channels --channel conda-forge
+RUN conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main
+RUN conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r
 RUN conda install -c conda-forge git
 RUN pip install numpy>=2.0.0
 RUN pip install scipy>=1.13.0 cython nibabel dipy tqdm
diff --git a/README.md b/README.md
index 5cda7f5..9e0275d 100644
--- a/README.md
+++ b/README.md
@@ -48,63 +48,7 @@ Destroy GPUTracker...
 
 Note that if you experience memory errors, you can adjust the `--chunk-size` flag.
 
-To run on more seeds, we suggest enabling the `--use-fast-write` flag in the GPU script to not get bottlenecked by writing files. Here is a comparison running on 500K seeds on 1 GPU with and without this flag:
-
-Without `--use-fast-write`:
-```
-$ python run_gpu_streamlines.py --output-prefix small --nseeds 500000 --ngpus 1
-parsing arguments
-Fitting Tensor
-Computing anisotropy measures (FA,MD,RGB)
-slowadcodf
-Bootstrap direction getter
-streamline gen
-Creating GPUTracker with 1 GPUs...
-Generated 143891 streamlines from 100000 seeds, time: 7.978902339935303 s
-Saved streamlines to small.1_5.trk, time 11.439777851104736 s
-Generated 151932 streamlines from 100000 seeds, time: 10.155118703842163 s
-Saved streamlines to small.2_5.trk, time 12.438884019851685 s
-Generated 146971 streamlines from 100000 seeds, time: 9.822870016098022 s
-Saved streamlines to small.3_5.trk, time 12.377111673355103 s
-Generated 153824 streamlines from 100000 seeds, time: 11.133368968963623 s
-Saved streamlines to small.4_5.trk, time 13.317519187927246 s
-Generated 162004 streamlines from 100000 seeds, time: 13.19784665107727 s
-Saved streamlines to small.5_5.trk, time 14.21276593208313 s
-Completed processing 500000 seeds.
-Initialization time: 14.789637088775635 sec
-Streamline generation total time: 116.0746865272522 sec
-        Streamline processing: 52.28810667991638 sec
-        File writing: 63.7860586643219 sec
-Destroy GPUTracker...
-```
-
-With `--use-fast-write`:
-```
-$ python run_gpu_streamlines.py --output-prefix small --nseeds 500000 --ngpus 1 --use-fast-write
-parsing arguments
-Fitting Tensor
-Computing anisotropy measures (FA,MD,RGB)
-slowadcodf
-Bootstrap direction getter
-streamline gen
-Creating GPUTracker with 1 GPUs...
-Generated 143891 streamlines from 100000 seeds, time: 7.962322473526001 s
-Saved streamlines to small.1_5_*.trk, time 0.1053612232208252 s
-Generated 151932 streamlines from 100000 seeds, time: 10.148677825927734 s
-Saved streamlines to small.2_5_*.trk, time 0.1606450080871582 s
-Generated 146971 streamlines from 100000 seeds, time: 9.811130285263062 s
-Saved streamlines to small.3_5_*.trk, time 0.571892499923706 s
-Generated 153824 streamlines from 100000 seeds, time: 11.186563968658447 s
-Saved streamlines to small.4_5_*.trk, time 0.3091111183166504 s
-Generated 162004 streamlines from 100000 seeds, time: 13.282683610916138 s
-Saved streamlines to small.5_5_*.trk, time 0.7107999324798584 s
-Completed processing 500000 seeds.
-Initialization time: 14.705361366271973 sec
-Streamline generation total time: 54.24975609779358 sec
-        Streamline processing: 52.39137816429138 sec
-        File writing: 1.8578097820281982 sec
-Destroy GPUTracker...
-```
+To run on more seeds, we suggest setting the `--write-method trx` flag in the GPU script to not get bottlenecked by writing files.
 
 ## Running on AWS with Docker
 First, set up an AWS instance with GPU and ssh into it (we recommend a P3 instance with at least 1 V100 16 GB GPU and a Deep Learning AMI Ubuntu 18.04 v 33.0.). Then do the following:
diff --git a/build_cuslines.sh b/build_cuslines.sh
deleted file mode 100755
index 8375223..0000000
--- a/build_cuslines.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/bash
-
-build_dir=$(pwd)/build
-install_dir=$(pwd)/install
-
-# set up build dir
-mkdir -p ${build_dir}
-cd ${build_dir}
-
-# configure
-cmake -DCMAKE_INSTALL_PREFIX=${install_dir} \
-      -DCMAKE_BUILD_TYPE=Release \
-      -DCMAKE_C_COMPILER=gcc \
-      -DCMAKE_CXX_COMPILER=g++ \
-      -DPYTHON_EXECUTABLE=$(which python) \
-      ..
-
-# compile
-make && make install
diff --git a/cuslines/Makefile b/cuslines/Makefile
deleted file mode 100644
index 1061a16..0000000
--- a/cuslines/Makefile
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-#    list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-#    this list of conditions and the following disclaimer in the documentation
-#    and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-#    contributors may be used to endorse or promote products derived from
-#    this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-CUDA_HOME=/usr/local/cuda
-CUDACC=$(CUDA_HOME)/bin/nvcc # -G -g -dopt=on
-CXX=g++
-LD=g++
-
-CXXFLAGS= -c -O3 -std=c++11 -fopenmp -fPIC `python3 -m pybind11 --includes` -I$(CUDA_HOME)/include
-
-SMS ?= 70
-CUDA_ARCH = $(foreach SM,$(SMS),-gencode arch=compute_$(SM),code=sm_$(SM))
-LASTSM := $(lastword $(sort $(SMS)))
-CUDA_ARCH += -gencode arch=compute_$(LASTSM),code=compute_$(LASTSM)
-CUDACFLAGS=-c -O3 -lineinfo -Xptxas=-v -std=c++11 -Xcompiler -fPIC -Xcompiler=-fopenmp $(CUDA_ARCH)
-LDFLAGS= -shared -fopenmp -L$(CUDA_HOME)/lib64 -lcudart -lnvToolsExt
-
-all: cuslines
-
-cuslines: generate_streamlines_cuda.o cuslines.o
-	$(LD) cuslines.o generate_streamlines_cuda.o -o cuslines`python3-config --extension-suffix` $(LDFLAGS)
-
-%.o : %.cu
-	$(CUDACC) $(CUDACFLAGS) $< -o $@
-
-%.o: %.cpp
-	$(CXX) $(CXXFLAGS) $< -o $@
-
-clean:
-	rm *.o cuslines`python3-config --extension-suffix` __pycache__/*.pyc
diff --git a/cuslines/__init__.py b/cuslines/__init__.py
new file mode 100644
index 0000000..b96cca1
--- /dev/null
+++ b/cuslines/__init__.py
@@ -0,0 +1,13 @@
+from .cuda_python import (
+    GPUTracker,
+    ProbDirectionGetter,
+    PttDirectionGetter,
+    BootDirectionGetter
+)
+
+__all__ = [
+    "GPUTracker",
+    "ProbDirectionGetter",
+    "PttDirectionGetter",
+    "BootDirectionGetter"
+]
diff --git a/cuslines/cuda_c/boot.cu b/cuslines/cuda_c/boot.cu
new file mode 100644
index 0000000..133c43d
--- /dev/null
+++ b/cuslines/cuda_c/boot.cu
@@ -0,0 +1,1066 @@
+//#define USE_FIXED_PERMUTATION
+#ifdef USE_FIXED_PERMUTATION
+//__device__ const int fixedPerm[] = {44, 47, 53,  0,  3,  3, 39,  9, 19, 21, 50, 36, 23,
+//                                     6, 24, 24, 12,  1, 38, 39, 23, 46, 24, 17, 37, 25, 
+//                                    13,  8,  9, 20, 51, 16, 51,  5, 15, 47,  0, 18, 35, 
+//                                    24, 49, 51, 29, 19, 19, 14, 39, 32,  1,  9, 32, 31,
+//                                    10, 52, 23};
+__device__ const int fixedPerm[] = {
+  47, 117,  67, 103,   9,  21,  36,  87,  70,  88, 140,  58,  39,  87,  88,  81,  25,  77,
+  72,   9, 148, 115,  79,  82,  99,  29, 147, 147, 142,  32,   9, 127,  32,  31, 114,  28,
+  34, 128, 128,  53, 133,  38,  17,  79, 132, 105,  42,  31, 120,   1,  65,  57,  35, 102,
+ 119,  11,  82,  91, 128, 142,  99,  53, 140, 121,  84,  68,   6,  47, 127, 131, 100,  78,
+ 143, 148,  23, 141, 117,  85,  48,  49,  69,  95,  94,   0, 113,  36,  48,  93, 131,  98,
+  42, 112, 149, 127,   0, 138, 114,  43, 127,  23, 130, 121,  98,  62, 123,  82, 148,  50,
+  14,  41,  58,  36,  10,  86,  43, 104,  11,   2,  51,  80,  32, 128,  38,  19,  42, 115,
+  77,  30,  24, 125,   2,   3,  94, 107,  13, 112,  40,  72,  19,  95,  72,  67,  61,  14,
+  96,   4, 139,  86, 121, 109};
+#endif
+
+template<int BDIM_X,
+         typename VAL_T>
+__device__ VAL_T avgMask(const int mskLen,
+			 const int *__restrict__ mask,
+			 const VAL_T *__restrict__ data) {
+        
+	const int tidx = threadIdx.x;
+        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+
+        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
+        int   __myCnt = 0;
+        VAL_T __mySum = 0;
+
+        for(int i = tidx; i < mskLen; i += BDIM_X) {
+		if(mask[i]) {
+			__myCnt++;
+			__mySum += data[i];
+		}
+        }
+
+        #pragma unroll
+        for(int i = BDIM_X/2; i; i /= 2) {
+                __mySum += __shfl_xor_sync(WMASK, __mySum, i, BDIM_X);
+                __myCnt += __shfl_xor_sync(WMASK, __myCnt, i, BDIM_X);
+        }
+
+        return __mySum/__myCnt;
+
+}
+
+template<
+    int BDIM_X,
+    typename LEN_T,
+    typename MSK_T,
+    typename VAL_T>
+__device__ LEN_T maskGet(const LEN_T n, 
+			 const MSK_T *__restrict__ mask,
+			 const VAL_T *__restrict__ plain,
+			       VAL_T *__restrict__ masked) {
+
+	const int tidx = threadIdx.x;
+	
+        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
+	const int __laneMask = (1 << tidx)-1;
+
+	int woff = 0;
+	for(int j = 0; j < n; j += BDIM_X) {
+
+		const int __act = (j+tidx < n) ? !mask[j+tidx] : 0;
+		const int __msk = __ballot_sync(WMASK, __act);
+
+		const int toff = __popc(__msk & __laneMask);
+		if (__act) {
+			masked[woff+toff] = plain[j+tidx];
+		}
+		woff += __popc(__msk);
+	}
+	return woff;
+}
+
+template<
+    int BDIM_X,
+    typename LEN_T,
+    typename MSK_T,
+    typename VAL_T>
+__device__ void maskPut(const LEN_T n, 
+			const MSK_T *__restrict__ mask,
+			const VAL_T *__restrict__ masked,
+			      VAL_T *__restrict__ plain) {
+
+	const int tidx = threadIdx.x;
+	
+        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
+	const int __laneMask = (1 << tidx)-1;
+
+	int woff = 0;
+	for(int j = 0; j < n; j += BDIM_X) {
+
+		const int __act = (j+tidx < n) ? !mask[j+tidx] : 0;
+		const int __msk = __ballot_sync(WMASK, __act);
+
+		const int toff = __popc(__msk & __laneMask);
+		if (__act) {
+			plain[j+tidx] = masked[woff+toff];
+		}
+		woff += __popc(__msk);
+	}
+	return;
+}
+
+template<int BDIM_X,
+         int BDIM_Y,
+         typename REAL_T,
+         typename REAL3_T>
+__device__ int closest_peak_d(const REAL_T max_angle,
+			      const REAL3_T  direction, //dir
+                              const int npeaks,
+                              const REAL3_T *__restrict__ peaks,
+                                    REAL3_T *__restrict__ peak) {// dirs,
+
+        const int tidx = threadIdx.x;
+
+        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
+        //const REAL_T cos_similarity = COS(MAX_ANGLE_P);
+        const REAL_T cos_similarity = COS(max_angle);
+#if 0
+        if (!threadIdx.y && !tidx) {
+                printf("direction: (%f, %f, %f)\n",
+                        direction.x, direction.y, direction.z);
+        }
+        __syncwarp(WMASK);
+#endif
+        REAL_T cpeak_dot = 0;
+        int    cpeak_idx = -1;
+        for(int j = 0; j < npeaks; j += BDIM_X) {
+                if (j+tidx < npeaks) {
+#if 0
+                        if (!threadIdx.y && !tidx) {
+                                printf("j+tidx: %d, peaks[j+tidx]: (%f, %f, %f)\n",
+                                        j+tidx, peaks[j+tidx].x, peaks[j+tidx].y, peaks[j+tidx].z);
+                        }
+#endif
+                        const REAL_T dot = direction.x*peaks[j+tidx].x+
+                                           direction.y*peaks[j+tidx].y+
+                                           direction.z*peaks[j+tidx].z;
+
+                        if (FABS(dot) > FABS(cpeak_dot)) {
+                                cpeak_dot = dot;
+                                cpeak_idx = j+tidx;
+                        }
+                }
+        }
+#if 0
+        if (!threadIdx.y && !tidx) {
+                printf("cpeak_idx: %d, cpeak_dot: %f\n", cpeak_idx, cpeak_dot);
+        }
+        __syncwarp(WMASK);
+#endif
+
+        #pragma unroll
+        for(int j = BDIM_X/2; j; j /= 2) {
+
+                const REAL_T dot = __shfl_xor_sync(WMASK, cpeak_dot, j, BDIM_X);
+                const int    idx = __shfl_xor_sync(WMASK, cpeak_idx, j, BDIM_X);
+                if (FABS(dot) > FABS(cpeak_dot)) {
+                        cpeak_dot = dot;
+                        cpeak_idx = idx;
+                }
+        }
+#if 0
+        if (!threadIdx.y && !tidx) {
+                printf("cpeak_idx: %d, cpeak_dot: %f, cos_similarity: %f\n", cpeak_idx, cpeak_dot, cos_similarity);
+        }
+        __syncwarp(WMASK);
+#endif
+        if (cpeak_idx >= 0) {
+                if (cpeak_dot >= cos_similarity) {
+                        peak[0] = peaks[cpeak_idx];
+                        return 1;
+                }
+                if (cpeak_dot <= -cos_similarity) {
+                        peak[0] = MAKE_REAL3(-peaks[cpeak_idx].x,
+                                             -peaks[cpeak_idx].y,
+                                             -peaks[cpeak_idx].z);
+                        return 1;
+                }
+        }
+        return 0;
+}
+
+template<int BDIM_X,
+         typename VAL_T>
+__device__ void ndotp_d(const int N,
+			const int M,
+			const VAL_T *__restrict__ srcV,
+                        const VAL_T *__restrict__ srcM,
+                              VAL_T *__restrict__ dstV) {
+
+        const int tidx = threadIdx.x;
+
+        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
+        //#pragma unroll
+        for(int i = 0; i < N; i++) {
+
+                VAL_T __tmp = 0;
+
+                //#pragma unroll
+                for(int j = 0; j < M; j += BDIM_X) {
+                        if (j+tidx < M) {
+                                __tmp += srcV[j+tidx]*srcM[i*M + j+tidx];
+                        }
+                }
+                #pragma unroll
+                for(int j = BDIM_X/2; j; j /= 2) {
+#if 0
+                        __tmp += __shfl_xor_sync(WMASK, __tmp, j, BDIM_X);
+#else
+                        __tmp += __shfl_down_sync(WMASK, __tmp, j, BDIM_X);
+#endif
+                }
+                // values could be held by BDIM_X threads and written
+                // together every BDIM_X iterations...
+
+                if (tidx == 0) {
+                        dstV[i] = __tmp;
+                }
+        }
+        return;
+}
+
+
+template<int BDIM_X,
+         typename VAL_T>
+__device__ void ndotp_log_opdt_d(const int N,
+			    const int M,
+			    const VAL_T *__restrict__ srcV,
+                            const VAL_T *__restrict__ srcM,
+                                  VAL_T *__restrict__ dstV) {
+
+        const int tidx = threadIdx.x;
+
+        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+         const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
+        const VAL_T ONEP5 = static_cast<VAL_T>(1.5);
+
+        //#pragma unroll
+        for(int i = 0; i < N; i++) {
+
+                VAL_T __tmp = 0;
+
+                //#pragma unroll
+                for(int j = 0; j < M; j += BDIM_X) {
+                        if (j+tidx < M) {
+                                const VAL_T v = srcV[j+tidx];
+                                __tmp += -LOG(v)*(ONEP5+LOG(v))*v * srcM[i*M + j+tidx];
+                        }
+                }
+                #pragma unroll
+                for(int j = BDIM_X/2; j; j /= 2) {
+#if 0
+                        __tmp += __shfl_xor_sync(WMASK, __tmp, j, BDIM_X);
+#else
+                        __tmp += __shfl_down_sync(WMASK, __tmp, j, BDIM_X);
+#endif
+                }
+                // values could be held by BDIM_X threads and written
+                // together every BDIM_X iterations...
+
+                if (tidx == 0) {
+                        dstV[i] = __tmp;
+                }
+        }
+        return;
+}
+
+template<int BDIM_X,
+	 typename VAL_T>
+__device__ void ndotp_log_csa_d(const int N,
+				const int M,
+				const VAL_T *__restrict__ srcV,
+				const VAL_T *__restrict__ srcM,
+				VAL_T *__restrict__ dstV) {
+
+	const int tidx = threadIdx.x;
+
+	const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+	const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+	// Clamp values
+	constexpr VAL_T min = .001;
+	constexpr VAL_T max = .999;
+
+	//#pragma unroll
+	for(int i = 0; i < N; i++) {
+
+		VAL_T __tmp = 0;
+
+		//#pragma unroll
+		for(int j = 0; j < M; j += BDIM_X) {
+			if (j+tidx < M) {
+				const VAL_T v = MIN(MAX(srcV[j+tidx], min), max);
+				__tmp += LOG(-LOG(v)) * srcM[i*M + j+tidx];
+			}
+		}
+		#pragma unroll
+		for(int j = BDIM_X/2; j; j /= 2) {
+#if 0
+			__tmp += __shfl_xor_sync(WMASK, __tmp, j, BDIM_X);
+#else
+			__tmp += __shfl_down_sync(WMASK, __tmp, j, BDIM_X);
+#endif
+		}
+		// values could be held by BDIM_X threads and written
+		// together every BDIM_X iterations...
+
+		if (tidx == 0) {
+			dstV[i] = __tmp;
+		}
+	}
+	return;
+}
+
+
+template<int BDIM_X,
+         typename REAL_T>
+__device__ void fit_opdt(const int delta_nr,
+                         const int hr_side,
+                         const REAL_T *__restrict__ delta_q,
+                         const REAL_T *__restrict__ delta_b,
+                         const REAL_T *__restrict__ __msk_data_sh,
+                         REAL_T *__restrict__ __h_sh,
+                         REAL_T *__restrict__ __r_sh) {
+        const int tidx = threadIdx.x;
+        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
+        ndotp_log_opdt_d<BDIM_X>(delta_nr, hr_side, __msk_data_sh, delta_q, __r_sh);
+        ndotp_d         <BDIM_X>(delta_nr, hr_side, __msk_data_sh, delta_b, __h_sh);
+        __syncwarp(WMASK);
+        #pragma unroll
+        for(int j = tidx; j < delta_nr; j += BDIM_X) {
+                __r_sh[j] -= __h_sh[j];
+        }
+        __syncwarp(WMASK);
+}
+
+template<int BDIM_X, typename REAL_T>
+__device__ void fit_csa(const int delta_nr,
+                        const int hr_side,
+                        const REAL_T *__restrict__ fit_matrix,
+                        const REAL_T *__restrict__ __msk_data_sh,
+                        REAL_T *__restrict__ __r_sh) {
+        const int tidx = threadIdx.x;
+        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
+        constexpr REAL _n0_const = 0.28209479177387814; // .5 / sqrt(pi)
+        ndotp_log_csa_d<BDIM_X>(delta_nr, hr_side, __msk_data_sh, fit_matrix, __r_sh);
+        __syncwarp(WMASK);
+        if (tidx == 0) {
+                __r_sh[0] = _n0_const;
+        }
+        __syncwarp(WMASK);
+}
+
+template<int BDIM_X, ModelType MODEL_T, typename REAL_T>
+__device__ void fit_model_coef(const int delta_nr, // delta_nr is number of ODF directions
+                               const int hr_side, // hr_side is number of data directions
+                               const REAL_T *__restrict__ delta_q,
+                               const REAL_T *__restrict__ delta_b, // these are fit matrices the model can use, different for each model
+                               const REAL_T *__restrict__ __msk_data_sh, // __msk_data_sh is the part of the data currently being operated on by this block
+                               REAL_T *__restrict__ __h_sh, // these last two are modifications to the coefficients that will be returned
+                               REAL_T *__restrict__ __r_sh) {
+        switch(MODEL_T) {
+                case OPDT:
+                        fit_opdt<BDIM_X>(delta_nr, hr_side, delta_q, delta_b, __msk_data_sh, __h_sh, __r_sh);
+                        break;
+                case CSA:
+                        fit_csa<BDIM_X>(delta_nr, hr_side, delta_q, __msk_data_sh, __r_sh);
+                        break;
+                default:
+                        printf("FATAL: Invalid Model Type.\n");
+                        break;
+        }
+}
+
+template<int BDIM_X,
+         int BDIM_Y,
+         int NATTEMPTS,
+         ModelType MODEL_T,
+         typename REAL_T,
+         typename REAL3_T>
+__device__ int get_direction_boot_d(
+                                curandStatePhilox4_32_10_t *st,
+                                const REAL_T max_angle,
+                                const REAL_T min_signal,
+                                const REAL_T relative_peak_thres,
+                                const REAL_T min_separation_angle,
+                                REAL3_T dir,
+                                const int dimx,
+                                const int dimy,
+                                const int dimz,
+                                const int dimt,
+                                const REAL_T *__restrict__ dataf,
+                                const int *__restrict__ b0s_mask, // not using this (and its opposite, dwi_mask)
+                                                                  // but not clear if it will never be needed so
+                                                                  // we'll keep it here for now...
+                                const REAL3_T point,
+                                const REAL_T *__restrict__ H, 
+                                const REAL_T *__restrict__ R,
+                                // model unused
+                                // max_angle, pmf_threshold from global defines
+                                // b0s_mask already passed
+                                // min_signal from global defines
+                                const int delta_nr,
+                                const REAL_T *__restrict__ delta_b,
+                                const REAL_T *__restrict__ delta_q, // fit_matrix
+                                const int samplm_nr,
+                                const REAL_T *__restrict__ sampling_matrix,
+                                const REAL3_T *__restrict__ sphere_vertices,
+                                const int2 *__restrict__ sphere_edges,
+                                const int num_edges,
+                                REAL3_T *__restrict__ dirs) {
+
+        const int tidx = threadIdx.x;
+        const int tidy = threadIdx.y;
+	
+        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
+	const int n32dimt = ((dimt+31)/32)*32;
+
+	extern REAL_T __shared__ __sh[];
+
+	REAL_T *__vox_data_sh = reinterpret_cast<REAL_T *>(__sh);
+	REAL_T *__msk_data_sh = __vox_data_sh + BDIM_Y*n32dimt;
+
+	REAL_T *__r_sh = __msk_data_sh + BDIM_Y*n32dimt;
+	REAL_T *__h_sh = __r_sh + BDIM_Y*MAX(n32dimt, samplm_nr);
+
+	__vox_data_sh += tidy*n32dimt;
+	__msk_data_sh += tidy*n32dimt;
+
+	__r_sh += tidy*MAX(n32dimt, samplm_nr);
+	__h_sh += tidy*MAX(n32dimt, samplm_nr);
+	
+	// compute hr_side (may be passed from python)
+	int hr_side = 0;
+	for(int j = tidx; j < dimt; j += BDIM_X) {
+		hr_side += !b0s_mask[j] ? 1 : 0;
+	}
+        #pragma unroll
+        for(int i = BDIM_X/2; i; i /= 2) {
+                hr_side += __shfl_xor_sync(WMASK, hr_side, i, BDIM_X);
+        }
+
+        #pragma unroll
+        for(int i = 0; i < NATTEMPTS; i++) {
+
+                const int rv = trilinear_interp_d<BDIM_X>(dimx, dimy, dimz, dimt, -1, dataf, point, __vox_data_sh);
+
+		const int nmsk = maskGet<BDIM_X>(dimt, b0s_mask, __vox_data_sh, __msk_data_sh);
+
+		//if (!tidx && !threadIdx.y && !blockIdx.x) {
+		//
+		//	printf("interp of %f, %f, %f\n", point.x, point.y, point.z);
+		//	printf("hr_side: %d\n", hr_side);
+		//	printArray("vox_data", 6, dimt, __vox_data_sh[tidy]);
+		//	printArray("msk_data", 6, nmsk, __msk_data_sh[tidy]);
+		//}
+		//break;
+
+                __syncwarp(WMASK);
+
+                if (rv == 0) {
+
+                        ndotp_d<BDIM_X>(hr_side, hr_side, __msk_data_sh, R, __r_sh);
+			//__syncwarp();
+			//printArray("__r", 5, hr_side*hr_side, R);
+			//printArray("__r_sh", 6, hr_side, __r_sh[tidy]);
+
+                        ndotp_d<BDIM_X>(hr_side, hr_side, __msk_data_sh, H, __h_sh);
+			//__syncwarp();
+			//printArray("__h_sh", 6, hr_side, __h_sh[tidy]);
+
+                        __syncwarp(WMASK);
+
+                        for(int j = 0; j < hr_side; j += BDIM_X) {
+                                if (j+tidx < hr_side) {
+#ifdef USE_FIXED_PERMUTATION
+                                        const int srcPermInd = fixedPerm[j+tidx];
+#else
+                                        const int srcPermInd = curand(st) % hr_side;
+//                                        if (srcPermInd < 0 || srcPermInd >= hr_side) {
+//                                                printf("srcPermInd: %d\n", srcPermInd);
+//                                        }
+#endif
+					__h_sh[j+tidx] += __r_sh[srcPermInd];
+					//__h_sh[j+tidx] += __r_sh[j+tidx];
+                                }
+                        }
+			__syncwarp(WMASK);
+
+			//printArray("h+perm(r):", 6, hr_side, __h_sh[tidy]);
+			//__syncwarp();
+		
+			// vox_data[dwi_mask] = masked_data
+			maskPut<BDIM_X>(dimt, b0s_mask, __h_sh, __vox_data_sh);
+			__syncwarp(WMASK);
+
+			//printArray("vox_data[dwi_mask]:", 6, dimt, __vox_data_sh[tidy]);
+			//__syncwarp();
+
+			for(int j = tidx; j < dimt; j += BDIM_X) {
+				//__vox_data_sh[j] = MAX(MIN_SIGNAL_P, __vox_data_sh[j]);
+				__vox_data_sh[j] = MAX(min_signal, __vox_data_sh[j]);
+			}
+			__syncwarp(WMASK);
+
+			const REAL_T denom = avgMask<BDIM_X>(dimt, b0s_mask, __vox_data_sh);
+
+			for(int j = tidx; j < dimt; j += BDIM_X) {
+				__vox_data_sh[j] /= denom;
+			}
+			__syncwarp();
+
+			//if (!tidx && !threadIdx.y && !blockIdx.x) {
+			//	printf("denom: %f\n", denom);
+			//}
+			////break;
+			//if (!tidx && !threadIdx.y && !blockIdx.x) {
+			//
+			//	printf("__vox_data_sh:\n");
+			//	printArray("vox_data", 6, dimt, __vox_data_sh[tidy]);
+			//}
+			//break;
+
+			maskGet<BDIM_X>(dimt, b0s_mask, __vox_data_sh, __msk_data_sh);
+			__syncwarp(WMASK);
+
+                        fit_model_coef<BDIM_X, MODEL_T>(delta_nr, hr_side, delta_q, delta_b, __msk_data_sh, __h_sh, __r_sh);
+
+                        // __r_sh[tidy] <- python 'coef'
+
+                        ndotp_d<BDIM_X>(samplm_nr, delta_nr, __r_sh, sampling_matrix, __h_sh);
+
+                        // __h_sh[tidy] <- python 'pmf'
+                } else {
+                        #pragma unroll
+                        for(int j = tidx; j < samplm_nr; j += BDIM_X) {
+				__h_sh[j] = 0;
+                        }
+                        // __h_sh[tidy] <- python 'pmf'
+                }
+                __syncwarp(WMASK);
+#if 0
+                if (!threadIdx.y && threadIdx.x == 0) {
+                        for(int j = 0; j < samplm_nr; j++) {
+                                printf("pmf[%d]: %f\n", j, __h_sh[tidy][j]);
+                        }
+                }
+                //return;
+#endif
+                const REAL_T abs_pmf_thr = PMF_THRESHOLD_P*max_d<BDIM_X>(samplm_nr, __h_sh, REAL_MIN);
+                __syncwarp(WMASK);
+
+                #pragma unroll
+                for(int j = tidx; j < samplm_nr; j += BDIM_X) {
+			const REAL_T __v = __h_sh[j];
+			if (__v < abs_pmf_thr) {
+				__h_sh[j] = 0;
+			}
+                }
+                __syncwarp(WMASK);
+#if 0
+                if (!threadIdx.y && threadIdx.x == 0) {
+                        printf("abs_pmf_thr: %f\n", abs_pmf_thr);
+                        for(int j = 0; j < samplm_nr; j++) {
+                                printf("pmfNORM[%d]: %f\n", j, __h_sh[tidy][j]);
+                        }
+                }
+                //return;
+#endif
+#if 0
+                if init:
+                        directions = peak_directions(pmf, sphere)[0]
+                        return directions
+                else:
+                        peaks = peak_directions(pmf, sphere)[0]
+                        if (len(peaks) > 0):
+                                return closest_peak(directions, peaks, cos_similarity)
+#endif
+                const int ndir = peak_directions_d<BDIM_X,
+                                                   BDIM_Y>(__h_sh, dirs,
+                                                           sphere_vertices,
+                                                           sphere_edges,
+                                                           num_edges,
+							   samplm_nr,
+							   reinterpret_cast<int *>(__r_sh), // reuse __r_sh as shInd in func which is large enough
+							   relative_peak_thres,
+							   min_separation_angle);
+                if (NATTEMPTS == 1) { // init=True...
+                        return ndir; // and dirs;
+                } else { // init=False...
+                        if (ndir > 0) {
+                                /*
+                                if (!threadIdx.y && threadIdx.x == 0 && ndir > 1) {
+                                        printf("NATTEMPTS=5 and ndir: %d!!!\n", ndir);
+                                }
+                                */
+                                REAL3_T peak;
+                                const int foundPeak = closest_peak_d<BDIM_X, BDIM_Y, REAL_T, REAL3_T>(max_angle, dir, ndir, dirs, &peak);
+                                __syncwarp(WMASK);
+                                if (foundPeak) {
+                                        if (tidx == 0) {
+                                                dirs[0] = peak;
+                                        }
+                                        return 1;
+                                }
+                        }
+                }
+        }
+        return 0;
+}
+
+template<int BDIM_X,
+         int BDIM_Y,
+         typename REAL_T,
+         typename REAL3_T>
+__global__ void getNumStreamlinesBoot_k(
+                                    const ModelType model_type,
+                                    const REAL_T max_angle,
+		                    const REAL_T min_signal,
+		                    const REAL_T relative_peak_thres,
+	                            const REAL_T min_separation_angle,
+		                    const long long rndSeed,
+                                    const int nseed,
+                                    const REAL3_T *__restrict__ seeds,
+                                    const int dimx,
+                                    const int dimy,
+                                    const int dimz,
+                                    const int dimt,
+                                    const REAL_T *__restrict__ dataf,
+                                    const REAL_T *__restrict__ H,
+                                    const REAL_T *__restrict__ R,
+		                    const int delta_nr,
+                                    const REAL_T *__restrict__ delta_b,
+                                    const REAL_T *__restrict__ delta_q,
+                                    const int  *__restrict__ b0s_mask, // change to int
+		                    const int samplm_nr,
+                                    const REAL_T *__restrict__ sampling_matrix,
+                                    const REAL3_T *__restrict__ sphere_vertices,
+                                    const int2 *__restrict__ sphere_edges,
+                                    const int num_edges,
+                                    REAL3_T *__restrict__ shDir0,
+                                    int *slineOutOff) {
+
+        const int tidx = threadIdx.x;
+        const int slid = blockIdx.x*blockDim.y + threadIdx.y;
+        const size_t gid = blockIdx.x * blockDim.y * blockDim.x + blockDim.x * threadIdx.y + threadIdx.x;
+
+        if (slid >= nseed) {
+                return;
+        }
+
+        REAL3_T seed = seeds[slid]; 
+        // seed = lin_mat*seed + offset
+
+        REAL3_T *__restrict__ __shDir = shDir0+slid*samplm_nr;
+
+	// const int hr_side = dimt-1;
+
+        curandStatePhilox4_32_10_t st;
+        //curand_init(rndSeed, slid + rndOffset, DIV_UP(hr_side, BDIM_X)*tidx, &st); // each thread uses DIV_UP(hr_side/BDIM_X)
+        curand_init(rndSeed, gid, 0, &st); // each thread uses DIV_UP(hr_side/BDIM_X)
+                                                                                   // elements of the same sequence
+        // python:
+        //directions = get_direction(None, dataf, dwi_mask, sphere, s, H, R, model, max_angle,
+        //                pmf_threshold, b0s_mask, min_signal, fit_matrix,
+        //                sampling_matrix, init=True)
+
+	//if (!tidx && !threadIdx.y && !blockIdx.x) {
+	//	printf("seed: %f, %f, %f\n", seed.x, seed.y, seed.z);
+	//}
+
+        int ndir;
+        switch(model_type) {
+            case OPDT:
+                ndir = get_direction_boot_d<BDIM_X,
+                                            BDIM_Y,
+                                            1,
+                                            OPDT>(
+                                                &st,
+                                                max_angle,
+                                                min_signal,
+                                                relative_peak_thres,
+                                                min_separation_angle,
+                                                MAKE_REAL3(0,0,0),
+                                                dimx, dimy, dimz, dimt, dataf,
+                                                b0s_mask /* !dwi_mask */,
+                                                seed,
+                                                H, R,
+                                                // model unused
+                                                // max_angle, pmf_threshold from global defines
+                                                // b0s_mask already passed
+                                                // min_signal from global defines
+                                                delta_nr,
+                                                delta_b, delta_q, // fit_matrix
+                                                samplm_nr,
+                                                sampling_matrix,
+                                                sphere_vertices,
+                                                sphere_edges,
+                                                num_edges,
+                                                __shDir);
+                break;
+            case CSA:
+                ndir = get_direction_boot_d<BDIM_X,
+                                            BDIM_Y,
+                                            1,
+                                            CSA>(
+                                                &st,
+                                                max_angle,
+                                                min_signal,
+                                                relative_peak_thres,
+                                                min_separation_angle,
+                                                MAKE_REAL3(0,0,0),
+                                                dimx, dimy, dimz, dimt, dataf,
+                                                b0s_mask /* !dwi_mask */,
+                                                seed,
+                                                H, R,
+                                                // model unused
+                                                // max_angle, pmf_threshold from global defines
+                                                // b0s_mask already passed
+                                                // min_signal from global defines
+                                                delta_nr,
+                                                delta_b, delta_q, // fit_matrix
+                                                samplm_nr,
+                                                sampling_matrix,
+                                                sphere_vertices,
+                                                sphere_edges,
+                                                num_edges,
+                                                __shDir);
+                break;
+            default:
+                printf("FATAL: Invalid Model Type.\n");
+                break;
+        }
+
+        if (tidx == 0) {
+                slineOutOff[slid] = ndir;
+        }
+
+        return;
+}
+
+template<int BDIM_X,
+         int BDIM_Y,
+         ModelType MODEL_T,
+         typename REAL_T,
+         typename REAL3_T>
+__device__ int tracker_boot_d(
+                        curandStatePhilox4_32_10_t *st,
+			            const REAL_T max_angle,
+			            const REAL_T tc_threshold,
+			            const REAL_T step_size,
+			            const REAL_T relative_peak_thres,
+			            const REAL_T min_separation_angle,
+                         REAL3_T seed,
+                         REAL3_T first_step,
+                         REAL3_T voxel_size,
+                         const int dimx,
+                         const int dimy,
+                         const int dimz,
+                         const int dimt,
+                         const REAL_T *__restrict__ dataf,
+                         const REAL_T *__restrict__ metric_map,
+		                 const int samplm_nr,
+                         const REAL3_T *__restrict__ sphere_vertices,
+                         const int2 *__restrict__ sphere_edges,
+                         const int num_edges,
+                        /*BOOT specific params*/
+                        const REAL_T min_signal,
+                        const int delta_nr,
+                        const REAL_T *__restrict__ H,
+                        const REAL_T *__restrict__ R,
+                        const REAL_T *__restrict__ delta_b,
+                        const REAL_T *__restrict__ delta_q,
+                        const REAL_T *__restrict__ sampling_matrix,
+                        const int    *__restrict__ b0s_mask,
+                        /*BOOT specific params*/
+                         int *__restrict__ nsteps,
+                         REAL3_T *__restrict__ streamline) {
+
+        const int tidx = threadIdx.x;
+        const int tidy = threadIdx.y;
+
+        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
+        int tissue_class = TRACKPOINT;
+
+        REAL3_T point = seed;
+        REAL3_T direction = first_step;
+        __shared__ REAL3_T __sh_new_dir[BDIM_Y];
+
+        if (tidx == 0) {
+                streamline[0] = point;
+#if 0
+                if (threadIdx.y == 1) {
+                        printf("streamline[0]: %f, %f, %f\n", point.x, point.y, point.z);
+                }
+#endif
+        }
+        __syncwarp(WMASK);
+
+        int step_frac = 1;
+
+        int i;
+        for(i = 1; i < MAX_SLINE_LEN*step_frac; i++) {
+                int ndir = get_direction_boot_d<BDIM_X,
+                                                BDIM_Y,
+                                                5,
+                                                MODEL_T>(
+                                                        st,
+                                                        max_angle,
+                                                        min_signal,
+                                                        relative_peak_thres,
+                                                        min_separation_angle,
+                                                        direction,
+                                                        dimx, dimy, dimz, dimt, dataf,
+                                                        b0s_mask /* !dwi_mask */,
+                                                        point,
+                                                        H, R,
+                                                        delta_nr,
+                                                        delta_b, delta_q, // fit_matrix
+                                                        samplm_nr,
+                                                        sampling_matrix,
+                                                        sphere_vertices,
+                                                        sphere_edges,
+                                                        num_edges,
+                                                        __sh_new_dir + tidy);
+                __syncwarp(WMASK);
+                direction = __sh_new_dir[tidy];
+                __syncwarp(WMASK);
+
+                if (ndir == 0) {
+                        break;
+                }
+
+                point.x += (direction.x / voxel_size.x) * (step_size / step_frac);
+                point.y += (direction.y / voxel_size.y) * (step_size / step_frac);
+                point.z += (direction.z / voxel_size.z) * (step_size / step_frac);
+
+                if ((tidx == 0) && ((i % step_frac) == 0)){
+                        streamline[i/step_frac] = point;
+                }
+                __syncwarp(WMASK);
+
+                tissue_class = check_point_d<BDIM_X, BDIM_Y>(tc_threshold, point, dimx, dimy, dimz, metric_map);
+
+                if (tissue_class == ENDPOINT ||
+                    tissue_class == INVALIDPOINT ||
+                    tissue_class == OUTSIDEIMAGE) {
+                        break;
+                }
+        }
+        nsteps[0] = i/step_frac;
+        if (((i % step_frac) != 0) && i < step_frac*(MAX_SLINE_LEN - 1)){
+                nsteps[0]++;
+                if (tidx == 0) {
+                        streamline[nsteps[0]] = point;
+                }
+        }
+
+        return tissue_class;
+}
+
+template<int BDIM_X,
+         int BDIM_Y,
+         ModelType MODEL_T,
+         typename REAL_T,
+         typename REAL3_T>
+__global__ void genStreamlinesMergeBoot_k(
+				      const REAL_T max_angle,
+				      const REAL_T tc_threshold,
+				      const REAL_T step_size,
+				      const REAL_T relative_peak_thres,
+				      const REAL_T min_separation_angle,
+				      const long long rndSeed,
+                      const int rndOffset,
+                      const int nseed,
+                      const REAL3_T *__restrict__ seeds,
+                      const int dimx,
+                      const int dimy,
+                      const int dimz,
+                      const int dimt,
+                      const REAL_T *__restrict__ dataf,
+                      const REAL_T *__restrict__ metric_map,
+				      const int samplm_nr,
+                      const REAL3_T *__restrict__ sphere_vertices,
+                      const int2 *__restrict__ sphere_edges,
+                      const int num_edges,
+                      /*BOOT specific params*/
+				      const REAL_T min_signal,
+				      const int delta_nr,
+                      const REAL_T *__restrict__ H,
+                      const REAL_T *__restrict__ R,
+                      const REAL_T *__restrict__ delta_b,
+                      const REAL_T *__restrict__ delta_q,
+                      const REAL_T *__restrict__ sampling_matrix,
+                      const int    *__restrict__ b0s_mask,
+                      /*BOOT specific params*/
+                      const int    *__restrict__ slineOutOff,
+                      REAL3_T *__restrict__ shDir0,
+                      int     *__restrict__ slineSeed,
+                      int     *__restrict__ slineLen,
+                      REAL3_T *__restrict__ sline) {
+
+        const int tidx = threadIdx.x;
+        const int tidy = threadIdx.y;
+
+        const int slid = blockIdx.x*blockDim.y + threadIdx.y;
+
+        const int lid = (tidy*BDIM_X + tidx) % 32;
+        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
+        curandStatePhilox4_32_10_t st;
+        // const int gbid = blockIdx.y*gridDim.x + blockIdx.x;
+        const size_t gid = blockIdx.x * blockDim.y * blockDim.x + blockDim.x * threadIdx.y + threadIdx.x;
+        //curand_init(rndSeed, slid+rndOffset, DIV_UP(hr_side, BDIM_X)*tidx, &st); // each thread uses DIV_UP(HR_SIDE/BDIM_X)
+        curand_init(rndSeed, gid+1, 0, &st); // each thread uses DIV_UP(hr_side/BDIM_X)
+                                                                                 // elements of the same sequence
+        if (slid >= nseed) {
+                return;
+        }
+
+        REAL3_T seed = seeds[slid]; 
+
+        int ndir = slineOutOff[slid+1]-slineOutOff[slid];
+#if 0
+        if (threadIdx.y == 0 && threadIdx.x == 0) {
+                printf("%s: ndir: %d\n", __func__, ndir);
+                for(int i = 0; i < ndir; i++) {
+                        printf("__shDir[%d][%d]: (%f, %f, %f)\n",
+                                tidy, i, __shDir[tidy][i].x, __shDir[tidy][i].y, __shDir[tidy][i].z);
+                }
+        }
+#endif
+        __syncwarp(WMASK);
+
+        int slineOff = slineOutOff[slid];
+
+        for(int i = 0; i < ndir; i++) {
+                REAL3_T first_step = shDir0[slid*samplm_nr + i];
+
+		REAL3_T *__restrict__ currSline = sline + slineOff*MAX_SLINE_LEN*2;
+
+                if (tidx == 0) {
+                        slineSeed[slineOff] = slid;
+                }
+#if 0
+                if (threadIdx.y == 0 && threadIdx.x == 0) {
+                        printf("calling trackerF from: (%f, %f, %f)\n", first_step.x, first_step.y, first_step.z);
+                }
+#endif
+
+                int stepsB;
+                const int tissue_classB = tracker_boot_d<BDIM_X,
+                                                         BDIM_Y,
+                                                         MODEL_T>(
+                                                        &st,
+		                		                        max_angle,
+			        			                        tc_threshold,
+                                                        step_size,
+                                                        relative_peak_thres,
+                                                        min_separation_angle,
+                                                        seed,
+                                                        MAKE_REAL3(-first_step.x, -first_step.y, -first_step.z),
+                                                        MAKE_REAL3(1, 1, 1),
+                                                        dimx, dimy, dimz, dimt, dataf,
+                                                        metric_map,
+                                                        samplm_nr,
+                                                        sphere_vertices,
+                                                        sphere_edges,
+                                                        num_edges,
+                                                        min_signal,
+                                                        delta_nr,
+                                                        H,
+                                                        R,
+                                                        delta_b,
+                                                        delta_q,
+                                                        sampling_matrix,
+                                                        b0s_mask,
+                                                        &stepsB,
+                                                        currSline);
+
+                // reverse backward sline
+                for(int j = 0; j < stepsB/2; j += BDIM_X) {
+                        if (j+tidx < stepsB/2) {
+                                const REAL3_T __p = currSline[j+tidx];
+                                currSline[j+tidx] = currSline[stepsB-1 - (j+tidx)];
+                                currSline[stepsB-1 - (j+tidx)] = __p;
+                        }
+                }
+
+                int stepsF;
+                const int tissue_classF = tracker_boot_d<BDIM_X,
+                                                         BDIM_Y,
+                                                         MODEL_T>(
+                                                        &st,
+     	                    			                max_angle,
+		        				                        tc_threshold,
+	                				                    step_size,
+			    				                        relative_peak_thres,
+			            			                    min_separation_angle,
+                                                        seed,
+                                                        first_step,
+                                                        MAKE_REAL3(1, 1, 1),
+                                                        dimx, dimy, dimz, dimt, dataf,
+                                                        metric_map,
+			        			                        samplm_nr,
+                                                        sphere_vertices,
+                                                        sphere_edges,
+                                                        num_edges,
+                                                        min_signal,
+                                                        delta_nr,
+                                                        H,
+                                                        R,
+                                                        delta_b,
+                                                        delta_q,
+                                                        sampling_matrix,
+                                                        b0s_mask,
+                                                        &stepsF,
+                                                        currSline + stepsB-1);
+                if (tidx == 0) {
+                        slineLen[slineOff] = stepsB-1+stepsF;
+                }
+                
+                slineOff += 1;
+#if 0
+                if (threadIdx.y == 0 && threadIdx.x == 0) {
+                        printf("%s: stepsF: %d, tissue_classF: %d\n", __func__, stepsF, tissue_classF);
+                }
+                __syncwarp(WMASK);
+#endif
+                //if (/* !return_all || */0 &&
+                //    tissue_classF != ENDPOINT &&
+                //    tissue_classF != OUTSIDEIMAGE) {
+                //        continue;
+                //}
+                //if (/* !return_all || */ 0 &&
+                //    tissue_classB != ENDPOINT &&
+                //    tissue_classB != OUTSIDEIMAGE) {
+                //        continue;
+                //}
+        }
+        return;
+}
diff --git a/cuslines/cudamacro.h b/cuslines/cuda_c/cudamacro.h
similarity index 86%
rename from cuslines/cudamacro.h
rename to cuslines/cuda_c/cudamacro.h
index 49ac24c..7f03c6c 100644
--- a/cuslines/cudamacro.h
+++ b/cuslines/cuda_c/cudamacro.h
@@ -45,6 +45,18 @@
         exit(EXIT_FAILURE);                                                  \
     }}
 
+#if CUDART_VERSION >= 13000
+#define CUDA_MEM_ADVISE(devPtr, count, advice, device)            \
+    cudaMemLocation loc;                                          \
+    loc.type = cudaMemLocationTypeDevice;                         \
+    loc.id   = (device);                                          \
+    CHECK_CUDA(cudaMemAdvise((devPtr), (count), (advice), loc));
+#else
+#define CUDA_MEM_ADVISE(devPtr, count, advice, device)            \
+    CHECK_CUDA(cudaMemAdvise((devPtr), (count), (advice), (device)))
+#endif
+
+
 #ifdef USE_NVTX
 #include "nvToolsExt.h"
 
diff --git a/cuslines/cuwsort.cuh b/cuslines/cuda_c/cuwsort.cuh
similarity index 94%
rename from cuslines/cuwsort.cuh
rename to cuslines/cuda_c/cuwsort.cuh
index 18858f0..aac70ac 100644
--- a/cuslines/cuwsort.cuh
+++ b/cuslines/cuda_c/cuwsort.cuh
@@ -79,12 +79,15 @@ int swap4[3][4] = {{ 2,  3,  0,  1},
 __device__ __constant__
 int swap2[1][2] = {{ 1,  0}};
 
-__device__ __constant__ const int *__swaps[] = {NULL,
-						reinterpret_cast<const int *>(&swap2[0][0]),
-						reinterpret_cast<const int *>(&swap4[0][0]),
-						reinterpret_cast<const int *>(&swap8[0][0]),
-						reinterpret_cast<const int *>(&swap16[0][0]),
-						reinterpret_cast<const int *>(&swap32[0][0])};
+template<int GSIZE>
+__device__ __forceinline__ const int* get_swap_ptr() {
+    if constexpr (GSIZE == 2)  return (const int*)swap2;
+    else if constexpr (GSIZE == 4)  return (const int*)swap4;
+    else if constexpr (GSIZE == 8)  return (const int*)swap8;
+    else if constexpr (GSIZE == 16) return (const int*)swap16;
+    else if constexpr (GSIZE == 32) return (const int*)swap32;
+    else return nullptr;
+}
 
 template<int X>
 struct STATIC_LOG2 {
@@ -113,7 +116,7 @@ __device__  KEY_T warp_sort(KEY_T v) {
 
 	const int gid = lid % GSIZE;
 
-	const int (*swap)[GSIZE] = reinterpret_cast<const int (*)[GSIZE]>(__swaps[LOG2_GSIZE]);
+	const int (*swap)[GSIZE] = reinterpret_cast<const int (*)[GSIZE]>(get_swap_ptr<GSIZE>());
 
         #pragma unroll
         for(int i = 0; i < NSWAP; i++) {
@@ -140,7 +143,7 @@ __device__  void warp_sort(KEY_T *__restrict__ k, VAL_T *__restrict__ v) {
 	
 	const int gid = lid % GSIZE;
 
-	const int (*swap)[GSIZE] = reinterpret_cast<const int (*)[GSIZE]>(__swaps[LOG2_GSIZE]);
+	const int (*swap)[GSIZE] = reinterpret_cast<const int (*)[GSIZE]>(get_swap_ptr<GSIZE>());
 
         #pragma unroll
         for(int i = 0; i < NSWAP; i++) {
diff --git a/cuslines/disc.h b/cuslines/cuda_c/disc.h
similarity index 100%
rename from cuslines/disc.h
rename to cuslines/cuda_c/disc.h
diff --git a/cuslines/cuda_c/generate_streamlines_cuda.cu b/cuslines/cuda_c/generate_streamlines_cuda.cu
new file mode 100644
index 0000000..f5629e0
--- /dev/null
+++ b/cuslines/cuda_c/generate_streamlines_cuda.cu
@@ -0,0 +1,695 @@
+/* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <cuda_runtime.h>
+#include <curand_kernel.h>
+
+#include "globals.h"
+#include "cuwsort.cuh"
+#include "ptt.cuh"
+
+#include "utils.cu"
+#include "tracking_helpers.cu"
+#include "boot.cu"
+#include "ptt.cu"
+
+#define MAX_NUM_DIR (128)
+
+#define NTHR_GEN (128)
+
+#define MAX_DIMS        (8)
+#define MAX_STR_LEN     (256)
+
+template<int BDIM_X,
+         int BDIM_Y,
+         bool IS_START,
+         typename REAL_T,
+         typename REAL3_T>
+__device__ int get_direction_prob_d(curandStatePhilox4_32_10_t *st,
+                                    const REAL_T *__restrict__ pmf,
+                                    const REAL_T max_angle,
+                                    const REAL_T relative_peak_thres,
+                                    const REAL_T min_separation_angle,
+                                    REAL3_T dir,
+                                    const int dimx,
+                                    const int dimy,
+                                    const int dimz,
+                                    const int dimt,
+                                    const REAL3_T point,
+                                    const REAL3_T *__restrict__ sphere_vertices,
+                                    const int2 *__restrict__ sphere_edges,
+                                    const int num_edges,
+                                    REAL3_T *__restrict__ dirs) {
+        const int tidx = threadIdx.x;
+        const int tidy = threadIdx.y;
+	
+        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
+	const int n32dimt = ((dimt+31)/32)*32;
+
+	extern __shared__ REAL_T __sh[];
+        REAL_T *__pmf_data_sh = __sh + tidy*n32dimt;
+
+        // pmf = self.pmf_gen.get_pmf_c(&point[0], pmf)
+        __syncwarp(WMASK);
+        const int rv = trilinear_interp_d<BDIM_X>(dimx, dimy, dimz, dimt, -1, pmf, point, __pmf_data_sh);
+        __syncwarp(WMASK);
+        if (rv != 0) {
+                return 0;
+        }
+
+        // for i in range(_len):
+        //     if pmf[i] > max_pmf:
+        //         max_pmf = pmf[i]
+        // absolute_pmf_threshold = pmf_threshold * max_pmf
+        const REAL_T absolpmf_thresh = PMF_THRESHOLD_P * max_d<BDIM_X>(dimt, __pmf_data_sh, REAL_MIN);
+        __syncwarp(WMASK);
+
+        // for i in range(_len):
+        //     if pmf[i] < absolute_pmf_threshold:
+        //         pmf[i] = 0.0
+        #pragma unroll
+        for(int i = tidx; i < dimt; i += BDIM_X) {
+                if (__pmf_data_sh[i] < absolpmf_thresh) {
+                        __pmf_data_sh[i] = 0.0;
+                }
+        }
+        __syncwarp(WMASK);
+
+        if (IS_START) {
+                int *__shInd = reinterpret_cast<int *>(__sh + BDIM_Y*n32dimt) + tidy*n32dimt;
+                return peak_directions_d<BDIM_X,
+                                         BDIM_Y>(__pmf_data_sh,
+                                                 dirs,
+                                                 sphere_vertices,
+                                                 sphere_edges,
+                                                 num_edges,
+                                                 dimt,
+                                                 __shInd,
+                                                 relative_peak_thres,
+                                                 min_separation_angle);
+        } else {
+                REAL_T __tmp;
+                #ifdef DEBUG
+                        __syncwarp(WMASK);
+                        if (tidx == 0) {
+                                printArray("__pmf_data_sh initial", 8, dimt, __pmf_data_sh);
+                                printf("absolpmf_thresh %10.8f\n", absolpmf_thresh);
+                                printf("--->            dir %10.8f, %10.8f, %10.8f\n", dir.x, dir.y, dir.z);
+                                printf("--->            point %10.8f, %10.8f, %10.8f\n", point.x, point.y, point.z);
+                        }
+                        __syncwarp(WMASK);
+                        if (tidx == 15) {
+                                printf("absolpmf_thresh %10.8f l15\n", absolpmf_thresh);
+                                printf("--->            dir %10.8f, %10.8f, %10.8f l15\n", dir.x, dir.y, dir.z);
+                                printf("--->            point %10.8f, %10.8f, %10.8f l15\n", point.x, point.y, point.z);
+                        }
+                        __syncwarp(WMASK);
+                        if (tidx == 31) {
+                                printArray("__pmf_data_sh initial l31", 8, dimt, __pmf_data_sh);
+                                printf("absolpmf_thresh %10.8f l31\n", absolpmf_thresh);
+                                printf("--->            dir %10.8f, %10.8f, %10.8f l31\n", dir.x, dir.y, dir.z);
+                                printf("--->            point %10.8f, %10.8f, %10.8f l31\n", point.x, point.y, point.z);
+                        }
+                        __syncwarp(WMASK);
+                #endif
+
+                // // These should not be relevant
+                // if norm(&direction[0]) == 0:
+                //     return 1
+                // normalize(&direction[0])
+
+                // for i in range(_len):
+                //         cos_sim = self.vertices[i][0] * direction[0] \
+                //                 + self.vertices[i][1] * direction[1] \
+                //                 + self.vertices[i][2] * direction[2]
+                //         if cos_sim < 0:
+                //                 cos_sim = cos_sim * -1
+                //         if cos_sim < self.cos_similarity:
+                //                 pmf[i] = 0
+                const REAL_T cos_similarity = COS(max_angle);
+
+                #pragma unroll
+                for(int i = tidx; i < dimt; i += BDIM_X) {
+                        const REAL_T dot = dir.x*sphere_vertices[i].x+
+                                           dir.y*sphere_vertices[i].y+
+                                           dir.z*sphere_vertices[i].z;
+
+                        if (FABS(dot) < cos_similarity) {
+                                __pmf_data_sh[i] = 0.0;
+                        }
+                }
+                __syncwarp(WMASK);
+
+                #ifdef DEBUG
+                        __syncwarp(WMASK);
+                        if (tidx == 0) {
+                                printArray("__pmf_data_sh after filtering", 8, dimt, __pmf_data_sh);
+                        }
+                        __syncwarp(WMASK);
+                #endif
+
+                // cumsum(pmf, pmf, _len)
+                prefix_sum_sh_d<BDIM_X>(__pmf_data_sh, dimt);
+
+                #ifdef DEBUG
+                        __syncwarp(WMASK);
+                        if (tidx == 0) {
+                                printArray("__pmf_data_sh after cumsum", 8, dimt, __pmf_data_sh);
+                        }
+                        __syncwarp(WMASK);
+                #endif
+
+                // last_cdf = pmf[_len - 1]
+                // if last_cdf == 0:
+                //         return 1
+                REAL_T last_cdf = __pmf_data_sh[dimt - 1];
+                if (last_cdf == 0) {
+                        return 0;
+                }
+
+                // idx = where_to_insert(pmf, random() * last_cdf, _len)
+                if (tidx == 0) {
+                        __tmp = curand_uniform(st) * last_cdf;
+                }
+                REAL_T selected_cdf = __shfl_sync(WMASK, __tmp, 0, BDIM_X);
+// Both these implementations work
+#if 1
+                int low = 0;
+                int high = dimt - 1;
+                while ((high - low) >= BDIM_X) {
+                        const int mid = (low + high) / 2;
+                        if (__pmf_data_sh[mid] < selected_cdf) {
+                                low = mid;
+                        } else {
+                                high = mid;
+                        }
+                }
+                const bool __ballot = (low+tidx <= high) ? (selected_cdf < __pmf_data_sh[low+tidx]) : 0;
+                const int __msk = __ballot_sync(WMASK, __ballot);
+                const int indProb = low + __ffs(__msk) - 1;
+#else
+                int indProb = dimt - 1;
+                for (int ii = 0; ii < dimt; ii+=BDIM_X) {
+                        int __is_greater = 0;
+                        if (ii+tidx < dimt) {
+                                __is_greater = selected_cdf < __pmf_data_sh[ii+tidx];
+                        }
+                        const int __msk = __ballot_sync(WMASK, __is_greater);
+                        if (__msk != 0) {
+                                indProb = ii + __ffs(__msk) - 1;
+                                break;
+                        }
+                }
+#endif
+
+                #ifdef DEBUG
+                        __syncwarp(WMASK);
+                        if (tidx == 0) {
+                                printf("last_cdf %10.8f\n", last_cdf);
+                                printf("selected_cdf %10.8f\n", selected_cdf);
+                                printf("indProb %i out of %i\n", indProb, dimt);
+                        }
+                        __syncwarp(WMASK);
+                #endif
+
+                // newdir = self.vertices[idx]
+                // if (direction[0] * newdir[0]
+                //     + direction[1] * newdir[1]
+                //     + direction[2] * newdir[2] > 0):
+                //     copy_point(&newdir[0], &direction[0])
+                // else:
+                //     newdir[0] = newdir[0] * -1
+                //     newdir[1] = newdir[1] * -1
+                //     newdir[2] = newdir[2] * -1
+                //     copy_point(&newdir[0], &direction[0])
+                if (tidx == 0) {
+                        if ((dir.x * sphere_vertices[indProb].x +
+                             dir.y * sphere_vertices[indProb].y +
+                             dir.z * sphere_vertices[indProb].z) > 0) {
+                                *dirs = MAKE_REAL3(sphere_vertices[indProb].x,
+                                                   sphere_vertices[indProb].y,
+                                                   sphere_vertices[indProb].z);
+                        } else {
+                                *dirs = MAKE_REAL3(-sphere_vertices[indProb].x,
+                                                   -sphere_vertices[indProb].y,
+                                                   -sphere_vertices[indProb].z);
+                        }
+                        // printf("direction addr write %p, slid %i\n", dirs, blockIdx.x*blockDim.y+threadIdx.y);
+                }
+
+                #ifdef DEBUG
+                        __syncwarp(WMASK);
+                        if (tidx == 0) {
+                                printf("last_cdf %10.8f\n", last_cdf);
+                                printf("selected_cdf %10.8f\n", selected_cdf);
+                                printf("indProb %i out of %i\n", indProb, dimt);
+                        }
+                        __syncwarp(WMASK);
+                        if (tidx == 15) {
+                                printf("last_cdf %10.8f l15\n", last_cdf);
+                                printf("selected_cdf %10.8f l15\n", selected_cdf);
+                                printf("indProb %i out of %i l15\n", indProb, dimt);
+                        }
+                        __syncwarp(WMASK);
+                        if (tidx == 31) {
+                                printf("last_cdf %10.8f l31\n", last_cdf);
+                                printf("selected_cdf %10.8f l31\n", selected_cdf);
+                                printf("indProb %i out of %i l31\n", indProb, dimt);
+                        }
+                        __syncwarp(WMASK);
+                #endif
+                return 1;
+        }
+}
+
+template<int BDIM_X,
+         int BDIM_Y,
+         ModelType MODEL_T,
+         typename REAL_T,
+         typename REAL3_T>
+__device__ int tracker_d(curandStatePhilox4_32_10_t *st,
+			 const REAL_T max_angle,
+			 const REAL_T tc_threshold,
+			 const REAL_T step_size,
+			 const REAL_T relative_peak_thres,
+			 const REAL_T min_separation_angle,
+                         REAL3_T seed,
+                         REAL3_T first_step,
+                         REAL_T* ptt_frame,
+                         REAL3_T voxel_size,
+                         const int dimx,
+                         const int dimy,
+                         const int dimz,
+                         const int dimt,
+                         const REAL_T *__restrict__ dataf,
+                         const REAL_T *__restrict__ metric_map,
+		         const int samplm_nr,
+                         const REAL3_T *__restrict__ sphere_vertices,
+                         const int2 *__restrict__ sphere_edges,
+                         const int num_edges,
+                         int *__restrict__ nsteps,
+                         REAL3_T *__restrict__ streamline) {
+
+        const int tidx = threadIdx.x;
+        const int tidy = threadIdx.y;
+
+        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
+        int tissue_class = TRACKPOINT;
+
+        REAL3_T point = seed;
+        REAL3_T direction = first_step;
+        __shared__ REAL3_T __sh_new_dir[BDIM_Y];
+
+        if (tidx == 0) {
+                streamline[0] = point;
+        }
+        __syncwarp(WMASK);
+
+        int step_frac;
+        if (MODEL_T == PTT) {
+                step_frac = STEP_FRAC; 
+        } else {
+                step_frac = 1; // STEP_FRAC could be useful in other models
+        }
+
+        int i;
+        for(i = 1; i < MAX_SLINE_LEN*step_frac; i++) {
+                int ndir;
+                if constexpr (MODEL_T == PROB) {
+                        ndir = get_direction_prob_d<BDIM_X,
+                                                    BDIM_Y,
+                                                    0>(
+                                                        st,
+                                                        dataf,
+                                                        max_angle,
+                                                        relative_peak_thres,
+                                                        min_separation_angle,
+                                                        direction,
+                                                        dimx, dimy, dimz, dimt,
+                                                        point,
+                                                        sphere_vertices,
+                                                        sphere_edges,
+                                                        num_edges,
+                                                        __sh_new_dir + tidy);
+                } else if constexpr (MODEL_T == PTT) {
+                        ndir = get_direction_ptt_d<BDIM_X,
+                                                   BDIM_Y,
+                                                   0>(
+                                                        st,
+                                                        dataf,
+                                                        max_angle,
+                                                        step_size,
+                                                        direction,
+                                                        ptt_frame,
+                                                        dimx, dimy, dimz, dimt,
+                                                        point,
+                                                        sphere_vertices,
+                                                        __sh_new_dir + tidy);
+                }
+                __syncwarp(WMASK);
+                direction = __sh_new_dir[tidy];
+                __syncwarp(WMASK);
+
+                if (ndir == 0) {
+                        break;
+                }
+#if 0
+                if (threadIdx.y == 1 && threadIdx.x == 0) {
+                        printf("tracker: i: %d, direction: (%f, %f, %f)\n", i, direction.x, direction.y, direction.z);
+                }
+                //return;
+#endif
+
+                point.x += (direction.x / voxel_size.x) * (step_size / step_frac);
+                point.y += (direction.y / voxel_size.y) * (step_size / step_frac);
+                point.z += (direction.z / voxel_size.z) * (step_size / step_frac);
+
+                if ((tidx == 0) && ((i % step_frac) == 0)){
+                        streamline[i/step_frac] = point;
+#if 0
+                        if (threadIdx.y == 1) {
+                                printf("streamline[%d]: %f, %f, %f\n", i, point.x, point.y, point.z);
+                        }
+#endif
+                }
+                __syncwarp(WMASK);
+
+                tissue_class = check_point_d<BDIM_X, BDIM_Y>(tc_threshold, point, dimx, dimy, dimz, metric_map);
+
+#if 0
+                __syncwarp(WMASK);
+                if (tidx == 0) {
+                        printf("step_size %f\n", step_size);
+                        printf("direction %f, %f, %f\n", direction.x, direction.y, direction.z);
+                        printf("direction addr read %p, slid %i\n", __shDir, blockIdx.x*blockDim.y+threadIdx.y);
+                        printf("voxel_size %f, %f, %f\n", voxel_size.x, voxel_size.y, voxel_size.z);
+                        printf("point %f, %f, %f\n", point.x, point.y, point.z);
+                        printf("tc %i\n", tissue_class);
+                }
+                __syncwarp(WMASK);
+                if (tidx == 15) {
+                        printf("step_size %f l15\n", step_size);
+                        printf("direction %f, %f, %f l15\n", direction.x, direction.y, direction.z);
+                        printf("direction addr read %p, slid %i l15\n", __shDir, blockIdx.x*blockDim.y+threadIdx.y);
+                        printf("voxel_size %f, %f, %f l15\n", voxel_size.x, voxel_size.y, voxel_size.z);
+                        printf("point %f, %f, %f l15\n", point.x, point.y, point.z);
+                        printf("tc %i l15\n", tissue_class);
+                }
+                __syncwarp(WMASK);
+                if (tidx == 31) {
+                        printf("step_size %f l31\n", step_size);
+                        printf("direction %f, %f, %f l31\n", direction.x, direction.y, direction.z);
+                        printf("direction addr read %p, slid %i l31\n", __shDir, blockIdx.x*blockDim.y+threadIdx.y);
+                        printf("voxel_size %f, %f, %f l31\n", voxel_size.x, voxel_size.y, voxel_size.z);
+                        printf("point %f, %f, %f l31\n", point.x, point.y, point.z);
+                        printf("tc %i l31\n", tissue_class);
+                }
+                __syncwarp(WMASK);
+#endif
+
+                if (tissue_class == ENDPOINT ||
+                    tissue_class == INVALIDPOINT ||
+                    tissue_class == OUTSIDEIMAGE) {
+                        break;
+                }
+        }
+        nsteps[0] = i/step_frac;
+        if (((i % step_frac) != 0) && i < step_frac*(MAX_SLINE_LEN - 1)){
+                nsteps[0]++;
+                if (tidx == 0) {
+                        streamline[nsteps[0]] = point;
+                }
+        }
+
+        return tissue_class;
+}
+
+template<int BDIM_X,
+         int BDIM_Y,
+         typename REAL_T,
+         typename REAL3_T>
+__global__ void getNumStreamlinesProb_k(const REAL_T max_angle,
+				        const REAL_T relative_peak_thres,
+				        const REAL_T min_separation_angle,
+				        const long long rndSeed,
+                                        const int nseed,
+                                        const REAL3_T *__restrict__ seeds,
+                                        const int dimx,
+                                        const int dimy,
+                                        const int dimz,
+                                        const int dimt,
+                                        const REAL_T *__restrict__ dataf,
+                                        const REAL3_T *__restrict__ sphere_vertices,
+                                        const int2 *__restrict__ sphere_edges,
+                                        const int num_edges,
+                                        REAL3_T *__restrict__ shDir0,
+                                        int *slineOutOff) {
+
+        const int tidx = threadIdx.x;
+        const int slid = blockIdx.x*blockDim.y + threadIdx.y;
+        const size_t gid = blockIdx.x * blockDim.y * blockDim.x + blockDim.x * threadIdx.y + threadIdx.x;
+
+        if (slid >= nseed) {
+                return;
+        }
+
+        REAL3_T *__restrict__ __shDir = shDir0+slid*dimt;
+        curandStatePhilox4_32_10_t st;
+        curand_init(rndSeed, gid, 0, &st);
+
+        int ndir = get_direction_prob_d<BDIM_X,
+                                        BDIM_Y,
+                                        1>(
+                                                &st,
+                                                dataf,
+                                                max_angle,
+                                                relative_peak_thres,
+                                                min_separation_angle,
+                                                MAKE_REAL3(0,0,0),
+                                                dimx, dimy, dimz, dimt,
+                                                seeds[slid],
+                                                sphere_vertices,
+                                                sphere_edges,
+                                                num_edges,
+                                                __shDir);
+        if (tidx == 0) {
+                slineOutOff[slid] = ndir;
+        }
+
+        return;
+}
+
+template<int BDIM_X,
+         int BDIM_Y,
+         ModelType MODEL_T,
+         typename REAL_T,
+         typename REAL3_T>
+__global__ void genStreamlinesMergeProb_k(
+				      const REAL_T max_angle,
+				      const REAL_T tc_threshold,
+				      const REAL_T step_size,
+				      const REAL_T relative_peak_thres,
+				      const REAL_T min_separation_angle,
+				      const long long rndSeed,
+                                      const int rndOffset,
+                                      const int nseed,
+                                      const REAL3_T *__restrict__ seeds,
+                                      const int dimx,
+                                      const int dimy,
+                                      const int dimz,
+                                      const int dimt,
+                                      const REAL_T *__restrict__ dataf,
+                                      const REAL_T *__restrict__ metric_map,
+				      const int samplm_nr,
+                                      const REAL3_T *__restrict__ sphere_vertices,
+                                      const int2 *__restrict__ sphere_edges,
+                                      const int num_edges,
+                                      const int    *__restrict__ slineOutOff,
+                                            REAL3_T *__restrict__ shDir0,
+                                            int     *__restrict__ slineSeed,
+                                            int     *__restrict__ slineLen,
+                                            REAL3_T *__restrict__ sline) {
+
+        const int tidx = threadIdx.x;
+        const int tidy = threadIdx.y;
+
+        const int slid = blockIdx.x*blockDim.y + threadIdx.y;
+
+        const int lid = (tidy*BDIM_X + tidx) % 32;
+        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
+        __shared__ REAL_T frame_sh[((MODEL_T == PTT) ? BDIM_Y*18 : 1)]; // Only used by PTT, TODO: way to remove this in other cases
+        REAL_T* __ptt_frame = frame_sh + tidy*18;
+	// const int hr_side = dimt-1;
+
+        curandStatePhilox4_32_10_t st;
+        // const int gbid = blockIdx.y*gridDim.x + blockIdx.x;
+        const size_t gid = blockIdx.x * blockDim.y * blockDim.x + blockDim.x * threadIdx.y + threadIdx.x;
+        //curand_init(rndSeed, slid+rndOffset, DIV_UP(hr_side, BDIM_X)*tidx, &st); // each thread uses DIV_UP(HR_SIDE/BDIM_X)
+        curand_init(rndSeed, gid+1, 0, &st); // each thread uses DIV_UP(hr_side/BDIM_X)
+                                                                                 // elements of the same sequence
+        if (slid >= nseed) {
+                return;
+        }
+
+        REAL3_T seed = seeds[slid]; 
+
+        int ndir = slineOutOff[slid+1]-slineOutOff[slid];
+#if 0
+        if (threadIdx.y == 0 && threadIdx.x == 0) {
+                printf("%s: ndir: %d\n", __func__, ndir);
+                for(int i = 0; i < ndir; i++) {
+                        printf("__shDir[%d][%d]: (%f, %f, %f)\n",
+                                tidy, i, __shDir[tidy][i].x, __shDir[tidy][i].y, __shDir[tidy][i].z);
+                }
+        }
+#endif
+        __syncwarp(WMASK);
+
+        int slineOff = slineOutOff[slid];
+
+        for(int i = 0; i < ndir; i++) {
+                REAL3_T first_step = shDir0[slid*samplm_nr + i];
+
+		REAL3_T *__restrict__ currSline = sline + slineOff*MAX_SLINE_LEN*2;
+
+                if (tidx == 0) {
+                        slineSeed[slineOff] = slid;
+                }
+#if 0
+                if (threadIdx.y == 0 && threadIdx.x == 0) {
+                        printf("calling trackerF from: (%f, %f, %f)\n", first_step.x, first_step.y, first_step.z);
+                }
+#endif
+
+                if (MODEL_T == PTT) {
+                        if (!init_frame_ptt_d<BDIM_X, BDIM_Y>(
+                                &st,
+                                dataf,
+                                max_angle,
+                                step_size,
+                                first_step,
+                                dimx, dimy, dimz, dimt,
+                                seed,
+                                sphere_vertices,
+                                __ptt_frame
+                        )) { // this fails rarely
+                                if (tidx == 0) {
+                                        slineLen[slineOff] = 1;
+                                        currSline[0] = seed;
+                                }
+                                __syncwarp(WMASK);
+                                slineOff += 1;
+                                continue;
+                        }
+                }
+
+
+                int stepsB;
+                const int tissue_classB = tracker_d<BDIM_X,
+                                                    BDIM_Y,
+                                                    MODEL_T>(&st,
+		                		             max_angle,
+			        			     tc_threshold,
+	                        			     step_size,
+	                        			     relative_peak_thres,
+	                        			     min_separation_angle,
+                                                             seed,
+                                                             MAKE_REAL3(-first_step.x, -first_step.y, -first_step.z),
+                                                             __ptt_frame,
+                                                             MAKE_REAL3(1, 1, 1),
+                                                             dimx, dimy, dimz, dimt, dataf,
+                                                             metric_map,
+		                			     samplm_nr,
+                                                             sphere_vertices,
+                                                             sphere_edges,
+                                                             num_edges,
+                                                             &stepsB,
+                                                             currSline);
+                //if (tidx == 0) {
+                //        slineLenB[slineOff] = stepsB;
+                //}
+
+                // reverse backward sline
+                for(int j = 0; j < stepsB/2; j += BDIM_X) {
+                        if (j+tidx < stepsB/2) {
+                                const REAL3_T __p = currSline[j+tidx];
+                                currSline[j+tidx] = currSline[stepsB-1 - (j+tidx)];
+                                currSline[stepsB-1 - (j+tidx)] = __p;
+                        }
+                }
+
+                int stepsF;
+                const int tissue_classF = tracker_d<BDIM_X,
+                                                    BDIM_Y,
+                                                    MODEL_T>(&st,
+     	                    			             max_angle,
+		        				     tc_threshold,
+	                				     step_size,
+			    				     relative_peak_thres,
+			            			     min_separation_angle,
+                                                             seed,
+                                                             first_step,
+                                                             __ptt_frame + 9,
+                                                             MAKE_REAL3(1, 1, 1),
+                                                             dimx, dimy, dimz, dimt, dataf,
+                                                             metric_map,
+			        			     samplm_nr,
+                                                             sphere_vertices,
+                                                             sphere_edges,
+                                                             num_edges,
+                                                             &stepsF,
+                                                             currSline + stepsB-1);
+                if (tidx == 0) {
+                        slineLen[slineOff] = stepsB-1+stepsF;
+                }
+                
+                slineOff += 1;
+#if 0
+                if (threadIdx.y == 0 && threadIdx.x == 0) {
+                        printf("%s: stepsF: %d, tissue_classF: %d\n", __func__, stepsF, tissue_classF);
+                }
+                __syncwarp(WMASK);
+#endif
+                //if (/* !return_all || */0 &&
+                //    tissue_classF != ENDPOINT &&
+                //    tissue_classF != OUTSIDEIMAGE) {
+                //        continue;
+                //}
+                //if (/* !return_all || */ 0 &&
+                //    tissue_classB != ENDPOINT &&
+                //    tissue_classB != OUTSIDEIMAGE) {
+                //        continue;
+                //}
+        }
+        return;
+}
diff --git a/cuslines/globals.h b/cuslines/cuda_c/globals.h
similarity index 90%
rename from cuslines/globals.h
rename to cuslines/cuda_c/globals.h
index 25c2f29..71bcd73 100644
--- a/cuslines/globals.h
+++ b/cuslines/cuda_c/globals.h
@@ -29,7 +29,7 @@
 #ifndef __GLOBALS_H__
 #define __GLOBALS_H__
 
-#define REAL_SIZE 8
+#define REAL_SIZE 4
 
 #if REAL_SIZE == 4
 
@@ -40,7 +40,7 @@
 #define FLOOR		floorf
 #define LOG		__logf
 #define EXP		__expf
-#define REAL_MAX	(FLT_MAX)
+#define REAL_MAX	__int_as_float(0x7f7fffffU)
 #define REAL_MIN	(-REAL_MAX)
 #define COS		__cosf
 #define SIN		__sinf
@@ -58,7 +58,7 @@
 #define FLOOR		floor
 #define LOG		log
 #define EXP		exp
-#define REAL_MAX	(DBL_MAX)
+#define REAL_MAX	__longlong_as_double(0x7fefffffffffffffLL)
 #define REAL_MIN	(-REAL_MAX)
 #define COS		cos
 #define SIN		sin
@@ -68,9 +68,9 @@
 #define ACOS		acos
 
 #endif
-
+// TODO: half this in when WMGMI seeding
 #define MAX_SLINE_LEN	(501)
-#define PMF_THRESHOLD_P	((REAL)0.1)
+#define PMF_THRESHOLD_P	((REAL)0.05)
 
 #define THR_X_BL (64)
 #define THR_X_SL (32)
@@ -85,6 +85,8 @@
 
 #define EXCESS_ALLOC_FACT 2
 
+#define NORM_EPS ((REAL)1e-8)
+
 #if 0
   #define DEBUG
 #endif
@@ -96,4 +98,6 @@ enum ModelType {
   PTT = 3,
 };
 
+enum {OUTSIDEIMAGE, INVALIDPOINT, TRACKPOINT, ENDPOINT};
+
 #endif
diff --git a/cuslines/ptt.cu b/cuslines/cuda_c/ptt.cu
similarity index 52%
rename from cuslines/ptt.cu
rename to cuslines/cuda_c/ptt.cu
index b36e747..5684272 100644
--- a/cuslines/ptt.cu
+++ b/cuslines/cuda_c/ptt.cu
@@ -1,18 +1,19 @@
 template<typename REAL_T>
-__device__ void norm3_d(REAL_T *num, int fail_ind) {
+__device__ __forceinline__ void norm3_d(REAL_T *num, int fail_ind) {
     const REAL_T scale = SQRT(num[0] * num[0] + num[1] * num[1] + num[2] * num[2]);
 
-    if (scale != 0) {
+    if (scale > NORM_EPS) {
         num[0] /= scale;
         num[1] /= scale;
         num[2] /= scale;
     } else {
+        num[0] = num[1] = num[2] = 0;
         num[fail_ind] = 1.0; // this can happen randomly during propogation, though is exceedingly rare
     }
 }
 
 template<typename REAL_T>
-__device__ void crossnorm3_d(REAL_T *dest, const REAL_T *src1, const REAL_T *src2, int fail_ind) {
+__device__ __forceinline__ void crossnorm3_d(REAL_T *dest, const REAL_T *src1, const REAL_T *src2, int fail_ind) {
     dest[0] = src1[1] * src2[2] - src1[2] * src2[1];
     dest[1] = src1[2] * src2[0] - src1[0] * src2[2];
     dest[2] = src1[0] * src2[1] - src1[1] * src2[0];
@@ -20,13 +21,20 @@ __device__ void crossnorm3_d(REAL_T *dest, const REAL_T *src1, const REAL_T *src
     norm3_d(dest, fail_ind);
 }
 
-template<typename REAL_T, typename REAL3_T>
+template<int BDIM_X, typename REAL_T, typename REAL3_T>
 __device__ REAL_T interp4_d(const REAL3_T pos, const REAL_T* frame, const REAL_T *__restrict__ pmf,
                             const int dimx, const int dimy, const int dimz, const int dimt,
                             const REAL3_T *__restrict__ odf_sphere_vertices) {
+    const int tidx = threadIdx.x;
+
+    const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+    const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
     int closest_odf_idx = 0;
-    REAL_T __max_cos = 0;
-    for (int ii = 0; ii < dimt; ii++) {
+    REAL_T __max_cos = REAL_T(0);
+
+    #pragma unroll
+    for (int ii = tidx; ii < dimt; ii+= BDIM_X) {  // TODO: I need to think about better ways of parallelizing this
         REAL_T cos_sim = FABS(
             odf_sphere_vertices[ii].x * frame[0] \
             + odf_sphere_vertices[ii].y * frame[1] \
@@ -36,15 +44,31 @@ __device__ REAL_T interp4_d(const REAL3_T pos, const REAL_T* frame, const REAL_T
             closest_odf_idx = ii;
         }
     }
+    __syncwarp(WMASK);
 
-    const int rv = trilinear_interp_d<THR_X_SL>(dimx, dimy, dimz, dimt, closest_odf_idx, pmf, pos, &__max_cos);
+    #pragma unroll
+    for(int i = BDIM_X/2; i; i /= 2) {
+        const REAL_T __tmp = __shfl_xor_sync(WMASK, __max_cos, i, BDIM_X);
+        const int __tmp_idx = __shfl_xor_sync(WMASK, closest_odf_idx, i, BDIM_X);
+        if (__tmp > __max_cos ||
+           (__tmp == __max_cos && __tmp_idx < closest_odf_idx)) {
+            __max_cos = __tmp;
+            closest_odf_idx = __tmp_idx;
+        }
+    }
+    __syncwarp(WMASK);
 
 #if 0
-    printf("inerpolated %f at %f, %f, %f, %i\n", __max_cos, pos.x, pos.y, pos.z, closest_odf_idx);
+    if (closest_odf_idx >= dimt || closest_odf_idx < 0) { 
+        printf("Error: closest_odf_idx out of bounds: %d (dimt: %d)\n", closest_odf_idx, dimt);
+    }
 #endif
 
+    // TODO: maybe this should be texture memory, I am not so sure
+    const int rv = trilinear_interp_d<THR_X_SL>(dimx, dimy, dimz, dimt, closest_odf_idx, pmf, pos, &__max_cos);
+
     if (rv != 0) {
-        return -1;
+        return 0;  // No support
     } else {
         return __max_cos;
     }
@@ -87,24 +111,57 @@ __device__ void prepare_propagator_d(REAL_T k1, REAL_T k2, REAL_T arclength,
     }
 }
 
+template<typename REAL_T>
+__device__ void random_normal(curandStatePhilox4_32_10_t *st, REAL_T* probing_frame) {
+    probing_frame[3] = curand_normal(st);
+    probing_frame[4] = curand_normal(st);
+    probing_frame[5] = curand_normal(st);
+    REAL_T dot = probing_frame[3]*probing_frame[0]
+        + probing_frame[4]*probing_frame[1]
+        + probing_frame[5]*probing_frame[2];
+
+    probing_frame[3] -= dot*probing_frame[0];
+    probing_frame[4] -= dot*probing_frame[1];
+    probing_frame[5] -= dot*probing_frame[2];
+    REAL_T n2 = probing_frame[3]*probing_frame[3]
+        + probing_frame[4]*probing_frame[4]
+        + probing_frame[5]*probing_frame[5];
+
+    if (n2 < NORM_EPS) {
+        REAL_T abs_x = FABS(probing_frame[0]);
+        REAL_T abs_y = FABS(probing_frame[1]);
+        REAL_T abs_z = FABS(probing_frame[2]);
+
+        if (abs_x <= abs_y && abs_x <= abs_z) {
+            probing_frame[3] = 0.0;
+            probing_frame[4] = probing_frame[2];
+            probing_frame[5] = -probing_frame[1];
+        } 
+        else if (abs_y <= abs_z) {
+            probing_frame[3] = -probing_frame[2];
+            probing_frame[4] = 0.0;
+            probing_frame[5] = probing_frame[0];
+        } 
+        else {
+            probing_frame[3] = probing_frame[1];
+            probing_frame[4] = -probing_frame[0];
+            probing_frame[5] = 0.0;
+        }
+    }
+}
+
 template<bool IS_INIT, typename REAL_T>
 __device__ void get_probing_frame_d(const REAL_T* frame, curandStatePhilox4_32_10_t *st, REAL_T* probing_frame) {
     if (IS_INIT) {
         for (int ii = 0; ii < 3; ii++) { // tangent
             probing_frame[ii] = frame[ii];
         }
-        if ((probing_frame[0] != 0) && (probing_frame[1] != 0)) { // norm
-            probing_frame[3] = -probing_frame[1];
-            probing_frame[4] = probing_frame[0];
-            probing_frame[5] = 0;
-        } else {
-            probing_frame[3] = 0;
-            probing_frame[4] = -probing_frame[2];
-            probing_frame[5] = 0;
-        }
+        norm3_d(probing_frame, 0);
 
-        norm3_d(probing_frame, 0); // tangent
+        random_normal(st, probing_frame);
         norm3_d(probing_frame + 3, 1); // norm
+
+        // calculate binorm
         crossnorm3_d(probing_frame + 2*3, probing_frame, probing_frame + 3, 2); // binorm
     } else {
         for (int ii = 0; ii < 9; ii++) {
@@ -114,7 +171,7 @@ __device__ void get_probing_frame_d(const REAL_T* frame, curandStatePhilox4_32_1
 }
 
 template<typename REAL_T>
-__device__ void propogate_frame_d(REAL_T* propagator, REAL_T* frame, REAL_T* direc) {
+__device__ void propagate_frame_d(REAL_T* propagator, REAL_T* frame, REAL_T* direc) {
     REAL_T __tmp[3];
 
     for (int ii = 0; ii < 3; ii++) {
@@ -123,50 +180,61 @@ __device__ void propogate_frame_d(REAL_T* propagator, REAL_T* frame, REAL_T* dir
         frame[2*3 + ii] = propagator[6]*frame[ii] + propagator[7]*frame[3+ii] + propagator[8]*frame[6+ii];
     }
 
-#if 1
     norm3_d(__tmp, 0); // normalize tangent
     crossnorm3_d(frame + 3, frame + 2*3, __tmp, 1); // calc normal
     crossnorm3_d(frame + 2*3, __tmp, frame + 3, 2); // calculate binorm from tangent, norm
-#else
-    norm3_d(__tmp, 0); // normalize tangent
-    norm3_d(frame + 2*3, 2); // normalize binorm
-    crossnorm3_d(frame + 3, frame + 2*3, __tmp, 1); // calculate normal from binorm, tangent
-#endif
 
     for (int ii = 0; ii < 3; ii++) {
         frame[ii] = __tmp[ii];
     }
 }
 
-template<typename REAL_T, typename REAL3_T>
+template<int BDIM_X, typename REAL_T, typename REAL3_T>
 __device__ REAL_T calculate_data_support_d(REAL_T support,
                                            const REAL3_T pos, const REAL_T *__restrict__ pmf,
                                            const int dimx, const int dimy, const int dimz, const int dimt,
                                            const REAL_T probe_step_size,
+                                           const REAL_T absolpmf_thresh,
                                            const REAL3_T *__restrict__ odf_sphere_vertices,
-                                           REAL_T k1, REAL_T k2,
-                                           REAL_T* probing_frame) {
-    REAL_T probing_prop[9];
-    REAL_T direc[3];
-    REAL3_T probing_pos;
-    REAL_T fod_amp;
-
-    prepare_propagator_d(k1, k2, probe_step_size, probing_prop);
-    probing_pos.x = pos.x;
-    probing_pos.y = pos.y;
-    probing_pos.z = pos.z;
+                                           REAL_T* probing_prop_sh,
+                                           REAL_T* direc_sh, 
+                                           REAL3_T* probing_pos_sh,
+                                           REAL_T* k1_sh, REAL_T* k2_sh,
+                                           REAL_T* probing_frame_sh) {
+    const int tidx = threadIdx.x;
+
+    const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+    const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
+    if (tidx == 0) {
+        prepare_propagator_d(
+            *k1_sh, *k2_sh,
+            probe_step_size, probing_prop_sh);
+        probing_pos_sh->x = pos.x;
+        probing_pos_sh->y = pos.y;
+        probing_pos_sh->z = pos.z;
+    }
+    __syncwarp(WMASK);
 
     for (int ii = 0; ii < PROBE_QUALITY; ii++) { // we spend about 2/3 of our time in this loop when doing PTT
-        propogate_frame_d(probing_prop, probing_frame, direc);
+        if (tidx == 0) {
+            propagate_frame_d(
+                probing_prop_sh,
+                probing_frame_sh,
+                direc_sh);
+
+            probing_pos_sh->x += direc_sh[0];
+            probing_pos_sh->y += direc_sh[1];
+            probing_pos_sh->z += direc_sh[2];
+        }
+        __syncwarp(WMASK);
 
-        probing_pos.x += direc[0];
-        probing_pos.y += direc[1];
-        probing_pos.z += direc[2];
+        const REAL_T fod_amp = interp4_d<BDIM_X>( // This is the most expensive call
+            *probing_pos_sh, probing_frame_sh, pmf,
+            dimx, dimy, dimz, dimt,
+            odf_sphere_vertices);
 
-        fod_amp = interp4_d(probing_pos, probing_frame, pmf,
-                            dimx, dimy, dimz, dimt,
-                            odf_sphere_vertices);
-        if (!ALLOW_WEAK_LINK && (fod_amp < PMF_THRESHOLD_P)) {
+        if (!ALLOW_WEAK_LINK && (fod_amp < absolpmf_thresh)) {
             return 0;
         }
         support += fod_amp;
@@ -204,16 +272,37 @@ __device__ int get_direction_ptt_d(
     const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
     const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
 
-	REAL_T __shared__ face_cdf_sh[BDIM_Y*DISC_FACE_CNT];
-    REAL_T __shared__ vert_pdf_sh[BDIM_Y*DISC_VERT_CNT];
-    REAL_T __shared__ first_val_sh[BDIM_Y];
+	__shared__ REAL_T face_cdf_sh[BDIM_Y*DISC_FACE_CNT];
+    __shared__ REAL_T vert_pdf_sh[BDIM_Y*DISC_VERT_CNT];
+
+    __shared__ REAL_T probing_frame_sh[BDIM_Y*9];
+    __shared__ REAL_T k1_probe_sh[BDIM_Y];
+    __shared__ REAL_T k2_probe_sh[BDIM_Y];
+
+    __shared__ REAL_T probing_prop_sh[BDIM_Y*9];
+    __shared__ REAL_T direc_sh[BDIM_Y*3];
+    __shared__ REAL3_T probing_pos_sh[BDIM_Y];
 
     REAL_T *__face_cdf_sh = face_cdf_sh + tidy*DISC_FACE_CNT;
     REAL_T *__vert_pdf_sh = vert_pdf_sh + tidy*DISC_VERT_CNT;
-    REAL_T *__first_val_sh = first_val_sh + tidy;
 
-    const REAL_T max_curvature = SIN(max_angle / 2) / step_size; // bigger numbers means wiggle more
-    const REAL_T probe_step_size = ((step_size / 2) / (PROBE_QUALITY - 1));
+    REAL_T *__probing_frame_sh = probing_frame_sh + tidy*9;
+    REAL_T *__k1_probe_sh = k1_probe_sh + tidy;
+    REAL_T *__k2_probe_sh = k2_probe_sh + tidy;
+
+    REAL_T *__probing_prop_sh = probing_prop_sh + tidy*9;
+    REAL_T *__direc_sh = direc_sh + tidy*3;
+    REAL3_T *__probing_pos_sh = probing_pos_sh + tidy;
+
+    const REAL_T probe_step_size = ((step_size / PROBE_FRAC) / (PROBE_QUALITY - 1));
+    const REAL_T max_curvature = 2.0 * SIN(max_angle / 2.0) / (step_size / PROBE_FRAC); // This seems to work well
+    const REAL_T absolpmf_thresh = PMF_THRESHOLD_P * max_d<BDIM_X>(dimt, pmf, REAL_MIN);
+
+#if 0
+        printf("absolpmf_thresh: %f, max_curvature: %f, probe_step_size: %f\n", absolpmf_thresh, max_curvature, probe_step_size);
+        printf("max_angle: %f\n", max_angle);
+        printf("step_size: %f\n", step_size);
+#endif
 
     REAL_T __tmp;
 
@@ -225,30 +314,32 @@ __device__ int get_direction_ptt_d(
             __frame_sh[2] = dir.z;
         }
     }
-    if (tidx==0) {
-        *__first_val_sh = interp4_d(pos, __frame_sh, pmf,
-                                    dimx, dimy, dimz, dimt,
-                                    odf_sphere_vertices);
-    }
+
+    const REAL_T first_val = interp4_d<BDIM_X>(
+        pos, __frame_sh, pmf,
+        dimx, dimy, dimz, dimt,
+        odf_sphere_vertices);
     __syncwarp(WMASK);
 
     // Calculate __vert_pdf_sh
-    REAL_T probing_frame[9];
-    REAL_T k1_probe, k2_probe;
-    bool support_found = 0;
-    for (int ii = tidx; ii < DISC_VERT_CNT; ii += BDIM_X) {
-        k1_probe = DISC_VERT[ii*2] * max_curvature;
-        k2_probe = DISC_VERT[ii*2+1] * max_curvature;
-
-        get_probing_frame_d<IS_INIT>(__frame_sh, st, probing_frame);
+    bool support_found = false;
+    for (int ii = 0; ii < DISC_VERT_CNT; ii++) {
+        if (tidx == 0) {
+            *__k1_probe_sh = DISC_VERT[ii*2] * max_curvature;
+            *__k2_probe_sh = DISC_VERT[ii*2+1] * max_curvature;
+            get_probing_frame_d<IS_INIT>(__frame_sh, st, __probing_frame_sh);
+        }
+        __syncwarp(WMASK);
 
-        const REAL_T this_support = calculate_data_support_d(
-            *__first_val_sh,
+        const REAL_T this_support = calculate_data_support_d<BDIM_X>(
+            first_val,
             pos, pmf, dimx, dimy, dimz, dimt,
             probe_step_size,
+            absolpmf_thresh,
             odf_sphere_vertices,
-            k1_probe, k2_probe,
-            probing_frame);
+            __probing_prop_sh, __direc_sh, __probing_pos_sh,
+            __k1_probe_sh, __k2_probe_sh,
+            __probing_frame_sh);
 
 #if 0
         if (threadIdx.y == 1 && ii == 0) { 
@@ -256,15 +347,18 @@ __device__ int get_direction_ptt_d(
         }
 #endif
 
-        if (this_support < NORM_MIN_SUPPORT) {
-            __vert_pdf_sh[ii] = 0;
+        if (this_support < PROBE_QUALITY * absolpmf_thresh) {
+            if (tidx == 0) {
+                __vert_pdf_sh[ii] = 0;
+            }
         } else {
-            __vert_pdf_sh[ii] = this_support;
+            if (tidx == 0) {
+                __vert_pdf_sh[ii] = this_support;
+            }
             support_found = 1;
         }
     }
-    const int __msk = __ballot_sync(WMASK, support_found);
-    if (__msk == 0) {
+    if (support_found == 0) {
         return 0;
     }
 
@@ -323,82 +417,71 @@ __device__ int get_direction_ptt_d(
 #endif
 
     // Sample random valid faces randomly
-    REAL_T r1, r2;
-    for (int ii = 0; ii < TRIES_PER_REJECTION_SAMPLING / BDIM_X; ii++) {
-        r1 = curand_uniform(st);
-        r2 = curand_uniform(st);
-		if (r1 + r2 > 1) {
-			r1 = 1 - r1;
-			r2 = 1 - r2;
-		}
-
-        __tmp = curand_uniform(st) * last_cdf;
-		int jj;
-		for (jj = 0; jj < DISC_FACE_CNT; jj++) {
-			if (__face_cdf_sh[jj] >= __tmp)
-				break;
-		}
-
-        const REAL_T vx0 = max_curvature * DISC_VERT[DISC_FACE[jj*3]*2];
-        const REAL_T vx1 = max_curvature * DISC_VERT[DISC_FACE[jj*3+1]*2];
-        const REAL_T vx2 = max_curvature * DISC_VERT[DISC_FACE[jj*3+2]*2];
-
-        const REAL_T vy0 = max_curvature * DISC_VERT[DISC_FACE[jj*3]*2 + 1];
-        const REAL_T vy1 = max_curvature * DISC_VERT[DISC_FACE[jj*3+1]*2 + 1];
-        const REAL_T vy2 = max_curvature * DISC_VERT[DISC_FACE[jj*3+2]*2 + 1];
-
-        k1_probe = vx0 + r1 * (vx1 - vx0) + r2 * (vx2 - vx0);
-        k2_probe = vy0 + r1 * (vy1 - vy0) + r2 * (vy2 - vy0);
-
-        get_probing_frame_d<IS_INIT>(__frame_sh, st, probing_frame);
-
-        const REAL_T this_support = calculate_data_support_d(*__first_val_sh,
-                                                             pos, pmf, dimx, dimy, dimz, dimt,
-                                                             probe_step_size,
-                                                             odf_sphere_vertices,
-                                                             k1_probe, k2_probe,
-                                                             probing_frame);
+    for (int ii = 0; ii < TRIES_PER_REJECTION_SAMPLING; ii++) {
+        if (tidx == 0) {
+            REAL_T r1 = curand_uniform(st);
+            REAL_T r2 = curand_uniform(st);
+            if (r1 + r2 > 1) {
+                r1 = 1 - r1;
+                r2 = 1 - r2;
+            }
+
+            __tmp = curand_uniform(st) * last_cdf;
+            int jj;
+            for (jj = 0; jj < DISC_FACE_CNT; jj++) { // TODO: parallelize this
+                if (__face_cdf_sh[jj] >= __tmp)
+                    break;
+            }
+
+            const REAL_T vx0 = max_curvature * DISC_VERT[DISC_FACE[jj*3]*2];
+            const REAL_T vx1 = max_curvature * DISC_VERT[DISC_FACE[jj*3+1]*2];
+            const REAL_T vx2 = max_curvature * DISC_VERT[DISC_FACE[jj*3+2]*2];
 
+            const REAL_T vy0 = max_curvature * DISC_VERT[DISC_FACE[jj*3]*2 + 1];
+            const REAL_T vy1 = max_curvature * DISC_VERT[DISC_FACE[jj*3+1]*2 + 1];
+            const REAL_T vy2 = max_curvature * DISC_VERT[DISC_FACE[jj*3+2]*2 + 1];
 
+            *__k1_probe_sh = vx0 + r1 * (vx1 - vx0) + r2 * (vx2 - vx0);
+            *__k2_probe_sh = vy0 + r1 * (vy1 - vy0) + r2 * (vy2 - vy0);
+            get_probing_frame_d<IS_INIT>(__frame_sh, st, __probing_frame_sh);
+        }
+        __syncwarp(WMASK);
+
+        const REAL_T this_support = calculate_data_support_d<BDIM_X>(
+            first_val,
+            pos, pmf, dimx, dimy, dimz, dimt,
+            probe_step_size,
+            absolpmf_thresh,
+            odf_sphere_vertices,
+            __probing_prop_sh, __direc_sh, __probing_pos_sh,
+            __k1_probe_sh, __k2_probe_sh,
+            __probing_frame_sh);
         __syncwarp(WMASK);
-        int winning_lane = -1; // -1 indicates nobody won
-        int __msk = __ballot_sync(WMASK, this_support >= NORM_MIN_SUPPORT);
-        if (__msk != 0) {
-            REAL_T group_max_support = this_support;
-            #pragma unroll
-            for(int j = 1; j < PROBABILISTIC_GROUP_SZ; j *= 2) {
-                __tmp = __shfl_xor_sync(WMASK, group_max_support, j, BDIM_X);
-                group_max_support = MAX(group_max_support, __tmp);
-            }
 
-            __msk &= __ballot_sync(WMASK, this_support == group_max_support);
-            winning_lane = __ffs(__msk) - 1;
+        if (this_support < PROBE_QUALITY * absolpmf_thresh) {
+            continue;
         }
-        if (winning_lane != -1) {
-            if (tidx == winning_lane) {
-#if 0
-                if (threadIdx.y == 1) {
-                    printf("winning k1 %f, k2 %f, cdf %f, cdf_idx %i", k1_probe, k2_probe, __tmp, jj);
-                }
-#endif
-                if (IS_INIT) {
-                    dirs[0] = dir;
-                } else {
-                    REAL_T __prop[9];
-                    REAL_T __dir[3];
-                    prepare_propagator_d(k1_probe, k2_probe, step_size/STEP_FRAC, __prop);
-                    propogate_frame_d(__prop, probing_frame, __dir);
-                    norm3_d(__dir, 0); // this will be scaled by the generic stepping code
-                    dirs[0] = (REAL3_T) {__dir[0], __dir[1], __dir[2]};
-                }
 
-                for (int jj = 0; jj < 9; jj++) {
-                    __frame_sh[jj] = probing_frame[jj];
-                }
+        if (tidx == 0) {
+            if (IS_INIT) {
+                dirs[0] = dir;
+            } else {
+                // propagate, but only 1/STEP_FRAC of a step
+                prepare_propagator_d(
+                    *__k1_probe_sh, *__k2_probe_sh,
+                    step_size/STEP_FRAC, __probing_prop_sh);
+                get_probing_frame_d<0>(__frame_sh, st, __probing_frame_sh);
+                propagate_frame_d(__probing_prop_sh, __probing_frame_sh, __direc_sh);
+                norm3_d(__direc_sh, 0); // this will be scaled by the generic stepping code
+                dirs[0] = MAKE_REAL3(__direc_sh[0], __direc_sh[1], __direc_sh[2]);
             }
-            __syncwarp(WMASK);
-            return 1;
         }
+
+        if (tidx < 9) {
+            __frame_sh[tidx] = __probing_frame_sh[tidx];
+        }
+        __syncwarp(WMASK);
+        return 1;
     }
     return 0;
 }
diff --git a/cuslines/ptt.cuh b/cuslines/cuda_c/ptt.cuh
similarity index 71%
rename from cuslines/ptt.cuh
rename to cuslines/cuda_c/ptt.cuh
index d8986b5..9126250 100644
--- a/cuslines/ptt.cuh
+++ b/cuslines/cuda_c/ptt.cuh
@@ -4,18 +4,13 @@
 #include "disc.h"
 #include "globals.h"
 
-#define STEP_FRAC 20 // divides output step size (usually 0.5) into this many internal steps
-#define PROBE_FRAC 2 // divides output step size (usually 0.5) to find probe length
-#define PROBE_QUALITY 4
-#define SAMPLING_QUALITY 4 // can be 2-7
-#define PROBABILISTIC_BIAS 1 // 1 looks good. can be 0-log_2(N_WARPS) (typically 0-5). 0 is fully probabilistic, 4 is close to deterministic.
-#define ALLOW_WEAK_LINK 1
-#define TRIES_PER_REJECTION_SAMPLING 1024
-#define DEFAULT_PTT_MINDATASUPPORT 0.05
-#define K_SMALL 0.0001
-
-#define NORM_MIN_SUPPORT (DEFAULT_PTT_MINDATASUPPORT * PROBE_QUALITY)
-#define PROBABILISTIC_GROUP_SZ POW2(PROBABILISTIC_BIAS)
+#define STEP_FRAC (20) // divides output step size (usually 0.5) into this many internal steps
+#define PROBE_FRAC (2) // divides output step size (usually 0.5) to find probe length
+#define PROBE_QUALITY (4) // Number of probing steps
+#define SAMPLING_QUALITY (2) // can be 2-7
+#define ALLOW_WEAK_LINK (0)
+#define TRIES_PER_REJECTION_SAMPLING (1024)
+#define K_SMALL ((REAL) 0.0001)
 
 #if SAMPLING_QUALITY == 2
 #define DISC_VERT_CNT DISC_2_VERT_CNT
diff --git a/cuslines/cuda_c/tracking_helpers.cu b/cuslines/cuda_c/tracking_helpers.cu
new file mode 100644
index 0000000..21d5f67
--- /dev/null
+++ b/cuslines/cuda_c/tracking_helpers.cu
@@ -0,0 +1,290 @@
+
+using namespace cuwsort;
+
+template<typename REAL_T>
+__device__ REAL_T interpolation_helper_d(const REAL_T*__restrict__ dataf, const REAL_T wgh[3][2], const long long coo[3][2], int dimy, int dimz, int dimt, int t) {
+    REAL_T __tmp = 0;
+    #pragma unroll
+    for (int i = 0; i < 2; i++) {
+        #pragma unroll
+        for (int j = 0; j < 2; j++) {
+            #pragma unroll
+            for (int k = 0; k < 2; k++) {
+                __tmp += wgh[0][i] * wgh[1][j] * wgh[2][k] *
+                         dataf[coo[0][i] * dimy * dimz * dimt +
+                               coo[1][j] * dimz * dimt +
+                               coo[2][k] * dimt +
+                               t];
+            }
+        }
+    }
+    return __tmp;
+}
+
+template<int BDIM_X,
+         typename REAL_T,
+         typename REAL3_T>
+__device__ int trilinear_interp_d(const int dimx,
+                                  const int dimy,
+                                  const int dimz,
+                                  const int dimt,
+                                  int dimt_idx, // If -1, get all
+                                  const REAL_T *__restrict__ dataf,
+                                  const REAL3_T point,
+                                  REAL_T *__restrict__ __vox_data) {
+        const REAL_T HALF = static_cast<REAL_T>(0.5);
+
+        // all thr compute the same here
+        if (point.x < -HALF || point.x+HALF >= dimx ||
+            point.y < -HALF || point.y+HALF >= dimy ||
+               point.z < -HALF || point.z+HALF >= dimz) {
+                return -1;
+        }
+
+        long long  coo[3][2];
+        REAL_T wgh[3][2]; // could use just one...
+
+        const REAL_T ONE  = static_cast<REAL_T>(1.0);
+
+        const REAL3_T fl = MAKE_REAL3(FLOOR(point.x),
+                                      FLOOR(point.y),
+                                      FLOOR(point.z));
+
+        wgh[0][1] = point.x - fl.x; 
+        wgh[0][0] = ONE-wgh[0][1]; 
+        coo[0][0] = MAX(0, fl.x);
+        coo[0][1] = MIN(dimx-1, coo[0][0]+1);
+
+        wgh[1][1] = point.y - fl.y; 
+        wgh[1][0] = ONE-wgh[1][1]; 
+        coo[1][0] = MAX(0, fl.y);
+        coo[1][1] = MIN(dimy-1, coo[1][0]+1);
+
+        wgh[2][1] = point.z - fl.z; 
+        wgh[2][0] = ONE-wgh[2][1]; 
+        coo[2][0] = MAX(0, fl.z);
+        coo[2][1] = MIN(dimz-1, coo[2][0]+1);
+
+        if (dimt_idx == -1) {
+                for (int t = threadIdx.x; t < dimt; t += BDIM_X) {
+                        __vox_data[t] = interpolation_helper_d(dataf, wgh, coo, dimy, dimz, dimt, t);
+                }
+        } else {
+                *__vox_data = interpolation_helper_d(dataf, wgh, coo, dimy, dimz, dimt, dimt_idx);
+        }
+
+        // if (threadIdx.x == 0) {
+        //         printf("point: %f, %f, %f\n", point.x, point.y, point.z);
+        //         printf("dimt_idx: %d\n", dimt_idx);
+        //         // for(int i = 0; i < dimt; i++) {
+        //         //         printf("__vox_data[%d]: %f\n", i, __vox_data[i]);
+        //         // }
+        // }
+        return 0;
+}
+
+template<int BDIM_X,
+         int BDIM_Y,
+         typename REAL_T,
+         typename REAL3_T>
+__device__ int check_point_d(const REAL_T tc_threshold,
+			     const REAL3_T point,
+                             const int dimx,
+                             const int dimy,
+                             const int dimz,
+                             const REAL_T *__restrict__ metric_map) {
+
+        const int tidy = threadIdx.y;
+
+        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
+        __shared__ REAL_T __shInterpOut[BDIM_Y];
+
+        const int rv = trilinear_interp_d<BDIM_X>(dimx, dimy, dimz, 1, 0, metric_map, point, __shInterpOut+tidy);
+        __syncwarp(WMASK);
+#if 0
+        if (threadIdx.y == 1 && threadIdx.x == 0) {
+                printf("__shInterpOut[tidy]: %f, TC_THRESHOLD_P: %f\n", __shInterpOut[tidy], TC_THRESHOLD_P);
+        }
+#endif
+        if (rv != 0) {
+                return OUTSIDEIMAGE;
+        }
+        //return (__shInterpOut[tidy] > TC_THRESHOLD_P) ? TRACKPOINT : ENDPOINT;
+        return (__shInterpOut[tidy] > tc_threshold) ? TRACKPOINT : ENDPOINT;
+}
+
+template<int BDIM_X,
+         int BDIM_Y,
+         typename REAL_T,
+         typename REAL3_T>
+__device__ int peak_directions_d(const REAL_T  *__restrict__ odf,
+                                       REAL3_T *__restrict__ dirs,
+                                 const REAL3_T *__restrict__ sphere_vertices,
+                                 const int2 *__restrict__ sphere_edges,
+                                 const int num_edges,
+				 int samplm_nr,
+				 int *__restrict__ __shInd,
+				 const REAL_T relative_peak_thres,
+				 const REAL_T min_separation_angle) {
+
+        const int tidx = threadIdx.x;
+
+        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
+        const unsigned int lmask = (1 << lid)-1;
+
+//        __shared__ int __shInd[BDIM_Y][SAMPLM_NR];
+
+        #pragma unroll
+        for(int j = tidx; j < samplm_nr; j += BDIM_X) {
+		__shInd[j] = 0;
+        }
+
+        REAL_T odf_min = min_d<BDIM_X>(samplm_nr, odf, REAL_MAX);
+        odf_min = MAX(0, odf_min);
+
+        __syncwarp(WMASK);
+
+        // local_maxima() + _compare_neighbors()
+        // selecting only the indices corrisponding to maxima Ms
+        // such that M-odf_min >= relative_peak_thres
+        //#pragma unroll
+        for(int j = 0; j < num_edges; j += BDIM_X) {
+                if (j+tidx < num_edges) {
+                        const int u_ind = sphere_edges[j+tidx].x;
+                        const int v_ind = sphere_edges[j+tidx].y;
+
+                        //if (u_ind >= NUM_EDGES || v_ind >= NUM_EDGES) { ERROR; }
+
+                        const REAL_T u_val = odf[u_ind];
+                        const REAL_T v_val = odf[v_ind];
+
+                        //if (u_val != u_val || v_val != v_val) { ERROR_NANs; }
+
+                        // only check that they are not equal
+                        //if (u_val != v_val) {
+                        //        __shInd[tidy][u_val < v_val ? u_ind : v_ind] = -1; // benign race conditions...
+                        //}
+                        if (u_val < v_val) {
+                                atomicExch(__shInd+u_ind, -1);
+                                atomicOr(  __shInd+v_ind,  1);
+                        } else if (v_val < u_val) {
+                                atomicExch(__shInd+v_ind, -1);
+                                atomicOr(  __shInd+u_ind,  1);
+                        }
+                }
+        }
+        __syncwarp(WMASK);
+
+        const REAL_T compThres = relative_peak_thres*max_mask_transl_d<BDIM_X>(samplm_nr, __shInd, odf, -odf_min, REAL_MIN);
+#if 1
+/*
+        if (!tidy && !tidx) {
+                for(int j = 0; j < SAMPLM_NR; j++) {
+                        printf("local_max[%d]: %d (%f)\n", j, __shInd[tidy][j], odf[j]);
+                }
+                printf("maxMax with offset %f: %f\n", -odf_min, compThres);
+        }
+        __syncwarp(WMASK);
+*/
+        // compact indices of positive values to the right
+        int n = 0;
+
+        for(int j = 0; j < samplm_nr; j += BDIM_X) {
+
+                const int __v = (j+tidx < samplm_nr) ? __shInd[j+tidx] : -1;
+                const int __keep = (__v > 0) && ((odf[j+tidx]-odf_min) >= compThres);
+                const int __msk = __ballot_sync(WMASK, __keep);
+
+//__syncwarp(WMASK); // unnecessary
+                if (__keep) {
+                        const int myoff = __popc(__msk & lmask);
+                        __shInd[n + myoff] = j+tidx;
+                }
+                n += __popc(__msk);
+//__syncwarp(WMASK); // should be unnecessary
+        }
+        __syncwarp(WMASK);
+/*
+        if (!tidy && !tidx) {
+                for(int j = 0; j < n; j++) {
+                        printf("local_max_compact[%d]: %d\n", j, __shInd[tidy][j]);
+                }
+        }
+        __syncwarp(WMASK);
+*/
+
+        // sort local maxima indices
+        if (n < BDIM_X) {
+                REAL_T k = REAL_MIN;
+                int    v = 0;
+                if (tidx < n) {
+                        v = __shInd[tidx];
+                        k = odf[v];
+                }
+                warp_sort<32, BDIM_X, WSORT_DIR_DEC>(&k, &v);
+                __syncwarp(WMASK);
+
+                if (tidx < n) {
+                        __shInd[tidx] = v;
+                }
+        } else {
+                // ERROR !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+        }
+        __syncwarp(WMASK);
+
+        // __shInd[tidy][] contains the indices in odf correspoding to
+        // normalized maxima NOT sorted!
+        if (n != 0) {
+                // remove_similar_vertices()
+                // PRELIMINARY INEFFICIENT, SINGLE TH, IMPLEMENTATION
+                if (tidx == 0) {
+                        const REAL_T cos_similarity = COS(min_separation_angle);
+
+                        dirs[0] = sphere_vertices[__shInd[0]];
+
+                        int k = 1;
+                        for(int i = 1; i < n; i++) {
+
+                                const REAL3_T abc = sphere_vertices[__shInd[i]];
+
+                                int j = 0;
+                                for(; j < k; j++) {
+                                        const REAL_T cos = FABS(abc.x*dirs[j].x+
+                                                                abc.y*dirs[j].y+
+                                                                abc.z*dirs[j].z);
+                                        if (cos > cos_similarity) {
+                                                break;
+                                        }
+                                }
+                                if (j == k) {
+                                        dirs[k++] = abc;
+                                }
+                        }
+                        n = k;
+                }
+                n = __shfl_sync(WMASK, n, 0, BDIM_X);
+                __syncwarp(WMASK);
+
+        }
+/*
+        if (!tidy && !tidx) {
+                for(int j = 0; j < n; j++) {
+                        printf("local_max_compact_uniq[%d]: %d\n", j, __shInd[tidy][j]);
+                }
+        }
+        __syncwarp(WMASK);
+*/
+#else
+        const int indMax = max_d<BDIM_X, SAMPLM_NR>(__shInd[tidy], -1);
+        if (indMax != -1) {
+                __ret = MAKE_REAL3(sphere_vertices[indMax][0],
+                                   sphere_vertices[indMax][1],
+                                   sphere_vertices[indMax][2]);
+        }
+#endif
+        return n;
+}
diff --git a/cuslines/cuda_c/utils.cu b/cuslines/cuda_c/utils.cu
new file mode 100644
index 0000000..8c5afe1
--- /dev/null
+++ b/cuslines/cuda_c/utils.cu
@@ -0,0 +1,138 @@
+template<int BDIM_X,
+         typename VAL_T>
+__device__ VAL_T max_d(const int n, const VAL_T *__restrict__ src, const VAL_T minVal) {
+
+        const int tidx = threadIdx.x;
+
+        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
+        VAL_T __m = minVal;
+
+        for(int i = tidx; i < n; i += BDIM_X) {
+		__m = MAX(__m, src[i]);
+        }
+
+        #pragma unroll
+        for(int i = BDIM_X/2; i; i /= 2) {
+                const VAL_T __tmp = __shfl_xor_sync(WMASK, __m, i, BDIM_X);
+                __m = MAX(__m, __tmp);
+        }
+
+        return __m;
+}
+
+template<int BDIM_X,
+         typename LEN_T,
+         typename VAL_T>
+__device__ VAL_T max_mask_transl_d(const int n,
+				   const LEN_T *__restrict__ srcMsk,
+                                   const VAL_T *__restrict__ srcVal,
+                                   const VAL_T offset,
+                                   const VAL_T minVal) {
+
+        const int tidx = threadIdx.x;
+
+        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
+        VAL_T __m = minVal;
+
+        for(int i = tidx; i < n; i += BDIM_X) {
+		const LEN_T sel = srcMsk[i];
+		if (sel > 0) {
+			__m = MAX(__m, srcVal[i]+offset);
+		}
+        }
+
+        #pragma unroll
+        for(int i = BDIM_X/2; i; i /= 2) {
+                const VAL_T __tmp = __shfl_xor_sync(WMASK, __m, i, BDIM_X);
+                __m = MAX(__m, __tmp);
+        }
+
+        return __m;
+}
+
+template<int BDIM_X,
+         typename VAL_T>
+__device__ VAL_T min_d(const int n, const VAL_T *__restrict__ src, const VAL_T maxVal) {
+
+        const int tidx = threadIdx.x;
+
+        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
+        VAL_T __m = maxVal;
+
+        for(int i = tidx; i < n; i += BDIM_X) {
+		__m = MIN(__m, src[i]);
+        }
+
+        #pragma unroll
+        for(int i = BDIM_X/2; i; i /= 2) {
+                const VAL_T __tmp = __shfl_xor_sync(WMASK, __m, i, BDIM_X);
+                __m = MIN(__m, __tmp);
+        }
+
+        return __m;
+}
+
+template<int BDIM_X, typename REAL_T>
+__device__ void prefix_sum_sh_d(REAL_T *num_sh, int __len) {
+    const int tidx = threadIdx.x;
+
+    const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+    const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
+#if 0
+    // for debugging
+    __syncwarp(WMASK);
+    if (tidx == 0) {
+        for (int j = 1; j < __len; j++) {
+            num_sh[j] += num_sh[j-1];
+        }
+    }
+    __syncwarp(WMASK);
+#else
+    for (int j = 0; j < __len; j += BDIM_X) {
+        if ((tidx == 0) && (j != 0)) {
+            num_sh[j] += num_sh[j-1];
+        }
+        __syncwarp(WMASK);
+
+        REAL_T __t_pmf;
+        if (j+tidx < __len) {
+            __t_pmf = num_sh[j+tidx];
+        }
+        for (int i = 1; i < BDIM_X; i*=2) {
+            REAL_T __tmp = __shfl_up_sync(WMASK, __t_pmf, i, BDIM_X);
+            if ((tidx >= i) && (j+tidx < __len)) {
+                    __t_pmf += __tmp;
+            }
+        }
+        if (j+tidx < __len) {
+            num_sh[j+tidx] = __t_pmf;
+        }
+        __syncwarp(WMASK);
+    }
+#endif
+}
+
+template<typename REAL_T>
+__device__ void printArrayAlways(const char *name, int ncol, int n, REAL_T *arr) {
+        printf("%s:\n", name);
+
+        for(int j = 0; j < n; j++) {
+                if ((j%ncol)==0) printf("\n");
+                printf("%10.8f ", arr[j]);
+        }
+        printf("\n");
+}
+
+template<typename REAL_T>
+__device__ void printArray(const char *name, int ncol, int n, REAL_T *arr) {
+	if (!threadIdx.x && !threadIdx.y && !blockIdx.x) {
+		printArrayAlways(name, ncol, n, arr);
+	}
+}
diff --git a/cuslines/cuda_python/__init__.py b/cuslines/cuda_python/__init__.py
new file mode 100644
index 0000000..fd05c1e
--- /dev/null
+++ b/cuslines/cuda_python/__init__.py
@@ -0,0 +1,13 @@
+from .cu_tractography import GPUTracker
+from .cu_direction_getters import (
+    ProbDirectionGetter,
+    PttDirectionGetter,
+    BootDirectionGetter,
+)
+
+__all__ = [
+    "GPUTracker",
+    "ProbDirectionGetter",
+    "PttDirectionGetter",
+    "BootDirectionGetter",
+]
diff --git a/cuslines/cuda_python/_globals.py b/cuslines/cuda_python/_globals.py
new file mode 100644
index 0000000..c19368e
--- /dev/null
+++ b/cuslines/cuda_python/_globals.py
@@ -0,0 +1,10 @@
+# AUTO-GENERATED FROM globals.h — DO NOT EDIT
+
+EXCESS_ALLOC_FACT = 2
+MAX_SLINES_PER_SEED = 10
+MAX_SLINE_LEN = 501
+NORM_EPS = 1e-08
+PMF_THRESHOLD_P = 0.05
+REAL_SIZE = 4
+THR_X_BL = 64
+THR_X_SL = 32
diff --git a/cuslines/cuda_python/cu_direction_getters.py b/cuslines/cuda_python/cu_direction_getters.py
new file mode 100644
index 0000000..617f893
--- /dev/null
+++ b/cuslines/cuda_python/cu_direction_getters.py
@@ -0,0 +1,472 @@
+import numpy as np
+from abc import ABC, abstractmethod
+import logging
+from importlib.resources import files
+from time import time
+
+from dipy.reconst import shm
+
+from cuda.core import Device, LaunchConfig, Program, launch, ProgramOptions
+from cuda.pathfinder import find_nvidia_header_directory
+from cuda.cccl import get_include_paths
+from cuda.bindings import runtime, driver
+from cuda.bindings.runtime import cudaMemcpyKind
+
+from cuslines.cuda_python.cutils import (
+    REAL_SIZE,
+    REAL_DTYPE,
+    REAL_DTYPE_AS_STR,
+    REAL3_DTYPE_AS_STR,
+    checkCudaErrors,
+    ModelType,
+    THR_X_SL,
+    BLOCK_Y,
+)
+
+logger = logging.getLogger("GPUStreamlines")
+
+
+class GPUDirectionGetter(ABC):
+    @abstractmethod
+    def getNumStreamlines(self, n, nseeds_gpu, block, grid, sp):
+        pass
+
+    @abstractmethod
+    def generateStreamlines(self, n, nseeds_gpu, block, grid, sp):
+        pass
+
+    def allocate_on_gpu(self, n):
+        pass
+
+    def deallocate_on_gpu(self, n):
+        pass
+
+    def compile_program(self, debug: bool = False):
+        start_time = time()
+        logger.info("Compiling GPUStreamlines")
+
+        cuslines_cuda = files("cuslines").joinpath("cuda_c")
+
+        if debug:
+            program_opts = {
+                "ptxas_options": ["-O0", "-v"],
+                "device_code_optimize": True,
+                "debug": True,
+                "lineinfo": True,
+            }
+        else:
+            program_opts = {"ptxas_options": ["-O3"]}
+
+        program_options = ProgramOptions(
+            name="cuslines",
+            use_fast_math=True,
+            std="c++17",
+            define_macro="__NVRTC__",
+            include_path=[
+                str(cuslines_cuda),
+                find_nvidia_header_directory("cudart"),
+                find_nvidia_header_directory("curand"),
+                get_include_paths().libcudacxx,
+            ],
+            **program_opts,
+        )
+
+        # Here we assume all devices are the same,
+        # so we compile once for any current device.
+        # I think this is reasonable
+        dev = Device()
+        dev.set_current()
+        cuda_path = cuslines_cuda.joinpath("generate_streamlines_cuda.cu")
+        with open(cuda_path, "r") as f:
+            prog = Program(f.read(), code_type="c++", options=program_options)
+        self.module = prog.compile(
+            "cubin",
+            name_expressions=(
+                self.getnum_kernel_name,
+                self.genstreamlines_kernel_name,
+            ),
+        )
+        logger.info(
+            "GPUStreamlines compiled successfully in %.2f seconds", time() - start_time
+        )
+
+
+class BootDirectionGetter(GPUDirectionGetter):
+    def __init__(
+        self,
+        model_type: str,
+        min_signal: float,
+        H: np.ndarray,
+        R: np.ndarray,
+        delta_b: np.ndarray,
+        delta_q: np.ndarray,
+        sampling_matrix: np.ndarray,
+        b0s_mask: np.ndarray,
+    ):
+        if model_type.upper() == "OPDT":
+            self.model_type = int(ModelType.OPDT)
+        elif model_type.upper() == "CSA":
+            self.model_type = int(ModelType.CSA)
+        else:
+            raise ValueError(
+                f"Invalid model_type {model_type}, must be one of 'OPDT', 'CSA'"
+            )
+
+        checkCudaErrors(driver.cuInit(0))
+
+        self.H = np.ascontiguousarray(H, dtype=REAL_DTYPE)
+        self.R = np.ascontiguousarray(R, dtype=REAL_DTYPE)
+        self.delta_b = np.ascontiguousarray(delta_b, dtype=REAL_DTYPE)
+        self.delta_q = np.ascontiguousarray(delta_q, dtype=REAL_DTYPE)
+        self.delta_nr = int(delta_b.shape[0])
+        self.min_signal = REAL_DTYPE(min_signal)
+        self.sampling_matrix = np.ascontiguousarray(sampling_matrix, dtype=REAL_DTYPE)
+        self.b0s_mask = np.ascontiguousarray(b0s_mask, dtype=np.int32)
+
+        self.H_d = []
+        self.R_d = []
+        self.delta_b_d = []
+        self.delta_q_d = []
+        self.b0s_mask_d = []
+        self.sampling_matrix_d = []
+
+        self.getnum_kernel_name = f"getNumStreamlinesBoot_k<{THR_X_SL},{BLOCK_Y},{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>"
+        self.genstreamlines_kernel_name = f"genStreamlinesMergeBoot_k<{THR_X_SL},{BLOCK_Y},{model_type.upper()},{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>"
+        self.compile_program()
+
+    @classmethod
+    def from_dipy_opdt(
+        cls,
+        gtab,
+        sphere,
+        sh_order_max=6,
+        full_basis=False,
+        sh_lambda=0.006,
+        min_signal=1,
+    ):
+        sampling_matrix, _, _ = shm.real_sh_descoteaux(
+            sh_order_max, sphere.theta, sphere.phi, full_basis=full_basis, legacy=False
+        )
+
+        model = shm.OpdtModel(
+            gtab, sh_order_max=sh_order_max, smooth=sh_lambda, min_signal=min_signal
+        )
+        fit_matrix = model._fit_matrix
+        delta_b, delta_q = fit_matrix
+
+        b0s_mask = gtab.b0s_mask
+        dwi_mask = ~b0s_mask
+        x, y, z = model.gtab.gradients[dwi_mask].T
+        _, theta, phi = shm.cart2sphere(x, y, z)
+        B, _, _ = shm.real_sym_sh_basis(sh_order_max, theta, phi)
+        H = shm.hat(B)
+        R = shm.lcr_matrix(H)
+
+        return cls(
+            model_type="OPDT",
+            min_signal=min_signal,
+            H=H,
+            R=R,
+            delta_b=delta_b,
+            delta_q=delta_q,
+            sampling_matrix=sampling_matrix,
+            b0s_mask=gtab.b0s_mask,
+        )
+
+    @classmethod
+    def from_dipy_csa(
+        cls,
+        gtab,
+        sphere,
+        sh_order_max=6,
+        full_basis=False,
+        sh_lambda=0.006,
+        min_signal=1,
+    ):
+        sampling_matrix, _, _ = shm.real_sh_descoteaux(
+            sh_order_max, sphere.theta, sphere.phi, full_basis=full_basis, legacy=False
+        )
+
+        model = shm.CsaOdfModel(
+            gtab, sh_order_max=sh_order_max, smooth=sh_lambda, min_signal=min_signal
+        )
+        fit_matrix = model._fit_matrix
+        delta_b = fit_matrix
+        delta_q = fit_matrix
+
+        b0s_mask = gtab.b0s_mask
+        dwi_mask = ~b0s_mask
+        x, y, z = model.gtab.gradients[dwi_mask].T
+        _, theta, phi = shm.cart2sphere(x, y, z)
+        B, _, _ = shm.real_sym_sh_basis(sh_order_max, theta, phi)
+        H = shm.hat(B)
+        R = shm.lcr_matrix(H)
+
+        return cls(
+            model_type="CSA",
+            min_signal=min_signal,
+            H=H,
+            R=R,
+            delta_b=delta_b,
+            delta_q=delta_q,
+            sampling_matrix=sampling_matrix,
+            b0s_mask=gtab.b0s_mask,
+        )
+
+    def allocate_on_gpu(self, n):
+        self.H_d.append(checkCudaErrors(runtime.cudaMalloc(REAL_SIZE * self.H.size)))
+        self.R_d.append(checkCudaErrors(runtime.cudaMalloc(REAL_SIZE * self.R.size)))
+        self.delta_b_d.append(
+            checkCudaErrors(runtime.cudaMalloc(REAL_SIZE * self.delta_b.size))
+        )
+        self.delta_q_d.append(
+            checkCudaErrors(runtime.cudaMalloc(REAL_SIZE * self.delta_q.size))
+        )
+        self.b0s_mask_d.append(
+            checkCudaErrors(runtime.cudaMalloc(np.int32().nbytes * self.b0s_mask.size))
+        )
+        self.sampling_matrix_d.append(
+            checkCudaErrors(runtime.cudaMalloc(REAL_SIZE * self.sampling_matrix.size))
+        )
+
+        checkCudaErrors(
+            runtime.cudaMemcpy(
+                self.H_d[n],
+                self.H.ctypes.data,
+                REAL_SIZE * self.H.size,
+                cudaMemcpyKind.cudaMemcpyHostToDevice,
+            )
+        )
+        checkCudaErrors(
+            runtime.cudaMemcpy(
+                self.R_d[n],
+                self.R.ctypes.data,
+                REAL_SIZE * self.R.size,
+                cudaMemcpyKind.cudaMemcpyHostToDevice,
+            )
+        )
+        checkCudaErrors(
+            runtime.cudaMemcpy(
+                self.delta_b_d[n],
+                self.delta_b.ctypes.data,
+                REAL_SIZE * self.delta_b.size,
+                cudaMemcpyKind.cudaMemcpyHostToDevice,
+            )
+        )
+        checkCudaErrors(
+            runtime.cudaMemcpy(
+                self.delta_q_d[n],
+                self.delta_q.ctypes.data,
+                REAL_SIZE * self.delta_q.size,
+                cudaMemcpyKind.cudaMemcpyHostToDevice,
+            )
+        )
+        checkCudaErrors(
+            runtime.cudaMemcpy(
+                self.b0s_mask_d[n],
+                self.b0s_mask.ctypes.data,
+                np.int32().nbytes * self.b0s_mask.size,
+                cudaMemcpyKind.cudaMemcpyHostToDevice,
+            )
+        )
+        checkCudaErrors(
+            runtime.cudaMemcpy(
+                self.sampling_matrix_d[n],
+                self.sampling_matrix.ctypes.data,
+                REAL_SIZE * self.sampling_matrix.size,
+                cudaMemcpyKind.cudaMemcpyHostToDevice,
+            )
+        )
+
+    def deallocate_on_gpu(self, n):
+        if self.H_d[n]:
+            checkCudaErrors(runtime.cudaFree(self.H_d[n]))
+        if self.R_d[n]:
+            checkCudaErrors(runtime.cudaFree(self.R_d[n]))
+        if self.delta_b_d[n]:
+            checkCudaErrors(runtime.cudaFree(self.delta_b_d[n]))
+        if self.delta_q_d[n]:
+            checkCudaErrors(runtime.cudaFree(self.delta_q_d[n]))
+        if self.b0s_mask_d[n]:
+            checkCudaErrors(runtime.cudaFree(self.b0s_mask_d[n]))
+        if self.sampling_matrix_d[n]:
+            checkCudaErrors(runtime.cudaFree(self.sampling_matrix_d[n]))
+
+    def _shared_mem_bytes(self, sp):
+        return (
+            REAL_SIZE
+            * BLOCK_Y
+            * 2
+            * (
+                sp.gpu_tracker.n32dimt
+                + max(sp.gpu_tracker.n32dimt, sp.gpu_tracker.samplm_nr)
+            )
+            + np.int32().nbytes * BLOCK_Y * sp.gpu_tracker.samplm_nr
+        )
+
+    def getNumStreamlines(self, n, nseeds_gpu, block, grid, sp):
+        ker = self.module.get_kernel(self.getnum_kernel_name)
+        shared_memory = self._shared_mem_bytes(sp)
+        config = LaunchConfig(block=block, grid=grid, shmem_size=shared_memory)
+
+        launch(
+            sp.gpu_tracker.streams[n],
+            config,
+            ker,
+            self.model_type,
+            sp.gpu_tracker.max_angle,
+            self.min_signal,
+            sp.gpu_tracker.relative_peak_thresh,
+            sp.gpu_tracker.min_separation_angle,
+            sp.gpu_tracker.rng_seed,
+            nseeds_gpu,
+            sp.seeds_d[n],
+            sp.gpu_tracker.dimx,
+            sp.gpu_tracker.dimy,
+            sp.gpu_tracker.dimz,
+            sp.gpu_tracker.dimt,
+            sp.gpu_tracker.dataf_d[n],
+            self.H_d[n],
+            self.R_d[n],
+            self.delta_nr,
+            self.delta_b_d[n],
+            self.delta_q_d[n],
+            self.b0s_mask_d[n],
+            sp.gpu_tracker.samplm_nr,
+            self.sampling_matrix_d[n],
+            sp.gpu_tracker.sphere_vertices_d[n],
+            sp.gpu_tracker.sphere_edges_d[n],
+            sp.gpu_tracker.nedges,
+            sp.shDirTemp0_d[n],
+            sp.slinesOffs_d[n],
+        )
+
+    def generateStreamlines(self, n, nseeds_gpu, block, grid, sp):
+        ker = self.module.get_kernel(self.genstreamlines_kernel_name)
+        shared_memory = self._shared_mem_bytes(sp)
+        config = LaunchConfig(block=block, grid=grid, shmem_size=shared_memory)
+
+        launch(
+            sp.gpu_tracker.streams[n],
+            config,
+            ker,
+            sp.gpu_tracker.max_angle,
+            sp.gpu_tracker.tc_threshold,
+            sp.gpu_tracker.step_size,
+            sp.gpu_tracker.relative_peak_thresh,
+            sp.gpu_tracker.min_separation_angle,
+            sp.gpu_tracker.rng_seed,
+            sp.gpu_tracker.rng_offset + n * nseeds_gpu,
+            nseeds_gpu,
+            sp.seeds_d[n],
+            sp.gpu_tracker.dimx,
+            sp.gpu_tracker.dimy,
+            sp.gpu_tracker.dimz,
+            sp.gpu_tracker.dimt,
+            sp.gpu_tracker.dataf_d[n],
+            sp.gpu_tracker.metric_map_d[n],
+            sp.gpu_tracker.samplm_nr,
+            sp.gpu_tracker.sphere_vertices_d[n],
+            sp.gpu_tracker.sphere_edges_d[n],
+            sp.gpu_tracker.nedges,
+            self.min_signal,
+            self.delta_nr,
+            self.H_d[n],
+            self.R_d[n],
+            self.delta_b_d[n],
+            self.delta_q_d[n],
+            self.sampling_matrix_d[n],
+            self.b0s_mask_d[n],
+            sp.slinesOffs_d[n],
+            sp.shDirTemp0_d[n],
+            sp.slineSeed_d[n],
+            sp.slineLen_d[n],
+            sp.sline_d[n],
+        )
+
+
+class ProbDirectionGetter(GPUDirectionGetter):
+    def __init__(self):
+        checkCudaErrors(driver.cuInit(0))
+        self.getnum_kernel_name = f"getNumStreamlinesProb_k<{THR_X_SL},{BLOCK_Y},{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>"
+        self.genstreamlines_kernel_name = f"genStreamlinesMergeProb_k<{THR_X_SL},{BLOCK_Y},PROB,{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>"
+        self.compile_program()
+
+    def getNumStreamlines(self, n, nseeds_gpu, block, grid, sp):
+        ker = self.module.get_kernel(self.getnum_kernel_name)
+        shared_memory = (
+            REAL_SIZE * BLOCK_Y * sp.gpu_tracker.n32dimt
+            + np.int32().nbytes * BLOCK_Y * sp.gpu_tracker.n32dimt
+        )
+        config = LaunchConfig(block=block, grid=grid, shmem_size=shared_memory)
+
+        launch(
+            sp.gpu_tracker.streams[n],
+            config,
+            ker,
+            sp.gpu_tracker.max_angle,
+            sp.gpu_tracker.relative_peak_thresh,
+            sp.gpu_tracker.min_separation_angle,
+            sp.gpu_tracker.rng_seed,
+            nseeds_gpu,
+            sp.seeds_d[n],
+            sp.gpu_tracker.dimx,
+            sp.gpu_tracker.dimy,
+            sp.gpu_tracker.dimz,
+            sp.gpu_tracker.dimt,
+            sp.gpu_tracker.dataf_d[n],
+            sp.gpu_tracker.sphere_vertices_d[n],
+            sp.gpu_tracker.sphere_edges_d[n],
+            sp.gpu_tracker.nedges,
+            sp.shDirTemp0_d[n],
+            sp.slinesOffs_d[n],
+        )
+
+    def _shared_mem_bytes(self, sp):
+        return REAL_SIZE * BLOCK_Y * sp.gpu_tracker.n32dimt
+
+    def generateStreamlines(self, n, nseeds_gpu, block, grid, sp):
+        ker = self.module.get_kernel(self.genstreamlines_kernel_name)
+        shared_memory = self._shared_mem_bytes(sp)
+        config = LaunchConfig(block=block, grid=grid, shmem_size=shared_memory)
+
+        launch(
+            sp.gpu_tracker.streams[n],
+            config,
+            ker,
+            sp.gpu_tracker.max_angle,
+            sp.gpu_tracker.tc_threshold,
+            sp.gpu_tracker.step_size,
+            sp.gpu_tracker.relative_peak_thresh,
+            sp.gpu_tracker.min_separation_angle,
+            sp.gpu_tracker.rng_seed,
+            sp.gpu_tracker.rng_offset + n * nseeds_gpu,
+            nseeds_gpu,
+            sp.seeds_d[n],
+            sp.gpu_tracker.dimx,
+            sp.gpu_tracker.dimy,
+            sp.gpu_tracker.dimz,
+            sp.gpu_tracker.dimt,
+            sp.gpu_tracker.dataf_d[n],
+            sp.gpu_tracker.metric_map_d[n],
+            sp.gpu_tracker.samplm_nr,
+            sp.gpu_tracker.sphere_vertices_d[n],
+            sp.gpu_tracker.sphere_edges_d[n],
+            sp.gpu_tracker.nedges,
+            sp.slinesOffs_d[n],
+            sp.shDirTemp0_d[n],
+            sp.slineSeed_d[n],
+            sp.slineLen_d[n],
+            sp.sline_d[n],
+        )
+
+
+class PttDirectionGetter(ProbDirectionGetter):
+    def __init__(self):
+        checkCudaErrors(driver.cuInit(0))
+        self.getnum_kernel_name = f"getNumStreamlinesProb_k<{THR_X_SL},{BLOCK_Y},{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>"
+        self.genstreamlines_kernel_name = f"genStreamlinesMergeProb_k<{THR_X_SL},{BLOCK_Y},PTT,{REAL_DTYPE_AS_STR},{REAL3_DTYPE_AS_STR}>"
+        self.compile_program()
+
+    def _shared_mem_bytes(self, sp):
+        return 0
diff --git a/cuslines/cuda_python/cu_propagate_seeds.py b/cuslines/cuda_python/cu_propagate_seeds.py
new file mode 100644
index 0000000..b8991e5
--- /dev/null
+++ b/cuslines/cuda_python/cu_propagate_seeds.py
@@ -0,0 +1,259 @@
+import numpy as np
+import gc
+from cuda.bindings import runtime
+from cuda.bindings.runtime import cudaMemcpyKind
+
+from nibabel.streamlines.array_sequence import ArraySequence, MEGABYTE
+import logging
+
+from cuslines.cuda_python.cutils import (
+    REAL_SIZE,
+    REAL_DTYPE,
+    REAL3_DTYPE,
+    MAX_SLINE_LEN,
+    EXCESS_ALLOC_FACT,
+    THR_X_SL,
+    THR_X_BL,
+    DEV_PTR,
+    div_up,
+    checkCudaErrors,
+)
+
+
+logger = logging.getLogger("GPUStreamlines")
+
+
+class SeedBatchPropagator:
+    def __init__(self, gpu_tracker):
+        self.gpu_tracker = gpu_tracker
+        self.ngpus = gpu_tracker.ngpus
+
+        self.nSlines_old = np.zeros(self.ngpus, dtype=np.int32)
+        self.nSlines = np.zeros(self.ngpus, dtype=np.int32)
+        self.slines = np.zeros(self.ngpus, dtype=np.ndarray)
+        self.sline_lens = np.zeros(self.ngpus, dtype=np.ndarray)
+
+        self.seeds_d = np.empty(self.ngpus, dtype=DEV_PTR)
+        self.slineSeed_d = np.empty(self.ngpus, dtype=DEV_PTR)
+        self.slinesOffs_d = np.empty(self.ngpus, dtype=DEV_PTR)
+        self.shDirTemp0_d = np.empty(self.ngpus, dtype=DEV_PTR)
+        self.slineLen_d = np.empty(self.ngpus, dtype=DEV_PTR)
+        self.sline_d = np.empty(self.ngpus, dtype=DEV_PTR)
+
+    def _switch_device(self, n):
+        checkCudaErrors(runtime.cudaSetDevice(n))
+
+        nseeds_gpu = min(
+            self.nseeds_per_gpu, max(0, self.nseeds - n * self.nseeds_per_gpu)
+        )
+        block = (THR_X_SL, THR_X_BL // THR_X_SL, 1)
+        grid = (div_up(nseeds_gpu, THR_X_BL // THR_X_SL), 1, 1)
+
+        return nseeds_gpu, block, grid
+
+    def _get_sl_buffer_size(self, n):
+        return REAL_SIZE * 2 * 3 * MAX_SLINE_LEN * self.nSlines[n].astype(np.int64)
+
+    def _allocate_seed_memory(self, seeds):
+        # Move seeds to GPU
+        for ii in range(self.ngpus):
+            nseeds_gpu, _, _ = self._switch_device(ii)
+            self.seeds_d[ii] = checkCudaErrors(
+                runtime.cudaMalloc(REAL_SIZE * 3 * nseeds_gpu)
+            )
+            seeds_host = np.ascontiguousarray(
+                seeds[ii * self.nseeds_per_gpu : ii * self.nseeds_per_gpu + nseeds_gpu],
+                dtype=REAL_DTYPE,
+            )
+            checkCudaErrors(
+                runtime.cudaMemcpy(
+                    self.seeds_d[ii],
+                    seeds_host.ctypes.data,
+                    REAL_SIZE * 3 * nseeds_gpu,
+                    cudaMemcpyKind.cudaMemcpyHostToDevice,
+                )
+            )
+
+        for ii in range(self.ngpus):
+            nseeds_gpu, block, grid = self._switch_device(ii)
+            # Streamline offsets
+            self.slinesOffs_d[ii] = checkCudaErrors(
+                runtime.cudaMalloc(np.int32().nbytes * (nseeds_gpu + 1))
+            )
+            # Initial directions from each seed
+            self.shDirTemp0_d[ii] = checkCudaErrors(
+                runtime.cudaMalloc(
+                    REAL3_DTYPE.itemsize
+                    * self.gpu_tracker.samplm_nr
+                    * grid[0]
+                    * block[1]
+                )
+            )
+
+    def _cumsum_offsets(
+        self,
+    ):  # TODO: performance: do this on device? not crucial for performance now
+        for ii in range(self.ngpus):
+            nseeds_gpu, _, _ = self._switch_device(ii)
+            if nseeds_gpu == 0:
+                self.nSlines[ii] = 0
+                continue
+
+            slinesOffs_h = np.empty(nseeds_gpu + 1, dtype=np.int32)
+            checkCudaErrors(
+                runtime.cudaMemcpy(
+                    slinesOffs_h.ctypes.data,
+                    self.slinesOffs_d[ii],
+                    slinesOffs_h.nbytes,
+                    cudaMemcpyKind.cudaMemcpyDeviceToHost,
+                )
+            )
+
+            __pval = slinesOffs_h[0]
+            slinesOffs_h[0] = 0
+            for jj in range(1, nseeds_gpu + 1):
+                __cval = slinesOffs_h[jj]
+                slinesOffs_h[jj] = slinesOffs_h[jj - 1] + __pval
+                __pval = __cval
+            self.nSlines[ii] = int(slinesOffs_h[nseeds_gpu])
+
+            checkCudaErrors(
+                runtime.cudaMemcpy(
+                    self.slinesOffs_d[ii],
+                    slinesOffs_h.ctypes.data,
+                    slinesOffs_h.nbytes,
+                    cudaMemcpyKind.cudaMemcpyHostToDevice,
+                )
+            )
+
+    def _allocate_tracking_memory(self):
+        for ii in range(self.ngpus):
+            self._switch_device(ii)
+
+            self.slineSeed_d[ii] = checkCudaErrors(
+                runtime.cudaMalloc(self.nSlines[ii] * np.int32().nbytes)
+            )
+            checkCudaErrors(
+                runtime.cudaMemset(
+                    self.slineSeed_d[ii], -1, self.nSlines[ii] * np.int32().nbytes
+                )
+            )
+
+            if self.nSlines[ii] > EXCESS_ALLOC_FACT * self.nSlines_old[ii]:
+                self.slines[ii] = 0
+                self.sline_lens[ii] = 0
+                gc.collect()
+
+            buffer_size = self._get_sl_buffer_size(ii)
+            logger.debug(f"Streamline buffer size: {buffer_size}")
+
+            if not self.slines[ii]:
+                self.slines[ii] = np.empty(
+                    (EXCESS_ALLOC_FACT * self.nSlines[ii], MAX_SLINE_LEN * 2, 3),
+                    dtype=REAL_DTYPE,
+                )
+            if not self.sline_lens[ii]:
+                self.sline_lens[ii] = np.empty(
+                    EXCESS_ALLOC_FACT * self.nSlines[ii], dtype=np.int32
+                )
+
+        for ii in range(self.ngpus):
+            self._switch_device(ii)
+            buffer_size = self._get_sl_buffer_size(ii)
+
+            self.slineLen_d[ii] = checkCudaErrors(
+                runtime.cudaMalloc(np.int32().nbytes * self.nSlines[ii])
+            )
+            self.sline_d[ii] = checkCudaErrors(runtime.cudaMalloc(buffer_size))
+
+    def _cleanup(self):
+        for ii in range(self.ngpus):
+            self._switch_device(ii)
+            checkCudaErrors(
+                runtime.cudaMemcpyAsync(
+                    self.slines[ii],
+                    self.sline_d[ii],
+                    self._get_sl_buffer_size(ii),
+                    cudaMemcpyKind.cudaMemcpyDeviceToHost,
+                    self.gpu_tracker.streams[ii],
+                )
+            )
+            checkCudaErrors(
+                runtime.cudaMemcpyAsync(
+                    self.sline_lens[ii],
+                    self.slineLen_d[ii],
+                    np.int32().nbytes * self.nSlines[ii],
+                    cudaMemcpyKind.cudaMemcpyDeviceToHost,
+                    self.gpu_tracker.streams[ii],
+                )
+            )
+
+        for ii in range(self.ngpus):
+            self._switch_device(ii)
+            checkCudaErrors(runtime.cudaStreamSynchronize(self.gpu_tracker.streams[ii]))
+            checkCudaErrors(runtime.cudaFree(self.seeds_d[ii]))
+            checkCudaErrors(runtime.cudaFree(self.slineSeed_d[ii]))
+            checkCudaErrors(runtime.cudaFree(self.slinesOffs_d[ii]))
+            checkCudaErrors(runtime.cudaFree(self.shDirTemp0_d[ii]))
+            checkCudaErrors(runtime.cudaFree(self.slineLen_d[ii]))
+            checkCudaErrors(runtime.cudaFree(self.sline_d[ii]))
+
+        self.nSlines_old = self.nSlines
+        self.gpu_tracker.rng_offset += self.nseeds
+
+    # TODO: performance: better queuing/batching of seeds,
+    # if more performance needed,
+    # given exponential nature of streamlines
+    # May be better to do in cuda code directly
+    def propagate(self, seeds):
+        self.nseeds = len(seeds)
+        self.nseeds_per_gpu = (
+            self.nseeds + self.gpu_tracker.ngpus - 1
+        ) // self.gpu_tracker.ngpus
+
+        self._allocate_seed_memory(seeds)
+
+        for ii in range(self.ngpus):
+            nseeds_gpu, block, grid = self._switch_device(ii)
+            if nseeds_gpu == 0:
+                continue
+            self.gpu_tracker.dg.getNumStreamlines(ii, nseeds_gpu, block, grid, self)
+        for ii in range(self.ngpus):
+            checkCudaErrors(runtime.cudaStreamSynchronize(self.gpu_tracker.streams[ii]))
+
+        self._cumsum_offsets()
+        self._allocate_tracking_memory()
+
+        for ii in range(self.ngpus):
+            nseeds_gpu, block, grid = self._switch_device(ii)
+            if nseeds_gpu == 0:
+                continue
+            self.gpu_tracker.dg.generateStreamlines(ii, nseeds_gpu, block, grid, self)
+        for ii in range(self.ngpus):
+            checkCudaErrors(runtime.cudaStreamSynchronize(self.gpu_tracker.streams[ii]))
+
+        self._cleanup()
+
+    def get_buffer_size(self):
+        buffer_size = 0
+        for ii in range(self.ngpus):
+            lens = self.sline_lens[ii]
+            for jj in range(self.nSlines[ii]):
+                buffer_size += lens[jj] * 3 * REAL_SIZE
+        return buffer_size
+
+    def as_generator(self):
+        def _yield_slines():
+            for ii in range(self.ngpus):
+                this_sls = self.slines[ii]
+                this_len = self.sline_lens[ii]
+
+                for jj in range(self.nSlines[ii]):
+                    npts = this_len[jj]
+
+                    yield np.asarray(this_sls[jj], dtype=REAL_DTYPE)[:npts]
+
+        return _yield_slines()
+
+    def as_array_sequence(self):
+        return ArraySequence(self.as_generator(), self.get_buffer_size() // MEGABYTE)
diff --git a/cuslines/cuda_python/cu_tractography.py b/cuslines/cuda_python/cu_tractography.py
new file mode 100644
index 0000000..9c24cd7
--- /dev/null
+++ b/cuslines/cuda_python/cu_tractography.py
@@ -0,0 +1,315 @@
+from cuda.bindings import runtime
+from cuda.bindings.runtime import cudaMemcpyKind
+# TODO: consider cuda core over cuda bindings
+
+import numpy as np
+from tqdm import tqdm
+import logging
+from math import radians
+
+from cuslines.cuda_python.cutils import (
+    REAL_SIZE,
+    REAL_DTYPE,
+    checkCudaErrors,
+)
+from cuslines.cuda_python.cu_direction_getters import (
+    GPUDirectionGetter,
+    BootDirectionGetter,
+)
+from cuslines.cuda_python.cu_propagate_seeds import SeedBatchPropagator
+
+from trx.trx_file_memmap import TrxFile
+
+from nibabel.streamlines.tractogram import Tractogram
+from nibabel.streamlines.array_sequence import ArraySequence, MEGABYTE
+
+from dipy.io.stateful_tractogram import Space, StatefulTractogram
+
+logger = logging.getLogger("GPUStreamlines")
+
+# TODO performance:
+# ACT
+# SCIL streamline reduction onboard GPU
+# Remove small/long streamlines on gpu
+
+
+class GPUTracker:
+    def __init__(
+        self,
+        dg: GPUDirectionGetter,
+        dataf: np.ndarray,
+        stop_map: np.ndarray,
+        stop_theshold: float,
+        sphere_vertices: np.ndarray,
+        sphere_edges: np.ndarray,
+        max_angle: float = radians(60),
+        step_size: float = 0.5,
+        relative_peak_thresh: float = 0.25,
+        min_separation_angle: float = radians(45),
+        ngpus: int = 1,
+        rng_seed: int = 0,
+        rng_offset: int = 0,
+        chunk_size: int = 100000,
+    ):
+        """
+        Initialize GPUTracker with necessary data.
+
+        Parameters
+        ----------
+        dg : GPUDirectionGetter
+            Direction getter to use for tracking from
+            cuslines.cu_direction_getters
+        dataf : np.ndarray
+            4D numpy array with ODFs for prob/ptt, diffusion data if doing
+            bootstrapping.
+        stop_map : np.ndarray
+            3D numpy array with stopping metric (e.g., GFA, FA)
+        stop_theshold : float
+            Threshold for stopping metric (e.g., 0.2)
+        sphere_vertices : np.ndarray
+            Vertices of the sphere used for direction sampling.
+        sphere_edges : np.ndarray
+            Edges of the sphere used for direction sampling.
+        max_angle : float, optional
+            Maximum angle (in radians) between steps
+            default: radians(60)
+        step_size : float, optional
+            Step size for tracking
+            default: 0.5
+        relative_peak_thresh : float, optional
+            Relative peak threshold for direction selection
+            default: 0.25
+        min_separation_angle : float, optional
+            Minimum separation angle (in radians) between peaks
+            default: radians(45)
+        ngpus : int, optional
+            Number of GPUs to use
+            default: 1
+        rng_seed : int, optional
+            Seed for random number generator
+            default: 0
+        rng_offset : int, optional
+            Offset for random number generator
+            default: 0
+        """
+        self.dataf = np.ascontiguousarray(dataf, dtype=REAL_DTYPE)
+        self.metric_map = np.ascontiguousarray(stop_map, dtype=REAL_DTYPE)
+        self.sphere_vertices = np.ascontiguousarray(sphere_vertices, dtype=REAL_DTYPE)
+        self.sphere_edges = np.ascontiguousarray(sphere_edges, dtype=np.int32)
+
+        self.dimx, self.dimy, self.dimz, self.dimt = dataf.shape
+        self.nedges = int(sphere_edges.shape[0])
+        if isinstance(dg, BootDirectionGetter):
+            self.samplm_nr = int(dg.sampling_matrix.shape[0])
+        else:
+            self.samplm_nr = self.dimt
+        self.n32dimt = ((self.dimt + 31) // 32) * 32
+
+        self.dg = dg
+        self.max_angle = REAL_DTYPE(max_angle)
+        self.tc_threshold = REAL_DTYPE(stop_theshold)
+        self.step_size = REAL_DTYPE(step_size)
+        self.relative_peak_thresh = REAL_DTYPE(relative_peak_thresh)
+        self.min_separation_angle = REAL_DTYPE(min_separation_angle)
+
+        self.ngpus = int(ngpus)
+        self.rng_seed = int(rng_seed)
+        self.rng_offset = int(rng_offset)
+        self.chunk_size = int(chunk_size)
+
+        avail = checkCudaErrors(runtime.cudaGetDeviceCount())
+        if self.ngpus > avail:
+            raise RuntimeError(
+                f"Requested {self.ngpus} GPUs but only {avail} available"
+            )
+
+        logger.info("Creating GPUTracker with %d GPUs...", self.ngpus)
+
+        self.dataf_d = []
+        self.metric_map_d = []
+        self.sphere_vertices_d = []
+        self.sphere_edges_d = []
+
+        self.streams = []
+        self.managed_data = []
+
+        self.seed_propagator = SeedBatchPropagator(gpu_tracker=self)
+        self._allocated = False
+
+    def __enter__(self):
+        self._allocate()
+        return self
+
+    def _allocate(self):
+        if self._allocated:
+            return
+
+        for ii in range(self.ngpus):
+            checkCudaErrors(runtime.cudaSetDevice(ii))
+            self.streams.append(
+                checkCudaErrors(
+                    runtime.cudaStreamCreateWithFlags(runtime.cudaStreamNonBlocking)
+                )
+            )
+
+        for ii in range(self.ngpus):
+            checkCudaErrors(runtime.cudaSetDevice(ii))
+
+            # TODO: performance: dataf could be managed or texture memory instead?
+            self.dataf_d.append(
+                checkCudaErrors(runtime.cudaMalloc(REAL_SIZE * self.dataf.size))
+            )
+            self.metric_map_d.append(
+                checkCudaErrors(runtime.cudaMalloc(REAL_SIZE * self.metric_map.size))
+            )
+            self.sphere_vertices_d.append(
+                checkCudaErrors(
+                    runtime.cudaMalloc(REAL_SIZE * self.sphere_vertices.size)
+                )
+            )
+            self.sphere_edges_d.append(
+                checkCudaErrors(
+                    runtime.cudaMalloc(np.int32().nbytes * self.sphere_edges.size)
+                )
+            )
+
+            checkCudaErrors(
+                runtime.cudaMemcpy(
+                    self.dataf_d[ii],
+                    self.dataf.ctypes.data,
+                    REAL_SIZE * self.dataf.size,
+                    cudaMemcpyKind.cudaMemcpyHostToDevice,
+                )
+            )
+            checkCudaErrors(
+                runtime.cudaMemcpy(
+                    self.metric_map_d[ii],
+                    self.metric_map.ctypes.data,
+                    REAL_SIZE * self.metric_map.size,
+                    cudaMemcpyKind.cudaMemcpyHostToDevice,
+                )
+            )
+            checkCudaErrors(
+                runtime.cudaMemcpy(
+                    self.sphere_vertices_d[ii],
+                    self.sphere_vertices.ctypes.data,
+                    REAL_SIZE * self.sphere_vertices.size,
+                    cudaMemcpyKind.cudaMemcpyHostToDevice,
+                )
+            )
+            checkCudaErrors(
+                runtime.cudaMemcpy(
+                    self.sphere_edges_d[ii],
+                    self.sphere_edges.ctypes.data,
+                    np.int32().nbytes * self.sphere_edges.size,
+                    cudaMemcpyKind.cudaMemcpyHostToDevice,
+                )
+            )
+            self.dg.allocate_on_gpu(ii)
+
+        self._allocated = True
+
+    def __exit__(self, exc_type, exc, tb):
+        logger.info("Destroying GPUTracker and freeing GPU memory...")
+
+        for n in range(self.ngpus):
+            checkCudaErrors(runtime.cudaSetDevice(n))
+            if self.dataf_d[n]:
+                checkCudaErrors(runtime.cudaFree(self.dataf_d[n]))
+            if self.metric_map_d[n]:
+                checkCudaErrors(runtime.cudaFree(self.metric_map_d[n]))
+            if self.sphere_vertices_d[n]:
+                checkCudaErrors(runtime.cudaFree(self.sphere_vertices_d[n]))
+            if self.sphere_edges_d[n]:
+                checkCudaErrors(runtime.cudaFree(self.sphere_edges_d[n]))
+            self.dg.deallocate_on_gpu(n)
+
+            checkCudaErrors(runtime.cudaStreamDestroy(self.streams[n]))
+        return False
+
+    def _divide_chunks(self, seeds):
+        global_chunk_sz = self.chunk_size * self.ngpus
+        nchunks = (seeds.shape[0] + global_chunk_sz - 1) // global_chunk_sz
+        return global_chunk_sz, nchunks
+
+    def generate_sft(self, seeds, ref_img):
+        global_chunk_sz, nchunks = self._divide_chunks(seeds)
+        buffer_size = 0
+        generators = []
+
+        with tqdm(total=seeds.shape[0]) as pbar:
+            for idx in range(nchunks):
+                self.seed_propagator.propagate(
+                    seeds[idx * global_chunk_sz : (idx + 1) * global_chunk_sz]
+                )
+                buffer_size += self.seed_propagator.get_buffer_size()
+                generators.append(self.seed_propagator.as_generator())
+                pbar.update(
+                    seeds[idx * global_chunk_sz : (idx + 1) * global_chunk_sz].shape[0]
+                )
+        array_sequence = ArraySequence(
+            (item for gen in generators for item in gen), buffer_size // MEGABYTE
+        )
+        return StatefulTractogram(array_sequence, ref_img, Space.VOX)
+
+    # TODO: performance: consider a way to just output in VOX space directly
+    def generate_trx(self, seeds, ref_img):
+        global_chunk_sz, nchunks = self._divide_chunks(seeds)
+
+        # Will resize by a factor of 2 if these are exceeded
+        sl_len_guess = 100
+        sl_per_seed_guess = 3
+        n_sls_guess = sl_per_seed_guess * seeds.shape[0]
+
+        # trx files use memory mapping
+        trx_file = TrxFile(
+            reference=ref_img,
+            nb_streamlines=n_sls_guess,
+            nb_vertices=n_sls_guess * sl_len_guess,
+        )
+        trx_file.streamlines._offsets = trx_file.streamlines._offsets.astype(np.uint64)
+        offsets_idx = 0
+        sls_data_idx = 0
+
+        with tqdm(total=seeds.shape[0]) as pbar:
+            for idx in range(int(nchunks)):
+                self.seed_propagator.propagate(
+                    seeds[idx * global_chunk_sz : (idx + 1) * global_chunk_sz]
+                )
+                tractogram = Tractogram(
+                    self.seed_propagator.as_array_sequence(),
+                    affine_to_rasmm=ref_img.affine,
+                )
+                tractogram.to_world()
+                sls = tractogram.streamlines
+
+                new_offsets_idx = offsets_idx + len(sls._offsets)
+                new_sls_data_idx = sls_data_idx + len(sls._data)
+
+                if (
+                    new_offsets_idx > trx_file.header["NB_STREAMLINES"]
+                    or new_sls_data_idx > trx_file.header["NB_VERTICES"]
+                ):
+                    logger.info("TRX resizing...")
+                    trx_file.resize(
+                        nb_streamlines=new_offsets_idx * 2,
+                        nb_vertices=new_sls_data_idx * 2,
+                    )
+
+                # TRX uses memmaps here
+                trx_file.streamlines._data[sls_data_idx:new_sls_data_idx] = sls._data
+                trx_file.streamlines._offsets[offsets_idx:new_offsets_idx] = (
+                    sls_data_idx + sls._offsets
+                )
+                trx_file.streamlines._lengths[offsets_idx:new_offsets_idx] = (
+                    sls._lengths
+                )
+
+                offsets_idx = new_offsets_idx
+                sls_data_idx = new_sls_data_idx
+                pbar.update(
+                    seeds[idx * global_chunk_sz : (idx + 1) * global_chunk_sz].shape[0]
+                )
+        trx_file.resize()
+
+        return trx_file
diff --git a/cuslines/cuda_python/cutils.py b/cuslines/cuda_python/cutils.py
new file mode 100644
index 0000000..db4115a
--- /dev/null
+++ b/cuslines/cuda_python/cutils.py
@@ -0,0 +1,64 @@
+from cuda.bindings import driver, nvrtc
+
+import numpy as np
+
+from enum import IntEnum
+
+from cuslines.cuda_python._globals import *
+
+
+class ModelType(IntEnum):
+    OPDT = 0
+    CSA = 1
+    PROB = 2
+    PTT = 3
+
+
+REAL3_SIZE = 3 * REAL_SIZE
+if REAL_SIZE == 4:
+    REAL_DTYPE = np.float32
+    REAL3_DTYPE = np.dtype(
+        [("x", np.float32), ("y", np.float32), ("z", np.float32)], align=True
+    )
+    REAL_DTYPE_AS_STR = "float"
+    REAL3_DTYPE_AS_STR = "float3"
+elif REAL_SIZE == 8:
+    REAL_DTYPE = np.float64
+    REAL3_DTYPE = np.dtype(
+        [("x", np.float64), ("y", np.float64), ("z", np.float64)], align=True
+    )
+    REAL_DTYPE_AS_STR = "double"
+    REAL3_DTYPE_AS_STR = "double3"
+else:
+    raise NotImplementedError(f"Unsupported REAL_SIZE={REAL_SIZE} in globals.h")
+BLOCK_Y = THR_X_BL // THR_X_SL
+DEV_PTR = object
+
+
+def _cudaGetErrorEnum(error):
+    if isinstance(error, driver.CUresult):
+        err, name = driver.cuGetErrorName(error)
+        return name if err == driver.CUresult.CUDA_SUCCESS else "<unknown>"
+    elif isinstance(error, nvrtc.nvrtcResult):
+        return nvrtc.nvrtcGetErrorString(error)[1]
+    else:
+        raise RuntimeError("Unknown error type: {}".format(error))
+
+
+def checkCudaErrors(result):
+    if result[0].value:
+        raise RuntimeError(
+            "CUDA error code={}({})".format(
+                result[0].value, _cudaGetErrorEnum(result[0])
+            )
+        )
+    if len(result) == 1:
+        return None
+    elif len(result) == 2:
+        return result[1]
+    else:
+        return result[1:]
+
+
+def div_up(a, b):
+    return (a + b - 1) // b
diff --git a/cuslines/cuslines.cpp b/cuslines/cuslines.cpp
deleted file mode 100644
index 4e8bc30..0000000
--- a/cuslines/cuslines.cpp
+++ /dev/null
@@ -1,359 +0,0 @@
-/* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- *    list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- *    contributors may be used to endorse or promote products derived from
- *    this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <cstring>
-#include <iostream>
-#include <stdexcept>
-
-#include <pybind11/pybind11.h>
-#include <pybind11/numpy.h>
-#include <pybind11/stl.h>
-namespace py = pybind11;
-
-#include <cuda_runtime.h>
-
-// #define USE_NVTX
-
-#include "globals.h"
-#include "cudamacro.h"
-#include "generate_streamlines_cuda.h"
-
-using np_array = py::array_t<REAL>;
-using np_array_int = py::array_t<int>;
-
-using np_array_cast = py::array_t<REAL, py::array::c_style | py::array::forcecast>;
-using np_array_int_cast = py::array_t<int, py::array::c_style | py::array::forcecast>;
-
-// Handle to cleanup returned host allocations when associated Python object is destroyed
-template <typename T>
-py::capsule cleanup(T* ptr) {
-  return py::capsule(ptr, [](void *f) {
-           T *g = reinterpret_cast<T *>(f);
-           delete [] g;
-         });
-}
-
-class GPUTracker {
-  public:
-    GPUTracker(ModelType model_type,
-               double max_angle,
-               double min_signal,
-               double tc_threshold,
-               double step_size,
-               double relative_peak_thresh,
-               double min_separation_angle,
-               np_array_cast dataf,
-               np_array_cast H,
-               np_array_cast R,
-               np_array_cast delta_b,
-               np_array_cast delta_q,
-               np_array_int_cast b0s_mask,
-               np_array_cast metric_map,
-               np_array_cast sampling_matrix,
-               np_array_cast sphere_vertices,
-               np_array_int_cast sphere_edges,
-               int ngpus = 1,
-               int rng_seed = 0,
-               int rng_offset = 0) {
-
-      // Get info structs from numpy objects
-      auto dataf_info = dataf.request();
-      auto H_info = H.request();
-      auto R_info = R.request();
-      auto delta_b_info = delta_b.request();
-      auto delta_q_info = delta_q.request();
-      auto b0s_mask_info = b0s_mask.request();
-      auto metric_map_info = metric_map.request();
-      auto sampling_matrix_info = sampling_matrix.request();
-      auto sphere_vertices_info = sphere_vertices.request();
-      auto sphere_edges_info = sphere_edges.request();
-
-      dimx_ = dataf_info.shape[0];
-      dimy_ = dataf_info.shape[1];
-      dimz_ = dataf_info.shape[2];
-      dimt_ = dataf_info.shape[3];
-      nedges_ = sphere_edges_info.shape[0];
-
-      delta_nr_ = delta_b_info.shape[0];
-      samplm_nr_ = sampling_matrix_info.shape[0];
-
-// No longer needed
-#if 0
-      // Error checking for template parameters.
-      // TODO: Need to make kernel more general.
-      if (delta_b_info.shape[0] != 28 ||
-          sampling_matrix_info.shape[0] != 181 ||
-          dataf_info.shape[3] > 160) {
-          std::cout << delta_b_info.shape[0] << " " << sampling_matrix_info.shape[0] << " " << dataf_info.shape[3] << std::endl;
-          throw std::invalid_argument("Input data dimensions not currently supported.");
-      }
-#endif
-
-      // Get number of GPUs
-      int ngpus_avail;
-      CHECK_CUDA(cudaGetDeviceCount(&ngpus_avail));
-      if (ngpus > ngpus_avail) {
-        throw std::runtime_error("Requested to use more GPUs than available on system.");
-      }
-
-      std::cerr << "Creating GPUTracker with " << ngpus << " GPUs..." << std::endl;
-      ngpus_ = ngpus;
-
-      model_type_ = model_type;
-      max_angle_ = max_angle;
-      min_signal_ = min_signal;
-      tc_threshold_ = tc_threshold;
-      step_size_ = step_size;
-      relative_peak_thresh_ = relative_peak_thresh,
-      min_separation_angle_ = min_separation_angle,
-
-      // Allocate/copy constant problem data on GPUs
-      dataf_d.resize(ngpus_, nullptr);
-      H_d.resize(ngpus_, nullptr);
-      R_d.resize(ngpus_, nullptr);
-      delta_b_d.resize(ngpus_, nullptr);
-      delta_q_d.resize(ngpus_, nullptr);
-      b0s_mask_d.resize(ngpus_, nullptr);
-      metric_map_d.resize(ngpus_, nullptr);
-      sampling_matrix_d.resize(ngpus_, nullptr);
-      sphere_vertices_d.resize(ngpus_, nullptr);
-      sphere_edges_d.resize(ngpus_, nullptr);
-
-      //#pragma omp parallel for
-      for (int n = 0; n < ngpus_; ++n) {
-        CHECK_CUDA(cudaSetDevice(n));
-        CHECK_CUDA(cudaMallocManaged(&dataf_d[n], sizeof(*dataf_d[n]) * dataf_info.size));
-        CHECK_CUDA(cudaMemAdvise(dataf_d[n], sizeof(*dataf_d[n]) * dataf_info.size, cudaMemAdviseSetPreferredLocation, n));
-        CHECK_CUDA(cudaMalloc(&H_d[n], sizeof(*H_d[n]) * H_info.size));
-        CHECK_CUDA(cudaMalloc(&R_d[n], sizeof(*R_d[n]) * R_info.size));
-        CHECK_CUDA(cudaMalloc(&delta_b_d[n], sizeof(*delta_b_d[n]) * delta_b_info.size));
-        CHECK_CUDA(cudaMalloc(&delta_q_d[n], sizeof(*delta_q_d[n]) * delta_q_info.size));
-        CHECK_CUDA(cudaMalloc(&b0s_mask_d[n], sizeof(*b0s_mask_d[n]) * b0s_mask_info.size));
-        CHECK_CUDA(cudaMalloc(&metric_map_d[n], sizeof(*metric_map_d[n]) * metric_map_info.size));
-        CHECK_CUDA(cudaMalloc(&sampling_matrix_d[n], sizeof(*sampling_matrix_d[n]) * sampling_matrix_info.size));
-        CHECK_CUDA(cudaMalloc(&sphere_vertices_d[n], sizeof(*sphere_vertices_d[n]) * sphere_vertices_info.size));
-        CHECK_CUDA(cudaMalloc(&sphere_edges_d[n], sizeof(*sphere_edges_d[n]) * sphere_edges_info.size));
-
-        CHECK_CUDA(cudaMemcpy(dataf_d[n], dataf_info.ptr, sizeof(*dataf_d[n]) * dataf_info.size, cudaMemcpyHostToDevice));
-        CHECK_CUDA(cudaMemcpy(H_d[n], H_info.ptr, sizeof(*H_d[n]) * H_info.size, cudaMemcpyHostToDevice));
-        CHECK_CUDA(cudaMemcpy(R_d[n], R_info.ptr, sizeof(*R_d[n]) * R_info.size, cudaMemcpyHostToDevice));
-        CHECK_CUDA(cudaMemcpy(delta_b_d[n], delta_b_info.ptr, sizeof(*delta_b_d[n]) * delta_b_info.size, cudaMemcpyHostToDevice));
-        CHECK_CUDA(cudaMemcpy(delta_q_d[n], delta_q_info.ptr, sizeof(*delta_q_d[n]) * delta_q_info.size, cudaMemcpyHostToDevice));
-        CHECK_CUDA(cudaMemcpy(b0s_mask_d[n], b0s_mask_info.ptr, sizeof(*b0s_mask_d[n]) * b0s_mask_info.size, cudaMemcpyHostToDevice));
-        CHECK_CUDA(cudaMemcpy(metric_map_d[n], metric_map_info.ptr, sizeof(*metric_map_d[n]) * metric_map_info.size, cudaMemcpyHostToDevice));
-        CHECK_CUDA(cudaMemcpy(sampling_matrix_d[n], sampling_matrix_info.ptr, sizeof(*sampling_matrix_d[n]) * sampling_matrix_info.size, cudaMemcpyHostToDevice));
-        CHECK_CUDA(cudaMemcpy(sphere_vertices_d[n], sphere_vertices_info.ptr, sizeof(*sphere_vertices_d[n]) * sphere_vertices_info.size, cudaMemcpyHostToDevice));
-        CHECK_CUDA(cudaMemcpy(sphere_edges_d[n], sphere_edges_info.ptr, sizeof(*sphere_edges_d[n]) * sphere_edges_info.size, cudaMemcpyHostToDevice));
-      }
-
-      rng_seed_ = rng_seed;
-      rng_offset_ = rng_offset;
-      nSlines_old_.resize(ngpus_, 0);
-      slines_.resize(ngpus_, nullptr);
-      slinesLen_.resize(ngpus_, nullptr);
-
-      streams_.resize(ngpus_);
-      for (int n = 0; n < ngpus_; ++n) {
-        CHECK_CUDA(cudaSetDevice(n));
-        CHECK_CUDA(cudaStreamCreateWithFlags(&streams_[n], cudaStreamNonBlocking));
-      }
-
-    }
-
-    ~GPUTracker() {
-      std::cerr << "Destroy GPUTracker..." << std::endl;
-      for (int n = 0; n < ngpus_; ++n) {
-        CHECK_CUDA(cudaSetDevice(n));
-        if (dataf_d[n]) CHECK_CUDA(cudaFree(dataf_d[n]));
-        if (H_d[n]) CHECK_CUDA(cudaFree(H_d[n]));
-        if (R_d[n]) CHECK_CUDA(cudaFree(R_d[n]));
-        if (delta_b_d[n]) CHECK_CUDA(cudaFree(delta_b_d[n]));
-        if (delta_q_d[n]) CHECK_CUDA(cudaFree(delta_q_d[n]));
-        if (b0s_mask_d[n]) CHECK_CUDA(cudaFree(b0s_mask_d[n]));
-        if (metric_map_d[n]) CHECK_CUDA(cudaFree(metric_map_d[n]));
-        if (sampling_matrix_d[n]) CHECK_CUDA(cudaFree(sampling_matrix_d[n]));
-        if (sphere_vertices_d[n]) CHECK_CUDA(cudaFree(sphere_vertices_d[n]));
-        if (sphere_edges_d[n]) CHECK_CUDA(cudaFree(sphere_edges_d[n]));
-
-        if (slines_[n]) CHECK_CUDA(cudaFreeHost(slines_[n]));
-        if (slinesLen_[n]) CHECK_CUDA(cudaFreeHost(slinesLen_[n]));
-
-        CHECK_CUDA(cudaStreamDestroy(streams_[n]));
-      }
-    }
-
-    std::vector<py::array_t<REAL>> generate_streamlines(np_array seeds) {
-
-      auto seeds_info = seeds.request();
-      int nseeds = seeds_info.shape[0];
-
-      std::vector<REAL*> seeds_d(ngpus_, nullptr);
-      int nseeds_per_gpu = (nseeds + ngpus_ - 1) / ngpus_;
-
-      //#pragma omp parallel for
-      for (int n = 0; n < ngpus_; ++n) {
-        int nseeds_gpu = std::min(nseeds_per_gpu, std::max(0, nseeds - n*nseeds_per_gpu));
-        CHECK_CUDA(cudaSetDevice(n));
-        CHECK_CUDA(cudaMalloc(&seeds_d[n], sizeof(*seeds_d[n]) * 3 * nseeds_gpu));
-        CHECK_CUDA(cudaMemcpy(seeds_d[n], reinterpret_cast<REAL *>(seeds_info.ptr) + 3*n*nseeds_per_gpu, sizeof(*seeds_d[n]) * 3 * nseeds_gpu, cudaMemcpyHostToDevice));
-      }
-
-      std::vector<int> nSlines(ngpus_);
-
-      // Call GPU routine
-      generate_streamlines_cuda_mgpu(model_type_, max_angle_, min_signal_, tc_threshold_, step_size_,
-                                     relative_peak_thresh_, min_separation_angle_,
-                                     nseeds, seeds_d,
-                                     dimx_, dimy_, dimz_, dimt_,
-                                     dataf_d, H_d, R_d, delta_nr_, delta_b_d, delta_q_d, b0s_mask_d, metric_map_d, samplm_nr_, sampling_matrix_d,
-                                     sphere_vertices_d, sphere_edges_d, nedges_,
-                                     slines_, slinesLen_, nSlines, nSlines_old_, rng_seed_, rng_offset_, ngpus_,
-                                     streams_);
-
-      nSlines_old_ = nSlines;  //store number of slines for next set of seeds
-
-      // Update rng_offset for next set of seeds
-      rng_offset_ += nseeds;
-
-      int nSlines_total = 0;
-      for (int n = 0; n < ngpus_; ++n) {
-        CHECK_CUDA(cudaFree(seeds_d[n]));
-        nSlines_total += nSlines[n];
-      }
-
-      std::vector<py::array_t<REAL>> slines_list;
-      slines_list.reserve(nSlines_total);
-      for (int n = 0; n < ngpus_; ++n) {
-        for (int i = 0; i < nSlines[n]; ++i) {
-          REAL* sl = new REAL[slinesLen_[n][i]*3];
-          std::memcpy(sl, slines_[n] + i*3*2*MAX_SLINE_LEN, slinesLen_[n][i]*3*sizeof(*sl));
-          auto sl_arr = py::array_t<REAL>({slinesLen_[n][i], 3}, // shape
-                                          {3*sizeof(REAL), sizeof(REAL)}, // strides
-                                          sl,
-                                          cleanup(sl));
-          slines_list.push_back(sl_arr);
-        }
-      }
-
-      return slines_list;
-
-    }
-
-    void dump_streamlines(std::string output_prefix, std::string voxel_order,
-                          np_array_int roi_shape, np_array voxel_size, np_array vox_to_ras) {
-
-      auto roi_shape_info = roi_shape.request();
-      auto voxel_size_info = voxel_size.request();
-      auto vox_to_ras_info = vox_to_ras.request();
-
-      START_RANGE("filewrite", 0);
-
-      //#pragma omp parallel for
-      for (int n = 0; n < ngpus_; ++n) {
-        std::stringstream ss;
-        ss << output_prefix << "_" << std::to_string(n) <<  ".trk";
-        write_trk(ss.str().c_str(), reinterpret_cast<int *>(roi_shape_info.ptr), reinterpret_cast<REAL *>(voxel_size_info.ptr),
-                  voxel_order.c_str(), reinterpret_cast<REAL *>(vox_to_ras_info.ptr), nSlines_old_[n], slinesLen_[n],
-                  reinterpret_cast<REAL3 *>(slines_[n]));
-      }
-
-      END_RANGE;
-    }
-
-  private:
-    int ngpus_;
-    int rng_seed_;
-    int rng_offset_;
-    int dimx_, dimy_, dimz_, dimt_;
-    int nedges_;
-    int delta_nr_, samplm_nr_;
-
-    ModelType model_type_;
-    double max_angle_;
-    double tc_threshold_;
-    double min_signal_;
-    double step_size_;
-    double relative_peak_thresh_;
-    double min_separation_angle_;
-
-    std::vector<int> nSlines_old_;
-    std::vector<REAL*> slines_;
-    std::vector<int*> slinesLen_;
-
-    std::vector<REAL*> dataf_d;
-    std::vector<REAL*> H_d;
-    std::vector<REAL*> R_d;
-    std::vector<REAL*> delta_b_d;
-    std::vector<REAL*> delta_q_d;
-    std::vector<int*> b0s_mask_d;
-    std::vector<REAL*> metric_map_d;
-    std::vector<REAL*> sampling_matrix_d;
-    std::vector<REAL*> sphere_vertices_d;
-    std::vector<int*> sphere_edges_d;
-
-    std::vector<cudaStream_t> streams_;
-
-};
-
-
-PYBIND11_MODULE(cuslines, m) {
-  m.attr("MAX_SLINE_LEN") = py::int_(MAX_SLINE_LEN);
-  m.attr("REAL_SIZE") = py::int_(REAL_SIZE);
-
-  py::enum_<ModelType>(m, "ModelType")
-    .value("OPDT", OPDT)
-    .value("CSA", CSA)
-    .value("PROB", PROB)
-    .value("PTT", PTT);
-
-  py::class_<GPUTracker>(m, "GPUTracker")
-    .def(py::init<ModelType, double, double, double, double,
-                  double, double,
-		              np_array_cast, np_array_cast,
-                  np_array_cast, np_array_cast,
-                  np_array_cast, np_array_int_cast,
-                  np_array_cast, np_array_cast,
-                  np_array_cast, np_array_int_cast,
-                  int, int, int>(),
-                  py::arg().noconvert(), py::arg().noconvert(), py::arg().noconvert(), py::arg().noconvert(), py::arg().noconvert(),
-                  py::arg().noconvert(), py::arg().noconvert(),
-                  py::arg().noconvert(), py::arg().noconvert(),
-                  py::arg().noconvert(), py::arg().noconvert(),
-                  py::arg().noconvert(), py::arg().noconvert(),
-                  py::arg().noconvert(), py::arg().noconvert(),
-                  py::arg().noconvert(), py::arg().noconvert(),
-                  py::arg("ngpus") = 1, py::arg("rng_seed") = 0,
-                  py::arg("rng_offset") = 0)
-
-    .def("generate_streamlines", &GPUTracker::generate_streamlines,
-         "Generates streamline for dipy test case.")
-
-    .def("dump_streamlines", &GPUTracker::dump_streamlines,
-         "Dump streamlines to file.");
-}
-
diff --git a/cuslines/generate_streamlines_cuda.cu b/cuslines/generate_streamlines_cuda.cu
deleted file mode 100644
index 374c7c1..0000000
--- a/cuslines/generate_streamlines_cuda.cu
+++ /dev/null
@@ -1,2420 +0,0 @@
-/* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- *    list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- *    contributors may be used to endorse or promote products derived from
- *    this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <getopt.h>
-#include <cuda_runtime.h>
-#include <curand_kernel.h>
-#include <cfloat>
-#include <omp.h>
-#include <vector>
-#include <cmath>
-
-#include "cudamacro.h" /* for time() */
-#include "globals.h"
-
-#include <iostream>
-
-#include "cuwsort.cuh"
-#include "ptt.cuh"
-
-#include "utils.cu"
-#include "ptt.cu"
-
-#define MAX_NUM_DIR (128)
-
-#define NTHR_GEN (128)
-
-#define MAX_DIMS        (8)
-#define MAX_STR_LEN     (256)
-
-using namespace cuwsort;
-
-//#define USE_FIXED_PERMUTATION
-#ifdef USE_FIXED_PERMUTATION
-//__device__ const int fixedPerm[] = {44, 47, 53,  0,  3,  3, 39,  9, 19, 21, 50, 36, 23,
-//                                     6, 24, 24, 12,  1, 38, 39, 23, 46, 24, 17, 37, 25, 
-//                                    13,  8,  9, 20, 51, 16, 51,  5, 15, 47,  0, 18, 35, 
-//                                    24, 49, 51, 29, 19, 19, 14, 39, 32,  1,  9, 32, 31,
-//                                    10, 52, 23};
-__device__ const int fixedPerm[] = {
-  47, 117,  67, 103,   9,  21,  36,  87,  70,  88, 140,  58,  39,  87,  88,  81,  25,  77,
-  72,   9, 148, 115,  79,  82,  99,  29, 147, 147, 142,  32,   9, 127,  32,  31, 114,  28,
-  34, 128, 128,  53, 133,  38,  17,  79, 132, 105,  42,  31, 120,   1,  65,  57,  35, 102,
- 119,  11,  82,  91, 128, 142,  99,  53, 140, 121,  84,  68,   6,  47, 127, 131, 100,  78,
- 143, 148,  23, 141, 117,  85,  48,  49,  69,  95,  94,   0, 113,  36,  48,  93, 131,  98,
-  42, 112, 149, 127,   0, 138, 114,  43, 127,  23, 130, 121,  98,  62, 123,  82, 148,  50,
-  14,  41,  58,  36,  10,  86,  43, 104,  11,   2,  51,  80,  32, 128,  38,  19,  42, 115,
-  77,  30,  24, 125,   2,   3,  94, 107,  13, 112,  40,  72,  19,  95,  72,  67,  61,  14,
-  96,   4, 139,  86, 121, 109};
-#endif
-
-template<int BDIM_X,
-         typename VAL_T>
-__device__ void ndotp_d(const int N,
-			const int M,
-			const VAL_T *__restrict__ srcV,
-                        const VAL_T *__restrict__ srcM,
-                              VAL_T *__restrict__ dstV) {
-
-        const int tidx = threadIdx.x;
-
-        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
-        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
-
-        //#pragma unroll
-        for(int i = 0; i < N; i++) {
-
-                VAL_T __tmp = 0;
-
-                //#pragma unroll
-                for(int j = 0; j < M; j += BDIM_X) {
-                        if (j+tidx < M) {
-                                __tmp += srcV[j+tidx]*srcM[i*M + j+tidx];
-                        }
-                }
-                #pragma unroll
-                for(int j = BDIM_X/2; j; j /= 2) {
-#if 0
-                        __tmp += __shfl_xor_sync(WMASK, __tmp, j, BDIM_X);
-#else
-                        __tmp += __shfl_down_sync(WMASK, __tmp, j, BDIM_X);
-#endif
-                }
-                // values could be held by BDIM_X threads and written
-                // together every BDIM_X iterations...
-
-                if (tidx == 0) {
-                        dstV[i] = __tmp;
-                }
-        }
-        return;
-}
-
-
-template<int BDIM_X,
-         typename VAL_T>
-__device__ void ndotp_log_opdt_d(const int N,
-			    const int M,
-			    const VAL_T *__restrict__ srcV,
-                            const VAL_T *__restrict__ srcM,
-                                  VAL_T *__restrict__ dstV) {
-
-        const int tidx = threadIdx.x;
-
-        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
-         const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
-
-        const VAL_T ONEP5 = static_cast<VAL_T>(1.5);
-
-        //#pragma unroll
-        for(int i = 0; i < N; i++) {
-
-                VAL_T __tmp = 0;
-
-                //#pragma unroll
-                for(int j = 0; j < M; j += BDIM_X) {
-                        if (j+tidx < M) {
-                                const VAL_T v = srcV[j+tidx];
-                                __tmp += -LOG(v)*(ONEP5+LOG(v))*v * srcM[i*M + j+tidx];
-                        }
-                }
-                #pragma unroll
-                for(int j = BDIM_X/2; j; j /= 2) {
-#if 0
-                        __tmp += __shfl_xor_sync(WMASK, __tmp, j, BDIM_X);
-#else
-                        __tmp += __shfl_down_sync(WMASK, __tmp, j, BDIM_X);
-#endif
-                }
-                // values could be held by BDIM_X threads and written
-                // together every BDIM_X iterations...
-
-                if (tidx == 0) {
-                        dstV[i] = __tmp;
-                }
-        }
-        return;
-}
-
-template<int BDIM_X,
-	 typename VAL_T>
-__device__ void ndotp_log_csa_d(const int N,
-				const int M,
-				const VAL_T *__restrict__ srcV,
-				const VAL_T *__restrict__ srcM,
-				VAL_T *__restrict__ dstV) {
-
-	const int tidx = threadIdx.x;
-
-	const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
-	const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
-	// Clamp values
-	constexpr VAL_T min = .001;
-	constexpr VAL_T max = .999;
-
-	//#pragma unroll
-	for(int i = 0; i < N; i++) {
-
-		VAL_T __tmp = 0;
-
-		//#pragma unroll
-		for(int j = 0; j < M; j += BDIM_X) {
-			if (j+tidx < M) {
-				const VAL_T v = MIN(MAX(srcV[j+tidx], min), max);
-				__tmp += LOG(-LOG(v)) * srcM[i*M + j+tidx];
-			}
-		}
-		#pragma unroll
-		for(int j = BDIM_X/2; j; j /= 2) {
-#if 0
-			__tmp += __shfl_xor_sync(WMASK, __tmp, j, BDIM_X);
-#else
-			__tmp += __shfl_down_sync(WMASK, __tmp, j, BDIM_X);
-#endif
-		}
-		// values could be held by BDIM_X threads and written
-		// together every BDIM_X iterations...
-
-		if (tidx == 0) {
-			dstV[i] = __tmp;
-		}
-	}
-	return;
-}
-
-
-template<int BDIM_X,
-         typename REAL_T>
-__device__ void fit_opdt(const int delta_nr,
-                         const int hr_side,
-                         const REAL_T *__restrict__ delta_q,
-                         const REAL_T *__restrict__ delta_b,
-                         const REAL_T *__restrict__ __msk_data_sh,
-                         REAL_T *__restrict__ __h_sh,
-                         REAL_T *__restrict__ __r_sh) {
-        const int tidx = threadIdx.x;
-        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
-        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
-
-        ndotp_log_opdt_d<BDIM_X>(delta_nr, hr_side, __msk_data_sh, delta_q, __r_sh);
-        ndotp_d         <BDIM_X>(delta_nr, hr_side, __msk_data_sh, delta_b, __h_sh);
-        __syncwarp(WMASK);
-        #pragma unroll
-        for(int j = tidx; j < delta_nr; j += BDIM_X) {
-                __r_sh[j] -= __h_sh[j];
-        }
-        __syncwarp(WMASK);
-}
-
-template<int BDIM_X, typename REAL_T>
-__device__ void fit_csa(const int delta_nr,
-                        const int hr_side,
-                        const REAL_T *__restrict__ fit_matrix,
-                        const REAL_T *__restrict__ __msk_data_sh,
-                        REAL_T *__restrict__ __r_sh) {
-        const int tidx = threadIdx.x;
-        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
-        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
-
-        constexpr REAL _n0_const = 0.28209479177387814; // .5 / sqrt(pi)
-        ndotp_log_csa_d<BDIM_X>(delta_nr, hr_side, __msk_data_sh, fit_matrix, __r_sh);
-        __syncwarp(WMASK);
-        if (tidx == 0) {
-                __r_sh[0] = _n0_const;
-        }
-        __syncwarp(WMASK);
-}
-
-template<int BDIM_X, ModelType MODEL_T, typename REAL_T>
-__device__ void fit_model_coef(const int delta_nr, // delta_nr is number of ODF directions
-                               const int hr_side, // hr_side is number of data directions
-                               const REAL_T *__restrict__ delta_q,
-                               const REAL_T *__restrict__ delta_b, // these are fit matrices the model can use, different for each model
-                               const REAL_T *__restrict__ __msk_data_sh, // __msk_data_sh is the part of the data currently being operated on by this block
-                               REAL_T *__restrict__ __h_sh, // these last two are modifications to the coefficients that will be returned
-                               REAL_T *__restrict__ __r_sh) {
-        switch(MODEL_T) {
-                case OPDT:
-                        fit_opdt<BDIM_X>(delta_nr, hr_side, delta_q, delta_b, __msk_data_sh, __h_sh, __r_sh);
-                        break;
-                case CSA:
-                        fit_csa<BDIM_X>(delta_nr, hr_side, delta_q, __msk_data_sh, __r_sh);
-                        break;
-                default:
-                        printf("FATAL: Invalid Model Type.\n");
-                        break;
-        }
-}
-
-template<int BDIM_X,
-         typename LEN_T,
-         typename VAL_T>
-__device__ VAL_T max_mask_transl_d(const int n,
-				   const LEN_T *__restrict__ srcMsk,
-                                   const VAL_T *__restrict__ srcVal,
-                                   const VAL_T offset,
-                                   const VAL_T minVal) {
-
-        const int tidx = threadIdx.x;
-
-        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
-        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
-
-        VAL_T __m = minVal;
-
-        for(int i = tidx; i < n; i += BDIM_X) {
-		const LEN_T sel = srcMsk[i];
-		if (sel > 0) {
-			__m = MAX(__m, srcVal[i]+offset);
-		}
-        }
-
-        #pragma unroll
-        for(int i = BDIM_X/2; i; i /= 2) {
-                const VAL_T __tmp = __shfl_xor_sync(WMASK, __m, i, BDIM_X);
-                __m = MAX(__m, __tmp);
-        }
-
-        return __m;
-}
-
-template<int BDIM_X,
-         typename VAL_T>
-__device__ VAL_T max_d(const int n, const VAL_T *__restrict__ src, const VAL_T minVal) {
-
-        const int tidx = threadIdx.x;
-
-        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
-        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
-
-        VAL_T __m = minVal;
-
-        for(int i = tidx; i < n; i += BDIM_X) {
-		__m = MAX(__m, src[i]);
-        }
-
-        #pragma unroll
-        for(int i = BDIM_X/2; i; i /= 2) {
-                const VAL_T __tmp = __shfl_xor_sync(WMASK, __m, i, BDIM_X);
-                __m = MAX(__m, __tmp);
-        }
-
-        return __m;
-}
-
-template<int BDIM_X,
-         typename VAL_T>
-__device__ VAL_T min_d(const int n, const VAL_T *__restrict__ src, const VAL_T maxVal) {
-
-        const int tidx = threadIdx.x;
-
-        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
-        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
-
-        VAL_T __m = maxVal;
-
-        for(int i = tidx; i < n; i += BDIM_X) {
-		__m = MIN(__m, src[i]);
-        }
-
-        #pragma unroll
-        for(int i = BDIM_X/2; i; i /= 2) {
-                const VAL_T __tmp = __shfl_xor_sync(WMASK, __m, i, BDIM_X);
-                __m = MIN(__m, __tmp);
-        }
-
-        return __m;
-}
-			
-template<int BDIM_X,
-         typename VAL_T>
-__device__ VAL_T avgMask(const int mskLen,
-			 const int *__restrict__ mask,
-			 const VAL_T *__restrict__ data) {
-        
-	const int tidx = threadIdx.x;
-        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
-
-        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
-
-        int   __myCnt = 0;
-        VAL_T __mySum = 0;
-
-        for(int i = tidx; i < mskLen; i += BDIM_X) {
-		if(mask[i]) {
-			__myCnt++;
-			__mySum += data[i];
-		}
-        }
-
-        #pragma unroll
-        for(int i = BDIM_X/2; i; i /= 2) {
-                __mySum += __shfl_xor_sync(WMASK, __mySum, i, BDIM_X);
-                __myCnt += __shfl_xor_sync(WMASK, __myCnt, i, BDIM_X);
-        }
-
-        return __mySum/__myCnt;
-
-}
-
-template<int BDIM_X,
-         int BDIM_Y,
-         typename REAL_T,
-         typename REAL3_T>
-__device__ int peak_directions_d(const REAL_T  *__restrict__ odf,
-                                       REAL3_T *__restrict__ dirs,
-                                 const REAL3_T *__restrict__ sphere_vertices,
-                                 const int2 *__restrict__ sphere_edges,
-                                 const int num_edges,
-				 int samplm_nr,
-				 int *__restrict__ __shInd,
-				 const REAL_T relative_peak_thres,
-				 const REAL_T min_separation_angle) {
-
-        const int tidx = threadIdx.x;
-
-        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
-        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
-
-        const unsigned int lmask = (1 << lid)-1;
-
-//        __shared__ int __shInd[BDIM_Y][SAMPLM_NR];
-
-        #pragma unroll
-        for(int j = tidx; j < samplm_nr; j += BDIM_X) {
-		__shInd[j] = 0;
-        }
-
-        REAL_T odf_min = min_d<BDIM_X>(samplm_nr, odf, REAL_MAX);
-        odf_min = MAX(0, odf_min);
-
-        __syncwarp(WMASK);
-
-        // local_maxima() + _compare_neighbors()
-        // selecting only the indices corrisponding to maxima Ms
-        // such that M-odf_min >= relative_peak_thres
-        //#pragma unroll
-        for(int j = 0; j < num_edges; j += BDIM_X) {
-                if (j+tidx < num_edges) {
-                        const int u_ind = sphere_edges[j+tidx].x;
-                        const int v_ind = sphere_edges[j+tidx].y;
-
-                        //if (u_ind >= NUM_EDGES || v_ind >= NUM_EDGES) { ERROR; }
-
-                        const REAL_T u_val = odf[u_ind];
-                        const REAL_T v_val = odf[v_ind];
-
-                        //if (u_val != u_val || v_val != v_val) { ERROR_NANs; }
-
-                        // only check that they are not equal
-                        //if (u_val != v_val) {
-                        //        __shInd[tidy][u_val < v_val ? u_ind : v_ind] = -1; // benign race conditions...
-                        //}
-                        if (u_val < v_val) {
-                                atomicExch(__shInd+u_ind, -1);
-                                atomicOr(  __shInd+v_ind,  1);
-                        } else if (v_val < u_val) {
-                                atomicExch(__shInd+v_ind, -1);
-                                atomicOr(  __shInd+u_ind,  1);
-                        }
-                }
-        }
-        __syncwarp(WMASK);
-
-        const REAL_T compThres = relative_peak_thres*max_mask_transl_d<BDIM_X>(samplm_nr, __shInd, odf, -odf_min, REAL_MIN);
-#if 1
-/*
-        if (!tidy && !tidx) {
-                for(int j = 0; j < SAMPLM_NR; j++) {
-                        printf("local_max[%d]: %d (%f)\n", j, __shInd[tidy][j], odf[j]);
-                }
-                printf("maxMax with offset %f: %f\n", -odf_min, compThres);
-        }
-        __syncwarp(WMASK);
-*/
-        // compact indices of positive values to the right
-        int n = 0;
-
-        for(int j = 0; j < samplm_nr; j += BDIM_X) {
-
-                const int __v = (j+tidx < samplm_nr) ? __shInd[j+tidx] : -1;
-                const int __keep = (__v > 0) && ((odf[j+tidx]-odf_min) >= compThres);
-                const int __msk = __ballot_sync(WMASK, __keep);
-
-//__syncwarp(WMASK); // unnecessary
-                if (__keep) {
-                        const int myoff = __popc(__msk & lmask);
-                        __shInd[n + myoff] = j+tidx;
-                }
-                n += __popc(__msk);
-//__syncwarp(WMASK); // should be unnecessary
-        }
-        __syncwarp(WMASK);
-/*
-        if (!tidy && !tidx) {
-                for(int j = 0; j < n; j++) {
-                        printf("local_max_compact[%d]: %d\n", j, __shInd[tidy][j]);
-                }
-        }
-        __syncwarp(WMASK);
-*/
-
-        // sort local maxima indices
-        if (n < BDIM_X) {
-                REAL_T k = REAL_MIN;
-                int    v = 0;
-                if (tidx < n) {
-                        v = __shInd[tidx];
-                        k = odf[v];
-                }
-                warp_sort<32, BDIM_X, WSORT_DIR_DEC>(&k, &v);
-                __syncwarp(WMASK);
-
-                if (tidx < n) {
-                        __shInd[tidx] = v;
-                }
-        } else {
-                // ERROR !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-        }
-        __syncwarp(WMASK);
-
-        // __shInd[tidy][] contains the indices in odf correspoding to
-        // normalized maxima NOT sorted!
-        if (n != 0) {
-                // remove_similar_vertices()
-                // PRELIMINARY INEFFICIENT, SINGLE TH, IMPLEMENTATION
-                if (tidx == 0) {
-                        const REAL_T cos_similarity = COS(min_separation_angle);
-
-                        dirs[0] = sphere_vertices[__shInd[0]];
-
-                        int k = 1;
-                        for(int i = 1; i < n; i++) {
-
-                                const REAL3_T abc = sphere_vertices[__shInd[i]];
-
-                                int j = 0;
-                                for(; j < k; j++) {
-                                        const REAL_T cos = FABS(abc.x*dirs[j].x+
-                                                                abc.y*dirs[j].y+
-                                                                abc.z*dirs[j].z);
-                                        if (cos > cos_similarity) {
-                                                break;
-                                        }
-                                }
-                                if (j == k) {
-                                        dirs[k++] = abc;
-                                }
-                        }
-                        n = k;
-                }
-                n = __shfl_sync(WMASK, n, 0, BDIM_X);
-                __syncwarp(WMASK);
-
-        }
-/*
-        if (!tidy && !tidx) {
-                for(int j = 0; j < n; j++) {
-                        printf("local_max_compact_uniq[%d]: %d\n", j, __shInd[tidy][j]);
-                }
-        }
-        __syncwarp(WMASK);
-*/
-#else
-        const int indMax = max_d<BDIM_X, SAMPLM_NR>(__shInd[tidy], -1);
-        if (indMax != -1) {
-                __ret = MAKE_REAL3(sphere_vertices[indMax][0],
-                                   sphere_vertices[indMax][1],
-                                   sphere_vertices[indMax][2]);
-        }
-#endif
-        return n;
-}
-
-template<int BDIM_X,
-         int BDIM_Y,
-         typename REAL_T,
-         typename REAL3_T>
-__device__ int closest_peak_d(const REAL_T max_angle,
-			      const REAL3_T  direction, //dir
-                              const int npeaks,
-                              const REAL3_T *__restrict__ peaks,
-                                    REAL3_T *__restrict__ peak) {// dirs,
-
-        const int tidx = threadIdx.x;
-
-        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
-        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
-
-        //const REAL_T cos_similarity = COS(MAX_ANGLE_P);
-        const REAL_T cos_similarity = COS(max_angle);
-#if 0
-        if (!threadIdx.y && !tidx) {
-                printf("direction: (%f, %f, %f)\n",
-                        direction.x, direction.y, direction.z);
-        }
-        __syncwarp(WMASK);
-#endif
-        REAL_T cpeak_dot = 0;
-        int    cpeak_idx = -1;
-        for(int j = 0; j < npeaks; j += BDIM_X) {
-                if (j+tidx < npeaks) {
-#if 0
-                        if (!threadIdx.y && !tidx) {
-                                printf("j+tidx: %d, peaks[j+tidx]: (%f, %f, %f)\n",
-                                        j+tidx, peaks[j+tidx].x, peaks[j+tidx].y, peaks[j+tidx].z);
-                        }
-#endif
-                        const REAL_T dot = direction.x*peaks[j+tidx].x+
-                                           direction.y*peaks[j+tidx].y+
-                                           direction.z*peaks[j+tidx].z;
-
-                        if (FABS(dot) > FABS(cpeak_dot)) {
-                                cpeak_dot = dot;
-                                cpeak_idx = j+tidx;
-                        }
-                }
-        }
-#if 0
-        if (!threadIdx.y && !tidx) {
-                printf("cpeak_idx: %d, cpeak_dot: %f\n", cpeak_idx, cpeak_dot);
-        }
-        __syncwarp(WMASK);
-#endif
-
-        #pragma unroll
-        for(int j = BDIM_X/2; j; j /= 2) {
-
-                const REAL_T dot = __shfl_xor_sync(WMASK, cpeak_dot, j, BDIM_X);
-                const int    idx = __shfl_xor_sync(WMASK, cpeak_idx, j, BDIM_X);
-                if (FABS(dot) > FABS(cpeak_dot)) {
-                        cpeak_dot = dot;
-                        cpeak_idx = idx;
-                }
-        }
-#if 0
-        if (!threadIdx.y && !tidx) {
-                printf("cpeak_idx: %d, cpeak_dot: %f, cos_similarity: %f\n", cpeak_idx, cpeak_dot, cos_similarity);
-        }
-        __syncwarp(WMASK);
-#endif
-        if (cpeak_idx >= 0) {
-                if (cpeak_dot >= cos_similarity) {
-                        peak[0] = peaks[cpeak_idx];
-                        return 1;
-                }
-                if (cpeak_dot <= -cos_similarity) {
-                        peak[0] = MAKE_REAL3(-peaks[cpeak_idx].x,
-                                             -peaks[cpeak_idx].y,
-                                             -peaks[cpeak_idx].z);
-                        return 1;
-                }
-        }
-        return 0;
-}
-
-template<int BDIM_X,
-	 typename LEN_T,
-	 typename MSK_T,
-	 typename VAL_T>
-__device__ LEN_T maskGet(const LEN_T n, 
-			 const MSK_T *__restrict__ mask,
-			 const VAL_T *__restrict__ plain,
-			       VAL_T *__restrict__ masked) {
-
-	const int tidx = threadIdx.x;
-	
-        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
-        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
-
-	const int __laneMask = (1 << tidx)-1;
-
-	int woff = 0;
-	for(int j = 0; j < n; j += BDIM_X) {
-
-		const int __act = (j+tidx < n) ? !mask[j+tidx] : 0;
-		const int __msk = __ballot_sync(WMASK, __act);
-
-		const int toff = __popc(__msk & __laneMask);
-		if (__act) {
-			masked[woff+toff] = plain[j+tidx];
-		}
-		woff += __popc(__msk);
-	}
-	return woff;
-}
-
-template<int BDIM_X,
-	 typename LEN_T,
-	 typename MSK_T,
-	 typename VAL_T>
-__device__ void maskPut(const LEN_T n, 
-			const MSK_T *__restrict__ mask,
-			const VAL_T *__restrict__ masked,
-			      VAL_T *__restrict__ plain) {
-
-	const int tidx = threadIdx.x;
-	
-        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
-        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
-
-	const int __laneMask = (1 << tidx)-1;
-
-	int woff = 0;
-	for(int j = 0; j < n; j += BDIM_X) {
-
-		const int __act = (j+tidx < n) ? !mask[j+tidx] : 0;
-		const int __msk = __ballot_sync(WMASK, __act);
-
-		const int toff = __popc(__msk & __laneMask);
-		if (__act) {
-			plain[j+tidx] = masked[woff+toff];
-		}
-		woff += __popc(__msk);
-	}
-	return;
-}
-
-template<int BDIM_X,
-         int BDIM_Y,
-         bool IS_START,
-         typename REAL_T,
-         typename REAL3_T>
-__device__ int get_direction_prob_d(curandStatePhilox4_32_10_t *st,
-                                    const REAL_T *__restrict__ pmf,
-                                    const REAL_T max_angle,
-                                    const REAL_T relative_peak_thres,
-                                    const REAL_T min_separation_angle,
-                                    REAL3_T dir,
-                                    const int dimx,
-                                    const int dimy,
-                                    const int dimz,
-                                    const int dimt,
-                                    const REAL3_T point,
-                                    const REAL3_T *__restrict__ sphere_vertices,
-                                    const int2 *__restrict__ sphere_edges,
-                                    const int num_edges,
-                                    REAL3_T *__restrict__ dirs) {
-        const int tidx = threadIdx.x;
-        const int tidy = threadIdx.y;
-	
-        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
-        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
-
-	const int n32dimt = ((dimt+31)/32)*32;
-
-	extern __shared__ REAL_T __sh[];
-        REAL_T *__pmf_data_sh = __sh + tidy*n32dimt;
-
-        // pmf = self.pmf_gen.get_pmf_c(&point[0], pmf)
-        __syncwarp(WMASK);
-        const int rv = trilinear_interp_d<BDIM_X>(dimx, dimy, dimz, dimt, -1, pmf, point, __pmf_data_sh);
-        __syncwarp(WMASK);
-        if (rv != 0) {
-                return 0;
-        }
-
-        // for i in range(_len):
-        //     if pmf[i] > max_pmf:
-        //         max_pmf = pmf[i]
-        // absolute_pmf_threshold = pmf_threshold * max_pmf
-        const REAL_T absolpmf_thresh = PMF_THRESHOLD_P * max_d<BDIM_X>(dimt, __pmf_data_sh, REAL_MIN);
-        __syncwarp(WMASK);
-
-        // for i in range(_len):
-        //     if pmf[i] < absolute_pmf_threshold:
-        //         pmf[i] = 0.0
-        #pragma unroll
-        for(int i = tidx; i < dimt; i += BDIM_X) {
-                if (__pmf_data_sh[i] < absolpmf_thresh) {
-                        __pmf_data_sh[i] = 0.0;
-                }
-        }
-        __syncwarp(WMASK);
-
-        if (IS_START) {
-                int *__shInd = reinterpret_cast<int *>(__sh + BDIM_Y*n32dimt) + tidy*n32dimt;
-                return peak_directions_d<BDIM_X,
-                                         BDIM_Y>(__pmf_data_sh,
-                                                 dirs,
-                                                 sphere_vertices,
-                                                 sphere_edges,
-                                                 num_edges,
-                                                 dimt,
-                                                 __shInd,
-                                                 relative_peak_thres,
-                                                 min_separation_angle);
-        } else {
-                REAL_T __tmp;
-                #ifdef DEBUG
-                        __syncwarp(WMASK);
-                        if (tidx == 0) {
-                                printArray("__pmf_data_sh initial", 8, dimt, __pmf_data_sh);
-                                printf("absolpmf_thresh %10.8f\n", absolpmf_thresh);
-                                printf("--->            dir %10.8f, %10.8f, %10.8f\n", dir.x, dir.y, dir.z);
-                                printf("--->            point %10.8f, %10.8f, %10.8f\n", point.x, point.y, point.z);
-                        }
-                        __syncwarp(WMASK);
-                        if (tidx == 15) {
-                                printf("absolpmf_thresh %10.8f l15\n", absolpmf_thresh);
-                                printf("--->            dir %10.8f, %10.8f, %10.8f l15\n", dir.x, dir.y, dir.z);
-                                printf("--->            point %10.8f, %10.8f, %10.8f l15\n", point.x, point.y, point.z);
-                        }
-                        __syncwarp(WMASK);
-                        if (tidx == 31) {
-                                printArray("__pmf_data_sh initial l31", 8, dimt, __pmf_data_sh);
-                                printf("absolpmf_thresh %10.8f l31\n", absolpmf_thresh);
-                                printf("--->            dir %10.8f, %10.8f, %10.8f l31\n", dir.x, dir.y, dir.z);
-                                printf("--->            point %10.8f, %10.8f, %10.8f l31\n", point.x, point.y, point.z);
-                        }
-                        __syncwarp(WMASK);
-                #endif
-
-                // // These should not be relevant
-                // if norm(&direction[0]) == 0:
-                //     return 1
-                // normalize(&direction[0])
-
-                // for i in range(_len):
-                //         cos_sim = self.vertices[i][0] * direction[0] \
-                //                 + self.vertices[i][1] * direction[1] \
-                //                 + self.vertices[i][2] * direction[2]
-                //         if cos_sim < 0:
-                //                 cos_sim = cos_sim * -1
-                //         if cos_sim < self.cos_similarity:
-                //                 pmf[i] = 0
-                const REAL_T cos_similarity = COS(max_angle);
-
-                #pragma unroll
-                for(int i = tidx; i < dimt; i += BDIM_X) {
-                        const REAL_T dot = dir.x*sphere_vertices[i].x+
-                                           dir.y*sphere_vertices[i].y+
-                                           dir.z*sphere_vertices[i].z;
-
-                        if (FABS(dot) < cos_similarity) {
-                                __pmf_data_sh[i] = 0.0;
-                        }
-                }
-                __syncwarp(WMASK);
-
-                #ifdef DEBUG
-                        __syncwarp(WMASK);
-                        if (tidx == 0) {
-                                printArray("__pmf_data_sh after filtering", 8, dimt, __pmf_data_sh);
-                        }
-                        __syncwarp(WMASK);
-                #endif
-
-                // cumsum(pmf, pmf, _len)
-                prefix_sum_sh_d<BDIM_X>(__pmf_data_sh, dimt);
-
-                #ifdef DEBUG
-                        __syncwarp(WMASK);
-                        if (tidx == 0) {
-                                printArray("__pmf_data_sh after cumsum", 8, dimt, __pmf_data_sh);
-                        }
-                        __syncwarp(WMASK);
-                #endif
-
-                // last_cdf = pmf[_len - 1]
-                // if last_cdf == 0:
-                //         return 1
-                REAL_T last_cdf = __pmf_data_sh[dimt - 1];
-                if (last_cdf == 0) {
-                        return 0;
-                }
-
-                // idx = where_to_insert(pmf, random() * last_cdf, _len)
-                if (tidx == 0) {
-                        __tmp = curand_uniform(st) * last_cdf;
-                }
-                REAL_T selected_cdf = __shfl_sync(WMASK, __tmp, 0, BDIM_X);
-// Both these implementations work
-#if 1
-                int low = 0;
-                int high = dimt - 1;
-                while ((high - low) >= BDIM_X) {
-                        const int mid = (low + high) / 2;
-                        if (__pmf_data_sh[mid] < selected_cdf) {
-                                low = mid;
-                        } else {
-                                high = mid;
-                        }
-                }
-                const bool __ballot = (low+tidx <= high) ? (selected_cdf < __pmf_data_sh[low+tidx]) : 0;
-                const int __msk = __ballot_sync(WMASK, __ballot);
-                const int indProb = low + __ffs(__msk) - 1;
-#else
-                int indProb = dimt - 1;
-                for (int ii = 0; ii < dimt; ii+=BDIM_X) {
-                        int __is_greater = 0;
-                        if (ii+tidx < dimt) {
-                                __is_greater = selected_cdf < __pmf_data_sh[ii+tidx];
-                        }
-                        const int __msk = __ballot_sync(WMASK, __is_greater);
-                        if (__msk != 0) {
-                                indProb = ii + __ffs(__msk) - 1;
-                                break;
-                        }
-                }
-#endif
-
-                #ifdef DEBUG
-                        __syncwarp(WMASK);
-                        if (tidx == 0) {
-                                printf("last_cdf %10.8f\n", last_cdf);
-                                printf("selected_cdf %10.8f\n", selected_cdf);
-                                printf("indProb %i out of %i\n", indProb, dimt);
-                        }
-                        __syncwarp(WMASK);
-                #endif
-
-                // newdir = self.vertices[idx]
-                // if (direction[0] * newdir[0]
-                //     + direction[1] * newdir[1]
-                //     + direction[2] * newdir[2] > 0):
-                //     copy_point(&newdir[0], &direction[0])
-                // else:
-                //     newdir[0] = newdir[0] * -1
-                //     newdir[1] = newdir[1] * -1
-                //     newdir[2] = newdir[2] * -1
-                //     copy_point(&newdir[0], &direction[0])
-                if (tidx == 0) {
-                        if ((dir.x * sphere_vertices[indProb].x +
-                             dir.y * sphere_vertices[indProb].y +
-                             dir.z * sphere_vertices[indProb].z) > 0) {
-                                *dirs = MAKE_REAL3(sphere_vertices[indProb].x,
-                                                   sphere_vertices[indProb].y,
-                                                   sphere_vertices[indProb].z);
-                        } else {
-                                *dirs = MAKE_REAL3(-sphere_vertices[indProb].x,
-                                                   -sphere_vertices[indProb].y,
-                                                   -sphere_vertices[indProb].z);
-                        }
-                        // printf("direction addr write %p, slid %i\n", dirs, blockIdx.x*blockDim.y+threadIdx.y);
-                }
-
-                #ifdef DEBUG
-                        __syncwarp(WMASK);
-                        if (tidx == 0) {
-                                printf("last_cdf %10.8f\n", last_cdf);
-                                printf("selected_cdf %10.8f\n", selected_cdf);
-                                printf("indProb %i out of %i\n", indProb, dimt);
-                        }
-                        __syncwarp(WMASK);
-                        if (tidx == 15) {
-                                printf("last_cdf %10.8f l15\n", last_cdf);
-                                printf("selected_cdf %10.8f l15\n", selected_cdf);
-                                printf("indProb %i out of %i l15\n", indProb, dimt);
-                        }
-                        __syncwarp(WMASK);
-                        if (tidx == 31) {
-                                printf("last_cdf %10.8f l31\n", last_cdf);
-                                printf("selected_cdf %10.8f l31\n", selected_cdf);
-                                printf("indProb %i out of %i l31\n", indProb, dimt);
-                        }
-                        __syncwarp(WMASK);
-                #endif
-                return 1;
-        }
-}
-
-template<int BDIM_X,
-         int BDIM_Y,
-         int NATTEMPTS,
-         ModelType MODEL_T,
-         typename REAL_T,
-         typename REAL3_T>
-__device__ int get_direction_boot_d(
-                                curandStatePhilox4_32_10_t *st,
-                                const REAL_T max_angle,
-                                const REAL_T min_signal,
-                                const REAL_T relative_peak_thres,
-                                const REAL_T min_separation_angle,
-                                REAL3_T dir,
-                                const int dimx,
-                                const int dimy,
-                                const int dimz,
-                                const int dimt,
-                                const REAL_T *__restrict__ dataf,
-                                const int *__restrict__ b0s_mask, // not using this (and its opposite, dwi_mask)
-                                                                  // but not clear if it will never be needed so
-                                                                  // we'll keep it here for now...
-                                const REAL3_T point,
-                                const REAL_T *__restrict__ H, 
-                                const REAL_T *__restrict__ R,
-                                // model unused
-                                // max_angle, pmf_threshold from global defines
-                                // b0s_mask already passed
-                                // min_signal from global defines
-                                const int delta_nr,
-                                const REAL_T *__restrict__ delta_b,
-                                const REAL_T *__restrict__ delta_q, // fit_matrix
-                                const int samplm_nr,
-                                const REAL_T *__restrict__ sampling_matrix,
-                                const REAL3_T *__restrict__ sphere_vertices,
-                                const int2 *__restrict__ sphere_edges,
-                                const int num_edges,
-                                REAL3_T *__restrict__ dirs) {
-
-        const int tidx = threadIdx.x;
-        const int tidy = threadIdx.y;
-	
-        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
-        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
-
-	const int n32dimt = ((dimt+31)/32)*32;
-
-	extern REAL_T __shared__ __sh[];
-
-	REAL_T *__vox_data_sh = reinterpret_cast<REAL_T *>(__sh);
-	REAL_T *__msk_data_sh = __vox_data_sh + BDIM_Y*n32dimt;
-
-	REAL_T *__r_sh = __msk_data_sh + BDIM_Y*n32dimt;
-	REAL_T *__h_sh = __r_sh + BDIM_Y*MAX(n32dimt, samplm_nr);
-
-	__vox_data_sh += tidy*n32dimt;
-	__msk_data_sh += tidy*n32dimt;
-
-	__r_sh += tidy*MAX(n32dimt, samplm_nr);
-	__h_sh += tidy*MAX(n32dimt, samplm_nr);
-	
-	// compute hr_side (may be passed from python)
-	int hr_side = 0;
-	for(int j = tidx; j < dimt; j += BDIM_X) {
-		hr_side += !b0s_mask[j] ? 1 : 0;
-	}
-        #pragma unroll
-        for(int i = BDIM_X/2; i; i /= 2) {
-                hr_side += __shfl_xor_sync(WMASK, hr_side, i, BDIM_X);
-        }
-
-        #pragma unroll
-        for(int i = 0; i < NATTEMPTS; i++) {
-
-                const int rv = trilinear_interp_d<BDIM_X>(dimx, dimy, dimz, dimt, -1, dataf, point, __vox_data_sh);
-
-		const int nmsk = maskGet<BDIM_X>(dimt, b0s_mask, __vox_data_sh, __msk_data_sh);
-
-		//if (!tidx && !threadIdx.y && !blockIdx.x) {
-		//
-		//	printf("interp of %f, %f, %f\n", point.x, point.y, point.z);
-		//	printf("hr_side: %d\n", hr_side);
-		//	printArray("vox_data", 6, dimt, __vox_data_sh[tidy]);
-		//	printArray("msk_data", 6, nmsk, __msk_data_sh[tidy]);
-		//}
-		//break;
-
-                __syncwarp(WMASK);
-
-                if (rv == 0) {
-
-                        ndotp_d<BDIM_X>(hr_side, hr_side, __msk_data_sh, R, __r_sh);
-			//__syncwarp();
-			//printArray("__r", 5, hr_side*hr_side, R);
-			//printArray("__r_sh", 6, hr_side, __r_sh[tidy]);
-
-                        ndotp_d<BDIM_X>(hr_side, hr_side, __msk_data_sh, H, __h_sh);
-			//__syncwarp();
-			//printArray("__h_sh", 6, hr_side, __h_sh[tidy]);
-
-                        __syncwarp(WMASK);
-
-                        for(int j = 0; j < hr_side; j += BDIM_X) {
-                                if (j+tidx < hr_side) {
-#ifdef USE_FIXED_PERMUTATION
-                                        const int srcPermInd = fixedPerm[j+tidx];
-#else
-                                        const int srcPermInd = curand(st) % hr_side;
-//                                        if (srcPermInd < 0 || srcPermInd >= hr_side) {
-//                                                printf("srcPermInd: %d\n", srcPermInd);
-//                                        }
-#endif
-					__h_sh[j+tidx] += __r_sh[srcPermInd];
-					//__h_sh[j+tidx] += __r_sh[j+tidx];
-                                }
-                        }
-			__syncwarp(WMASK);
-
-			//printArray("h+perm(r):", 6, hr_side, __h_sh[tidy]);
-			//__syncwarp();
-		
-			// vox_data[dwi_mask] = masked_data
-			maskPut<BDIM_X>(dimt, b0s_mask, __h_sh, __vox_data_sh);
-			__syncwarp(WMASK);
-
-			//printArray("vox_data[dwi_mask]:", 6, dimt, __vox_data_sh[tidy]);
-			//__syncwarp();
-
-			for(int j = tidx; j < dimt; j += BDIM_X) {
-				//__vox_data_sh[j] = MAX(MIN_SIGNAL_P, __vox_data_sh[j]);
-				__vox_data_sh[j] = MAX(min_signal, __vox_data_sh[j]);
-			}
-			__syncwarp(WMASK);
-
-			const REAL_T denom = avgMask<BDIM_X>(dimt, b0s_mask, __vox_data_sh);
-
-			for(int j = tidx; j < dimt; j += BDIM_X) {
-				__vox_data_sh[j] /= denom;
-			}
-			__syncwarp();
-
-			//if (!tidx && !threadIdx.y && !blockIdx.x) {
-			//	printf("denom: %f\n", denom);
-			//}
-			////break;
-			//if (!tidx && !threadIdx.y && !blockIdx.x) {
-			//
-			//	printf("__vox_data_sh:\n");
-			//	printArray("vox_data", 6, dimt, __vox_data_sh[tidy]);
-			//}
-			//break;
-
-			maskGet<BDIM_X>(dimt, b0s_mask, __vox_data_sh, __msk_data_sh);
-			__syncwarp(WMASK);
-
-                        fit_model_coef<BDIM_X, MODEL_T>(delta_nr, hr_side, delta_q, delta_b, __msk_data_sh, __h_sh, __r_sh);
-
-                        // __r_sh[tidy] <- python 'coef'
-
-                        ndotp_d<BDIM_X>(samplm_nr, delta_nr, __r_sh, sampling_matrix, __h_sh);
-
-                        // __h_sh[tidy] <- python 'pmf'
-                } else {
-                        #pragma unroll
-                        for(int j = tidx; j < samplm_nr; j += BDIM_X) {
-				__h_sh[j] = 0;
-                        }
-                        // __h_sh[tidy] <- python 'pmf'
-                }
-                __syncwarp(WMASK);
-#if 0
-                if (!threadIdx.y && threadIdx.x == 0) {
-                        for(int j = 0; j < samplm_nr; j++) {
-                                printf("pmf[%d]: %f\n", j, __h_sh[tidy][j]);
-                        }
-                }
-                //return;
-#endif
-                const REAL_T abs_pmf_thr = PMF_THRESHOLD_P*max_d<BDIM_X>(samplm_nr, __h_sh, REAL_MIN);
-                __syncwarp(WMASK);
-
-                #pragma unroll
-                for(int j = tidx; j < samplm_nr; j += BDIM_X) {
-			const REAL_T __v = __h_sh[j];
-			if (__v < abs_pmf_thr) {
-				__h_sh[j] = 0;
-			}
-                }
-                __syncwarp(WMASK);
-#if 0
-                if (!threadIdx.y && threadIdx.x == 0) {
-                        printf("abs_pmf_thr: %f\n", abs_pmf_thr);
-                        for(int j = 0; j < samplm_nr; j++) {
-                                printf("pmfNORM[%d]: %f\n", j, __h_sh[tidy][j]);
-                        }
-                }
-                //return;
-#endif
-#if 0
-                if init:
-                        directions = peak_directions(pmf, sphere)[0]
-                        return directions
-                else:
-                        peaks = peak_directions(pmf, sphere)[0]
-                        if (len(peaks) > 0):
-                                return closest_peak(directions, peaks, cos_similarity)
-#endif
-                const int ndir = peak_directions_d<BDIM_X,
-                                                   BDIM_Y>(__h_sh, dirs,
-                                                           sphere_vertices,
-                                                           sphere_edges,
-                                                           num_edges,
-							   samplm_nr,
-							   reinterpret_cast<int *>(__r_sh), // reuse __r_sh as shInd in func which is large enough
-							   relative_peak_thres,
-							   min_separation_angle);
-                if (NATTEMPTS == 1) { // init=True...
-                        return ndir; // and dirs;
-                } else { // init=False...
-                        if (ndir > 0) {
-                                /*
-                                if (!threadIdx.y && threadIdx.x == 0 && ndir > 1) {
-                                        printf("NATTEMPTS=5 and ndir: %d!!!\n", ndir);
-                                }
-                                */
-                                REAL3_T peak;
-                                const int foundPeak = closest_peak_d<BDIM_X, BDIM_Y, REAL_T, REAL3_T>(max_angle, dir, ndir, dirs, &peak);
-                                __syncwarp(WMASK);
-                                if (foundPeak) {
-                                        if (tidx == 0) {
-                                                dirs[0] = peak;
-                                        }
-                                        return 1;
-                                }
-                        }
-                }
-        }
-        return 0;
-}
-
-enum {OUTSIDEIMAGE, INVALIDPOINT, TRACKPOINT, ENDPOINT};
-
-template<int BDIM_X,
-         int BDIM_Y,
-         typename REAL_T,
-         typename REAL3_T>
-__device__ int check_point_d(const REAL_T tc_threshold,
-			     const REAL3_T point,
-                             const int dimx,
-                             const int dimy,
-                             const int dimz,
-                             const REAL_T *__restrict__ metric_map) {
-
-        const int tidy = threadIdx.y;
-
-        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
-        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
-
-        __shared__ REAL_T __shInterpOut[BDIM_Y];
-
-        const int rv = trilinear_interp_d<BDIM_X>(dimx, dimy, dimz, 1, 0, metric_map, point, __shInterpOut+tidy);
-        __syncwarp(WMASK);
-#if 0
-        if (threadIdx.y == 1 && threadIdx.x == 0) {
-                printf("__shInterpOut[tidy]: %f, TC_THRESHOLD_P: %f\n", __shInterpOut[tidy], TC_THRESHOLD_P);
-        }
-#endif
-        if (rv != 0) {
-                return OUTSIDEIMAGE;
-        }
-        //return (__shInterpOut[tidy] > TC_THRESHOLD_P) ? TRACKPOINT : ENDPOINT;
-        return (__shInterpOut[tidy] > tc_threshold) ? TRACKPOINT : ENDPOINT;
-}
-
-template<int BDIM_X,
-         int BDIM_Y,
-         ModelType MODEL_T,
-         typename REAL_T,
-         typename REAL3_T>
-__device__ int tracker_d(curandStatePhilox4_32_10_t *st,
-			 const REAL_T max_angle,
-			 const REAL_T min_signal,
-			 const REAL_T tc_threshold,
-			 const REAL_T step_size,
-			 const REAL_T relative_peak_thres,
-			 const REAL_T min_separation_angle,
-                         REAL3_T seed,
-                         REAL3_T first_step,
-                         REAL_T* ptt_frame,
-                         REAL3_T voxel_size,
-                         const int dimx,
-                         const int dimy,
-                         const int dimz,
-                         const int dimt,
-                         const REAL_T *__restrict__ dataf,
-                         const int *__restrict__ b0s_mask, // not using this (and its opposite, dwi_mask)
-                         const REAL_T *__restrict__ H, 
-                         const REAL_T *__restrict__ R,
-                         // model unused
-                         // step_size from global defines
-                         // max_angle, pmf_threshold from global defines
-                         // b0s_mask already passed
-                         // min_signal from global defines
-                         // tc_threshold from global defines
-                         // pmf_threashold from global defines
-                         const REAL_T *__restrict__ metric_map,
-			 const int delta_nr,
-                         const REAL_T *__restrict__ delta_b,
-                         const REAL_T *__restrict__ delta_q, // fit_matrix
-			 const int samplm_nr,
-                         const REAL_T *__restrict__ sampling_matrix,
-                         const REAL3_T *__restrict__ sphere_vertices,
-                         const int2 *__restrict__ sphere_edges,
-                         const int num_edges,
-                         int *__restrict__ nsteps,
-                         REAL3_T *__restrict__ streamline) {
-
-        const int tidx = threadIdx.x;
-        const int tidy = threadIdx.y;
-
-        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
-        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
-
-        int tissue_class = TRACKPOINT;
-
-        REAL3_T point = seed;
-        REAL3_T direction = first_step;
-        __shared__ REAL3_T __sh_new_dir[BDIM_Y];
-
-        if (tidx == 0) {
-                streamline[0] = point;
-#if 0
-                if (threadIdx.y == 1) {
-                        printf("streamline[0]: %f, %f, %f\n", point.x, point.y, point.z);
-                }
-#endif
-        }
-        __syncwarp(WMASK);
-
-        int step_frac;
-        if (MODEL_T == PTT) {
-                step_frac = STEP_FRAC; 
-        } else {
-                step_frac = 1; // STEP_FRAC could be useful in other models
-        }
-
-        int i;
-        for(i = 1; i < MAX_SLINE_LEN*step_frac; i++) {
-                int ndir;
-                if (MODEL_T == PROB) {
-                        ndir = get_direction_prob_d<BDIM_X,
-                                                    BDIM_Y,
-                                                    0>(
-                                                        st,
-                                                        dataf,
-                                                        max_angle,
-                                                        relative_peak_thres,
-                                                        min_separation_angle,
-                                                        direction,
-                                                        dimx, dimy, dimz, dimt,
-                                                        point,
-                                                        sphere_vertices,
-                                                        sphere_edges,
-                                                        num_edges,
-                                                        __sh_new_dir + tidy);
-                } else if (MODEL_T == PTT) {
-                        ndir = get_direction_ptt_d<BDIM_X,
-                                                   BDIM_Y,
-                                                   0>(
-                                                        st,
-                                                        dataf,
-                                                        max_angle,
-                                                        step_size,
-                                                        direction,
-                                                        ptt_frame,
-                                                        dimx, dimy, dimz, dimt,
-                                                        point,
-                                                        sphere_vertices,
-                                                        __sh_new_dir + tidy);
-                } else {
-                        // call get_direction_boot_d() with NATTEMPTS=5
-                        ndir = get_direction_boot_d<BDIM_X,
-                                                    BDIM_Y,
-                                                    5,
-                                                    MODEL_T>(
-                                                        st,
-                                                        max_angle,
-                                                        min_signal,
-                                                        relative_peak_thres,
-                                                        min_separation_angle,
-                                                        direction,
-                                                        dimx, dimy, dimz, dimt, dataf,
-                                                        b0s_mask /* !dwi_mask */,
-                                                        point,
-                                                        H, R,
-                                                        // model unused
-                                                        // max_angle, pmf_threshold from global defines
-                                                        // b0s_mask already passed
-                                                        // min_signal from global defines
-                                                        delta_nr,
-                                                        delta_b, delta_q, // fit_matrix
-                                                        samplm_nr,
-                                                        sampling_matrix,
-                                                        sphere_vertices,
-                                                        sphere_edges,
-                                                        num_edges,
-                                                        __sh_new_dir + tidy);
-                }
-                __syncwarp(WMASK);
-                direction = __sh_new_dir[tidy];
-                __syncwarp(WMASK);
-
-                if (ndir == 0) {
-                        break;
-                }
-#if 0
-                if (threadIdx.y == 1 && threadIdx.x == 0) {
-                        printf("tracker: i: %d, direction: (%f, %f, %f)\n", i, direction.x, direction.y, direction.z);
-                }
-                //return;
-#endif
-                //point.x += (direction.x / voxel_size.x) * STEP_SIZE_P;
-                //point.y += (direction.y / voxel_size.y) * STEP_SIZE_P;
-                //point.z += (direction.z / voxel_size.z) * STEP_SIZE_P;
-                point.x += (direction.x / voxel_size.x) * (step_size / step_frac);
-                point.y += (direction.y / voxel_size.y) * (step_size / step_frac);
-                point.z += (direction.z / voxel_size.z) * (step_size / step_frac);
-
-                if ((tidx == 0) && ((i % step_frac) == 0)){
-                        streamline[i/step_frac] = point;
-#if 0
-                        if (threadIdx.y == 1) {
-                                printf("streamline[%d]: %f, %f, %f\n", i, point.x, point.y, point.z);
-                        }
-#endif
-                }
-                __syncwarp(WMASK);
-
-                tissue_class = check_point_d<BDIM_X, BDIM_Y>(tc_threshold, point, dimx, dimy, dimz, metric_map);
-
-#if 0
-                __syncwarp(WMASK);
-                if (tidx == 0) {
-                        printf("step_size %f\n", step_size);
-                        printf("direction %f, %f, %f\n", direction.x, direction.y, direction.z);
-                        printf("direction addr read %p, slid %i\n", __shDir, blockIdx.x*blockDim.y+threadIdx.y);
-                        printf("voxel_size %f, %f, %f\n", voxel_size.x, voxel_size.y, voxel_size.z);
-                        printf("point %f, %f, %f\n", point.x, point.y, point.z);
-                        printf("tc %i\n", tissue_class);
-                }
-                __syncwarp(WMASK);
-                if (tidx == 15) {
-                        printf("step_size %f l15\n", step_size);
-                        printf("direction %f, %f, %f l15\n", direction.x, direction.y, direction.z);
-                        printf("direction addr read %p, slid %i l15\n", __shDir, blockIdx.x*blockDim.y+threadIdx.y);
-                        printf("voxel_size %f, %f, %f l15\n", voxel_size.x, voxel_size.y, voxel_size.z);
-                        printf("point %f, %f, %f l15\n", point.x, point.y, point.z);
-                        printf("tc %i l15\n", tissue_class);
-                }
-                __syncwarp(WMASK);
-                if (tidx == 31) {
-                        printf("step_size %f l31\n", step_size);
-                        printf("direction %f, %f, %f l31\n", direction.x, direction.y, direction.z);
-                        printf("direction addr read %p, slid %i l31\n", __shDir, blockIdx.x*blockDim.y+threadIdx.y);
-                        printf("voxel_size %f, %f, %f l31\n", voxel_size.x, voxel_size.y, voxel_size.z);
-                        printf("point %f, %f, %f l31\n", point.x, point.y, point.z);
-                        printf("tc %i l31\n", tissue_class);
-                }
-                __syncwarp(WMASK);
-#endif
-
-                if (tissue_class == ENDPOINT ||
-                    tissue_class == INVALIDPOINT ||
-                    tissue_class == OUTSIDEIMAGE) {
-                        break;
-                }
-        }
-        nsteps[0] = i/step_frac;
-        if (((i % step_frac) != 0) && i < step_frac*(MAX_SLINE_LEN - 1)){
-                nsteps[0]++;
-                if (tidx == 0) {
-                        streamline[nsteps[0]] = point;
-                }
-        }
-
-        return tissue_class;
-}
-
-template<int BDIM_X,
-         int BDIM_Y,
-         typename REAL_T,
-         typename REAL3_T>
-__global__ void getNumStreamlinesBoot_k(
-                                    const ModelType model_type,
-                                    const REAL_T max_angle,
-		                    const REAL_T min_signal,
-		                    const REAL_T relative_peak_thres,
-	                            const REAL_T min_separation_angle,
-		                    const long long rndSeed,
-                                    const int nseed,
-                                    const REAL3_T *__restrict__ seeds,
-                                    const int dimx,
-                                    const int dimy,
-                                    const int dimz,
-                                    const int dimt,
-                                    const REAL_T *__restrict__ dataf,
-                                    const REAL_T *__restrict__ H,
-                                    const REAL_T *__restrict__ R,
-		                    const int delta_nr,
-                                    const REAL_T *__restrict__ delta_b,
-                                    const REAL_T *__restrict__ delta_q,
-                                    const int  *__restrict__ b0s_mask, // change to int
-		                    const int samplm_nr,
-                                    const REAL_T *__restrict__ sampling_matrix,
-                                    const REAL3_T *__restrict__ sphere_vertices,
-                                    const int2 *__restrict__ sphere_edges,
-                                    const int num_edges,
-                                    REAL3_T *__restrict__ shDir0,
-                                    int *slineOutOff) {
-
-        const int tidx = threadIdx.x;
-        const int slid = blockIdx.x*blockDim.y + threadIdx.y;
-        const size_t gid = blockIdx.x * blockDim.y * blockDim.x + blockDim.x * threadIdx.y + threadIdx.x;
-
-        if (slid >= nseed) {
-                return;
-        }
-
-        REAL3_T seed = seeds[slid]; 
-        // seed = lin_mat*seed + offset
-
-        REAL3_T *__restrict__ __shDir = shDir0+slid*samplm_nr;
-
-	// const int hr_side = dimt-1;
-
-        curandStatePhilox4_32_10_t st;
-        //curand_init(rndSeed, slid + rndOffset, DIV_UP(hr_side, BDIM_X)*tidx, &st); // each thread uses DIV_UP(hr_side/BDIM_X)
-        curand_init(rndSeed, gid, 0, &st); // each thread uses DIV_UP(hr_side/BDIM_X)
-                                                                                   // elements of the same sequence
-        // python:
-        //directions = get_direction(None, dataf, dwi_mask, sphere, s, H, R, model, max_angle,
-        //                pmf_threshold, b0s_mask, min_signal, fit_matrix,
-        //                sampling_matrix, init=True)
-
-	//if (!tidx && !threadIdx.y && !blockIdx.x) {
-	//	printf("seed: %f, %f, %f\n", seed.x, seed.y, seed.z);
-	//}
-
-        int ndir;
-        switch(model_type) {
-            case OPDT:
-                ndir = get_direction_boot_d<BDIM_X,
-                                            BDIM_Y,
-                                            1,
-                                            OPDT>(
-                                                &st,
-                                                max_angle,
-                                                min_signal,
-                                                relative_peak_thres,
-                                                min_separation_angle,
-                                                MAKE_REAL3(0,0,0),
-                                                dimx, dimy, dimz, dimt, dataf,
-                                                b0s_mask /* !dwi_mask */,
-                                                seed,
-                                                H, R,
-                                                // model unused
-                                                // max_angle, pmf_threshold from global defines
-                                                // b0s_mask already passed
-                                                // min_signal from global defines
-                                                delta_nr,
-                                                delta_b, delta_q, // fit_matrix
-                                                samplm_nr,
-                                                sampling_matrix,
-                                                sphere_vertices,
-                                                sphere_edges,
-                                                num_edges,
-                                                __shDir);
-                break;
-            case CSA:
-                ndir = get_direction_boot_d<BDIM_X,
-                                            BDIM_Y,
-                                            1,
-                                            CSA>(
-                                                &st,
-                                                max_angle,
-                                                min_signal,
-                                                relative_peak_thres,
-                                                min_separation_angle,
-                                                MAKE_REAL3(0,0,0),
-                                                dimx, dimy, dimz, dimt, dataf,
-                                                b0s_mask /* !dwi_mask */,
-                                                seed,
-                                                H, R,
-                                                // model unused
-                                                // max_angle, pmf_threshold from global defines
-                                                // b0s_mask already passed
-                                                // min_signal from global defines
-                                                delta_nr,
-                                                delta_b, delta_q, // fit_matrix
-                                                samplm_nr,
-                                                sampling_matrix,
-                                                sphere_vertices,
-                                                sphere_edges,
-                                                num_edges,
-                                                __shDir);
-                break;
-            default:
-                printf("FATAL: Invalid Model Type.\n");
-                break;
-        }
-
-        if (tidx == 0) {
-                slineOutOff[slid] = ndir;
-        }
-
-        return;
-}
-
-template<int BDIM_X,
-         int BDIM_Y,
-         typename REAL_T,
-         typename REAL3_T>
-__global__ void getNumStreamlinesProb_k(const REAL_T max_angle,
-				        const REAL_T relative_peak_thres,
-				        const REAL_T min_separation_angle,
-				        const long long rndSeed,
-                                        const int nseed,
-                                        const REAL3_T *__restrict__ seeds,
-                                        const int dimx,
-                                        const int dimy,
-                                        const int dimz,
-                                        const int dimt,
-                                        const REAL_T *__restrict__ dataf,
-                                        const REAL3_T *__restrict__ sphere_vertices,
-                                        const int2 *__restrict__ sphere_edges,
-                                        const int num_edges,
-                                        REAL3_T *__restrict__ shDir0,
-                                        int *slineOutOff) {
-
-        const int tidx = threadIdx.x;
-        const int slid = blockIdx.x*blockDim.y + threadIdx.y;
-        const size_t gid = blockIdx.x * blockDim.y * blockDim.x + blockDim.x * threadIdx.y + threadIdx.x;
-
-        if (slid >= nseed) {
-                return;
-        }
-
-        REAL3_T *__restrict__ __shDir = shDir0+slid*dimt;
-        curandStatePhilox4_32_10_t st;
-        curand_init(rndSeed, gid, 0, &st);
-
-        int ndir = get_direction_prob_d<BDIM_X,
-                                        BDIM_Y,
-                                        1>(
-                                                &st,
-                                                dataf,
-                                                max_angle,
-                                                relative_peak_thres,
-                                                min_separation_angle,
-                                                MAKE_REAL3(0,0,0),
-                                                dimx, dimy, dimz, dimt,
-                                                seeds[slid],
-                                                sphere_vertices,
-                                                sphere_edges,
-                                                num_edges,
-                                                __shDir);
-        if (tidx == 0) {
-                slineOutOff[slid] = ndir;
-        }
-
-        return;
-}
-
-template<int BDIM_X,
-         int BDIM_Y,
-         ModelType MODEL_T,
-         typename REAL_T,
-         typename REAL3_T>
-__global__ void genStreamlinesMerge_k(
-				      const REAL_T max_angle,
-				      const REAL_T min_signal,
-				      const REAL_T tc_threshold,
-				      const REAL_T step_size,
-				      const REAL_T relative_peak_thres,
-				      const REAL_T min_separation_angle,
-				      const long long rndSeed,
-                                      const int rndOffset,
-                                      const int nseed,
-                                      const REAL3_T *__restrict__ seeds,
-                                      const int dimx,
-                                      const int dimy,
-                                      const int dimz,
-                                      const int dimt,
-                                      const REAL_T *__restrict__ dataf,
-                                      const REAL_T *__restrict__ H,
-                                      const REAL_T *__restrict__ R,
-				      const int delta_nr,
-                                      const REAL_T *__restrict__ delta_b,
-                                      const REAL_T *__restrict__ delta_q,
-                                      const int    *__restrict__ b0s_mask, // change to int
-                                      const REAL_T *__restrict__ metric_map,
-				      const int samplm_nr,
-                                      const REAL_T *__restrict__ sampling_matrix,
-                                      const REAL3_T *__restrict__ sphere_vertices,
-                                      const int2 *__restrict__ sphere_edges,
-                                      const int num_edges,
-                                      const int    *__restrict__ slineOutOff,
-                                            REAL3_T *__restrict__ shDir0,
-                                            int     *__restrict__ slineSeed,
-                                            int     *__restrict__ slineLen,
-                                            REAL3_T *__restrict__ sline) {
-
-        const int tidx = threadIdx.x;
-        const int tidy = threadIdx.y;
-
-        const int slid = blockIdx.x*blockDim.y + threadIdx.y;
-
-        const int lid = (tidy*BDIM_X + tidx) % 32;
-        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
-
-        __shared__ REAL_T frame_sh[((MODEL_T == PTT) ? BDIM_Y*18 : 1)]; // Only used by PTT, TODO: way to remove this in other cases
-        REAL_T* __ptt_frame = frame_sh + tidy*18;
-	// const int hr_side = dimt-1;
-
-        curandStatePhilox4_32_10_t st;
-        // const int gbid = blockIdx.y*gridDim.x + blockIdx.x;
-        const size_t gid = blockIdx.x * blockDim.y * blockDim.x + blockDim.x * threadIdx.y + threadIdx.x;
-        //curand_init(rndSeed, slid+rndOffset, DIV_UP(hr_side, BDIM_X)*tidx, &st); // each thread uses DIV_UP(HR_SIDE/BDIM_X)
-        curand_init(rndSeed, gid+1, 0, &st); // each thread uses DIV_UP(hr_side/BDIM_X)
-                                                                                 // elements of the same sequence
-        if (slid >= nseed) {
-                return;
-        }
-
-        REAL3_T seed = seeds[slid]; 
-
-        int ndir = slineOutOff[slid+1]-slineOutOff[slid];
-#if 0
-        if (threadIdx.y == 0 && threadIdx.x == 0) {
-                printf("%s: ndir: %d\n", __func__, ndir);
-                for(int i = 0; i < ndir; i++) {
-                        printf("__shDir[%d][%d]: (%f, %f, %f)\n",
-                                tidy, i, __shDir[tidy][i].x, __shDir[tidy][i].y, __shDir[tidy][i].z);
-                }
-        }
-#endif
-        __syncwarp(WMASK);
-
-        int slineOff = slineOutOff[slid];
-
-        for(int i = 0; i < ndir; i++) {
-                REAL3_T first_step = shDir0[slid*samplm_nr + i];
-
-		REAL3_T *__restrict__ currSline = sline + slineOff*MAX_SLINE_LEN*2;
-
-                if (tidx == 0) {
-                        slineSeed[slineOff] = slid;
-                }
-#if 0
-                if (threadIdx.y == 0 && threadIdx.x == 0) {
-                        printf("calling trackerF from: (%f, %f, %f)\n", first_step.x, first_step.y, first_step.z);
-                }
-#endif
-
-                if (MODEL_T == PTT) {
-                        if (!init_frame_ptt_d<BDIM_X, BDIM_Y>(
-                                &st,
-                                dataf,
-                                max_angle,
-                                step_size,
-                                first_step,
-                                dimx, dimy, dimz, dimt,
-                                seed,
-                                sphere_vertices,
-                                __ptt_frame
-                        )) { // this fails rarely
-                                if (tidx == 0) {
-                                        slineLen[slineOff] = 1;
-                                        currSline[0] = seed;
-                                }
-                                __syncwarp(WMASK);
-                                slineOff += 1;
-                                continue;
-                        }
-                }
-
-
-                int stepsB;
-                const int tissue_classB = tracker_d<BDIM_X,
-                                                    BDIM_Y,
-                                                    MODEL_T>(&st,
-		                		             max_angle,
-			        		             min_signal,
-			        			     tc_threshold,
-	                        			     step_size,
-	                        			     relative_peak_thres,
-	                        			     min_separation_angle,
-                                                             seed,
-                                                             MAKE_REAL3(-first_step.x, -first_step.y, -first_step.z),
-                                                             __ptt_frame,
-                                                             MAKE_REAL3(1, 1, 1),
-                                                             dimx, dimy, dimz, dimt, dataf,
-                                                             b0s_mask,
-                                                             H, R,
-                                                             metric_map,
-			                                     delta_nr,
-                                                             delta_b, delta_q, //fit_matrix
-		                			     samplm_nr,
-                                                             sampling_matrix,
-                                                             sphere_vertices,
-                                                             sphere_edges,
-                                                             num_edges,
-                                                             &stepsB,
-                                                             currSline);
-                //if (tidx == 0) {
-                //        slineLenB[slineOff] = stepsB;
-                //}
-
-                // reverse backward sline
-                for(int j = 0; j < stepsB/2; j += BDIM_X) {
-                        if (j+tidx < stepsB/2) {
-                                const REAL3_T __p = currSline[j+tidx];
-                                currSline[j+tidx] = currSline[stepsB-1 - (j+tidx)];
-                                currSline[stepsB-1 - (j+tidx)] = __p;
-                        }
-                }
-
-                int stepsF;
-                const int tissue_classF = tracker_d<BDIM_X,
-                                                    BDIM_Y,
-                                                    MODEL_T>(&st,
-     	                    			             max_angle,
-			        		             min_signal,
-		        				     tc_threshold,
-	                				     step_size,
-			    				     relative_peak_thres,
-			            			     min_separation_angle,
-                                                             seed,
-                                                             first_step,
-                                                             __ptt_frame + 9,
-                                                             MAKE_REAL3(1, 1, 1),
-                                                             dimx, dimy, dimz, dimt, dataf,
-                                                             b0s_mask,
-                                                             H, R,
-                                                             metric_map,
-			        			     delta_nr,
-                                                             delta_b, delta_q, //fit_matrix
-			        			     samplm_nr,
-                                                             sampling_matrix,
-                                                             sphere_vertices,
-                                                             sphere_edges,
-                                                             num_edges,
-                                                             &stepsF,
-                                                             currSline + stepsB-1);
-                if (tidx == 0) {
-                        slineLen[slineOff] = stepsB-1+stepsF;
-                }
-                
-                slineOff += 1;
-#if 0
-                if (threadIdx.y == 0 && threadIdx.x == 0) {
-                        printf("%s: stepsF: %d, tissue_classF: %d\n", __func__, stepsF, tissue_classF);
-                }
-                __syncwarp(WMASK);
-#endif
-                //if (/* !return_all || */0 &&
-                //    tissue_classF != ENDPOINT &&
-                //    tissue_classF != OUTSIDEIMAGE) {
-                //        continue;
-                //}
-                //if (/* !return_all || */ 0 &&
-                //    tissue_classB != ENDPOINT &&
-                //    tissue_classB != OUTSIDEIMAGE) {
-                //        continue;
-                //}
-        }
-        return;
-}
-
-void generate_streamlines_cuda_mgpu(const ModelType model_type, const REAL max_angle, const REAL min_signal, const REAL tc_threshold, const REAL step_size,
-                                    const REAL relative_peak_thresh, const REAL min_separation_angle,
-                                    const int nseeds, const std::vector<REAL*> &seeds_d,
-                                    const int dimx, const int dimy, const int dimz, const int dimt,
-                                    const std::vector<REAL*> &dataf_d, const std::vector<REAL*> &H_d, const std::vector<REAL*> &R_d,
-			            const int delta_nr,
-                                    const std::vector<REAL*> &delta_b_d, const std::vector<REAL*> &delta_q_d,
-                                    const std::vector<int*> &b0s_mask_d, const std::vector<REAL*> &metric_map_d,
-			            const int samplm_nr,
-                                    const std::vector<REAL*> &sampling_matrix_d,
-                                    const std::vector<REAL*> &sphere_vertices_d, const std::vector<int*> &sphere_edges_d, const int nedges,
-                                    std::vector<REAL*> &slines_h, std::vector<int*> &slinesLen_h, std::vector<int> &nSlines_h,
-                                    const std::vector<int> nSlines_old_h, const int rng_seed, const int rng_offset,
-                                    const int ngpus, const std::vector<cudaStream_t> &streams) {
-
-  int nseeds_per_gpu = (nseeds + ngpus - 1) / ngpus;
-
-  std::vector<int*> slinesOffs_d(ngpus, nullptr);
-  std::vector<REAL3*> shDirTemp0_d(ngpus, nullptr);
-
-  //#pragma omp parallel for
-  for (int n = 0; n < ngpus; ++n) {
-    CHECK_CUDA(cudaSetDevice(n));
-    int nseeds_gpu = std::min(nseeds_per_gpu, std::max(0, nseeds - n*nseeds_per_gpu));
-    dim3 block(THR_X_SL, THR_X_BL/THR_X_SL);
-    dim3 grid(DIV_UP(nseeds_gpu, THR_X_BL/THR_X_SL));
-
-    CHECK_CUDA(cudaMalloc(&slinesOffs_d[n], sizeof(*slinesOffs_d[n])*(nseeds_gpu+1)));
-    CHECK_CUDA(cudaMalloc(&shDirTemp0_d[n], sizeof(*shDirTemp0_d[n])*samplm_nr*grid.x*block.y));
-  }
-
-  int n32dimt = ((dimt+31)/32)*32;
-
-  size_t shSizeGNS;
-  //#pragma omp parallel for
-  for (int n = 0; n < ngpus; ++n) {
-    CHECK_CUDA(cudaSetDevice(n));
-    int nseeds_gpu = std::min(nseeds_per_gpu, std::max(0, nseeds - n*nseeds_per_gpu));
-    if (nseeds_gpu == 0) continue;
-    dim3 block(THR_X_SL, THR_X_BL/THR_X_SL);
-    dim3 grid(DIV_UP(nseeds_gpu, THR_X_BL/THR_X_SL));
-
-    // Precompute number of streamlines before allocating memory
-    if (!((model_type == PTT) || (model_type == PROB))) {
-        shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*(2*n32dimt + 2*MAX(n32dimt, samplm_nr)) + // for get_direction_boot_d
-                    sizeof(int)*samplm_nr;						      // for peak_directions_d	
-        getNumStreamlinesBoot_k<THR_X_SL,
-                                THR_X_BL/THR_X_SL>
-                                <<<grid, block, shSizeGNS>>>(
-                                        model_type,
-                                        max_angle,
-                                        min_signal,
-                                        relative_peak_thresh,
-                                        min_separation_angle,
-                                        rng_seed,
-                                        nseeds_gpu,
-                                        reinterpret_cast<const REAL3 *>(seeds_d[n]),
-                                        dimx,
-                                        dimy,
-                                        dimz,
-                                        dimt,
-                                        dataf_d[n],
-                                        H_d[n],
-                                        R_d[n],
-                                        delta_nr,
-                                        delta_b_d[n],
-                                        delta_q_d[n],
-                                        b0s_mask_d[n],
-                                        samplm_nr,
-                                        sampling_matrix_d[n],
-                                        reinterpret_cast<const REAL3 *>(sphere_vertices_d[n]),
-                                        reinterpret_cast<const int2 *>(sphere_edges_d[n]),
-                                        nedges,
-                                        shDirTemp0_d[n],
-                                        slinesOffs_d[n]);
-    } else {
-        shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*n32dimt + sizeof(int)*(THR_X_BL/THR_X_SL)*n32dimt;
-        getNumStreamlinesProb_k<THR_X_SL,
-                                THR_X_BL/THR_X_SL>
-                                <<<grid, block, shSizeGNS>>>(
-                                        max_angle,
-                                        relative_peak_thresh,
-                                        min_separation_angle,
-                                        rng_seed,
-                                        nseeds_gpu,
-                                        reinterpret_cast<const REAL3 *>(seeds_d[n]),
-                                        dimx,
-                                        dimy,
-                                        dimz,
-                                        dimt,
-                                        dataf_d[n],
-                                        reinterpret_cast<const REAL3 *>(sphere_vertices_d[n]),
-                                        reinterpret_cast<const int2 *>(sphere_edges_d[n]),
-                                        nedges,
-                                        shDirTemp0_d[n],
-                                        slinesOffs_d[n]);
-    }
-  }
-
-  std::vector<int> slinesOffs_h;
-  //#pragma omp parallel for
-  for (int n = 0; n < ngpus; ++n) {
-    //std::vector<int> slinesOffs_h;
-    int nseeds_gpu = std::min(nseeds_per_gpu, std::max(0, nseeds - n*nseeds_per_gpu));
-    if (nseeds_gpu == 0) {
-      nSlines_h[n] = 0;
-      continue;
-    }
-    slinesOffs_h.resize(nseeds_gpu+1);
-    CHECK_CUDA(cudaMemcpy(slinesOffs_h.data(), slinesOffs_d[n], sizeof(*slinesOffs_h.data())*(nseeds_gpu+1), cudaMemcpyDeviceToHost));
-
-    int __pval = slinesOffs_h[0];
-    slinesOffs_h[0] = 0;
-    for(int i = 1; i < nseeds_gpu+1; i++) {
-      const int __cval = slinesOffs_h[i];
-      slinesOffs_h[i] = slinesOffs_h[i-1] + __pval;
-      __pval = __cval;
-    }
-    nSlines_h[n] = slinesOffs_h[nseeds_gpu];
-    CHECK_CUDA(cudaMemcpy(slinesOffs_d[n], slinesOffs_h.data(), sizeof(*slinesOffs_d[n])*(nseeds_gpu+1), cudaMemcpyHostToDevice));
-  }
-
-  std::vector<int*> slineSeed_d(ngpus, nullptr);
-
-  //#pragma omp parallel for
-  for (int n = 0; n < ngpus; ++n) {
-    CHECK_CUDA(cudaSetDevice(n));
-    int nseeds_gpu = std::min(nseeds_per_gpu, std::max(0, nseeds - n*nseeds_per_gpu));
-
-    CHECK_CUDA(cudaMalloc(&slineSeed_d[n], sizeof(*slineSeed_d[n])*nSlines_h[n]));
-    CHECK_CUDA(cudaMemset(slineSeed_d[n], -1, sizeof(*slineSeed_d[n])*nSlines_h[n]));
-
-    // Allocate/reallocate output arrays if necessary
-    if (nSlines_h[n] > EXCESS_ALLOC_FACT*nSlines_old_h[n]) {
-      if(slines_h[n]) cudaFreeHost(slines_h[n]);
-      if(slinesLen_h[n]) cudaFreeHost(slinesLen_h[n]);
-      slines_h[n] = nullptr;
-      slinesLen_h[n] = nullptr;
-    }
-
-#ifdef DEBUG
-    printf("buffer size %zu\n", sizeof(*slines_h[n])*EXCESS_ALLOC_FACT*2*3*MAX_SLINE_LEN*nSlines_h[n]);
-#endif
-
-    if (!slines_h[n]) CHECK_CUDA(cudaMallocHost(&slines_h[n], sizeof(*slines_h[n])*EXCESS_ALLOC_FACT*2*3*MAX_SLINE_LEN*nSlines_h[n]));
-    if (!slinesLen_h[n]) CHECK_CUDA(cudaMallocHost(&slinesLen_h[n], sizeof(*slinesLen_h[n])*EXCESS_ALLOC_FACT*nSlines_h[n]));
-  }
-
-  //if (nSlines_h) {
-
-  std::vector<int*> slineLen_d(ngpus, nullptr);
-  std::vector<REAL3*> sline_d(ngpus, nullptr);
-  //#pragma omp parallel for
-  for (int n = 0; n < ngpus; ++n) {
-    CHECK_CUDA(cudaSetDevice(n));
-    CHECK_CUDA(cudaMalloc(&slineLen_d[n], sizeof(*slineLen_d[n])*nSlines_h[n]));
-
-    CHECK_CUDA(cudaMalloc(&sline_d[n], sizeof(*sline_d[n])*2*MAX_SLINE_LEN*nSlines_h[n]));
-
-#if 0
-    size_t free_mem, total_mem;
-    CHECK_CUDA(cudaMemGetInfo(&free_mem, &total_mem));
-    std::cerr << "GPU " << n << ": ";
-    std::cerr << "GPU Memory Usage before genStreamlinesMerge_k: ";
-    std::cerr << (total_mem-free_mem)/(1024*1024) << " MiB used, ";
-    std::cerr << total_mem/(1024*1024) << " MiB total ";
-    std::cerr << std::endl;
-#endif
-  }
-
-  //#pragma omp parallel for
-  for (int n = 0; n < ngpus; ++n) {
-    CHECK_CUDA(cudaSetDevice(n));
-    int nseeds_gpu = std::min(nseeds_per_gpu, std::max(0, nseeds - n*nseeds_per_gpu));
-    if (nseeds_gpu == 0) continue;
-    dim3 block(THR_X_SL, THR_X_BL/THR_X_SL);
-    dim3 grid(DIV_UP(nseeds_gpu, THR_X_BL/THR_X_SL));
-#if 0
-    std::cerr << "GPU " << n << ": ";
-    std::cerr << "Generating " << nSlines_h[n] << " streamlines (from " << nseeds_gpu << " seeds)" << std::endl; 
-#endif
-
-    //fprintf(stderr, "Launching kernel with %u blocks of size (%u, %u)\n", grid.x, block.x, block.y);
-    switch(model_type) {
-        case OPDT:
-            genStreamlinesMerge_k<THR_X_SL, THR_X_BL/THR_X_SL, OPDT> <<<grid, block, shSizeGNS, streams[n]>>>(
-                max_angle, min_signal, tc_threshold, step_size, relative_peak_thresh, min_separation_angle,
-                rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast<const REAL3 *>(seeds_d[n]),
-                dimx, dimy, dimz, dimt, dataf_d[n], H_d[n], R_d[n], delta_nr, delta_b_d[n], delta_q_d[n],
-                b0s_mask_d[n], metric_map_d[n], samplm_nr, sampling_matrix_d[n],
-                reinterpret_cast<const REAL3 *>(sphere_vertices_d[n]), reinterpret_cast<const int2 *>(sphere_edges_d[n]),
-                nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]);
-            break;
-
-        case CSA:
-            genStreamlinesMerge_k<THR_X_SL, THR_X_BL/THR_X_SL, CSA> <<<grid, block, shSizeGNS, streams[n]>>>(
-                max_angle, min_signal, tc_threshold, step_size, relative_peak_thresh, min_separation_angle,
-                rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast<const REAL3 *>(seeds_d[n]),
-                dimx, dimy, dimz, dimt, dataf_d[n], H_d[n], R_d[n], delta_nr, delta_b_d[n], delta_q_d[n],
-                b0s_mask_d[n], metric_map_d[n], samplm_nr, sampling_matrix_d[n],
-                reinterpret_cast<const REAL3 *>(sphere_vertices_d[n]), reinterpret_cast<const int2 *>(sphere_edges_d[n]),
-                nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]);
-            break;
-
-        case PROB:
-            // Shared memory requirements are smaller for probabilistic for main run
-            // than for preliminary run
-            shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*n32dimt;
-            genStreamlinesMerge_k<THR_X_SL, THR_X_BL/THR_X_SL, PROB> <<<grid, block, shSizeGNS, streams[n]>>>(
-                max_angle, min_signal, tc_threshold, step_size, relative_peak_thresh, min_separation_angle,
-                rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast<const REAL3 *>(seeds_d[n]),
-                dimx, dimy, dimz, dimt, dataf_d[n], H_d[n], R_d[n], delta_nr, delta_b_d[n], delta_q_d[n],
-                b0s_mask_d[n], metric_map_d[n], samplm_nr, sampling_matrix_d[n],
-                reinterpret_cast<const REAL3 *>(sphere_vertices_d[n]), reinterpret_cast<const int2 *>(sphere_edges_d[n]),
-                nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]);
-            break;
-
-        case PTT:
-            shSizeGNS = 0; // PTT uses exclusively static shared memory
-            genStreamlinesMerge_k<THR_X_SL, THR_X_BL/THR_X_SL, PTT> <<<grid, block, shSizeGNS, streams[n]>>>(
-                max_angle, min_signal, tc_threshold, step_size, relative_peak_thresh, min_separation_angle,
-                rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast<const REAL3 *>(seeds_d[n]),
-                dimx, dimy, dimz, dimt, dataf_d[n], H_d[n], R_d[n], delta_nr, delta_b_d[n], delta_q_d[n],
-                b0s_mask_d[n], metric_map_d[n], samplm_nr, sampling_matrix_d[n],
-                reinterpret_cast<const REAL3 *>(sphere_vertices_d[n]), reinterpret_cast<const int2 *>(sphere_edges_d[n]),
-                nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]);
-            break;
-
-        default:
-            printf("FATAL: Invalid Model Type.\n");
-            break;
-    }
-
-    CHECK_ERROR("genStreamlinesMerge_k");
-  }
-
-  //CHECK_CUDA(cudaDeviceSynchronize());
-
-  //#pragma omp parallel for
-  for (int n = 0; n < ngpus; ++n) {
-    CHECK_CUDA(cudaSetDevice(n));
-    CHECK_CUDA(cudaMemcpyAsync(slines_h[n],
-                          reinterpret_cast<REAL *>(sline_d[n]),
-                          sizeof(*slines_h[n])*2*MAX_SLINE_LEN*nSlines_h[n]*3,
-                          cudaMemcpyDeviceToHost, streams[n]));
-    CHECK_CUDA(cudaMemcpyAsync(slinesLen_h[n],
-                          slineLen_d[n],
-                          sizeof(*slinesLen_h[n])*nSlines_h[n],
-                          cudaMemcpyDeviceToHost, streams[n]));
-
-  }
-  //};
-
-  //#pragma omp parallel for
-  for (int n = 0; n < ngpus; ++n) {
-    CHECK_CUDA(cudaSetDevice(n));
-    CHECK_CUDA(cudaStreamSynchronize(streams[n]));
-    CHECK_CUDA(cudaFree(slineSeed_d[n]));
-    CHECK_CUDA(cudaFree(slinesOffs_d[n]));
-    CHECK_CUDA(cudaFree(shDirTemp0_d[n]));
-    CHECK_CUDA(cudaFree(slineLen_d[n]));
-    CHECK_CUDA(cudaFree(sline_d[n]));
-  }
-
-}
-
-#if 1
-void write_trk(const char *fname,
-               const /*short*/ int *dims,
-               const REAL *voxel_size,
-               const char *voxel_order,
-               const REAL *vox_to_ras,
-               const int nsline,
-               const int *slineLen,
-               const REAL3 *sline) {
-
-        FILE *fp = fopen(fname, "w");
-        if (!fp) {
-                fprintf(stderr, "Cannot open file %s for writing...\n", fname);
-                exit(EXIT_FAILURE);
-        }
-
-        const char ID_STRING[6] = "TRACK";
-        short DIM[3] = {1, 1, 1};
-        float VOXEL_SIZE[3] = {1.0f, 1.0f, 1.0f};
-        float VOX_TO_RAS[4][4] = {{1.0f, 0.0f, 0.0, 0.0f},
-                                  {0.0f, 1.0f, 0.0, 0.0f},
-                                  {0.0f, 0.0f, 1.0, 0.0f},
-                                  {0.0f, 0.0f, 0.0, 1.0f}};
-        //const char VOXEL_ORDER[2][4] = {"RAS", "LAS"};
-        const float ORIGIN[3] = {0.0f, 0.0f, 0.0f};
-        const float IMAGE_ORIENTATION_PATIENT[6] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
-        const int VERSION = 2;
-        const int HDR_SIZE = 1000;
-
-        // write header
-        unsigned char header[1000];
-        memset(&header[0], 0, sizeof(header));
-
-        long long int off = 0;
-
-        memcpy(header, ID_STRING, sizeof(ID_STRING));
-        off += sizeof(ID_STRING);
-
-        if (dims) {
-                DIM[0] = dims[0];
-                DIM[1] = dims[1];
-                DIM[2] = dims[2];
-        }
-        memcpy(header+off, DIM, sizeof(DIM));
-        off += sizeof(DIM);
-
-        if (voxel_size) {
-                VOXEL_SIZE[0] = (float)voxel_size[0];
-                VOXEL_SIZE[1] = (float)voxel_size[1];
-                VOXEL_SIZE[2] = (float)voxel_size[2];
-        }
-        memcpy(header+off, VOXEL_SIZE, sizeof(VOXEL_SIZE));
-        off += sizeof(VOXEL_SIZE);
-
-        memcpy(header+off, ORIGIN, sizeof(ORIGIN));
-        off += sizeof(ORIGIN);
-
-        // skip n_scalaer(2b) + scalar_name(200b) +
-        //      n_properties(2b) + property_name(200b)
-        off += 404;
-
-        if (vox_to_ras) {
-                for(int i = 0; i < 4; i++) {
-                        for(int j = 0; j < 4; j++) {
-                                VOX_TO_RAS[i][j] = (float)vox_to_ras[i*4+j];
-                        }
-                }
-        }
-        memcpy(header+off, VOX_TO_RAS, sizeof(VOX_TO_RAS));
-        off += sizeof(VOX_TO_RAS);
-
-        // skip reserved(444b)
-        off += 444;
-
-        if (voxel_order) {
-                memcpy(header+off, voxel_order, 4);
-        } else {
-                memcpy(header+off, "LAS", 4);
-        }
-        off += 4; //sizeof(VOXEL_ORDER[voxel_order]);
-
-        // skip pad2(4b)
-        off += 4;
-
-        memcpy(header+off, IMAGE_ORIENTATION_PATIENT, sizeof(IMAGE_ORIENTATION_PATIENT));
-        off += sizeof(IMAGE_ORIENTATION_PATIENT);
-
-        // skip pad1(2b)
-        off += 2;
-
-        // skip invert_x(1b), invert_y(1b), invert_x(1b), swap_xy(1b), swap_yz(1b), swap_zx(1b)
-        off += 6;
-
-        memcpy(header+off, &nsline, sizeof(int));
-        off += sizeof(int);
-
-        memcpy(header+off, &VERSION, sizeof(VERSION));
-        off += sizeof(VERSION);
-
-        memcpy(header+off, &HDR_SIZE, sizeof(HDR_SIZE));
-        off += sizeof(HDR_SIZE);
-
-        //assert(off == 1000);
-        if (off != 1000) {
-                fprintf(stderr, "%s:%s:%d: heder size = %lld, (!= 1000)!\n", __FILE__, __func__, __LINE__, off);
-                exit(EXIT_FAILURE);
-        }
-        
-        size_t nw = fwrite(header, sizeof(header), 1, fp);
-        if (nw != 1) {
-                fprintf(stderr, "Error while writing to file!\n");
-                exit(EXIT_FAILURE);
-        }
-#if 0
-        // write body
-        long long maxSlineLen = slineLen[0];
-        for(long long i = 1; i < nsline; i++) {
-                maxSlineLen = MAX(maxSlineLen, slineLen[i]);
-        }
-
-        float *slineData = (float *)Malloc((1+3*maxSlineLen)*sizeof(*slineData));
-#else
-        float slineData[1 + 3*(2*MAX_SLINE_LEN)];
-#endif
-        for(int i = 0; i < nsline; i++) {
-                reinterpret_cast<int *>(slineData)[0] = slineLen[i];
-                for(int j = 0; j < slineLen[i]; j++) {
-                        slineData[1+3*j+0] = (float)((sline[i*2*MAX_SLINE_LEN + j].x+0.5)*VOXEL_SIZE[0]);
-                        slineData[1+3*j+1] = (float)((sline[i*2*MAX_SLINE_LEN + j].y+0.5)*VOXEL_SIZE[1]);
-                        slineData[1+3*j+2] = (float)((sline[i*2*MAX_SLINE_LEN + j].z+0.5)*VOXEL_SIZE[2]);
-                }
-                nw = fwrite(slineData, (1+3*slineLen[i])*sizeof(*slineData), 1, fp);
-                if (nw != 1) {
-                        fprintf(stderr, "Error while writing to file!\n");
-                        exit(EXIT_FAILURE);
-                }
-        }
-#if 0
-        free(slineData);
-#endif
-        fclose(fp);
-
-        return;
-}
-#else
-void write_trk(const int num_threads,
-               const char *fname,
-               const /*short*/ int *dims,
-               const REAL *voxel_size,
-               const char *voxel_order,
-               const REAL *vox_to_ras,
-               const int nsline,
-               const int *slineLen,
-               const REAL3 *sline) {
-
-        FILE *fp = fopen(fname, "w");
-        if (!fp) {
-                fprintf(stderr, "Cannot open file %s for writing...\n", fname);
-                exit(EXIT_FAILURE);
-        }
-
-        const char ID_STRING[6] = "TRACK";
-        short DIM[3] = {1, 1, 1};
-        float VOXEL_SIZE[3] = {1.0f, 1.0f, 1.0f};
-        float VOX_TO_RAS[4][4] = {{1.0f, 0.0f, 0.0, 0.0f},
-                                  {0.0f, 1.0f, 0.0, 0.0f},
-                                  {0.0f, 0.0f, 1.0, 0.0f},
-                                  {0.0f, 0.0f, 0.0, 1.0f}};
-        //const char VOXEL_ORDER[2][4] = {"RAS", "LAS"};
-        const float ORIGIN[3] = {0.0f, 0.0f, 0.0f};
-        const float IMAGE_ORIENTATION_PATIENT[6] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
-        const int VERSION = 2;
-        const int HDR_SIZE = 1000;
-
-        // write header
-        unsigned char header[1000];
-        memset(&header[0], 0, sizeof(header));
-
-        long long int off = 0;
-
-        memcpy(header, ID_STRING, sizeof(ID_STRING));
-        off += sizeof(ID_STRING);
-
-        if (dims) {
-                DIM[0] = dims[0];
-                DIM[1] = dims[1];
-                DIM[2] = dims[2];
-        }
-        memcpy(header+off, DIM, sizeof(DIM));
-        off += sizeof(DIM);
-
-        if (voxel_size) {
-                VOXEL_SIZE[0] = (float)voxel_size[0];
-                VOXEL_SIZE[1] = (float)voxel_size[1];
-                VOXEL_SIZE[2] = (float)voxel_size[2];
-        }
-        memcpy(header+off, VOXEL_SIZE, sizeof(VOXEL_SIZE));
-        off += sizeof(VOXEL_SIZE);
-
-        memcpy(header+off, ORIGIN, sizeof(ORIGIN));
-        off += sizeof(ORIGIN);
-
-        // skip n_scalaer(2b) + scalar_name(200b) +
-        //      n_properties(2b) + property_name(200b)
-        off += 404;
-
-        if (vox_to_ras) {
-                for(int i = 0; i < 4; i++) {
-                        for(int j = 0; j < 4; j++) {
-                                VOX_TO_RAS[i][j] = (float)vox_to_ras[i*4+j];
-                        }
-                }
-        }
-        memcpy(header+off, VOX_TO_RAS, sizeof(VOX_TO_RAS));
-        off += sizeof(VOX_TO_RAS);
-
-        // skip reserved(444b)
-        off += 444;
-
-        if (voxel_order) {
-                memcpy(header+off, voxel_order, 4);
-        } else {
-                memcpy(header+off, "LAS", 4);
-        }
-        off += 4; //sizeof(VOXEL_ORDER[voxel_order]);
-
-        // skip pad2(4b)
-        off += 4;
-
-        memcpy(header+off, IMAGE_ORIENTATION_PATIENT, sizeof(IMAGE_ORIENTATION_PATIENT));
-        off += sizeof(IMAGE_ORIENTATION_PATIENT);
-
-        // skip pad1(2b)
-        off += 2;
-
-        // skip invert_x(1b), invert_y(1b), invert_x(1b), swap_xy(1b), swap_yz(1b), swap_zx(1b)
-        off += 6;
-
-        memcpy(header+off, &nsline, sizeof(int));
-        off += sizeof(int);
-
-        memcpy(header+off, &VERSION, sizeof(VERSION));
-        off += sizeof(VERSION);
-
-        memcpy(header+off, &HDR_SIZE, sizeof(HDR_SIZE));
-        off += sizeof(HDR_SIZE);
-
-        //assert(off == 1000);
-        if (off != 1000) {
-                fprintf(stderr, "%s:%s:%d: heder size = %lld, (!= 1000)!\n", __FILE__, __func__, __LINE__, off);
-                exit(EXIT_FAILURE);
-        }
-        
-        size_t nw = fwrite(header, sizeof(header), 1, fp);
-        if (nw != 1) {
-                fprintf(stderr, "Error while writing to file!\n");
-                exit(EXIT_FAILURE);
-        }
-
-        // write body
-        long long maxSlineLen = slineLen[0];
-        for(long long i = 1; i < nsline; i++) {
-                maxSlineLen = MAX(maxSlineLen, slineLen[i]);
-        }
-
-        //omp_set_dynamic(0);
-        const int NTHREADS = num_threads > 0 ? num_threads : 1;
-        omp_set_num_threads(NTHREADS);
-
-        const int NFLTS_PER_TH = 1 + 2*(3*MAX_SLINE_LEN);
-        float *slineData = (float *)Malloc(NFLTS_PER_TH*NTHREADS*sizeof(*slineData));
-
-        #pragma omp parallel 
-        {
-                const int tid = omp_get_thread_num();
-                float *__mySlineData = slineData+tid*NFLTS_PER_TH;
-#if 1
-                //#pragma omp for schedule(static)
-                for(int i = 0; i < nsline; i += NTHREADS) {
-                        if (i+tid < nsline) {
-                                reinterpret_cast<int *>(__mySlineData)[0] = slineLen[i+tid];
-                                for(int j = 0; j < slineLen[i+tid]; j++) {
-                                        __mySlineData[1+3*j+0] = (float)((sline[(i+tid)*2*MAX_SLINE_LEN + j].x+0.5)*VOXEL_SIZE[0]);
-                                        __mySlineData[1+3*j+1] = (float)((sline[(i+tid)*2*MAX_SLINE_LEN + j].y+0.5)*VOXEL_SIZE[1]);
-                                        __mySlineData[1+3*j+2] = (float)((sline[(i+tid)*2*MAX_SLINE_LEN + j].z+0.5)*VOXEL_SIZE[2]);
-                                }
-                        }
-                        #pragma omp barrier
-                        if (tid == 0) {
-                                for(int j = 0; j < NTHREADS; j++) {
-                                        if (i+j >= nsline) {
-                                               break;
-                                        }
-                                        nw = fwrite(slineData+j*NFLTS_PER_TH, (1+3*slineLen[i+j])*sizeof(*slineData), 1, fp);
-                                        if (nw != 1) {
-                                                fprintf(stderr, "Error while writing to file!\n");
-                                                exit(EXIT_FAILURE);
-                                        }
-                                }
-                        }
-                        #pragma omp barrier
-                }
-#else
-                // streamlines are not required to be in any specific order inside the trk file...
-                #pragma omp for
-                for(int i = 0; i < nsline; i++) {
-                        reinterpret_cast<int *>(__mySlineData)[0] = slineLen[i];
-                        for(int j = 0; j < slineLen[i]; j++) {
-                                __mySlineData[1+3*j+0] = (float)((sline[i*2*MAX_SLINE_LEN + j].x+0.5)*VOXEL_SIZE[0]);
-                                __mySlineData[1+3*j+1] = (float)((sline[i*2*MAX_SLINE_LEN + j].y+0.5)*VOXEL_SIZE[1]);
-                                __mySlineData[1+3*j+2] = (float)((sline[i*2*MAX_SLINE_LEN + j].z+0.5)*VOXEL_SIZE[2]);
-                        }
-                        nw = fwrite(__mySlineData, (1+3*slineLen[i])*sizeof(*__mySlineData), 1, fp);
-                        if (nw != 1) {
-                                fprintf(stderr, "Error while writing to file!\n");
-                                exit(EXIT_FAILURE);
-                        }
-                }
-#endif
-        }
-        free(slineData);
-        fclose(fp);
-
-        return;
-}
-#endif
diff --git a/cuslines/generate_streamlines_cuda.h b/cuslines/generate_streamlines_cuda.h
deleted file mode 100644
index 14105ce..0000000
--- a/cuslines/generate_streamlines_cuda.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- *    list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- *    contributors may be used to endorse or promote products derived from
- *    this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __GENERATE_STREAMLINES_CUDA_H__
-#define __GENERATE_STREAMLINES_CUDA_H__
-
-#include <vector>
-
-#include "globals.h"
-
-void generate_streamlines_cuda_mgpu(const ModelType model_type, const REAL max_angle, const REAL min_signal, const REAL tc_threshold, const REAL step_size,
-                                    const REAL relative_peak_thresh, const REAL min_separation_angle,
-                                    const int nseeds, const std::vector<REAL*> &seeds_d,
-                                    const int dimx, const int dimy, const int dimz, const int dimt,
-                                    const std::vector<REAL*> &dataf_d, const std::vector<REAL*> &H_d, const std::vector<REAL*> &R_d,
-                                    const int delta_nr,
-                                    const std::vector<REAL*> &delta_b_d, const std::vector<REAL*> &delta_q_d,
-                                    const std::vector<int*> &b0s_mask_d, const std::vector<REAL*> &metric_map_d,
-                                    const int samplm_nr,
-                                    const std::vector<REAL*> &sampling_matrix_d,
-                                    const std::vector<REAL*> &sphere_vertices_d, const std::vector<int*> &sphere_edges_d, const int nedges,
-                                    std::vector<REAL*> &slines_h, std::vector<int*> &slinesLen_h, std::vector<int> &nSlines_h,
-                                    const std::vector<int> nSlines_old_h, const int rng_seed, const int rng_offset,
-                                    const int ngpus, const std::vector<cudaStream_t> &streams);
-#if 1
-void write_trk(const char *fname,
-               const /*short*/ int *dims,
-	       const REAL *voxel_size,
-	       const char *voxel_order,
-	       const REAL *vox_to_ras,
-	       const int nsline,
-	       const int *slineLen,
-	       const REAL3 *sline);
-#else
-void write_trk(const int num_threads,
-	       const char *fname,
-               const /*short*/ int *dims,
-	       const REAL *voxel_size,
-	       const char *voxel_order,
-	       const REAL *vox_to_ras,
-	       const int nsline,
-	       const int *slineLen,
-	       const REAL3 *sline);
-#endif
-#endif
diff --git a/cuslines/utils.cu b/cuslines/utils.cu
deleted file mode 100644
index c7fe47f..0000000
--- a/cuslines/utils.cu
+++ /dev/null
@@ -1,143 +0,0 @@
-
-template<int BDIM_X, typename REAL_T>
-__device__ void prefix_sum_sh_d(REAL_T *num_sh, int __len) {
-    const int tidx = threadIdx.x;
-
-    const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
-    const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
-
-#if 0
-    // for debugging
-    __syncwarp(WMASK);
-    if (tidx == 0) {
-        for (int j = 1; j < __len; j++) {
-            num_sh[j] += num_sh[j-1];
-        }
-    }
-    __syncwarp(WMASK);
-#else
-    for (int j = 0; j < __len; j += BDIM_X) {
-        if ((tidx == 0) && (j != 0)) {
-            num_sh[j] += num_sh[j-1];
-        }
-        __syncwarp(WMASK);
-
-        REAL_T __t_pmf;
-        if (j+tidx < __len) {
-            __t_pmf = num_sh[j+tidx];
-        }
-        for (int i = 1; i < BDIM_X; i*=2) {
-            REAL_T __tmp = __shfl_up_sync(WMASK, __t_pmf, i, BDIM_X);
-            if ((tidx >= i) && (j+tidx < __len)) {
-                    __t_pmf += __tmp;
-            }
-        }
-        if (j+tidx < __len) {
-            num_sh[j+tidx] = __t_pmf;
-        }
-        __syncwarp(WMASK);
-    }
-#endif
-}
-
-template<typename REAL_T>
-__device__ void printArrayAlways(const char *name, int ncol, int n, REAL_T *arr) {
-        printf("%s:\n", name);
-
-        for(int j = 0; j < n; j++) {
-                if ((j%ncol)==0) printf("\n");
-                printf("%10.8f ", arr[j]);
-        }
-        printf("\n");
-}
-
-template<typename REAL_T>
-__device__ void printArray(const char *name, int ncol, int n, REAL_T *arr) {
-	if (!threadIdx.x && !threadIdx.y && !blockIdx.x) {
-		printArrayAlways(name, ncol, n, arr);
-	}
-}
-
-template<typename REAL_T>
-__device__ REAL_T interpolation_helper_d(const REAL_T* dataf, const REAL_T wgh[3][2], const long long coo[3][2], int dimy, int dimz, int dimt, int t) {
-    REAL_T __tmp = 0;
-    #pragma unroll
-    for (int i = 0; i < 2; i++) {
-        #pragma unroll
-        for (int j = 0; j < 2; j++) {
-            #pragma unroll
-            for (int k = 0; k < 2; k++) {
-                __tmp += wgh[0][i] * wgh[1][j] * wgh[2][k] *
-                         dataf[coo[0][i] * dimy * dimz * dimt +
-                               coo[1][j] * dimz * dimt +
-                               coo[2][k] * dimt +
-                               t];
-            }
-        }
-    }
-    return __tmp;
-}
-
-template<int BDIM_X,
-         typename REAL_T,
-         typename REAL3_T>
-__device__ int trilinear_interp_d(const int dimx,
-                                  const int dimy,
-                                  const int dimz,
-                                  const int dimt,
-                                  int dimt_idx, // If -1, get all
-                                  const REAL_T *__restrict__ dataf,
-                                  const REAL3_T point,
-                                  REAL_T *__restrict__ __vox_data) {
-        const REAL_T HALF = static_cast<REAL_T>(0.5);
-
-        // all thr compute the same here
-        if (point.x < -HALF || point.x+HALF >= dimx ||
-            point.y < -HALF || point.y+HALF >= dimy ||
-               point.z < -HALF || point.z+HALF >= dimz) {
-                return -1;
-        }
-
-        long long  coo[3][2];
-        REAL_T wgh[3][2]; // could use just one...
-
-        const REAL_T ONE  = static_cast<REAL_T>(1.0);
-
-        const REAL3_T fl = MAKE_REAL3(FLOOR(point.x),
-                                      FLOOR(point.y),
-                                      FLOOR(point.z));
-
-        wgh[0][1] = point.x - fl.x; 
-        wgh[0][0] = ONE-wgh[0][1]; 
-        coo[0][0] = MAX(0, fl.x);
-        coo[0][1] = MIN(dimx-1, coo[0][0]+1);
-
-        wgh[1][1] = point.y - fl.y; 
-        wgh[1][0] = ONE-wgh[1][1]; 
-        coo[1][0] = MAX(0, fl.y);
-        coo[1][1] = MIN(dimy-1, coo[1][0]+1);
-
-        wgh[2][1] = point.z - fl.z; 
-        wgh[2][0] = ONE-wgh[2][1]; 
-        coo[2][0] = MAX(0, fl.z);
-        coo[2][1] = MIN(dimz-1, coo[2][0]+1);
-
-        if (dimt_idx == -1) {
-                for (int t = threadIdx.x; t < dimt; t += BDIM_X) {
-                        __vox_data[t] = interpolation_helper_d(dataf, wgh, coo, dimy, dimz, dimt, t);
-                }
-        } else {
-                *__vox_data = interpolation_helper_d(dataf, wgh, coo, dimy, dimz, dimt, dimt_idx);
-        }
-
-        /*
-        __syncwarp(WMASK);
-        if (tidx == 0 && threadIdx.y == 0) {
-                printf("point: %f, %f, %f\n", point.x, point.y, point.z);
-                for(int i = 0; i < dimt; i++) {
-                        printf("__vox_data[%d]: %f\n", i, __vox_data[i]);
-                }
-        }
-        */
-        return 0;
-}
diff --git a/merge_trk.sh b/merge_trk.sh
deleted file mode 100755
index c47412f..0000000
--- a/merge_trk.sh
+++ /dev/null
@@ -1,99 +0,0 @@
-#!/bin/bash
-
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-#    list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-#    this list of conditions and the following disclaimer in the documentation
-#    and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-#    contributors may be used to endorse or promote products derived from
-#    this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-usage() {
-	echo "$(basename $0) [ -v ] -o trk_outfile <trk_infile0> <trk_infile1> ..."
-}
-
-if [ $# -eq 0 ]; then
-	usage
-	exit 1
-fi
-
-OUT_FILE=""
-VERBOSE=""
-
-OPTS=$(getopt -o "vho:" -- "$@")
-eval set -- "$OPTS"
-
-while true; do
-	case "$1" in
-		-o)
-			OUT_FILE=$2
-			shift
-			shift
-			;;
-		-v)
-			VERBOSE="1"
-			shift
-			;;
-		-h)
-			usage
-			exit 1
-			;;
-		--)
-			shift
-			break
-			;;
-	esac
-done
-
-if [ -z $OUT_FILE ]; then
-	echo "Please provide an output file name with the -o option!"
-	exit 1
-fi
-
-# necessary when running via docker to expand again
-# the parameter list turning spaces into separators
-set -- $*
-
-TRK_FILES=("$@")
-NTRKF=$(($#))
-
-#echo $TRK_FILES
-#echo $NTRKF
-
-if [ $VERBOSE ]; then
-	echo "Merging $NTRKF files into $OUT_FILE..."
-fi
-
-head -c1000 ${TRK_FILES[0]} > $OUT_FILE
-
-NTRACK=0
-for((i=0; i<$NTRKF; i++)); do
-	if [ $VERBOSE ]; then
-		printf "%8d/%8d\r" $i $NTRKF
-	fi
-	NTRACK=$(($NTRACK + $(od -A none -t dI -j 988 -N4 ${TRK_FILES[$i]})));
-	tail -c+1001 ${TRK_FILES[$i]} >> $OUT_FILE
-done
-
-NTRACK=$(printf "%08X" $NTRACK)
-
-printf "\x${NTRACK:6:2}\x${NTRACK:4:2}\x${NTRACK:2:2}\x${NTRACK:0:2}" | dd of=$OUT_FILE bs=1 seek=988 count=4 conv=notrunc &> /dev/null
diff --git a/pyproject.toml b/pyproject.toml
index 7ad8645..67877c1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [build-system]
-requires = ["scikit-build-core", "pybind11"]
-build-backend = "scikit_build_core.build"
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
 
 [project]
 name = "cuslines"
@@ -10,8 +10,15 @@ readme = "README.md"
 requires-python = ">=3.7"
 dependencies = [
     "numpy",
-    "pybind11"
+    "nibabel",
+    "tqdm",
+    "dipy",
+    "trx-python",
+    "cuda-python",
+    "cuda-core",
+    "cuda-cccl"
 ]
 
-[tool.scikit-build]
-cmake.build-type = "Release"
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["cuslines*"]
diff --git a/run_gpu_streamlines.py b/run_gpu_streamlines.py
index e627978..0d6c447 100644
--- a/run_gpu_streamlines.py
+++ b/run_gpu_streamlines.py
@@ -30,34 +30,38 @@
 import argparse
 import random
 import time
-import zipfile
 
 import numpy as np
-import numpy.linalg as npl
 
 import dipy.reconst.dti as dti
 from dipy.io import read_bvals_bvecs
-from dipy.io.stateful_tractogram import Origin, Space, StatefulTractogram
+from dipy.io.stateful_tractogram import Space, StatefulTractogram
 from dipy.io.streamline import save_tractogram
 from dipy.tracking import utils
 from dipy.core.gradients import gradient_table, unique_bvals_magnitude
 from dipy.data import default_sphere
-from dipy.direction import (BootDirectionGetter, ProbabilisticDirectionGetter, PTTDirectionGetter)
+from dipy.direction import (
+  BootDirectionGetter as cpu_BootDirectionGetter,
+  ProbabilisticDirectionGetter as cpu_ProbDirectionGetter,
+  PTTDirectionGetter as cpu_PTTDirectionGetter)
 from dipy.reconst.shm import OpdtModel, CsaOdfModel
 from dipy.reconst.csdeconv import ConstrainedSphericalDeconvModel, auto_response_ssst
 from dipy.tracking.local_tracking import LocalTracking
 from dipy.tracking.stopping_criterion import ThresholdStoppingCriterion
-from dipy.reconst import shm
 from dipy.data import get_fnames
 from dipy.data import read_stanford_pve_maps
 
 import nibabel as nib
 from nibabel.orientations import aff2axcodes
 
-from trx.trx_file_memmap import TrxFile, zip_from_folder
+from trx.io import save as save_trx
 
-# Import custom module
-import cuslines
+from cuslines import (
+    BootDirectionGetter,
+    GPUTracker,
+    ProbDirectionGetter,
+    PttDirectionGetter,
+)
 
 t0 = time.time()
 
@@ -68,7 +72,7 @@
 #Get Gradient values
 def get_gtab(fbval, fbvec):
     bvals, bvecs = read_bvals_bvecs(fbval, fbvec)
-    gtab = gradient_table(bvals, bvecs)
+    gtab = gradient_table(bvals=bvals, bvecs=bvecs)
     return gtab
 
 def get_img(ep2_seq):
@@ -87,7 +91,7 @@ def get_img(ep2_seq):
 parser.add_argument("--chunk-size", type=int, default=100000, help="how many seeds to process per sweep, per GPU")
 parser.add_argument("--nseeds", type=int, default=100000, help="how many seeds to process in total")
 parser.add_argument("--ngpus", type=int, default=1, help="number of GPUs to use if using gpu")
-parser.add_argument("--write-method", type=str, default="fast", help="Can be trx, fast, or standard")
+parser.add_argument("--write-method", type=str, default="trk", help="Can be trx or trk")
 parser.add_argument("--max-angle", type=float, default=60, help="max angle (in degrees)")
 parser.add_argument("--min-signal", type=float, default=1.0, help="default: 1.0")
 parser.add_argument("--step-size", type=float, default=0.5, help="default: 0.5")
@@ -101,9 +105,9 @@ def get_img(ep2_seq):
 
 args = parser.parse_args()
 
-if args.device == "cpu" and args.write_method != "standard":
-  print("WARNING: only standard write method is implemented for cpu testing.")
-  write_method = "standard"
+if args.device == "cpu" and args.write_method != "trk":
+  print("WARNING: only trk write method is implemented for cpu testing.")
+  write_method = "trk"
 else:
   write_method = args.write_method
 
@@ -111,7 +115,8 @@ def get_img(ep2_seq):
   if not all(arg == 'hardi' for arg in [args.nifti_file, args.bvals, args.bvecs, args.mask_nifti, args.roi_nifti]):
     raise ValueError("If any of the arguments is 'hardi', all must be 'hardi'")
   # Get Stanford HARDI data
-  hardi_nifti_fname, hardi_bval_fname, hardi_bvec_fname = get_fnames('stanford_hardi')
+  hardi_nifti_fname, hardi_bval_fname, hardi_bvec_fname = get_fnames(
+     name='stanford_hardi')
   csf, gm, wm = read_stanford_pve_maps()
   wm_data = wm.get_fdata()
 
@@ -135,18 +140,14 @@ def get_img(ep2_seq):
 
 tenmodel = dti.TensorModel(gtab, fit_method='WLS')
 print('Fitting Tensor')
-tenfit = tenmodel.fit(data, mask)
+tenfit = tenmodel.fit(data, mask=mask)
 print('Computing anisotropy measures (FA,MD,RGB)')
 FA = tenfit.fa
-FA[np.isnan(FA)] = 0
 
 # Setup tissue_classifier args
 tissue_classifier = ThresholdStoppingCriterion(FA, args.fa_threshold)
-metric_map = np.asarray(FA, 'float64')
 
 # Create seeds for ROI
-# seed_mask = utils.seeds_from_mask(roi_data, density=args.sampling_density, affine=np.eye(4))
-# seed_mask = seed_mask[0:args.nseeds]
 seed_mask = np.asarray(utils.random_seeds_from_mask(
   roi_data, seeds_count=args.nseeds,
   seed_count_per_voxel=False,
@@ -155,20 +156,27 @@ def get_img(ep2_seq):
 # Setup model
 sphere = default_sphere
 if args.model == "opdt":
-  model_type = cuslines.ModelType.OPDT
-  print("Running OPDT model...")
-  model = OpdtModel(gtab, sh_order=args.sh_order, smooth=args.sm_lambda, min_signal=args.min_signal)
-  fit_matrix = model._fit_matrix
-  delta_b, delta_q = fit_matrix
+  if args.device == "cpu":
+    model = OpdtModel(gtab, sh_order=args.sh_order, smooth=args.sm_lambda, min_signal=args.min_signal)
+    dg = cpu_BootDirectionGetter
+  else:
+    dg = BootDirectionGetter.from_dipy_opdt(
+      gtab,
+      sphere,
+      sh_order_max=args.sh_order,
+      sh_lambda=args.sm_lambda,
+      min_signal=args.min_signal)
 elif args.model == "csa":
-  model_type = cuslines.ModelType.CSA
-  print("Running CSA model...")
-  model = CsaOdfModel(gtab, sh_order=args.sh_order, smooth=args.sm_lambda, min_signal=args.min_signal)
-  fit_matrix = model._fit_matrix
-  # Unlike OPDT, CSA has a single matrix used for fit_matrix. Populating delta_b and delta_q with necessary values for
-  # now.
-  delta_b = fit_matrix
-  delta_q = fit_matrix
+  if args.device == "cpu":
+    model = CsaOdfModel(gtab, sh_order=args.sh_order, smooth=args.sm_lambda, min_signal=args.min_signal)
+    dg = cpu_BootDirectionGetter
+  else:
+    dg = BootDirectionGetter.from_dipy_csa(
+      gtab,
+      sphere,
+      sh_order_max=args.sh_order,
+      sh_lambda=args.sm_lambda,
+      min_signal=args.min_signal)
 else:
   print("Running CSD model...")
   unique_bvals = unique_bvals_magnitude(gtab.bvals)
@@ -186,158 +194,67 @@ def get_img(ep2_seq):
     roi_radii=10,
     fa_thr=0.7)
   model = ConstrainedSphericalDeconvModel(gtab, response, sh_order=args.sh_order)
-  # TODO: we shouldnt have to do this, also for CSA, but we populate delta_b, delta_q.
-  # we need to name change delta_b/delta_q and make it possible for them to be None, or something like this
-  delta_b = model._X
-  delta_q = model.B_reg
-
-if args.dg != "boot":
-  if args.dg == "prob":
-    model_type = cuslines.ModelType.PROB
-    dg = ProbabilisticDirectionGetter
-  else:
-    model_type = cuslines.ModelType.PTT
-    dg = PTTDirectionGetter
-  fit = model.fit(data, mask=(metric_map >= args.fa_threshold))
+  fit = model.fit(data, mask=(FA >= args.fa_threshold))
   data = fit.odf(sphere).clip(min=0)
-else:
-  dg = BootDirectionGetter
-
-global_chunk_size = args.chunk_size
+  if args.dg == "ptt":
+      if args.device == "cpu":
+          dg = cpu_PTTDirectionGetter()
+      else:
+        # Set FOD to 0 outside mask for probing
+        data[FA < args.fa_threshold, :] = 0
+        dg = PttDirectionGetter()
+  elif args.dg == "prob":
+      if args.device == "cpu":
+        dg = cpu_ProbDirectionGetter()
+      else:
+        dg = ProbDirectionGetter()
+  else:
+      raise ValueError("Unknown direction getter type: {}".format(args.dg))
 
 # Setup direction getter args
 if args.device == "cpu":
   if args.dg != "boot":
     dg = dg.from_pmf(data, max_angle=args.max_angle, sphere=sphere, relative_peak_threshold=args.relative_peak_threshold, min_separation_angle=args.min_separation_angle)
   else:
-    dg = BootDirectionGetter.from_data(data, model, max_angle=args.max_angle, sphere=sphere, sh_order=args.sh_order, relative_peak_threshold=args.relative_peak_threshold, min_separation_angle=args.min_separation_angle)
-else:
-  # Setup direction getter args
-  b0s_mask = gtab.b0s_mask
-  dwi_mask = ~b0s_mask
-
-  # setup sampling matrix
-  theta = sphere.theta
-  phi = sphere.phi
-  sampling_matrix, _, _ = shm.real_sym_sh_basis(args.sh_order, theta, phi)
-
-  ## from BootPmfGen __init__
-  # setup H and R matrices
-  # TODO: figure out how to get H, R matrices from direction getter object
-  x, y, z = model.gtab.gradients[dwi_mask].T
-  r, theta, phi = shm.cart2sphere(x, y, z)
-  B, _, _ = shm.real_sym_sh_basis(args.sh_order, theta, phi)
-  H = shm.hat(B)
-  R = shm.lcr_matrix(H)
-
-  # create floating point copy of data
-  dataf = np.asarray(data, dtype=np.float64)
-
-  gpu_tracker = cuslines.GPUTracker(model_type,
-                                    args.max_angle * np.pi/180,
-                                    args.min_signal,
-                                    args.fa_threshold,
-                                    args.step_size,
-                                    args.relative_peak_threshold,
-                                    args.min_separation_angle * np.pi/180,
-                                    dataf.astype(np.float64), H.astype(np.float64), R.astype(np.float64), delta_b.astype(np.float64), delta_q.astype(np.float64),
-                                    b0s_mask.astype(np.int32), metric_map.astype(np.float64), sampling_matrix.astype(np.float64),
-                                    sphere.vertices.astype(np.float64), sphere.edges.astype(np.int32),
-                                    ngpus=args.ngpus, rng_seed=0)
-
-print('streamline gen')
-nchunks = (seed_mask.shape[0] + global_chunk_size - 1) // global_chunk_size
-
-t1 = time.time()
-streamline_time = 0
-io_time = 0
-
-if args.output_prefix and write_method == "trx":
-  # Will resize by a factor of 2 if these are exceeded
-  sl_len_guess = 100
-  sl_per_seed_guess = 3
-  n_sls_guess = sl_per_seed_guess*len(seed_mask)
-
-  # trx files use memory mapping
-  trx_file = TrxFile(
-    reference=hardi_nifti_fname,
-    nb_streamlines=n_sls_guess,
-    nb_vertices=n_sls_guess*sl_len_guess)
-  offsets_idx = 0
-  sls_data_idx = 0
-
-for idx in range(int(nchunks)):
-  # Main streamline computation
-  ts = time.time()
-  if args.device == "cpu":
-    streamline_generator = LocalTracking(dg, tissue_classifier, seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size], affine=np.eye(4), step_size=args.step_size)
-    streamlines = [s for s in streamline_generator]
-  else:
-    streamlines = gpu_tracker.generate_streamlines(seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size])
-  te = time.time()
-  streamline_time += (te-ts)
-  print("Generated {} streamlines from {} seeds, time: {} s".format(len(streamlines),
-                                                                    seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size].shape[0],
-                                                                    te-ts))
+    dg = dg.from_data(data, model, max_angle=args.max_angle, sphere=sphere, sh_order=args.sh_order, relative_peak_threshold=args.relative_peak_threshold, min_separation_angle=args.min_separation_angle)
 
-  # Save tracklines file
-  if args.output_prefix:
     ts = time.time()
-    if write_method == "standard":
-      fname = "{}.{}_{}.trk".format(args.output_prefix, idx+1, nchunks)
-      sft = StatefulTractogram(streamlines, args.nifti_file, Space.VOX)
-      save_tractogram(sft, fname)
-      te = time.time()
-      print("Saved streamlines to {}, time {} s".format(fname, te-ts))
-    elif write_method == "trx":
-      tractogram = nib.streamlines.Tractogram(streamlines, affine_to_rasmm=img.affine)
-      tractogram.to_world()
-      sls = tractogram.streamlines
-
-      new_offsets_idx = offsets_idx + len(sls._offsets)
-      new_sls_data_idx = sls_data_idx + len(sls._data)
-
-      if new_offsets_idx > trx_file.header["NB_STREAMLINES"]\
-          or new_sls_data_idx > trx_file.header["NB_VERTICES"]:
-        print("TRX resizing...")
-        trx_file.resize(nb_streamlines=new_offsets_idx*2, nb_vertices=new_sls_data_idx*2)
-
-      # TRX uses memmaps here
-      trx_file.streamlines._data[sls_data_idx:new_sls_data_idx] = sls._data
-      trx_file.streamlines._offsets[offsets_idx:new_offsets_idx] = offsets_idx + sls._offsets
-      trx_file.streamlines._lengths[offsets_idx:new_offsets_idx] = sls._lengths
-
-      offsets_idx = new_offsets_idx
-      sls_data_idx = new_sls_data_idx
-      
-      te = time.time()
-      print("Streamlines to TRX format, time {} s".format(te-ts))
-    else:
-      fname = "{}.{}_{}".format(args.output_prefix, idx+1, nchunks)
-      gpu_tracker.dump_streamlines(fname, voxel_order, wm.shape, wm.header.get_zooms(), img.affine)
-      te = time.time()
-      print("Saved streamlines to {}, time {} s".format(fname, te-ts))
-
-    io_time += (te-ts)
-
-if args.output_prefix and write_method == "trx":
-  ts = time.time()
-  fname = "{}.trx".format(args.output_prefix)
-  trx_file.resize()
-  zip_from_folder(
-    trx_file._uncompressed_folder_handle.name,
-    fname,
-    zipfile.ZIP_STORED)
-  trx_file.close()
-  te = time.time()
-  print("Saved streamlines to {}, time {} s".format(fname, te-ts))
-  io_time += (te-ts)
-
-t2 = time.time()
+    streamline_generator = LocalTracking(dg, tissue_classifier, seed_mask, affine=np.eye(4), step_size=args.step_size)
+    sft = StatefulTractogram(streamline_generator, img, Space.VOX)
+    n_sls = len(sft.streamlines)
+    te = time.time()
+else:
+    with GPUTracker(
+        dg,
+        data,
+        FA,
+        args.fa_threshold,
+        sphere.vertices,
+        sphere.edges,
+        max_angle=args.max_angle * np.pi/180,
+        step_size=args.step_size,
+        relative_peak_thresh=args.relative_peak_threshold,
+        min_separation_angle=args.min_separation_angle * np.pi/180,
+        ngpus=args.ngpus,
+        rng_seed=0,
+        chunk_size=args.chunk_size
+    ) as gpu_tracker:
+        ts = time.time()
+        if args.output_prefix and write_method == "trx":
+            trx_file = gpu_tracker.generate_trx(seed_mask, img)
+            n_sls = len(trx_file.streamlines)
+        else:
+            sft = gpu_tracker.generate_sft(seed_mask, img)
+            n_sls = len(sft.streamlines)
+        te = time.time()
+print("Generated {} streamlines from {} seeds, time: {} s".format(n_sls,
+                                                                  seed_mask.shape[0],
+                                                                  te-ts))
 
-print("Completed processing {} seeds.".format(seed_mask.shape[0]))
-print("Initialization time: {} sec".format(t1-t0))
-print("Streamline generation total time: {} sec".format(t2-t1))
-print("\tStreamline processing: {} sec".format(streamline_time))
 if args.output_prefix:
-  print("\tFile writing: {} sec".format(io_time))
+  if write_method == "trx":
+    fname = "{}.trx".format(args.output_prefix)
+    save_trx(trx_file, fname)
+  else:
+    fname = "{}.trk".format(args.output_prefix)
+    save_tractogram(sft, fname)
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..ae15d6a
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,46 @@
+from setuptools import setup
+from setuptools.command.build_py import build_py
+from pathlib import Path
+import re
+
+
+def defines_to_python(src, dst):
+    src = Path(src)
+    dst = Path(dst)
+
+    INT_DEFINE = re.compile(
+        r"#define\s+(\w+)\s+\(?\s*([0-9]+)\s*\)?"
+    )
+
+    REAL_CAST_DEFINE = re.compile(
+        r"#define\s+(\w+)\s+\(\(REAL\)\s*([0-9eE\.\+\-]+)\s*\)"
+    )
+
+    defines = {}
+
+    for line in src.read_text().splitlines():
+        if m := INT_DEFINE.match(line):
+            defines[m.group(1)] = int(m.group(2))
+        elif m := REAL_CAST_DEFINE.match(line):
+            defines[m.group(1)] = float(m.group(2))
+
+    dst.parent.mkdir(parents=True, exist_ok=True)
+
+    with dst.open("w") as f:
+        f.write("# AUTO-GENERATED FROM globals.h — DO NOT EDIT\n\n")
+        for k, v in sorted(defines.items()):
+            f.write(f"{k} = {v}\n")
+
+class build_py_with_cuda(build_py):
+    def run(self):
+        root = Path(__file__).parent
+
+        globals_src = str(root / "cuslines" / "globals.h")
+        globals_dst = str(root / "cuslines" / "cuda_python" / "_globals.py")
+        defines_to_python(globals_src, globals_dst)
+
+        super().run()
+
+setup(
+    cmdclass={"build_py": build_py_with_cuda},
+)