diff --git a/README.md b/README.md index 0e38ddb..b7acfec 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,171 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* SPENCER WEBSTER-BASS + * [LinkedIn](https://www.linkedin.com/in/spencer-webster-bass/) +* Tested on: Windows 10, i7-6700 @ 3.40GHz 16GB, Quadro P1000 222MB (MOR100B-19) -### (TODO: Your README) +### DESCRIPTION -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +This project is an implementation of the stream compaction parallel algorithm on the GPU using CUDA and C++. + +Features: +* Serial implementation of scan and stream compaction algorithms on the CPU +* Naive, parallel implementation of scan and stream compaction algorithms on the GPU +* Atepted work-efficient, parallel implementation of scan and stream compaction algorithms on the GPU +* Comparison between my implementations' efficiency and thrust's implementation of exclusive scan algorithm + +TODOs: +Include Analysis + +**************** +** SCAN TESTS ** +**************** + [ 49 8 2 27 20 44 21 27 49 3 20 3 16 3 31 0 ] + +==== cpu scan, power-of-two ==== + + elapsed time: 0ms (std::chrono Measured) + + [ 0 49 57 59 86 106 150 171 198 247 250 270 273 289 292 323 ] + +==== cpu scan, non-power-of-two ==== + + elapsed time: 0ms (std::chrono Measured) + + [ 0 49 57 59 86 106 150 171 198 247 250 270 273 ] + + passed + +==== naive scan, power-of-two ==== + + [ 49 8 2 27 20 44 21 27 49 3 20 3 16 3 31 0 ] + + [ 49 8 2 27 20 44 21 27 49 3 20 3 16 3 31 0 ] + + [ 0 49 57 10 29 47 64 65 48 76 52 23 23 19 19 34 ] + + [ 0 49 57 59 86 57 93 112 112 141 100 99 75 42 42 53 ] + + [ 0 49 57 59 86 106 150 171 198 198 193 211 187 183 142 152 ] + + [ 0 49 57 59 86 106 150 171 198 247 250 270 273 289 292 323 ] + + elapsed time: 7.58922ms (CUDA Measured) + + [ 0 49 57 59 86 106 150 171 198 247 250 270 273 289 292 323 ] + + passed + +==== 1s array for finding bugs ==== + + [ 49 8 2 27 20 44 21 27 49 3 20 3 16 3 31 0 ] + + [ 49 8 2 27 20 44 21 27 49 3 20 3 16 3 31 0 ] + + [ 0 49 57 10 29 47 64 65 48 76 52 23 23 19 19 34 ] + + [ 0 49 57 59 86 57 93 112 112 141 100 99 75 42 42 53 ] + + [ 0 49 57 59 86 106 150 171 198 198 193 211 187 183 142 152 ] + + [ 0 49 57 59 86 106 150 171 198 247 250 270 273 289 292 323 ] + + [ 0 49 57 59 86 106 150 171 198 247 250 270 273 289 292 323 ] + +==== naive scan, non-power-of-two ==== + + [ 49 8 2 27 20 44 21 27 49 3 20 3 16 3 31 0 ] + + [ 49 8 2 27 20 44 21 27 49 3 20 3 16 ] + + [ 0 49 57 10 29 47 64 65 48 76 52 23 23 3 0 0 ] + + [ 0 49 57 59 86 57 93 112 112 141 100 99 75 26 23 3 ] + + [ 0 49 57 59 86 106 150 171 198 198 193 211 187 167 123 102 ] + + [ 0 49 57 59 86 106 150 171 198 247 250 270 273 273 273 273 ] + + elapsed time: 15.0825ms (CUDA Measured) + + [ 0 49 57 59 86 106 150 171 198 247 250 270 273 0 0 0 ] + + passed + +==== work-efficient scan, power-of-two ==== + + elapsed time: 0ms (CUDA Measured) + + a[1] = 49, b[1] = 0 + + FAIL VALUE + +==== work-efficient scan, non-power-of-two ==== + + elapsed time: 0ms (CUDA Measured) + + a[1] = 49, b[1] = 0 + + FAIL VALUE + +==== thrust scan, power-of-two ==== + + elapsed time: 0.083008ms (CUDA Measured) + + passed + +==== thrust scan, non-power-of-two ==== + + elapsed time: 0.069632ms (CUDA Measured) + + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + + [ 1 2 2 1 2 0 1 3 1 3 0 3 0 3 3 0 ] + +==== cpu compact without scan, power-of-two ==== + + elapsed time: 0.0034ms (std::chrono Measured) + + [ 0 0 0 0 0 0 0 0 0 0 0 0 ] + + passed + +==== cpu compact without scan, non-power-of-two ==== + + elapsed time: 0.004ms (std::chrono Measured) + + [ 0 0 0 0 0 0 0 0 0 0 ] + + passed + +==== cpu compact with scan ==== + + elapsed time: 0.0023ms (std::chrono Measured) + + [ ] + + expected 12 elements, got -1 + + FAIL COUNT + +==== work-efficient compact, power-of-two ==== + + elapsed time: 0ms (CUDA Measured) + + expected 12 elements, got -1 + + FAIL COUNT + +==== work-efficient compact, non-power-of-two ==== + + elapsed time: 0ms (CUDA Measured) + + expected 10 elements, got -1 + + FAIL COUNT diff --git a/notes.txt b/notes.txt new file mode 100644 index 0000000..c1f8511 --- /dev/null +++ b/notes.txt @@ -0,0 +1,34 @@ +How does dim3 work? and dimensions in CUDA. Does dim3 set unspecified arguments to 1 or 0? +Does CUDA expect unused dimensions to be 1 or 0? + + + +How do these checkCUDAError work? They sometimes say that an entire is occurring at an incorrect location if +I dont have one at every CUDA function call. + + + +Do we need to include new functions in header files? + + + +Inside of CUDA files are we using c or C++ + +Dont have classes. The coding style might be closer to C instead of C++. You can pass structs to CUDA kernels. + +When using the memory window is it showing you gpu or cpu memory when you copy and paste an address +from the locals or autos window? + +Can memcpy from the device back to the host. +And start with a smaller sized buffer so that you can check the values in the buffers by hand. + + +dev_data1, dev_data2; +// cudamalloc, memcpy, etc +// for eah iteration, launch kernels on dev_data1 and dev_data2 +int* temp = dev_data1 +dev_dta1 = dev_data2 +dev_data2 = temp + + +The weird alternating thing where every other value was zero was due to the book's funky way of ping ponging. \ No newline at end of file diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..13791af 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 4; //8; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -49,22 +49,25 @@ int main(int argc, char* argv[]) { zeroArray(SIZE, c); printDesc("naive scan, power-of-two"); + printArray(SIZE, a, true); StreamCompaction::Naive::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); - /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan + /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan */ onesArray(SIZE, c); printDesc("1s array for finding bugs"); + printArray(SIZE, a, true); StreamCompaction::Naive::scan(SIZE, c, a); - printArray(SIZE, c, true); */ + printArray(SIZE, c, true); zeroArray(SIZE, c); printDesc("naive scan, non-power-of-two"); + printArray(SIZE, a, true); StreamCompaction::Naive::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..222ee88 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -24,6 +24,13 @@ namespace StreamCompaction { */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { // TODO + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) + return; + if (idata[index]) + bools[index] = 1; + else + bools[index] = 0; } /** @@ -33,6 +40,11 @@ namespace StreamCompaction { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { // TODO + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) + return; + if (bools[index]) + odata[indices[index]] = idata[index]; } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..2b6b05e 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -1,4 +1,5 @@ #include +#include #include "cpu.h" #include "common.h" @@ -18,9 +19,12 @@ namespace StreamCompaction { * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first. */ void scan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); + // timer().startCpuTimer(); // TODO - timer().endCpuTimer(); + odata[0] = 0; + for (int i = 1; i < n; i++) + odata[i] = odata[i - 1] + idata[i - 1]; + // timer().endCpuTimer(); } /** @@ -31,8 +35,13 @@ namespace StreamCompaction { int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + std::vector o = std::vector(); + for (int i = 0; i < n; i++) + if (idata[i]) + o.push_back(idata[i]); + odata = o.data(); timer().endCpuTimer(); - return -1; + return o.size(); } /** @@ -43,8 +52,38 @@ namespace StreamCompaction { int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + // step 1: compute bit mask + std::vector mask(n); + for (int i = 0; i < n; i++) { + if (idata[i] == 0) { + mask.at(i) = 0; + } + else { + mask.at(i) = 1; + } + } + + // step 2: exclusive scan + scan(n, odata, mask.data()); + timer().endCpuTimer(); return -1; + + // step 3: scatter + int m = odata[n - 1]; + std::vector ovec(m); + m = 0; + for (int i = 0; i < n; i++) { + if (mask[i]) { + ovec[odata[i]] = idata[i]; + m++; + } + } + + odata = ovec.data(); + + timer().endCpuTimer(); + return m; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..cb799f3 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -12,13 +12,122 @@ namespace StreamCompaction { return timer; } + + __global__ void prescan(int n, float* g_odata, float* g_idata) { + extern __shared__ float temp[]; + // allocated on invocation + int thid = threadIdx.x; + int offset = 1; + temp[2 * thid] = g_idata[2 * thid]; // load input into shared memory + temp[2*thid+1] = g_idata[2*thid+1]; + + // build sum in place up the tree + for (int d = n >> 1; d > 0; d >>= 1) { + __syncthreads(); + if (thid < d) { + int ai = offset * (2 * thid + 1) - 1; + int bi = offset * (2 * thid + 2) - 1; + temp[bi] += temp[ai]; + } + offset *= 2; + } + if (thid == 0) { temp[n - 1] = 0; } // clear the last element + + // traverse down tree & build scan + for (int d = 1; d < n; d *= 2) { + offset >>= 1; + __syncthreads(); + if (thid < d) { + int ai = offset * (2 * thid + 1) - 1; + int bi = offset * (2 * thid + 2) - 1; + float t = temp[ai]; + temp[ai] = temp[bi]; + temp[bi] += t; + } + } + __syncthreads(); + g_odata[2 * thid] = temp[2 * thid]; + // write results to device memory + g_odata[2*thid+1] = temp[2*thid+1]; + } + + __device__ void kernUpSweep(int n, int pN, int* idata, int offset, int d) { + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + + if (idx >= pN) + return; + + if (idx < d) { + int ai = offset * (2 * thid + 1) - 1; + int bi = offset * (2 * thid + 2) - 1; + temp[bi] += temp[ai]; + } + } + + __device__ void kernDownSweep() { + + } + + __global__ void kernExScan(int n, int pN) { + + } + + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + + return; + + int* dev_idata; + + int depth = ilog2ceil(n); + // remember numbers are read from right to left + int pN = 1 << depth; // n rounded to the next power of 2 = n after padding + + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_idata failed!"); + + cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy idata to dev_idata failed!"); + + // for most gpus there are 1024 threads per block + int threadsPerBlock = 1024; + int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock; // ceiling of n / threadsPerBlock + dim3 blockDim(threadsPerBlock, 0, 0); + dim3 gridDim(blocksPerGrid, 0, 0); + + timer().startGpuTimer(); // TODO + // kernScan << > > (); + // upsweep + int offset = 1; + for (int d = n >> 1; d > 0; d >>= 1) { + kernUpSweep<<>>(); + offset *= 2; + + + kernExScan << > > (pN, dev_temp, dev_odata, dev_idata, dev_ping, dev_pong, offset, pingpong); + checkCUDAError("kernExScan failed!"); + + vector temp_test(pN); + cudaMemcpy(temp_test.data(), dev_ping, sizeof(int) * pN, cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy dev_temp to temp_test failed!"); + printArray(pN, temp_test.data(), false); + + pingpong = 1 - pingpong; + /*cudaMemcpy(dev_temp, dev_ping, pN * sizeof(int), cudaMemcpyDeviceToDevice); + cudaMemcpy(dev_ping, dev_pong, pN * sizeof(int), cudaMemcpyDeviceToDevice); + cudaMemcpy(dev_pong, dev_temp, pN * sizeof(int), cudaMemcpyDeviceToDevice);*/ + int* temp = dev_ping; + dev_ping = dev_pong; + dev_pong = temp; + } timer().endGpuTimer(); + + cudaFree(dev_idata); } /** @@ -30,10 +139,54 @@ namespace StreamCompaction { * @param idata The array of elements to compact. * @returns The number of elements remaining after compaction. */ + using namespace StreamCompaction::Common; int compact(int n, int *odata, const int *idata) { + return -1; + + int* dev_idata; + int* dev_odata; + bool* dev_bools; + int* dev_indices; + + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_idata failed!"); + + cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy idata to dev_idata failed!"); + + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_odata failed!"); + + cudaMalloc((void**)&dev_bools, n * sizeof(bool)); + checkCUDAError("cudaMalloc dev_mask failed!"); + + cudaMalloc((void**)&dev_indices, n * sizeof(bool)); + checkCUDAError("cudaMalloc dev_indices failed!"); + + // for most gpus there are 1024 threads per block + int threadsPerBlock = 1024; + int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock; // ceiling of n / threadsPerBlock + dim3 blockDim(threadsPerBlock, 0, 0); + dim3 gridDim(blocksPerGrid, 0, 0); + + timer().startGpuTimer(); // TODO + int k = ilog2ceil(n); + // step 1: compute dev_bools = determine which elements should be purged + // kernMapToBoolean<<>>(n, dev_bools, dev_idata); + // step 2: exclusive scan on dev_bools + // kernScan<<>>(n, dev_indices, dev_bools); + // step 3: reduce the array based on bools + // kernScatter<<>>(n, dev_odata, dev_idata, dev_bools, dev_indices); + timer().endGpuTimer(); + + cudaFree(dev_idata); + cudaFree(dev_odata); + cudaFree(dev_bools); + cudaFree(dev_indices); + return -1; } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..235d47b 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -3,6 +3,10 @@ #include "common.h" #include "naive.h" +#include +#include +#include + namespace StreamCompaction { namespace Naive { using StreamCompaction::Common::PerformanceTimer; @@ -11,15 +15,169 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } + // TODO: __global__ + // This version can handle arrays only as large as can be processed by a single thread block running on one multiprocessor of a GPU. + //__global__ void scan(float* g_odata, float* g_idata, int n) { + // extern __shared__ float temp[]; + // // allocated on invocation + // int thid = threadIdx.x; int pout = 0, pin = 1; + // // Load input into shared memory. + // // This is exclusive scan, so shift right by one + // // and set first element to 0 + // temp[pout*n + thid] = (thid > 0) ? g_idata[thid-1] : 0; + // __syncthreads(); + // for (int offset = 1; offset < n; offset *= 2) + // { + // pout = 1 - pout; + // // swap double buffer indices + // pin = 1 - pout; + // if (thid >= offset) + // temp[pout*n+thid] += temp[pin*n+thid - offset]; + // else + // temp[pout*n+thid] = temp[pin*n+thid]; + // __syncthreads(); + // } + // g_odata[thid] = temp[pout*n+thid]; + // // write output + //} + + __global__ void kernInitExScan(int n, int pN, int* temp, int* idata, int* pong) { + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + if (idx >= pN) + return; + + if (idx >= n) + idata[idx] = 0; + // shift the array to the right by one for exclusive scan + // the initializing the padding of idata inn the above line is not guaranteed to be + // completed for all threads by the time the next line is reached + // so just initialize all of the padding in the temp to 0 here + pong[idx] = (idx > 0 && idx < n) ? idata[idx - 1] : 0; + return; + + temp[idx] = (idx > 0 && idx < n) ? idata[idx - 1] : 0; + } + + __global__ void kernExScan(int pN, int* temp, int* odata, const int*idata, int* ping, int* pong, int offset, int pingpong) { + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + + if (idx >= pN) + return; + + if (idx >= offset) + ping[idx] = pong[idx] + pong[idx - offset]; + else + ping[idx] = pong[idx]; + + return; + + if (idx >= offset) + ping[idx] += pong[idx - offset]; + else + ping[idx] = pong[idx]; + + return; + + if (idx >= offset) + temp[pingpong * pN + idx] += temp[(1 - pingpong) * pN + idx - offset]; + else + temp[pingpong * pN + idx] = temp[(1 - pingpong) * pN + idx]; + } + + using namespace std; + void printArray(int n, const int* a, bool abridged = false) { + cout << " [ "; + for (int i = 0; i < n; i++) { + if (abridged && i + 2 == 15 && n > 16) { + i = n - 2; + cout << "... "; + } + cout << a[i] << " "; + } + cout << "]\n"; + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int* dev_idata; + int* dev_odata; + int* dev_temp; + int* dev_ping; + int* dev_pong; + + int depth = ilog2ceil(n); + // remember numbers are read from right to left + int pN = 1 << depth; // n rounded to the next power of 2 = n after padding + + // allocating memory for dev_idata and copying memory over from idata + cudaMalloc((void**)&dev_idata, pN * sizeof(int)); + checkCUDAError("cudaMalloc dev_idata failed!"); + + cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy idata to dev_idata failed!"); + + // allocating memory for dev_odata + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_odata failed!"); + + // allocating memory for dev_temp + cudaMalloc((void**)&dev_temp, 2 * pN * sizeof(int)); + checkCUDAError("cudaMalloc dev_temp failed!"); + + // allocating memory for dev_ping + cudaMalloc((void**)&dev_ping, pN * sizeof(int)); + checkCUDAError("cudaMalloc dev_ping failed!"); + + // allocating memory for dev_pong + cudaMalloc((void**)&dev_pong, pN * sizeof(int)); + checkCUDAError("cudaMalloc dev_pong failed!"); + + // for most gpus 1024 is the maximum number of threads per block + int threadsPerBlock = 1024; + int blocksPerGrid = (pN + threadsPerBlock - 1) / threadsPerBlock; // ceiling of ( pN / threadsPerBlock ) + dim3 blockDim(threadsPerBlock); + dim3 gridDim(blocksPerGrid); + timer().startGpuTimer(); - // TODO + // launches a kernel that initializes buffers necessary for naive exclusive scan + kernInitExScan<<>>(n, pN, dev_temp, dev_idata, dev_pong); + checkCUDAError("kernInitExScan failed!"); + + printArray(n, idata, false); + + // execution of naive exclusive scan in parallel + // uses global memory instead of shared memory for ping pong buffers + // so that the data can be of arbitrary size + int pingpong = 0; + for (int offset = 1; offset < pN; offset *= 2) { + kernExScan<<>>(pN, dev_temp, dev_odata, dev_idata, dev_ping, dev_pong, offset, pingpong); + checkCUDAError("kernExScan failed!"); + + vector temp_test(pN); + cudaMemcpy(temp_test.data(), dev_ping, sizeof(int) * pN, cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy dev_temp to temp_test failed!"); + printArray(pN, temp_test.data(), false); + + pingpong = 1 - pingpong; + /*cudaMemcpy(dev_temp, dev_ping, pN * sizeof(int), cudaMemcpyDeviceToDevice); + cudaMemcpy(dev_ping, dev_pong, pN * sizeof(int), cudaMemcpyDeviceToDevice); + cudaMemcpy(dev_pong, dev_temp, pN * sizeof(int), cudaMemcpyDeviceToDevice);*/ + int* temp = dev_ping; + dev_ping = dev_pong; + dev_pong = temp; + } + cudaMemcpy(odata, dev_pong, n * sizeof(int), cudaMemcpyDeviceToHost); + // cudaMemcpy(odata, dev_temp, n * sizeof(int), cudaMemcpyDeviceToHost); timer().endGpuTimer(); + + cudaFree(dev_idata); + cudaFree(dev_odata); + cudaFree(dev_temp); + cudaFree(dev_ping); + cudaFree(dev_pong); } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..07fea05 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,31 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int* dev_idata; + int* dev_odata; + + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_idata failed!"); + + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_idata failed!"); + + thrust::device_ptr thrust_dev_idata(dev_idata); + thrust::device_ptr thrust_dev_odata(dev_odata); + timer().startGpuTimer(); // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(thrust_dev_idata, thrust_dev_idata + n, thrust_dev_odata); timer().endGpuTimer(); + + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_idata); + cudaFree(dev_odata); } } }