diff --git a/README.md b/README.md index 0e38ddb..7047571 100644 --- a/README.md +++ b/README.md @@ -2,13 +2,87 @@ CUDA Stream Compaction ====================== **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** +* Haorong Yang +* [LinkedIn](https://www.linkedin.com/in/haorong-henry-yang/) +* Tested on: Windows 10 Home, i7-10750H @ 2.60GHz 16GB, GTX 2070 Super Max-Q (Personal) -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +The goal of this project was to implement a stream compaction algorithm on the GPU in CUDA from scratch. +The algorithm will remove `0`s from an array of `int`s utilizing a scan function, which performs parallel reduction on the array to obtain an exclusive prefix sum. -### (TODO: Your README) +Although the goal is to obtain an efficient parallel solution, for comparison, a few variations of the algorithm were also implemented. +A list of algorithms that will be compared to each other: +* CPU scan function +* CPU stream compaction without scan +* CPU sream compaction with scan +* GPU naive scan +* GPU work-efficient scan & compaction +* thrust library's implementation -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +The test results for array size of 2^8 is: +``` +**************** +** SCAN TESTS ** +**************** + [ 17 20 19 34 19 3 6 3 27 2 14 5 21 ... 36 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.0005ms (std::chrono Measured) + [ 0 17 37 56 90 109 112 118 121 148 150 164 169 ... 6163 6199 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 0.0004ms (std::chrono Measured) + [ 0 17 37 56 90 109 112 118 121 148 150 164 169 ... 6092 6101 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 0.029056ms (CUDA Measured) + [ 0 17 37 56 90 109 112 118 121 148 150 164 169 ... 6163 6199 ] + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.026752ms (CUDA Measured) + [ 0 17 37 56 90 109 112 118 121 148 150 164 169 ... 0 0 ] + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.012768ms (CUDA Measured) + [ 6199 6216 6236 6255 6289 6308 6311 6317 6320 6347 6349 6363 6368 ... 3050 3086 ] + a[0] = 0, b[0] = 6199 + FAIL VALUE +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.012512ms (CUDA Measured) + [ 6138 6155 6175 6194 6228 6247 6250 6256 6259 6286 6288 6302 6307 ... 2979 2988 ] + a[0] = 0, b[0] = 6138 + FAIL VALUE +==== thrust scan, power-of-two ==== + elapsed time: 0.055264ms (CUDA Measured) + [ 0 17 37 56 90 109 112 118 121 148 150 164 169 ... 6163 6199 ] + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.054368ms (CUDA Measured) + [ 0 17 37 56 90 109 112 118 121 148 150 164 169 ... 6092 6101 ] + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 3 0 1 2 1 1 0 1 1 0 2 1 3 ... 2 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.0008ms (std::chrono Measured) + [ 3 1 2 1 1 1 1 2 1 3 1 2 2 ... 3 2 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.0008ms (std::chrono Measured) + [ 3 1 2 1 1 1 1 2 1 3 1 2 2 ... 1 3 ] + passed +==== cpu compact with scan ==== + elapsed time: 0.004ms (std::chrono Measured) + [ 3 1 2 1 1 1 1 2 1 3 1 2 2 ... 3 2 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.020992ms (CUDA Measured) + [ 3 1 2 1 1 1 1 2 1 3 1 2 2 ... 0 0 ] +expected count is 185, count is 185 + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.021888ms (CUDA Measured) + [ 3 1 2 1 1 1 1 2 1 3 1 2 2 ... 1 3 ] +expected count is 185, count is 183 + passed +``` \ No newline at end of file diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..f649d1b 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -14,7 +14,8 @@ #include "testing_helpers.hpp" const int SIZE = 1 << 8; // feel free to change the size of array -const int NPOT = SIZE - 3; // Non-Power-Of-Two +//const int SIZE = 8; +const int NPOT = SIZE - 3; // Non8Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; int *c = new int[SIZE]; @@ -27,7 +28,8 @@ int main(int argc, char* argv[]) { printf("** SCAN TESTS **\n"); printf("****************\n"); - genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case + genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case // 3rd argument is maxValue + //testArray(SIZE - 1, a, 50); // test a[SIZE - 1] = 0; printArray(SIZE, a, true); @@ -51,7 +53,7 @@ int main(int argc, char* argv[]) { printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan @@ -64,35 +66,35 @@ int main(int argc, char* argv[]) { printDesc("naive scan, non-power-of-two"); StreamCompaction::Naive::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("thrust scan, power-of-two"); StreamCompaction::Thrust::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("thrust scan, non-power-of-two"); StreamCompaction::Thrust::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); printf("\n"); @@ -137,14 +139,16 @@ int main(int argc, char* argv[]) { printDesc("work-efficient compact, power-of-two"); count = StreamCompaction::Efficient::compact(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(SIZE, c, true); + std::cout << "expected count is " << expectedCount << ", count is " << count << std::endl; printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); printDesc("work-efficient compact, non-power-of-two"); count = StreamCompaction::Efficient::compact(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); + std::cout << "expected count is " << expectedCount << ", count is " << count << std::endl; printCmpLenResult(count, expectedNPOT, b, c); system("pause"); // stop Win32 console from closing on exit diff --git a/src/testing_helpers.hpp b/src/testing_helpers.hpp index 025e94a..fe01387 100644 --- a/src/testing_helpers.hpp +++ b/src/testing_helpers.hpp @@ -57,6 +57,15 @@ void genArray(int n, int *a, int maxval) { } } +void testArray(int n, int* a, int maxval) { + srand(time(nullptr)); + int q = 0; + for (int i = 0; i < n; i++) { + a[i] = q; + q++; + } +} + void printArray(int n, int *a, bool abridged = false) { printf(" [ "); for (int i = 0; i < n; i++) { diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..0e30336 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -18,8 +18,12 @@ namespace StreamCompaction { * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first. */ void scan(int n, int *odata, const int *idata) { + // we are assuming that the arrays have at least one element timer().startCpuTimer(); - // TODO + odata[0] = 0; + for (int k = 1; k < n; k++) { + odata[k] = odata[k - 1] + idata[k - 1]; + } timer().endCpuTimer(); } @@ -30,9 +34,15 @@ namespace StreamCompaction { */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + int optr = 0; + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + odata[optr] = idata[i]; + optr++; + } + } timer().endCpuTimer(); - return -1; + return optr; } /** @@ -42,9 +52,34 @@ namespace StreamCompaction { */ int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + int* zeroOnes = new int[n]; + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + zeroOnes[i] = 1; + } + else { + zeroOnes[i] = 0; + } + } + int* scanResult = new int[n]; + + // scan + scanResult[0] = 0; + for (int k = 1; k < n; k++) { + scanResult[k] = scanResult[k - 1] + zeroOnes[k - 1]; + } + // end of scan + + for (int i = 0; i < n; i++) { + if (zeroOnes[i] == 1) { + odata[scanResult[i]] = idata[i]; + } + } + int count = scanResult[n - 1]; + delete[] zeroOnes; + delete[] scanResult; timer().endCpuTimer(); - return -1; + return count; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..624d446 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -3,6 +3,8 @@ #include "common.h" #include "efficient.h" +#define blockSize 128 + namespace StreamCompaction { namespace Efficient { using StreamCompaction::Common::PerformanceTimer; @@ -12,15 +14,227 @@ namespace StreamCompaction { return timer; } + int* dev_scan_odata; + int* dev_dummy; + int* dev_zeroOnes; + + __global__ void kernUpSweep(int n, int powDplus1, int powD, int* out) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n || (index % powDplus1 != 0)) { + return; + } + out[index + powDplus1 - 1] += out[index + powD - 1]; + } + + __global__ void kernDownSweep(int n, int powDplus1, int powD, int* out) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n || (index % powDplus1 != 0)) { + return; + } + int t = out[index + powD - 1]; + out[index + powD - 1] = out[index + powDplus1 - 1]; + out[index + powDplus1 - 1] += t; + } + + + + __global__ void kernSweep(int n, int lgn, int* out) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) { + return; + } + for (int d = 0; d <= lgn - 1; d++) { + int powD = 1; + for (int i = 0; i < d; i++) { + powD *= 2; + } + int powDplus1 = powD * 2; + if (index % powDplus1 == 0) { + out[index + powDplus1 - 1] += out[index + powD - 1]; + } + __syncthreads(); + } + + if (index == n - 1) { + out[index] = 0; + } + __syncthreads(); + + for (int d = lgn - 1; d >= 0; d--) { + int powD = 1; + for (int i = 0; i < d; i++) { + powD *= 2; + } + int powDplus1 = powD * 2; + if (index % powDplus1 == 0) { + int t = out[index + powD - 1]; + out[index + powD - 1] = out[index + powDplus1 - 1]; + out[index + powDplus1 - 1] += t; + } + __syncthreads(); + } + } + + + + __global__ void kernSetZero(int n, int* out) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) { + return; + } + out[index] = 0; + } + + __global__ void kernCopy(int n, int* out, int* in) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) { + return; + } + out[index] = in[index]; + } + + + + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { + void scan(int n, int *odata, const int *idata) { // need to pad zeros if n is not pow of 2 + int lgn = ilog2ceil(n); + int pow2Ceil = 1; + for (int i = 0; i < lgn; i++) { + pow2Ceil *= 2; + } + // set dimension + int fullBlocksPerGrid = (pow2Ceil + blockSize - 1) / blockSize; // test 1d grid + + // cudaMalloc + cudaMalloc((void**)&dev_scan_odata, pow2Ceil * sizeof(int)); + cudaMalloc((void**)&dev_dummy, n * sizeof(int)); + checkCUDAError("cudaMalloc failed!"); + // cudaMemCpy + cudaMemcpy(dev_dummy, idata, sizeof(int) * n, cudaMemcpyHostToDevice); // copy idata + + // set zeros + kernSetZero << > > (pow2Ceil, dev_scan_odata); + checkCUDAError("kernSetZero failed!"); + // copy dummy over to array padded with zeros + kernCopy << > > (n, dev_scan_odata, dev_dummy); + checkCUDAError("kernCopy failed!"); + + timer().startGpuTimer(); - // TODO + kernSweep << > > (pow2Ceil, lgn, dev_scan_odata); + checkCUDAError("kernSweep failed!"); timer().endGpuTimer(); + + + kernCopy << > > (n, dev_dummy, dev_scan_odata); + checkCUDAError("kernCopy failed!"); + + // cudaMemcpy back + cudaMemcpy(odata, dev_dummy, sizeof(int) * n, cudaMemcpyDeviceToHost); + //cudaFree + cudaFree(dev_scan_odata); + cudaFree(dev_dummy); + } + + + + __global__ void kernZeroOnes(int n, int* out, int* in) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) { + return; + } + if (in[index] != 0) { + out[index] = 1; + } + else { + out[index] = 0; + } } + __global__ void kernSweepCompact(int n, int lgn, int* out, int* dummy, int* idata) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) { + return; + } + bool isOne = false; + if (out[index] == 1) { + isOne = true; + } + for (int d = 0; d <= lgn - 1; d++) { + int powD = 1; + for (int i = 0; i < d; i++) { + powD *= 2; + } + int powDplus1 = powD * 2; + if (index % powDplus1 == 0) { + out[index + powDplus1 - 1] += out[index + powD - 1]; + } + __syncthreads(); + } + + /*if (index == n - 1) { + out[index] = 0; + } + __syncthreads();*/ + + /* for (int d = lgn - 1; d >= 0; d--) { + int powD = 1; + for (int i = 0; i < d; i++) { + powD *= 2; + } + int powDplus1 = powD * 2; + if (index % powDplus1 == 0) { + int t = out[index + powD - 1]; + out[index + powD - 1] = out[index + powDplus1 - 1]; + out[index + powDplus1 - 1] += t; + } + __syncthreads(); + }*/ + + //if (isOne) { + // dummy[out[index]] = idata[index]; + //} + + } + + __global__ void kernSetLastZero(int n, int* out) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index == n - 1) { + out[index] = 0; + } + } + + + __global__ void kernSweepDown(int n, int lgn, int* out, int* dummy, int* idata) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) { + return; + } + + for (int d = lgn - 1; d >= 0; d--) { + int powD = 1; + for (int i = 0; i < d; i++) { + powD *= 2; + } + int powDplus1 = powD * 2; + if (index % powDplus1 == 0) { + int t = out[index + powD - 1]; + out[index + powD - 1] = out[index + powDplus1 - 1]; + out[index + powDplus1 - 1] += t; + } + __syncthreads(); + } + + if (out[index] != out[index + 1]) { + dummy[out[index]] = idata[index]; + } + + } + + /** * Performs stream compaction on idata, storing the result into odata. * All zeroes are discarded. @@ -31,10 +245,79 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + int lgn = ilog2ceil(n); + int pow2Ceil = 1; + for (int i = 0; i < lgn; i++) { + pow2Ceil *= 2; + } + //int pow2Ceil = n; // test + + // set dimension + int fullBlocksPerGrid = (pow2Ceil + blockSize - 1) / blockSize; // test 1d grid + + // cudaMalloc + cudaMalloc((void**)&dev_scan_odata, pow2Ceil * sizeof(int)); + cudaMalloc((void**)&dev_zeroOnes, pow2Ceil * sizeof(int)); + cudaMalloc((void**)&dev_dummy, n * sizeof(int)); + checkCUDAError("cudaMalloc failed!"); + // cudaMemCpy + cudaMemcpy(dev_dummy, idata, sizeof(int) * n, cudaMemcpyHostToDevice); // copy idata + + // set zeros + kernSetZero << > > (pow2Ceil, dev_scan_odata); + checkCUDAError("kernSetZero failed!"); + // copy dummy over to array padded with zeros + kernCopy << > > (n, dev_scan_odata, dev_dummy); + checkCUDAError("kernCopy failed!"); + + // set dummy to zero for later + kernSetZero << > > (n, dev_dummy); + checkCUDAError("kernSetZero failed!"); + + + // -------------------------------------------------------------------------------------- timer().startGpuTimer(); - // TODO + // create ZeroOne array + kernZeroOnes << > > (pow2Ceil, dev_zeroOnes, dev_scan_odata); + checkCUDAError("kernZeroOnes failed!"); + + kernSweepCompact << > > (pow2Ceil, lgn, dev_zeroOnes, dev_dummy, dev_scan_odata); + checkCUDAError("kernSweepCompact failed!"); + + // set zeros + kernSetLastZero << > > (pow2Ceil, dev_zeroOnes); + checkCUDAError("kernSetZero failed!"); + + kernSweepDown << > > (pow2Ceil, lgn, dev_zeroOnes, dev_dummy, dev_scan_odata); + checkCUDAError("kernSweepCompact failed!"); + timer().endGpuTimer(); - return -1; + // -------------------------------------------------------------------------------------- + + + /*kernCopy << > > (n, dev_dummy, dev_zeroOnes); + checkCUDAError("kernCopy failed!");*/ + + int* counter = new int[pow2Ceil]; + cudaMemcpy(counter, dev_zeroOnes, sizeof(int) * pow2Ceil, cudaMemcpyDeviceToHost); // should change for non 2 pows + int count = 0; + for (int i = 0; i < pow2Ceil; i++) { + if (counter[i] == 1) { + count++; + } + } + count = counter[pow2Ceil - 1]; + + // cudaMemcpy back + //cudaMemcpy(odata, dev_dummy, sizeof(int) * n, cudaMemcpyDeviceToHost); + cudaMemcpy(odata, dev_dummy, sizeof(int) * n, cudaMemcpyDeviceToHost); + //cudaFree + cudaFree(dev_scan_odata); + cudaFree(dev_dummy); + cudaFree(dev_zeroOnes); + delete(counter); + + return count; } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..40100b0 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -3,6 +3,8 @@ #include "common.h" #include "naive.h" +#define blockSize 128 + namespace StreamCompaction { namespace Naive { using StreamCompaction::Common::PerformanceTimer; @@ -11,15 +13,86 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } - // TODO: __global__ + + int* dev_idata; + int* dev_odata; + + __global__ void kernUpdateK(int n, int offset, int* out, int* in) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) { + return; + } + if (index < offset) { + out[index] = in[index]; + } + else { + out[index] = in[index - offset] + in[index]; + } + } + + __global__ void kernShiftRight(int n, int* out, int* in) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) { + return; + } + if (index == 0) { + out[index] = 0; + } + else { + out[index] = in[index - 1]; + } + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + // cudaMalloc + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_idata failed!"); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_odata failed!"); + // cudaMemCpy + cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + cudaMemcpy(dev_odata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + timer().startGpuTimer(); - // TODO + // set dimension + int fullBlocksPerGrid = (n + blockSize - 1) / blockSize; // test 1d grid + int lgn = ilog2ceil(n); + int* temp; + for (int d = 1; d <= lgn; d++) { + // calculate offset from d + int offset = 1; + if (d > 1) { + for (int i = 0; i < d - 1; i++) { + offset *= 2; + } + } + + kernUpdateK << > > (n, offset, dev_odata, dev_idata); + checkCUDAError("kernUpdateK failed!"); + // now o = curr, i = old. + + // ping-pong + temp = dev_odata; + dev_odata = dev_idata; + dev_idata = temp; + // now, o = old, i = curr + } + + // shift right to make exclusive + kernShiftRight << > > (n, dev_odata, dev_idata); + checkCUDAError("kernShiftRight failed!"); + // now o = curr, i = old timer().endGpuTimer(); + + // cudaMemcpy back + cudaMemcpy(odata, dev_odata, sizeof(int) * n, cudaMemcpyDeviceToHost); + //cudaFree + cudaFree(dev_idata); + cudaFree(dev_odata); } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..8ddf089 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -17,12 +17,29 @@ namespace StreamCompaction { /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ + + void scan(int n, int *odata, const int *idata) { + thrust::host_vector h_out(n); + thrust::host_vector h_in(n); + for (int i = 0; i < n; i++) { + h_in[i] = idata[i]; + h_out[i] = idata[i]; + } + + + thrust::device_vector dev_out = h_out; + thrust::device_vector dev_in = h_in; + timer().startGpuTimer(); - // TODO use `thrust::exclusive_scan` - // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(dev_in.begin(), dev_in.end(), dev_out.begin()); timer().endGpuTimer(); + + thrust::copy(dev_out.begin(), dev_out.end(), h_out.begin()); + for (int i = 0; i < n; i++) { + odata[i] = h_out[i]; + } + } } }