diff --git a/README.md b/README.md index 0e38ddb..c7ebfb8 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,165 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Jie Meng + * [LinkedIn](https://www.linkedin.com/in/jie-meng/), [twitter](https://twitter.com/JieMeng6). +* Tested on: Windows 10, i7-7700HQ @ 2.80GHz, 16GB, GTX 1050 4GB (My personal laptop) -### (TODO: Your README) -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +## Background +### Scan +*Prefix Sum*, A.K.A. *Scan*, is a widely used basic algorithm that: +given an array, for each index, compute the sum of array elements before it. +It is trivial to implement this on CPU, but we want to parallelize it to run on GPU. +On GPU, every thread will only access a small part of data. A naive parallel method is shown as the following image: + +![](img/prefixsum.png) + +Given this naive algorithm, a more efficient algorithm that uses two sweeps is shown as: + +Up-Sweep: + +![](img/upsweep.jpg) + +Down-Sweep: + +![](img/downsweep.jpg) + +This work-efficient algorithm reduced the complexity from *O(nlogn)* to *O(n)* + + +### Stream Compaction +It is another widely used basic algorithm that: +given an array and a condition, wipe out all the elements that fails the condition. +On CPU, it is also trivial to implement; on the GPU, we could also create a parallel version based on the previous scan algorithm: +* First map each element to a 1 or 0 based on whether it meets the condition; +* Then scan the mapped array using former parallel scan algoritm +* Finally scatter the original array + +![](img/scatter.jpg) + + +## Project Description +In this project, I implemented: +* CPU scan +* Naive GPU scan +* Work-Efficient scan +and +* CPU stream compaction +* CPU stream compaction with scan +* Work-Efficient GPU stream compaction +Performance analysis on these algoritms are conducted. + +*Extra Credits* +* Optimization of Work-efficient GPU scan and stream compact, and performance comparison against non-optimized version. + +### Performance Analysis + +- #### SCAN Algorithm Running Time w.r.t. Array Size + ##### Under Release x64, all @128 block size, work-efficient scan is *optimized* + +![](img/scanperformances.png) + + ##### Analysis + 1. As array size increases, all algorithms take more time to finish, where: + * CPU Scanrunning time increases linearly + * Naive methods perform good, running time increases slowly and substantially slower than CPU methods when array size goes up exponentially + * Work-efficient methods have fluctuating performances, which I'm not sure why. But it still out-performs CPU methods when array goes large. + 2. As for power-of-two against NPT performances: + * In the work-efficient methods case, NPT usually takes longer to finish scan. + * For other algorithms, NPT doesn't make noticable differences. + 3. Thrust performs weiredly: + * Before 2^15 elements, thrust performs stably, and significantly faster than any other algoritms, but: + * At 2^15 elements, thrust suddenly takes much longer time to finish than expected, and remains stable afterwards. + * From NSight Performance Analysis, not much information could be gained pertaining mechanism behind thrust: you can only notice one memory allocation & free, one Device => Host and one Host => Device. + But I guess thrust treat data at different stages, like before/after 2^15 elements. Performances within each stage is stable and optimized, but drop significantly across stages. + + 4. When array size is at 2^20(not ploted on image), work-efficient methods are much better than naive methods: + + ![](img/220.png) + +- #### SCAN Algorithms Performances w.r.t. Block Size + #### Under Release x64, all @2048 elements, work-efficient scan is *optimized* + +![](img/scanblocksize.png) + + ##### Analysis + 1. For Naive method, block size doesn't make any noticable differences + 2. For Work-Efficient methods, they performs bad at specific block sizes. I'm not sure why, a reasonable guess is error. + +- #### Work-Efficient Method: optimized v.s. non-optimized + ##### Under Release x64, all @128 block size + +![](img/optscan.png) + + ##### Analysis + As we can see from the data chart, after removal of branch statement in kernel, and only launchs the threads that actually work, the performance +increases noticeably: I actually didn't expect this, since we are using just global memory here, and the constrain on performance should be memory access. + +- #### STREAM COMPACTION performances are similar to Scans: + #### @128 block size +![](img/sc2.png) + +- #### Console Output +``` +**************** +** SCAN TESTS ** +**************** + [ 0 8 40 26 45 29 26 49 4 49 38 5 14 ... 9 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.002188ms (std::chrono Measured) + [ 0 0 8 48 74 119 148 174 223 227 276 314 319 ... 6397 6406 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 0.002188ms (std::chrono Measured) + [ 0 0 8 48 74 119 148 174 223 227 276 314 319 ... 6354 6355 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 0.022944ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.021824ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.0392ms (CUDA Measured) + [ 0 0 8 48 74 119 148 174 223 227 276 314 319 ... 6397 6406 ] + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.039936ms (CUDA Measured) + [ 0 0 8 48 74 119 148 174 223 227 276 314 319 ... 6354 6355 ] + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.009088ms (CUDA Measured) + [ 0 0 8 48 74 119 148 174 223 227 276 314 319 ... 6397 6406 ] + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.009152ms (CUDA Measured) + [ 0 0 8 48 74 119 148 174 223 227 276 314 319 ... 6354 6355 ] + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 3 1 2 0 1 1 2 1 3 1 2 0 3 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.004376ms (std::chrono Measured) + [ 3 1 2 1 1 2 1 3 1 2 3 3 3 ... 1 1 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.004011ms (std::chrono Measured) + [ 3 1 2 1 1 2 1 3 1 2 3 3 3 ... 3 1 ] + passed +==== cpu compact with scan ==== + elapsed time: 0.006564ms (std::chrono Measured) + [ 3 1 2 1 1 2 1 3 1 2 3 3 3 ... 1 1 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.047104ms (CUDA Measured) + [ 3 1 2 1 1 2 1 3 1 2 3 3 3 ... 1 1 ] + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.047104ms (CUDA Measured) + [ 3 1 2 1 1 2 1 3 1 2 3 3 3 ... 3 1 ] + passed +Press any key to continue . . . +``` diff --git a/img/220.png b/img/220.png new file mode 100644 index 0000000..9a00293 Binary files /dev/null and b/img/220.png differ diff --git a/img/downsweep.jpg b/img/downsweep.jpg new file mode 100644 index 0000000..88be7ff Binary files /dev/null and b/img/downsweep.jpg differ diff --git a/img/optscan.png b/img/optscan.png new file mode 100644 index 0000000..e368065 Binary files /dev/null and b/img/optscan.png differ diff --git a/img/prefixsum.png b/img/prefixsum.png new file mode 100644 index 0000000..fa7f882 Binary files /dev/null and b/img/prefixsum.png differ diff --git a/img/sc2.png b/img/sc2.png new file mode 100644 index 0000000..580ffb5 Binary files /dev/null and b/img/sc2.png differ diff --git a/img/scanblocksize.png b/img/scanblocksize.png new file mode 100644 index 0000000..f408ca7 Binary files /dev/null and b/img/scanblocksize.png differ diff --git a/img/scanperformances.png b/img/scanperformances.png new file mode 100644 index 0000000..96ddf38 Binary files /dev/null and b/img/scanperformances.png differ diff --git a/img/scatter.jpg b/img/scatter.jpg new file mode 100644 index 0000000..d3438dc Binary files /dev/null and b/img/scatter.jpg differ diff --git a/img/upsweep.jpg b/img/upsweep.jpg new file mode 100644 index 0000000..22c4da0 Binary files /dev/null and b/img/upsweep.jpg differ diff --git a/src/main.cpp b/src/main.cpp index 1850161..84eaae2 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 20; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -54,11 +54,11 @@ int main(int argc, char* argv[]) { //printArray(SIZE, c, true); printCmpResult(SIZE, b, c); - /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan - onesArray(SIZE, c); - printDesc("1s array for finding bugs"); - StreamCompaction::Naive::scan(SIZE, c, a); - printArray(SIZE, c, true); */ + ///* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan + //onesArray(SIZE, c); + //printDesc("1s array for finding bugs"); + //StreamCompaction::Naive::scan(SIZE, c, a); + //printArray(SIZE, c, true); zeroArray(SIZE, c); printDesc("naive scan, non-power-of-two"); @@ -71,28 +71,28 @@ int main(int argc, char* argv[]) { printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("thrust scan, power-of-two"); StreamCompaction::Thrust::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("thrust scan, non-power-of-two"); StreamCompaction::Thrust::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); printf("\n"); @@ -117,6 +117,7 @@ int main(int argc, char* argv[]) { expectedCount = count; printArray(count, b, true); printCmpLenResult(count, expectedCount, b, b); + zeroArray(SIZE, c); printDesc("cpu compact without scan, non-power-of-two"); @@ -137,14 +138,14 @@ int main(int argc, char* argv[]) { printDesc("work-efficient compact, power-of-two"); count = StreamCompaction::Efficient::compact(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); printDesc("work-efficient compact, non-power-of-two"); count = StreamCompaction::Efficient::compact(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); system("pause"); // stop Win32 console from closing on exit diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 8fc0211..67fa521 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -1,4 +1,6 @@ #include "common.h" +#include +#include void checkCUDAErrorFn(const char *msg, const char *file, int line) { cudaError_t err = cudaGetLastError(); @@ -24,6 +26,13 @@ namespace StreamCompaction { */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { // TODO + int index = threadIdx.x + (blockDim.x * blockIdx.x); + if (index >= n) + return; + if (idata[index] == 0) + bools[index] = 0; + else + bools[index] = 1; } /** @@ -33,6 +42,12 @@ namespace StreamCompaction { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { // TODO + int index = threadIdx.x + (blockDim.x * blockIdx.x); + if (index >= n) + return; + if (bools[index]) + odata[indices[index]] = idata[index]; + } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 05ce667..2b075c5 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -1,15 +1,15 @@ #include #include "cpu.h" -#include "common.h" +#include "common.h" namespace StreamCompaction { namespace CPU { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } /** @@ -20,6 +20,11 @@ namespace StreamCompaction { void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + odata[0] = 0; + for (int ind = 1; ind < n; ind++) + { + odata[ind] = odata[ind - 1] + idata[ind - 1]; + } timer().endCpuTimer(); } @@ -31,20 +36,63 @@ namespace StreamCompaction { int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int nonZeros = 0; + for (int ind = 0; ind < n; ind++) + { + if (idata[ind] != 0) + { + odata[nonZeros] = idata[ind]; + nonZeros++; + } + } timer().endCpuTimer(); - return -1; + return nonZeros; } + /* + * CPU scatter function + */ + void cpuScatter(int n, int* odata, const int* idata, int* indicator, int* scanned) + { + for (int ind = 0; ind < n; ind++) + { + if (indicator[ind] != 0) + { + odata[scanned[ind]] = idata[ind]; + } + } + } /** * CPU stream compaction using scan and scatter, like the parallel version. * * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { + int* indicator= new int[n]; + int* scanned = new int[n]; timer().startCpuTimer(); // TODO + for (int ind = 0; ind < n; ind++) + { + if (idata[ind] == 0) + indicator[ind] = 0; + else + indicator[ind] = 1; + } + scanned[0] = 0; + for (int ind = 1; ind < n; ind++) + { + scanned[ind] = scanned[ind - 1] + indicator[ind - 1]; + } + cpuScatter(n, odata, idata, indicator, scanned); timer().endCpuTimer(); - return -1; + + int num = scanned[n - 1]; + if (indicator[n - 1] != 0) + num++; + delete[] indicator; + delete[] scanned; + return num; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 36c5ef2..96e1663 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -2,23 +2,151 @@ #include #include "common.h" #include "efficient.h" +#include +#include + +#define blockSize 128 +//toggle between opimization threads and not (for EC1 or part 5) +#define OPT 1 namespace StreamCompaction { namespace Efficient { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } + __global__ void kernDataCopy(int n, int* odata, const int* idata) + { + int index = threadIdx.x + (blockDim.x * blockIdx.x); + if (index >= n) + return; + odata[index] = idata[index]; + } + + __global__ void kernDataSet(int n, int* idata, int value) + { + int index = threadIdx.x + (blockDim.x * blockIdx.x); + if (index >= n) + return; + idata[index] = value; + } + __global__ void kernRootSet(int n, int* idata) + { + idata[n-1] = 0; + } + __global__ void kernEfficientUpsweep(int n, int* idata, int d) + { + int index = threadIdx.x + (blockDim.x * blockIdx.x); + if (index >= n) + { + return; + } +#if OPT + //With optimization : only launch active threads + //Basically hack on the indices + int targetInd = (index + 1) * (1 << (d + 1)) - 1; //for example: index = 0,1,2,3 targetInd = 1,3,5,7 (d = 0) + idata[targetInd] += idata[targetInd - (1 << d)]; + //With optimization - end +#else + //No optimization + if ((index + 1) % (1 << (d + 1)) == 0) + { + idata[index] = idata[index] + idata[index - (1 << d)]; + } + //No optimization - end +#endif + + + } + __global__ void kernEfficientDownsweep(int n, int* idata, int d) + { + int index = threadIdx.x + (blockDim.x * blockIdx.x); + if (index >= n) + { + return; + } +#if OPT + //With optimization : only launch active threads + int targetInd = (index + 1) * (1 << (d + 1)) - 1; + int t = idata[targetInd - (1 << d)]; + idata[targetInd - (1 << d)] = idata[targetInd]; + idata[targetInd] += t; + //With optimization - end +#else + //No optimization + if ((index + 1) % (1 << (d + 1)) == 0) + { + int t = idata[index - (1 << d)]; + idata[index - (1 << d)] = idata[index]; + idata[index] += t; + } + //No optimization - end +#endif + + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + //allocate and copy memories on CUDA device + //need to round n to the next power of 2 + int deg = ilog2ceil(n); + int nround2 = 1 << deg; + int* dev_idata; + cudaMalloc((void**)&dev_idata, sizeof(int) * nround2); + checkCUDAError("cudamalloc failed!"); + cudaDeviceSynchronize(); + + //set the extra memory to zeros + dim3 fullBlocksPerGrid = (nround2 + blockSize - 1) / blockSize; + kernDataSet << >> (nround2, dev_idata, 0); + checkCUDAError("kerndataset failed!"); + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudamemcpy failed!"); timer().startGpuTimer(); - // TODO + // TODO +#if OPT + //With optimization + for (int d = 0; d <= deg - 1; d++) + { + int threadCount = 1 << (deg - 1 - d); + kernEfficientUpsweep << > >(threadCount, dev_idata, d); + } + checkCUDAError("kernEffcientUpSweep failed!"); + kernRootSet << > > (nround2, dev_idata); + checkCUDAError("kernRootSet failed!"); + for (int d = deg - 1; d >= 0; d--) + { + int threadCount = 1 << (deg - 1 - d); + kernEfficientDownsweep << > >(threadCount, dev_idata, d); + } + checkCUDAError("kernEffcientDownSweep failed!"); + //With optimization - end +#else + //No optimization: always launching same amount of threads + for (int d = 0; d <= deg - 1; d++) + { + kernEfficientUpsweep << > >(nround2, dev_idata, d); + + } + checkCUDAError("kernEffcientUpSweep failed!"); + kernRootSet << >> (nround2, dev_idata); + checkCUDAError("kernRootSet failed!"); + for (int d = deg - 1; d >= 0; d--) + { + + kernEfficientDownsweep << > >(nround2, dev_idata, d); + } + checkCUDAError("kernEffcientDownSweep failed!"); + //No optimization - end +#endif timer().endGpuTimer(); + cudaMemcpy(odata, dev_idata, sizeof(int)*n, cudaMemcpyDeviceToHost); + cudaFree(dev_idata); + } /** @@ -31,10 +159,90 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + int nround2 = 1 << (ilog2ceil(n)); + int deg = ilog2ceil(n); + dim3 fullBlocksPerGrid = (nround2 + blockSize - 1) / blockSize; + dim3 originGrid = (n + blockSize - 1) / blockSize; + int* dev_odata, *dev_idata, *dev_bools, *dev_scanned; + cudaMalloc((void**)&dev_idata, sizeof(int) * n); + cudaMalloc((void**)&dev_bools, sizeof(int) * n); + cudaMalloc((void**)&dev_scanned, sizeof(int) * nround2); + cudaMalloc((void**)&dev_odata, sizeof(int) * n); + cudaDeviceSynchronize(); + //set the extra memory to zeros + + kernDataSet << > > (nround2, dev_scanned, 0); + checkCUDAError("kerndataset failed!"); + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + + //dim3 fullBlocksPerGrid2 = (n + blockSize - 1) / blockSize; + int* dev_count; + cudaMalloc((void**)&dev_count, sizeof(int)); + int count; + cudaDeviceSynchronize(); + timer().startGpuTimer(); // TODO + //step1: map to bools + Common::kernMapToBoolean << > >(n, dev_bools, dev_idata); + + //kernDataCopy << > >(nround2, dev_scanned, dev_bools); + cudaMemcpy(dev_scanned, dev_bools, n * sizeof(int), cudaMemcpyDeviceToDevice); + //step2: scan +#if OPT + //With optimization + for (int d = 0; d <= deg - 1; d++) + { + int threadCount = 1 << (deg - 1 - d); + kernEfficientUpsweep << > >(threadCount, dev_scanned, d); + } + checkCUDAError("kernEffcientUpSweep failed!"); + kernRootSet << > > (nround2, dev_scanned); + checkCUDAError("kernRootSet failed!"); + for (int d = deg - 1; d >= 0; d--) + { + int threadCount = 1 << (deg - 1 - d); + kernEfficientDownsweep << > >(threadCount, dev_scanned, d); + } + checkCUDAError("kernEffcientDownSweep failed!"); + //With optimization - end +#else + //No optimization: always launching same amount of threads + for (int d = 0; d <= deg - 1; d++) + { + kernEfficientUpsweep << > >(nround2, dev_scanned, d); + + } + checkCUDAError("kernEffcientUpSweep failed!"); + kernRootSet << > > (nround2, dev_scanned); + checkCUDAError("kernRootSet failed!"); + for (int d = deg - 1; d >= 0; d--) + { + + kernEfficientDownsweep << > >(nround2, dev_scanned, d); + } + checkCUDAError("kernEffcientDownSweep failed!"); + //No optimization - end +#endif + + //step3: scatter + + Common::kernScatter<<>>(n, dev_odata, dev_idata, dev_bools, dev_scanned); + timer().endGpuTimer(); - return -1; + + cudaMemcpy(odata, dev_odata, sizeof(int)*n, cudaMemcpyDeviceToHost); + + cudaMemcpy(&count, dev_scanned + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + + if (idata[n - 1] != 0) + count++; + cudaFree(dev_idata); + cudaFree(dev_odata); + cudaFree(dev_bools); + cudaFree(dev_scanned); + return count; } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 9218f8e..f87e6d3 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -2,24 +2,86 @@ #include #include "common.h" #include "naive.h" +#include +#include +#define blockSize 128 namespace StreamCompaction { namespace Naive { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } // TODO: __global__ + __global__ void kernCopyValues(int n, const int* idata, int* temp, bool inclusive) + { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) + { + return; + } + if (inclusive) + { + temp[index] = idata[index]; + } + else + { + if (index == 0) + temp[index] = 0; + else + temp[index] = idata[index - 1]; + } + } + __global__ void kernScan(int n, int* odata, int* temp, int d) + { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) + { + return; + } + if (index >= 1<<(d-1)) + { + odata[index] = temp[index - (1 << (d - 1))] + temp[index]; + } + else + { + odata[index] = temp[index]; + } + __syncthreads(); + temp[index] = odata[index]; + __syncthreads(); + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int* dev_idata, *dev_odata; + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(dev_odata, odata, n * sizeof(int), cudaMemcpyHostToDevice); + + int *temp; + cudaMalloc((void**)&temp, n * sizeof(int)); + dim3 fullBlocksPerGrid = (n + blockSize - 1) / blockSize; + kernCopyValues << > >(n, dev_idata, temp,false); + cudaThreadSynchronize(); timer().startGpuTimer(); // TODO + for (int d = 1; d <= ilog2ceil(n); d++) + { + kernScan << > >(n, dev_odata, temp,d); + } timer().endGpuTimer(); + kernCopyValues << > >(n, temp, dev_odata,true); + cudaThreadSynchronize(); + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(temp); + cudaFree(dev_idata); + cudaFree(dev_odata); } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 36b732d..88d6940 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -8,21 +8,30 @@ namespace StreamCompaction { namespace Thrust { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + //create thrust vectors: + thrust::host_vector host_idata(idata , idata + n ); + thrust::device_vector dev_idata = host_idata; + thrust::device_vector dev_odata(n,0); + thrust::exclusive_scan(dev_idata.begin(), dev_idata.end(), dev_odata.begin()); timer().startGpuTimer(); // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(dev_idata.begin(), dev_idata.end(), dev_odata.begin()); + timer().endGpuTimer(); + int* out_data = thrust::raw_pointer_cast(dev_odata.data()); + cudaMemcpy(odata, out_data, n * sizeof(int), cudaMemcpyDeviceToHost); } } }