diff --git a/README.md b/README.md index 0e38ddb..ccff2d0 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,145 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Qiaosen Chen + * [LinkedIn](https://www.linkedin.com/in/qiaosen-chen-725699141/), etc. +* Tested on: Windows 10, i5-9400 @ 2.90GHz 16GB, GeForce RTX 2060 6GB (personal computer). -### (TODO: Your README) +## Implemented Features -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +This project includes several scan and stream compaction algorithms, and most of them are implemented with parallelism in CUDA. With the serial version of algorithms run on CPU as comparison, we can have a better view on the performance of parallel algorithm run on GPU. + +- CPU Scan & Stream Compaction + +- Naive GPU Scan Algorithm + +- Work-Efficient GPU Scan & Stream Compaction + +- Thrust Scan + +- Radix Sort (Extra Credit) + + Please see the details in the last part of the report. + +## Performance Analysis + +### Performances under Different Block Size + +![Different Block Size](https://github.com/giaosame/Project2-Stream-Compaction/blob/master/img/different_blocksize_perf.png) + +In general, when ```blockSize = 128```, the parallel version of scan algorithms and compact algorithms could achieve a relative optimized performance. + +### Scan Algorithms Performances + +![Scan Algorithms Performances](https://github.com/giaosame/Project2-Stream-Compaction/blob/master/img/different_arraysize_scan_perf.png) + +This performance analysis tested when ```blockSize = 256```, and all algorithms taken into accounts are given the input with an power-of-two array size. + +When the input array size is small (```SIZE < 2^15```), the difference of performances is small and not obvious for all the scan algorithms, but the serial version of algorithm run on CPU performs better than those parallel version of algorithms run on GPU. + +When the input array size is large enough (```SIZE > 2^17```), the difference of performances becomes larger and larger, and apparently at this time, ```Thrust::Scan``` performs best among all algorithms. As expected, the ```CPU::Scan``` algorithm performs much worse than ```Thrust::Scan```, it is even worse than ```Naive::Scan```algorithm. However, it is quite weird that, the naive scan algorithm always runs faster than the work-efficient scan algorithm, because the so-called "efficient" work-efficient scan algorithm can still get optimized. + +### Compact Algorithm Performances + +![Compact Algorithm Performances](https://github.com/giaosame/Project2-Stream-Compaction/blob/master/img/different_arraysize_compact_perf.png) + +When the input array size is large enough (```SIZE > 2^17```), the difference of performances become more and more obvious, and both of the compact algorithms perform much worse as the input size increases, as expected. The serial version of compact algorithm run on CPU performs better than the parallel version run in GPU when the input size is small, however, when the input size is quite huge, such as ```SIZE = 2^20```, there is no doubt that the work-efficient compact algorithm run in parallel on GPU perform much better than the CPU version. + +### Output + +This output tests were based on an array ```SIZE = 1024``` and ```blockSize = 128```: + +```bash +**************** +** SCAN TESTS ** +**************** + [ 46 46 37 6 29 0 28 22 25 23 3 11 29 ... 20 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.0013ms (std::chrono Measured) + [ 0 46 92 129 135 164 164 192 214 239 262 265 276 ... 24775 24795 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 0.0007ms (std::chrono Measured) + [ 0 46 92 129 135 164 164 192 214 239 262 265 276 ... 24725 24751 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 0.02192ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.021952ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.046304ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.045856ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.038912ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.038176ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 0 1 2 3 2 0 0 1 1 3 1 1 ... 2 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.0026ms (std::chrono Measured) + [ 2 1 2 3 2 1 1 3 1 1 1 2 1 ... 2 2 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.0024ms (std::chrono Measured) + [ 2 1 2 3 2 1 1 3 1 1 1 2 1 ... 1 2 ] + passed +==== cpu compact with scan ==== + elapsed time: 0.0054ms (std::chrono Measured) + [ 2 1 2 3 2 1 1 3 1 1 1 2 1 ... 2 2 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.062976ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.052992ms (CUDA Measured) + passed +``` + +## Extra Credit + +- **Radix sort** + + I defined the several function used by Radix Sort in [radix.h](https://github.com/giaosame/Project2-Stream-Compaction/blob/master/stream_compaction/radix.h) and implemented it in [radix.cu](https://github.com/giaosame/Project2-Stream-Compaction/blob/master/stream_compaction/radix.cu) under the directory [/stream_compaction](https://github.com/giaosame/Project2-Stream-Compaction/tree/master/stream_compaction). In [main.cpp](https://github.com/giaosame/Project2-Stream-Compaction/blob/master/src/main.cpp), I called this function ```StreamCompaction::Radix::sort``` in the last of the ```main``` function. + + ```c++ + zeroArray(SIZE, c); + printDesc("radix sort, power-of-two"); + StreamCompaction::CPU::sort(SIZE, b, a); + StreamCompaction::Radix::sort(SIZE, c, a); + printCmpResult(SIZE, b, c); + + zeroArray(SIZE, c); + printDesc("radix sort, non-power-of-two"); + StreamCompaction::CPU::sort(NPOT, b, a); + StreamCompaction::Radix::sort(NPOT, c, a); + printCmpResult(NPOT, b, c); + ``` + + Examples of output of Radix Sort: + + ```bash + ********************** + ** RADIX SORT TESTS ** + ********************** + [ 10 31 19 93 79 96 60 46 46 85 44 56 52 53 85 39 ] + ==== radix sort, power-of-two ==== + [ 10 19 31 39 44 46 46 52 53 56 60 79 85 85 93 96 ] + passed + ==== radix sort, non-power-of-two ==== + [ 10 19 31 44 46 46 52 56 60 79 85 93 96 ] + passed + ``` + + + + diff --git a/img/different_arraysize_compact_perf.png b/img/different_arraysize_compact_perf.png new file mode 100644 index 0000000..98c7f6d Binary files /dev/null and b/img/different_arraysize_compact_perf.png differ diff --git a/img/different_arraysize_scan_perf.png b/img/different_arraysize_scan_perf.png new file mode 100644 index 0000000..ee54f0a Binary files /dev/null and b/img/different_arraysize_scan_perf.png differ diff --git a/img/different_blocksize_perf.png b/img/different_blocksize_perf.png new file mode 100644 index 0000000..332234a Binary files /dev/null and b/img/different_blocksize_perf.png differ diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..33c28c4 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -10,10 +10,11 @@ #include #include #include +#include #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 5; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -137,16 +138,36 @@ int main(int argc, char* argv[]) { printDesc("work-efficient compact, power-of-two"); count = StreamCompaction::Efficient::compact(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); printDesc("work-efficient compact, non-power-of-two"); count = StreamCompaction::Efficient::compact(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); + printf("\n"); + printf("**********************\n"); + printf("** RADIX SORT TESTS **\n"); + printf("**********************\n"); + + genArray(SIZE, a, 100); // Leave a 0 at the end to test that edge case + printArray(SIZE, a, true); + + zeroArray(SIZE, c); + printDesc("radix sort, power-of-two"); + StreamCompaction::CPU::sort(SIZE, b, a); + StreamCompaction::Radix::sort(SIZE, c, a); + printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + + zeroArray(SIZE, c); + printDesc("radix sort, non-power-of-two"); + StreamCompaction::CPU::sort(NPOT, b, a); + StreamCompaction::Radix::sort(NPOT, c, a); + printArray(NPOT, c, true); + printCmpResult(NPOT, b, c); + system("pause"); // stop Win32 console from closing on exit delete[] a; delete[] b; diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index 567795b..eb71175 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -4,6 +4,7 @@ set(headers "naive.h" "efficient.h" "thrust.h" + "radix.h" ) set(sources @@ -12,6 +13,7 @@ set(sources "naive.cu" "efficient.cu" "thrust.cu" + "radix.cu" ) list(SORT headers) diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..6483339 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -14,26 +14,48 @@ void checkCUDAErrorFn(const char *msg, const char *file, int line) { exit(EXIT_FAILURE); } - namespace StreamCompaction { namespace Common { + // Initialize array in gpu + __global__ void kernInitializeArray(int n, int* a, int value) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index < n) + { + a[index] = value; + } + } + /** * Maps an array to an array of 0s and 1s for stream compaction. Elements * which map to 0 will be removed, and elements which map to 1 will be kept. */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) + { + return; + } + + bools[index] = idata[index] != 0 ? 1 : 0; } /** * Performs scatter on an array. That is, for each element in idata, - * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]]. + * if bools[index] == 1, it copies idata[index] to odata[indices[index]]. */ - __global__ void kernScatter(int n, int *odata, - const int *idata, const int *bools, const int *indices) { - // TODO - } + __global__ void kernScatter(int n, int* odata, const int* idata, const int* bools, const int* indices) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) + { + return; + } + if (bools[index]) + { + odata[indices[index]] = idata[index]; + } + } } } diff --git a/stream_compaction/common.h b/stream_compaction/common.h index d2c1fed..734bdb9 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -2,7 +2,6 @@ #include #include - #include #include #include @@ -13,6 +12,9 @@ #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) +// Block size used for CUDA kernel launch +#define blockSize 256 + /** * Check for CUDA errors; print and exit if there was a problem. */ @@ -32,6 +34,8 @@ inline int ilog2ceil(int x) { namespace StreamCompaction { namespace Common { + __global__ void kernInitializeArray(int n, int* a, int value); + __global__ void kernMapToBoolean(int n, int *bools, const int *idata); __global__ void kernScatter(int n, int *odata, diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..d6ba6c9 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -19,7 +19,14 @@ namespace StreamCompaction { */ void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + + int prefixSum = 0; + for (int i = 0; i < n; i++) + { + odata[i] = prefixSum; + prefixSum += idata[i]; + } + timer().endCpuTimer(); } @@ -30,9 +37,16 @@ namespace StreamCompaction { */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + + int cnt = 0; + for (int i = 0; i < n; i++) + { + if (idata[i] != 0) + odata[cnt++] = idata[i]; + } + timer().endCpuTimer(); - return -1; + return cnt; } /** @@ -41,10 +55,40 @@ namespace StreamCompaction { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { + // Compute temporary array and run exclusive scan on temporary array + int prefixSum = 0; + int* tdata = new int[n]; + int* sdata = new int[n]; + timer().startCpuTimer(); - // TODO + for (int i = 0; i < n; i++) + { + tdata[i] = idata[i] != 0 ? 1 : 0; + sdata[i] = prefixSum; + prefixSum += tdata[i]; + } + + // Scatter + int idx = 0; + for (int i = 0; i < n; i++) + { + if (tdata[i]) + { + idx = sdata[i]; + odata[idx] = idata[i]; + } + } timer().endCpuTimer(); - return -1; + return idx + 1; + } + + void sort(int n, int* odata, const int* idata) + { + for (int i = 0; i < n; i++) + { + odata[i] = idata[i]; + } + std::sort(odata, odata + n); } } } diff --git a/stream_compaction/cpu.h b/stream_compaction/cpu.h index 873c047..8a6efa9 100644 --- a/stream_compaction/cpu.h +++ b/stream_compaction/cpu.h @@ -8,6 +8,8 @@ namespace StreamCompaction { void scan(int n, int *odata, const int *idata); + void sort(int n, int* odata, const int* idata); + int compactWithoutScan(int n, int *odata, const int *idata); int compactWithScan(int n, int *odata, const int *idata); diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..4c21385 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -5,6 +5,12 @@ namespace StreamCompaction { namespace Efficient { + int* dev_tempData; + int* dev_inputData; + int* dev_boolData; + int* dev_idxData; + int* dev_outputData; + using StreamCompaction::Common::PerformanceTimer; PerformanceTimer& timer() { @@ -12,13 +18,77 @@ namespace StreamCompaction { return timer; } + __global__ void kernUpSweep(int n, int* tdata, int d) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) + { + return; + } + + int leftChildOffset = 1 << d; // 2^d + int rightChildOffset = leftChildOffset << 1; // 2^(d+1) + if (index % rightChildOffset == 0) + { + tdata[index + rightChildOffset - 1] += tdata[index + leftChildOffset - 1]; + } + } + + __global__ void kernDownSweep(int n, int* odata, int d) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) + { + return; + } + + int leftChildOffset = 1 << d; // 2^d + int rightChildOffset = leftChildOffset << 1; // 2^(d+1) + if (index % rightChildOffset == 0) + { + // Save left child value + int preLeftChildVal = odata[index + leftChildOffset - 1]; + // Set the left child of the next round as the current node's value + odata[index + leftChildOffset - 1] = odata[index + rightChildOffset - 1]; + // Set the right child (the node itself) of the next round as the current node's value + previous left child value + odata[index + rightChildOffset - 1] += preLeftChildVal; + } + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { + void scan(int n, int *odata, const int *idata) + { + int depth = ilog2ceil(n); + int size = 1 << depth; // sizes of arrays will are rounded to the next power of two + dim3 threadsPerBlock(blockSize); + dim3 blocksPerGrid((size + blockSize - 1) / blockSize); + + cudaMalloc((void**)&dev_tempData, size * sizeof(int)); + Common::kernInitializeArray<<>>(size, dev_tempData, 0); + cudaMemcpy(dev_tempData, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + // ------------------------------------- Performance Measurement ------------------------------------------- timer().startGpuTimer(); - // TODO + + for (int d = 0; d < depth; d++) + { + kernUpSweep<<>>(size, dev_tempData, d); + } + + // Set root to zero + cudaMemset(dev_tempData + size - 1, 0, sizeof(int)); + for (int d = depth - 1; d >= 0; d--) + { + kernDownSweep<<>>(size, dev_tempData, d); + } + timer().endGpuTimer(); + // -------------------------------------------------------------------------------------------------------- + + cudaMemcpy(odata, dev_tempData, n * sizeof(int), cudaMemcpyKind::cudaMemcpyDeviceToHost); + cudaFree(dev_tempData); } /** @@ -30,11 +100,56 @@ namespace StreamCompaction { * @param idata The array of elements to compact. * @returns The number of elements remaining after compaction. */ - int compact(int n, int *odata, const int *idata) { + int compact(int n, int *odata, const int *idata) + { + int depth = ilog2ceil(n); + int size = 1 << depth; // sizes of arrays will are rounded to the next power of two + dim3 threadsPerBlock(blockSize); + dim3 blocksPerGrid((size + blockSize - 1) / blockSize); + + cudaMalloc((void**)&dev_inputData, size * sizeof(int)); + cudaMalloc((void**)&dev_boolData, size * sizeof(int)); + cudaMalloc((void**)&dev_idxData, size * sizeof(int)); + cudaMalloc((void**)&dev_outputData, n * sizeof(int)); + Common::kernInitializeArray<<>>(size, dev_inputData, 0); + Common::kernInitializeArray<<>>(size, dev_boolData, 0); + Common::kernInitializeArray<<>>(size, dev_idxData, 0); + Common::kernInitializeArray<<>>(n, dev_outputData, 0); + cudaMemcpy(dev_inputData, idata, n * sizeof(int), cudaMemcpyKind::cudaMemcpyHostToDevice); + + // ------------------------------------- Performance Measurement ------------------------------------------- timer().startGpuTimer(); - // TODO + // Step 1: Compute temporary array + Common::kernMapToBoolean<<>>(n, dev_boolData, dev_inputData); + + // Step 2: Run exclusive scan on temporary array + cudaMemcpy(dev_idxData, dev_boolData, n * sizeof(int), cudaMemcpyKind::cudaMemcpyDeviceToDevice); + for (int d = 0; d < depth; d++) + { + kernUpSweep<<>>(size, dev_idxData, d); + } + + cudaMemset(dev_idxData + size - 1, 0, sizeof(int)); + for (int d = depth - 1; d >= 0; d--) + { + kernDownSweep<<>>(size, dev_idxData, d); + } + + // Step 3: Scatter + Common::kernScatter<<>>(n, dev_outputData, dev_inputData, dev_boolData, dev_idxData); + timer().endGpuTimer(); - return -1; + // -------------------------------------------------------------------------------------------------------- + + int compactedSize = -1; + cudaMemcpy(&compactedSize, dev_idxData + size - 1, sizeof(int), cudaMemcpyKind::cudaMemcpyDeviceToHost); + cudaMemcpy(odata, dev_outputData, n * sizeof(int), cudaMemcpyKind::cudaMemcpyDeviceToHost); + + cudaFree(dev_inputData); + cudaFree(dev_boolData); + cudaFree(dev_idxData); + cudaFree(dev_outputData); + return compactedSize; } } } diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 803cb4f..77891fb 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -6,6 +6,10 @@ namespace StreamCompaction { namespace Efficient { StreamCompaction::Common::PerformanceTimer& timer(); + __global__ void kernUpSweep(int n, int* tdata, int d); + + __global__ void kernDownSweep(int n, int* odata, int d); + void scan(int n, int *odata, const int *idata); int compact(int n, int *odata, const int *idata); diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..d6a94ac 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -11,15 +11,55 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } - // TODO: __global__ + + __global__ void kernParallelScan(int n, int* odata, const int* idata, int d) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) + { + return; + } + + int offset = 1 << d; + if (index >= offset) + { + odata[index] = idata[index - offset] + idata[index]; + } + else + { + odata[index] = idata[index]; + } + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { + void scan(int n, int *odata, const int *idata) + { + dim3 threadsPerBlock(blockSize); + dim3 blocksPerGrid((n + blockSize - 1) / blockSize); + int* dev_tempData; + int* dev_outputData; + cudaMalloc((void**)&dev_tempData, n * sizeof(int)); + cudaMalloc((void**)&dev_outputData, n * sizeof(int)); + cudaMemcpy(dev_outputData, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + // ------------------------------------- Performance Measurement ------------------------------------------ timer().startGpuTimer(); - // TODO + int depth = ilog2ceil(n); + for (int d = 0; d < depth; d++) + { + kernParallelScan<<>>(n, dev_tempData, dev_outputData, d); + std::swap(dev_tempData, dev_outputData); + } timer().endGpuTimer(); + // -------------------------------------------------------------------------------------------------------- + + // Do a right shift when copying data from gpu to cpu, to convert inclusive scan to exclusive scan + odata[0] = 0; + cudaMemcpy(odata + 1, dev_outputData, (n - 1) * sizeof(int), cudaMemcpyKind::cudaMemcpyDeviceToHost); + cudaFree(dev_tempData); + cudaFree(dev_outputData); } } } diff --git a/stream_compaction/radix.cu b/stream_compaction/radix.cu new file mode 100644 index 0000000..34653d8 --- /dev/null +++ b/stream_compaction/radix.cu @@ -0,0 +1,145 @@ +#include +#include +#include "common.h" +#include "radix.h" +#include "efficient.h" + +namespace StreamCompaction { + namespace Radix { + int* dev_tempData; + int* dev_inputData; + int* dev_boolData; + int* dev_notBoolData; + int* dev_scanData; + int* dev_outputData; + + // Returns the position of the most significant bit + int getMSB(int x) + { + int bit = 1 << 31; + for (int i = 31; i >= 0; i--, bit >>= 1) + { + if (x & bit) + return i + 1; + } + return 0; + } + + // Returns the maximum of the array + int getMax(int n, const int* a) + { + int maximum = a[0]; + for (int i = 1; i < n; i++) + { + maximum = std::max(maximum, a[i]); + } + return maximum; + } + + // Maps an array to 2 arrays only contains 1s and 0s. + // _bools_ is just the logic NOT of _notBools_ + __global__ void kernMapTo2Bools(int n, int bit, int* bools, int* notBools, const int* idata) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) + { + return; + } + + bool b = idata[index] & bit; + bools[index] = b; + notBools[index] = !b; + } + + // Computes the temp array _temps_ which stores address for writing true keys + __global__ void kernComputeAddressOfTrueKeys(int n, int* temps, const int* notBools, const int* scanData) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) + { + return; + } + + int totalFalses = notBools[n - 1] + scanData[n - 1]; + temps[index] = index - scanData[index] + totalFalses; + } + + // Scatters based on address _temps_ + __global__ void kernRadixScatter(int n, int* odata, const int* temps, const int* bools, const int* scanData, const int* idata) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) + { + return; + } + + int newIdx = bools[index] ? temps[index] : scanData[index]; + odata[newIdx] = idata[index]; + } + + /** + * Performs radix sort on idata, storing the result into odata. + * + * @param n The number of elements in idata. + * @param odata The array into which to store elements. + * @param idata The array of elements to sort. + */ + void sort(int n, int* odata, const int* idata) + { + int depth = ilog2ceil(n); + int size = 1 << depth; // sizes of arrays will are rounded to the next power of two + int maximum = getMax(n, idata); + int highestBit = getMSB(maximum); + + dim3 threadsPerBlock(blockSize); + dim3 blocksPerGrid((n + blockSize - 1) / blockSize); + dim3 scanBlocksPerGrid((n + blockSize - 1) / blockSize); + + cudaMalloc((void**)&dev_inputData, n * sizeof(int)); + cudaMalloc((void**)&dev_boolData, n * sizeof(int)); + cudaMalloc((void**)&dev_notBoolData, n * sizeof(int)); + cudaMalloc((void**)&dev_scanData, size * sizeof(int)); + cudaMalloc((void**)&dev_tempData, n * sizeof(int)); + cudaMalloc((void**)&dev_outputData, n * sizeof(int)); + Common::kernInitializeArray<<>>(size, dev_scanData, 0); + cudaMemcpy(dev_inputData, idata, n * sizeof(int), cudaMemcpyKind::cudaMemcpyHostToDevice); + + // Do radix sort for _bits_ times + for (int i = 0, bit = 1; i < highestBit; i++, bit <<= 1) + { + // Step 1: Compute the bool array and notBool array + kernMapTo2Bools<<>>(n, bit, dev_boolData, dev_notBoolData, dev_inputData); + + // Step 2: Exclusive scan array + cudaMemcpy(dev_scanData, dev_notBoolData, n * sizeof(int), cudaMemcpyKind::cudaMemcpyDeviceToDevice); + for (int d = 0; d < depth; d++) + { + Efficient::kernUpSweep<<>>(size, dev_scanData, d); + } + + cudaMemset(dev_scanData + size - 1, 0, sizeof(int)); + for (int d = depth - 1; d >= 0; d--) + { + Efficient::kernDownSweep<<>>(size, dev_scanData, d); + } + + // Step 3: Compute temp array _dev_tempData_ + kernComputeAddressOfTrueKeys<<>>(n, dev_tempData, dev_notBoolData, dev_scanData); + + // Step 4: Scatter + kernRadixScatter<<>>(n, dev_outputData, dev_tempData, dev_boolData, dev_scanData, dev_inputData); + + // Swap for next round of radix sort + std::swap(dev_outputData, dev_inputData); + } + + cudaMemcpy(odata, dev_inputData, n * sizeof(int), cudaMemcpyKind::cudaMemcpyDeviceToHost); + cudaFree(dev_inputData); + cudaFree(dev_boolData); + cudaFree(dev_notBoolData); + cudaFree(dev_scanData); + cudaFree(dev_tempData); + cudaFree(dev_outputData); + } + } +} diff --git a/stream_compaction/radix.h b/stream_compaction/radix.h new file mode 100644 index 0000000..e97f7a2 --- /dev/null +++ b/stream_compaction/radix.h @@ -0,0 +1,8 @@ +#pragma once +#include "common.h" + +namespace StreamCompaction { + namespace Radix { + void sort(int n, int* odata, const int* idata); + } +} diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..0c123ee 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -17,12 +17,17 @@ namespace StreamCompaction { /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { + void scan(int n, int *odata, const int *idata) + { + thrust::host_vector host_idata(idata, idata + n); + thrust::device_vector dev_idata = host_idata; // copy host_vector to device_vector + thrust::device_vector dev_odata(n); + timer().startGpuTimer(); - // TODO use `thrust::exclusive_scan` - // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(dev_idata.begin(), dev_idata.end(), dev_odata.begin()); timer().endGpuTimer(); + + thrust::copy(dev_odata.begin(), dev_odata.end(), odata); } } }