diff --git a/README.md b/README.md index 0e38ddb..c196685 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,99 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Thy (Tea) Tran + * [LinkedIn](https://www.linkedin.com/in/thy-tran-97a30b148/), [personal website](https://tatran5.github.io/), [email](thytran316@outlook.com) +* Tested on: Windows 10, i7-8750H @ 2.20GHz 22GB, GTX 1070 -### (TODO: Your README) +# Scan, Stream Compaction and Radix Sort +![](img/cover.png) -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +The main goal of the project is to implement different methods for scan (mostly exclusive scan) and stream compaction. Scan is used to aid with stream compaction and radix sort. For scan and stream compaction, several CPU and GPU methods are implemented to compare their efficiency. +## Performance Analysis + +The runtime of the naive method on GPU is worst, followed by that of CPU, then GPU work-efficient method, and finally by thrust. + +Both the naive and work-efficient method on the GPU only rely on global memory instead of ultilizing shared memory, which potentially lead to much slower runtime than thrust method (and CPU for the naive method.) Naive and work-efficient methods also do not use up the potential of warp partitioning or memory coalescing, which are features that thrust may have to reduce its runtime. + +![](img/scan-runtime.png) + +The same explanation above is applicable to the below as well. + +![](img/stream-compaction-runtime.png) + +Another thing to note that greatly affects performance within GPU implementation for both naive and work-efficient methods, compared with CPU method, is that the GPU ones only work well for a power-of-2-elements array. If an input array is not a power of 2, the number of elements is increased to the nearest power of 2. Hence, in this case, the runtime for scan and stream compaction on GPU is not that much better than that on the CPU. + + +The below is an example of the results with 1000000 elements. Note that the CPU scan time is 0ms (which is not true) because the CPU timer is disabled here, so that the CPU timer for stream compaction that uses CPU scan does not stop the program from executing. + +``` + +**************** +** SCAN TESTS ** +**************** + [ 2 30 36 0 31 22 46 36 16 3 45 48 20 ... 37 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0ms (std::chrono Measured) + [ 0 2 32 68 68 99 121 167 203 219 222 267 315 ... 24482173 24482210 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 0ms (std::chrono Measured) + [ 0 2 32 68 68 99 121 167 203 219 222 267 315 ... 24482069 24482112 ] + passed +==== naive scan, power-of-two ==== +kernel invoke count: 20 + elapsed time: 2.18784ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== +kernel invoke count: 20 + elapsed time: 2.35523ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 1.20525ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 1.19664ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 1.94966ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 1.9712ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 1 0 0 0 2 3 2 1 0 3 1 2 1 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 2.3196ms (std::chrono Measured) + [ 1 2 3 2 1 3 1 2 1 3 1 2 2 ... 2 2 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 2.3835ms (std::chrono Measured) + [ 1 2 3 2 1 3 1 2 1 3 1 2 2 ... 2 2 ] + passed +==== cpu compact with scan ==== + elapsed time: 8.1008ms (std::chrono Measured) + [ 1 2 3 2 1 3 1 2 1 3 1 2 2 ... 2 2 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 1.1991ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 1.19808ms (CUDA Measured) + passed + +***************************** +** RADIX SORT TESTS ** +***************************** + [ 5 10 32 46 46 31 8 9 28 23 47 42 41 ... 8 0 ] +largest number of bits: 6 +Correctly sorted: + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 49 49 ] +Yours: + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 49 49 ] + passed +Press any key to continue . . . + +``` \ No newline at end of file diff --git a/img/cover.png b/img/cover.png new file mode 100644 index 0000000..6870242 Binary files /dev/null and b/img/cover.png differ diff --git a/img/scan-runtime.png b/img/scan-runtime.png new file mode 100644 index 0000000..bcfd312 Binary files /dev/null and b/img/scan-runtime.png differ diff --git a/img/stream-compaction-runtime.png b/img/stream-compaction-runtime.png new file mode 100644 index 0000000..fcc8472 Binary files /dev/null and b/img/stream-compaction-runtime.png differ diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..dffde8d 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -11,144 +11,175 @@ #include #include #include +#include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1000000;// 1 << 8; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two -int *a = new int[SIZE]; -int *b = new int[SIZE]; -int *c = new int[SIZE]; +int* a = new int[SIZE]; +int* b = new int[SIZE]; +int* c = new int[SIZE]; int main(int argc, char* argv[]) { - // Scan tests - - printf("\n"); - printf("****************\n"); - printf("** SCAN TESTS **\n"); - printf("****************\n"); - - genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); - - // initialize b using StreamCompaction::CPU::scan you implement - // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct. - // At first all cases passed because b && c are all zeroes. - zeroArray(SIZE, b); - printDesc("cpu scan, power-of-two"); - StreamCompaction::CPU::scan(SIZE, b, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(SIZE, b, true); - - zeroArray(SIZE, c); - printDesc("cpu scan, non-power-of-two"); - StreamCompaction::CPU::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(NPOT, b, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("naive scan, power-of-two"); - StreamCompaction::Naive::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan - onesArray(SIZE, c); - printDesc("1s array for finding bugs"); - StreamCompaction::Naive::scan(SIZE, c, a); - printArray(SIZE, c, true); */ - - zeroArray(SIZE, c); - printDesc("naive scan, non-power-of-two"); - StreamCompaction::Naive::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient scan, power-of-two"); - StreamCompaction::Efficient::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient scan, non-power-of-two"); - StreamCompaction::Efficient::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("thrust scan, power-of-two"); - StreamCompaction::Thrust::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - zeroArray(SIZE, c); - printDesc("thrust scan, non-power-of-two"); - StreamCompaction::Thrust::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); - - printf("\n"); - printf("*****************************\n"); - printf("** STREAM COMPACTION TESTS **\n"); - printf("*****************************\n"); - - // Compaction tests - - genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); - - int count, expectedCount, expectedNPOT; - - // initialize b using StreamCompaction::CPU::compactWithoutScan you implement - // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct. - zeroArray(SIZE, b); - printDesc("cpu compact without scan, power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - expectedCount = count; - printArray(count, b, true); - printCmpLenResult(count, expectedCount, b, b); - - zeroArray(SIZE, c); - printDesc("cpu compact without scan, non-power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - expectedNPOT = count; - printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); - - zeroArray(SIZE, c); - printDesc("cpu compact with scan"); - count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient compact, power-of-two"); - count = StreamCompaction::Efficient::compact(SIZE, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient compact, non-power-of-two"); - count = StreamCompaction::Efficient::compact(NPOT, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); - - system("pause"); // stop Win32 console from closing on exit - delete[] a; - delete[] b; - delete[] c; -} + // Scan tests + + printf("\n"); + printf("****************\n"); + printf("** SCAN TESTS **\n"); + printf("****************\n"); + + genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case + a[SIZE - 1] = 0; + /* a[0] = 1; + a[1] = 4; + a[2] = 9; + a[3] = 0; + a[4] = 2; + a[5] = 1; + a[6] = 4;*/ + printArray(SIZE, a, true); + + // initialize b using StreamCompaction::CPU::scan you implement + // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct. + // At first all cases passed because b && c are all zeroes. + zeroArray(SIZE, b); + printDesc("cpu scan, power-of-two"); + StreamCompaction::CPU::scan(SIZE, b, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + printArray(SIZE, b, true); + + zeroArray(SIZE, c); + printDesc("cpu scan, non-power-of-two"); + StreamCompaction::CPU::scan(NPOT, c, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + printArray(NPOT, b, true); + printCmpResult(NPOT, b, c); + + zeroArray(SIZE, c); + printDesc("naive scan, power-of-two"); + StreamCompaction::Naive::scan(SIZE, c, a); + printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + + /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan + onesArray(SIZE, c); + printDesc("1s array for finding bugs"); + StreamCompaction::Naive::scan(SIZE, c, a); + printArray(SIZE, c, true); */ + + zeroArray(SIZE, c); + printDesc("naive scan, non-power-of-two"); + StreamCompaction::Naive::scan(NPOT, c, a); + printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(SIZE, c, true); + printCmpResult(NPOT, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient scan, power-of-two"); + StreamCompaction::Efficient::scan(SIZE, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient scan, non-power-of-two"); + StreamCompaction::Efficient::scan(NPOT, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(NPOT, c, true); + printCmpResult(NPOT, b, c); + + zeroArray(SIZE, c); + printDesc("thrust scan, power-of-two"); + StreamCompaction::Thrust::scan(SIZE, c, a); + printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + + zeroArray(SIZE, c); + printDesc("thrust scan, non-power-of-two"); + StreamCompaction::Thrust::scan(NPOT, c, a); + printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(NPOT, c, true); + printCmpResult(NPOT, b, c); + + printf("\n"); + printf("*****************************\n"); + printf("** STREAM COMPACTION TESTS **\n"); + printf("*****************************\n"); + + // Compaction tests + + genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case + a[SIZE - 1] = 0; + /* a[0] = 1; + a[1] = 4; + a[2] = 9; + a[3] = 0; + a[4] = 2; + a[5] = 1; + a[6] = 4;*/ + printArray(SIZE, a, true); + + int count, expectedCount, expectedNPOT; + + // initialize b using StreamCompaction::CPU::compactWithoutScan you implement + // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct. + zeroArray(SIZE, b); + printDesc("cpu compact without scan, power-of-two"); + count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + expectedCount = count; + printArray(count, b, true); + printCmpLenResult(count, expectedCount, b, b); + + zeroArray(SIZE, c); + printDesc("cpu compact without scan, non-power-of-two"); + count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + expectedNPOT = count; + printArray(count, c, true); + printCmpLenResult(count, expectedNPOT, b, c); + + zeroArray(SIZE, c); + printDesc("cpu compact with scan"); + count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + printArray(count, c, true); + printCmpLenResult(count, expectedCount, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient compact, power-of-two"); + count = StreamCompaction::Efficient::compact(SIZE, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(count, c, true); + printCmpLenResult(count, expectedCount, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient compact, non-power-of-two"); + count = StreamCompaction::Efficient::compact(NPOT, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(count, c, true); + printCmpLenResult(count, expectedNPOT, b, c); + + printf("\n"); + printf("*****************************\n"); + printf("** RADIX SORT TESTS **\n"); + printf("*****************************\n"); + + genArray(SIZE - 1, a, 50); + printArray(SIZE, a, true); + std::copy(a, a + SIZE, b); + std::sort(b, b + SIZE); + StreamCompaction::Radix::sort(SIZE, c, a); + std::cout << "Correctly sorted:" << std::endl; + printArray(SIZE, b, true); + std::cout << "Yours:" << std::endl; + printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + + system("pause"); // stop Win32 console from closing on exit + delete[] a; + delete[] b; + delete[] c; +} \ No newline at end of file diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index 567795b..eb71175 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -4,6 +4,7 @@ set(headers "naive.h" "efficient.h" "thrust.h" + "radix.h" ) set(sources @@ -12,6 +13,7 @@ set(sources "naive.cu" "efficient.cu" "thrust.cu" + "radix.cu" ) list(SORT headers) diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..8c99a0e 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -23,7 +23,12 @@ namespace StreamCompaction { * which map to 0 will be removed, and elements which map to 1 will be kept. */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + // TODO + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) { + return; + } + bools[index] = idata[index] != 0? 1 : 0; } /** @@ -32,8 +37,38 @@ namespace StreamCompaction { */ __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { - // TODO + // TODO + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) { + return; + } + if (bools[index] == 1) { + odata[indices[index]] = idata[index]; + } } + __global__ void shiftRight(int n, int* idata, int* odata) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) { + return; + } + if (index == 0) { + odata[index] = 0; + } + else { + odata[index] = idata[index - 1]; + } + } + + /* Copy initial data over and pad 0's if out of scope of initial size + * aka the input array has a smaller initial size than the final array, + * and anything larger than index [size of input array] will be 0 in the output array + */ + __global__ void formatInitData(int initSize, int finalSize, int* data) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= initSize && index < finalSize) { + data[index] = 0; + } + } } } diff --git a/stream_compaction/common.h b/stream_compaction/common.h index d2c1fed..acf6a29 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -37,6 +37,10 @@ namespace StreamCompaction { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices); + __global__ void shiftRight(int n, int* idata, int* odata); + + __global__ void formatInitData(int initSize, int finalSize, int* data); + /** * This class is used for timing the performance * Uncopyable and unmovable diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..0b4719d 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -18,9 +18,17 @@ namespace StreamCompaction { * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first. */ void scan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); + // Compute exclusive prefix sum + //timer().startCpuTimer(); // TODO - timer().endCpuTimer(); + if (n < 0) { + return; + } + odata[0] = 0; + for (int i = 1; i < n; i++) { + odata[i] = odata[i - 1] + idata[i - 1]; + } + //timer().endCpuTimer(); } /** @@ -31,8 +39,15 @@ namespace StreamCompaction { int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int o = 0; + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + odata[o] = idata[i]; + o++; + } + } timer().endCpuTimer(); - return -1; + return o; } /** @@ -43,8 +58,27 @@ namespace StreamCompaction { int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int* bdata = new int[n]; + for (int i = 0; i < n; i++) { + bdata[i] = idata[i] != 0 ? 1 : 0; + } + + int* sdata = new int[n]; + scan(n, sdata, bdata); + + int o = 0; + for (int i = 0; i < n; i++) { + if (bdata[i] == 1) { + odata[o] = idata[i]; + o++; + } + } + + delete[] bdata; + delete[] sdata; + timer().endCpuTimer(); - return -1; + return o; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..fef098a 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -2,6 +2,7 @@ #include #include "common.h" #include "efficient.h" +#include namespace StreamCompaction { namespace Efficient { @@ -12,13 +13,112 @@ namespace StreamCompaction { return timer; } + __global__ void upSweepEfficient(int n, int offset, int sDataSize, int* data) { + extern __shared__ int sData[]; + int tIdx = threadIdx.x; + sData[tIdx] = data[blockIdx.x * blockDim.x + tIdx]; + for (int stride = blockDim.x / 2; stride > 0; stride /= 2) { + if (tIdx < stride) { + __syncthreads(); + int s1 = sData[2 * tIdx]; + int s2 = sData[2 * tIdx + 1]; + __syncthreads(); + sData[tIdx] = s1 + s2; + } + } + __syncthreads(); + data[blockIdx.x * blockDim.x + tIdx] = sData[sDataSize - tIdx]; + } + + __global__ void upSweep(int n, int offset, int* data) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index < n && index % offset == 0) { + data[index + offset - 1] = data[index + offset / 2 - 1] + data[index + offset - 1]; + } + } + + __global__ void downSweep(int n, int offset, int* data) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index < n && index % offset == 0) { + int halfOffset = offset / 2; // Helps to find left child + int t = data[index + halfOffset - 1]; + // Set right child to be the same as parent's value + data[index + halfOffset - 1] = data[index + offset - 1]; + // Set left child to be the sum of parent and parent's sibling + data[index + offset - 1] += t; + } + } + + __global__ void setLastElementZero(int n, int* data) { + data[n - 1] = 0; + } + + __global__ void formatFinalData(int n, int* odata, const int* idata) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) { + return; + } + odata[index] = idata[index]; + } + + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + // Calculate the number of elements the input can be treated as an array with a power of two elements + int kernelInvokeCount = ilog2ceil(n); + int n2 = (int)pow(2, kernelInvokeCount); + + // Declare, allocate, and transfer data on gpu from cpu + int* dev_odata; + int* dev_odata2; + + cudaMalloc((void**)&dev_odata, n2 * sizeof(int)); + checkCUDAError("cudaMalloc dev_odata failed!"); + + cudaMalloc((void**)&dev_odata2, n2 * sizeof(int)); + checkCUDAError("cudaMalloc dev_odata2 failed!"); + + cudaMemcpy(dev_odata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy idata to dev_odata failed!"); + + if (n < 1) { + return; + } timer().startGpuTimer(); - // TODO + + int blockSize = 128; + dim3 blockCount((n2 + blockSize - 1) / blockSize); + + // Format input data (pad 0s to the closest power of two elements, inclusively) + StreamCompaction::Common::formatInitData << > > (n, n2, dev_odata); + + for (int i = 0; i <= kernelInvokeCount; i++) { + int offset = (int) pow(2, i + 1); + upSweep << > > (n2, offset, dev_odata); + } + + setLastElementZero << > > (n2, dev_odata); + + for (int i = kernelInvokeCount - 1; i >= 0; i--) { + int offset = (int) pow(2, i + 1); + downSweep << > > (n2, offset, dev_odata); + } + + formatFinalData << < blockCount, blockSize >> > (n, dev_odata2, dev_odata); + timer().endGpuTimer(); + + // Transfer data from gpu to cpu + cudaMemcpy(odata, dev_odata2, sizeof(int) * n, cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy dev_odata to odata failed!"); + + cudaFree(dev_odata); + checkCUDAError("cudaFree dev_odata failed!"); + + cudaFree(dev_odata2); + checkCUDAError("cudaFree dev_odata2 failed!"); } /** @@ -31,10 +131,66 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { - timer().startGpuTimer(); + //timer().startGpuTimer(); + // TODO - timer().endGpuTimer(); - return -1; + + int blockSize = 128; + dim3 blockCount((n + blockSize - 1) / blockSize); + + // Declare, allocate memory in GPU and transfer memory from CPU to GPU + int* dev_idata; + int* dev_bools; + int* dev_indices; + int* dev_odata; + + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_idata failed!"); + + cudaMalloc((void**)&dev_bools, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_bools failed!"); + + cudaMalloc((void**)&dev_indices, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_indices failed!"); + + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_odata failed!"); + + cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy dev_idata failed!"); + + // Compute temorary arrray containing + // 1 if corresponding element meets criteria (of not equal to 0) + // 0 if element does not meete criteria (of not equal to 0) + StreamCompaction::Common::kernMapToBoolean << > > (n, dev_bools, dev_idata); + + // Run exclusive scan on temporary array + scan(n, dev_indices, dev_bools); + StreamCompaction::Common::kernScatter << > > (n, dev_odata, dev_idata, dev_bools, dev_indices); + + std::unique_ptr bools{ new int[n] }; + std::unique_ptr indices{ new int[n] }; + + cudaMemcpy(bools.get(), dev_bools, sizeof(int) * n, cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy dev_bools failed!"); + + cudaMemcpy(indices.get(), dev_indices, sizeof(int) * n, cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy dev_bools failed!"); + + cudaMemcpy(odata, dev_odata, sizeof(int) * n, cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy dev_odata failed!"); + + // Clean up + cudaFree(dev_idata); + cudaFree(dev_bools); + cudaFree(dev_indices); + cudaFree(dev_odata); + + int remaining = bools[n - 1] == 1? indices[n - 1] : indices[n - 1] - 1; + remaining++; + + //timer().endGpuTimer(); + return remaining; } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..d7b4e39 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -2,6 +2,8 @@ #include #include "common.h" #include "naive.h" +#include +#include namespace StreamCompaction { namespace Naive { @@ -16,9 +18,87 @@ namespace StreamCompaction { /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ + + + /* Copy initial data over and pad 0's if out of scope of initial size + * aka the input array has a smaller initial size than the final array, + * and anything larger than index [size of input array] will be 0 in the output array + */ + __global__ void formatInitData(int initSize, int finalSize, int* data) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= initSize && index < finalSize) { + data[index] = 0; + } + } + + __global__ void add(int n, int ignoreIndexCount, int* odata, const int* idata) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index < ignoreIndexCount) { + odata[index] = idata[index]; + } else if (index < n) { + int x1 = idata[index - ignoreIndexCount]; + int x2 = idata[index]; + odata[index] = x1 + x2; + } + } + + // Careful with non-power of 2 void scan(int n, int *odata, const int *idata) { + if (n < 1) { + return; + } + // Calculate the number of elements the input can be treated as an array with a power of two elements + int kernelInvokeCount = ilog2ceil(n); + int n2 = pow(2, kernelInvokeCount); + + // Declare data to be on the gpu + int* dev_odata; + int* dev_idata; + std::unique_ptr tdata{ new int[n2] }; + + // Allocate data to be on the gpu + cudaMalloc((void**)&dev_odata, n2 * sizeof(int)); + checkCUDAError("cudaMalloc dev_odata failed!"); + + cudaMalloc((void**)&dev_idata, n2 * sizeof(int)); + checkCUDAError("cudaMalloc dev_tdata failed!"); + + // Transfer data from cpu to gpu + cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy dev_idata failed!"); + timer().startGpuTimer(); - // TODO + + int blockSize = 128; + dim3 blockCount((n2 + blockSize - 1) / blockSize); + + // Format input data (pad 0s to the closest power of two elements, inclusively) + StreamCompaction::Common::formatInitData << > > (n, n2, dev_idata); + + std::cout << "kernel invoke count: " << kernelInvokeCount << std::endl; + + for (int i = 1; i <= kernelInvokeCount; i++) { + int ignoreIndexCount = pow(2, i - 1); + add << > > (n2, ignoreIndexCount, dev_odata, dev_idata); + + int* temp = dev_idata; + dev_idata = dev_odata; + dev_odata = temp; + } + + // Shift things to the right to make the inclusive can into exclusive scan + StreamCompaction::Common::shiftRight<< > > (n, dev_idata, dev_odata); + + // Transfer data from gpu to cpu + cudaMemcpy(odata, dev_odata, sizeof(int) * n, cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy dev_odata failed!"); + + cudaFree(dev_odata); + checkCUDAError("cudaFree dev_odata failed!"); + cudaFree(dev_idata); + checkCUDAError("cudaFree dev_idata failed!"); + + // Calculate the number of blocks and threads per block timer().endGpuTimer(); } } diff --git a/stream_compaction/radix.cu b/stream_compaction/radix.cu new file mode 100644 index 0000000..92d5b2e --- /dev/null +++ b/stream_compaction/radix.cu @@ -0,0 +1,143 @@ +#include +#include +#include "common.h" +#include "efficient.h" +#include + +namespace StreamCompaction { + namespace Radix { + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; + } + + __global__ void computeBoolsForBits(int n, int bit, int* data, int* bools0, int* bools1) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) { + return; + } + int bitVal = (data[index] >> bit) & 1; + bools0[index] = 1 - bitVal; + bools1[index] = bitVal; + } + + __global__ void computeTotalFalses(int n, int* totalFalses, int* bools0, int* falseAddreses) { + totalFalses[0] = bools0[n - 1] + falseAddreses[n - 1]; + } + + __global__ void computeAddressForTrueKeys(int n, const int* totalFalses, int* trueAddresses, const int* falseAddreses) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) { + return; + } + trueAddresses[index] = index - falseAddreses[index] + totalFalses[0]; + } + + __global__ void computeAddresses(int n, int* bools1, const int* trueAddresses, const int* falseAddresses, int* allAddresses) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) { + return; + } + allAddresses[index] = bools1[index] > 0 ? trueAddresses[index] : falseAddresses[index]; + } + + __global__ void scatter(int n, int* odata, int* idata, int* addresses) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) { + return; + } + odata[addresses[index]] = idata[index]; + } + + void sort(int n, int* odata, const int* idata) { + // TODO + if (n < 1) { + return; + } + int blockSize = 128; + dim3 blockCount((n + blockSize - 1) / blockSize); + + int* dev_idata; + int* dev_odata; + int* dev_bools0; + int* dev_bools1; + int* dev_falseAddresses; + int* dev_trueAddresses; + int* dev_addresses; + int* dev_totalFalses; + + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_idata failed!"); + + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_odata failed!"); + + cudaMalloc((void**)&dev_bools0, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_bools0 failed!"); + + cudaMalloc((void**)&dev_bools1, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_bools1 failed!"); + + cudaMalloc((void**)&dev_falseAddresses, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_falseAddresses failed!"); + + cudaMalloc((void**)&dev_trueAddresses, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_trueAddresses failed!"); + + cudaMalloc((void**)&dev_addresses, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_addresses failed!"); + + cudaMalloc((void**)&dev_totalFalses, 1 * sizeof(int)); + checkCUDAError("cudaMalloc dev_totalFalses failed!"); + + cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy dev_idata failed!"); + + int largestNumBit = 0; + for (int i = 0; i < n; i++) { + int numBit = (int)log2(idata[i]) + 1; + if (largestNumBit < numBit) { + largestNumBit = numBit; + } + } + std::cout << "largest number of bits: " << largestNumBit << std::endl; + // Compute array which is true/false for bit n + for (int i = 0; i < largestNumBit; i++) { + computeBoolsForBits << > > (n, i, dev_idata, dev_bools0, dev_bools1); + StreamCompaction::Efficient::scan(n, dev_falseAddresses, dev_bools0); + computeTotalFalses << > > (n, dev_totalFalses, dev_bools0, dev_falseAddresses); + computeAddressForTrueKeys << > > (n, dev_totalFalses, dev_trueAddresses, dev_falseAddresses); + computeAddresses << > > (n, dev_bools1, dev_trueAddresses, dev_falseAddresses, dev_addresses); + scatter << > > (n, dev_odata, dev_idata, dev_addresses); + + int* temp = dev_idata; + dev_idata = dev_odata; + dev_odata = temp; + } + + // Transfer data from gpu to cpu + cudaMemcpy(odata, dev_idata, sizeof(int) * n, cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy dev_odata failed!"); + + // Cleanup + cudaFree(dev_addresses); + checkCUDAError("cudaFree dev_addresses failed!"); + cudaFree(dev_bools0); + checkCUDAError("cudaFree dev_bools0 failed!"); + cudaFree(dev_bools1); + checkCUDAError("cudaFree dev_bools1 failed!"); + cudaFree(dev_falseAddresses); + checkCUDAError("cudaFree dev_falseAddresses failed!"); + cudaFree(dev_idata); + checkCUDAError("cudaFree dev_idata failed!"); + cudaFree(dev_odata); + checkCUDAError("cudaFree dev_odata failed!"); + cudaFree(dev_trueAddresses); + checkCUDAError("cudaFree dev_trueAddresses failed!"); + cudaFree(dev_totalFalses); + checkCUDAError("cudaFree dev_totalFalses failed!"); + } + } +} diff --git a/stream_compaction/radix.h b/stream_compaction/radix.h new file mode 100644 index 0000000..aa489d9 --- /dev/null +++ b/stream_compaction/radix.h @@ -0,0 +1,11 @@ +#pragma once + +#include "common.h" + +namespace StreamCompaction { + namespace Radix { + StreamCompaction::Common::PerformanceTimer& timer(); + + void sort(int n, int *odata, const int *idata); + } +} diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..b275387 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -2,7 +2,7 @@ #include #include #include -#include +#include #include "common.h" #include "thrust.h" @@ -19,10 +19,17 @@ namespace StreamCompaction { */ void scan(int n, int *odata, const int *idata) { timer().startGpuTimer(); - // TODO use `thrust::exclusive_scan` - // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + + thrust::host_vector h_in(idata, idata + n); + thrust::device_vector dv_in = h_in; + thrust::device_vector dv_out(n); + thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + timer().endGpuTimer(); + + odata = thrust::copy(dv_out.begin(), dv_out.end(), odata); + + } } }