diff --git a/README.md b/README.md index 0e38ddb..bf8894d 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,90 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Eric Chiu +* Tested on: Windows 10 Education, Intel(R) Xeon(R) CPU E5-1630 v4 @ 3.60GHz 32GB, NVIDIA GeForce GTX 1070 (SIGLAB) -### (TODO: Your README) +## Description -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +This project explores and compares different CPU and GPU implementations of the scan (prefix sum) and the compaction algorithms. Some implementations include naive and work efficient methods. +## Performance Analysis + + +The following images below show a comparison between CPU, naive, work-efficient, and thrust implementations of the scan algorithm. + + +![](./img/scan-data.png) + +![](./img/scan-chart.png) + + +The following images below show a comparison between CPU without scan, CPU with scan, and work-efficient implementations of the compaction algorithm. + + +![](./img/compaction-data.png) + +![](./img/compaction-chart.png) + + +Overall, the thrust implementation had the best performance long term. In the beginning with smaller array sizes however, the CPU implementation was the fastest, then naive implementation, then work-efficient implementation, and then thrust implementation. As the array size increased, different implementations experienced different bottlenecks. I believe the CPU implementation experienced a bottleneck in the number of operations because it goes through serial memory processing (as to parallel memory processing). I believe the naive and work-efficient implementations experienced a bottleneck in global memory access. + + +## Program Output + +``` +**************** +** SCAN TESTS ** +**************** + [ 26 38 4 11 28 35 3 0 3 4 36 3 39 ... 22 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.0006ms (std::chrono Measured) + [ 0 26 64 68 79 107 142 145 145 148 152 188 191 ... 6071 6093 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 0.0007ms (std::chrono Measured) + [ 0 26 64 68 79 107 142 145 145 148 152 188 191 ... 6031 6046 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 0.023552ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.023552ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.095232ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.105472ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.058368ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.05632ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 3 0 0 3 2 1 1 1 0 0 2 1 1 ... 2 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.0009ms (std::chrono Measured) + [ 3 3 2 1 1 1 2 1 1 3 2 1 2 ... 1 2 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.0009ms (std::chrono Measured) + [ 3 3 2 1 1 1 2 1 1 3 2 1 2 ... 2 2 ] + passed +==== cpu compact with scan ==== + elapsed time: 0.0035ms (std::chrono Measured) + [ 3 3 2 1 1 1 2 1 1 3 2 1 2 ... 1 2 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.365568ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.359424ms (CUDA Measured) + passed +Press any key to continue . . . + +``` \ No newline at end of file diff --git a/img/compaction-chart.png b/img/compaction-chart.png new file mode 100644 index 0000000..ce3b645 Binary files /dev/null and b/img/compaction-chart.png differ diff --git a/img/compaction-data.png b/img/compaction-data.png new file mode 100644 index 0000000..2cbc514 Binary files /dev/null and b/img/compaction-data.png differ diff --git a/img/scan-chart.png b/img/scan-chart.png new file mode 100644 index 0000000..1d53f82 Binary files /dev/null and b/img/scan-chart.png differ diff --git a/img/scan-data.png b/img/scan-data.png new file mode 100644 index 0000000..2ee2752 Binary files /dev/null and b/img/scan-data.png differ diff --git a/src/main.cpp b/src/main.cpp index 1850161..459cda3 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 256; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index cdbef77..4bb0dc2 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -13,5 +13,5 @@ set(SOURCE_FILES cuda_add_library(stream_compaction ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_61 ) diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 8fc0211..4dd7b66 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -22,8 +22,11 @@ namespace StreamCompaction { * Maps an array to an array of 0s and 1s for stream compaction. Elements * which map to 0 will be removed, and elements which map to 1 will be kept. */ - __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + __global__ void kernMapToBoolean(int n, int *bools, const int *idata) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) return; + bools[index] = idata[index] == 0 ? 0 : 1; } /** @@ -31,8 +34,14 @@ namespace StreamCompaction { * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]]. */ __global__ void kernScatter(int n, int *odata, - const int *idata, const int *bools, const int *indices) { - // TODO + const int *idata, const int *bools, const int *indices) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) return; + if (bools[index] == 1) + { + odata[indices[index]] = idata[index]; + } } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 05ce667..112d8d0 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -17,9 +17,25 @@ namespace StreamCompaction { * For performance analysis, this is supposed to be a simple for loop. * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first. */ - void scan(int n, int *odata, const int *idata) { + + void scanHelper(int n, int *odata, const int *idata) + { + if (n == 0) return; + + odata[0] = 0; + for (int i = 1; i < n; i++) + { + odata[i] = odata[i - 1] + idata[i - 1]; + } + + } + + void scan(int n, int *odata, const int *idata) + { timer().startCpuTimer(); - // TODO + + scanHelper(n, odata, idata); + timer().endCpuTimer(); } @@ -29,10 +45,21 @@ namespace StreamCompaction { * @returns the number of elements remaining after compaction. */ int compactWithoutScan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); - // TODO + + timer().startCpuTimer(); + + int oIndex = 0; + for (int i = 0; i < n; i++) + { + if (idata[i] != 0) + { + odata[oIndex] = idata[i]; + oIndex++; + } + } + timer().endCpuTimer(); - return -1; + return oIndex; } /** @@ -40,11 +67,40 @@ namespace StreamCompaction { * * @returns the number of elements remaining after compaction. */ - int compactWithScan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); - // TODO - timer().endCpuTimer(); - return -1; + int compactWithScan(int n, int *odata, const int *idata) + { + timer().startCpuTimer(); + + int* bdata = new int[n]; + int* sdata = new int[n]; + + for (int i = 0; i < n; i++) + { + if (idata[i] != 0) + { + bdata[i] = 1; + } + else + { + bdata[i] = 0; + } + } + + // cannot call scan because it uses startCpuTimer as well + scanHelper(n, sdata, bdata); + + int sum = 0; + for (int i = 0; i < n; i++) + { + if (bdata[i] != 0) + { + odata[sdata[i]] = idata[i]; + sum = sdata[i]; + } + } + + timer().endCpuTimer(); + return sum + 1; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 36c5ef2..07360fc 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -12,13 +12,117 @@ namespace StreamCompaction { return timer; } + const int BLOCK_SIZE = 256; + + __global__ void kernUpSweep(int n, int stride, int *data) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) return; + + int fullIndex = (index + 1) * stride - 1; + int halfIndex = index * stride - 1 + (stride / 2); + data[fullIndex] += data[halfIndex]; + } + + __global__ void kernDownSweep(int n, int stride, int *data) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) return; + + int fullIndex = (index + 1) * stride - 1; + int halfIndex = index * stride - 1 + (stride / 2); + + int temp = data[halfIndex]; + data[halfIndex] = data[fullIndex]; + data[fullIndex] += temp; + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { + void scan(int n, int *odata, const int *idata) + { + int *dev_data; + int width = 1; + int squareN = pow(2, ilog2ceil(n)); + int iterations = ilog2(squareN) - 1; + int numThreads, numBlocks; + + cudaMalloc((void**)&dev_data, squareN * sizeof(int)); + cudaMemcpy(dev_data, idata, n * sizeof(int), cudaMemcpyHostToDevice); + timer().startGpuTimer(); - // TODO + + // up sweep + for (int i = 0; i <= iterations; i++) + { + width = width * 2; + numThreads = squareN / width; + numBlocks = (numThreads + BLOCK_SIZE - 1) / BLOCK_SIZE; + kernUpSweep<<>>(numThreads, width, dev_data); + checkCUDAErrorFn("kernUpSweep failed!"); + } + + int zero = 0; + cudaMemcpy(&dev_data[squareN - 1], &zero, sizeof(int), cudaMemcpyHostToDevice); + + // down sweep + width = pow(2, iterations + 2); + for (int i = iterations; i >= 0; i--) + { + width = width / 2; + numThreads = squareN / width; + numBlocks = (numThreads + BLOCK_SIZE - 1) / BLOCK_SIZE; + kernDownSweep<<>>(numThreads, width, dev_data); + checkCUDAErrorFn("kernDownSweep failed!"); + } + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_data, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(dev_data); + } + + /** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ + void scanNoTimer(int n, int *odata, const int *idata) + { + int *dev_data; + int width = 1; + int squareN = pow(2, ilog2ceil(n)); + int iterations = ilog2(squareN) - 1; + int numThreads, numBlocks; + + cudaMalloc((void**)&dev_data, squareN * sizeof(int)); + cudaMemcpy(dev_data, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + // up sweep + for (int i = 0; i <= iterations; i++) + { + width = width * 2; + numThreads = squareN / width; + numBlocks = (numThreads + BLOCK_SIZE - 1) / BLOCK_SIZE; + kernUpSweep << > > (numThreads, width, dev_data); + checkCUDAErrorFn("kernUpSweep failed!"); + } + + int zero = 0; + cudaMemcpy(&dev_data[squareN - 1], &zero, sizeof(int), cudaMemcpyHostToDevice); + + // down sweep + width = pow(2, iterations + 2); + for (int i = iterations; i >= 0; i--) + { + width = width / 2; + numThreads = squareN / width; + numBlocks = (numThreads + BLOCK_SIZE - 1) / BLOCK_SIZE; + kernDownSweep << > > (numThreads, width, dev_data); + checkCUDAErrorFn("kernDownSweep failed!"); + } + + cudaMemcpy(odata, dev_data, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(dev_data); } /** @@ -30,11 +134,43 @@ namespace StreamCompaction { * @param idata The array of elements to compact. * @returns The number of elements remaining after compaction. */ - int compact(int n, int *odata, const int *idata) { + int compact(int n, int *odata, const int *idata) + { + int count; + int *dev_idata, *dev_odata, *dev_bools, *dev_indices; + int numBlocks = (n + BLOCK_SIZE - 1) / BLOCK_SIZE; + + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + cudaMalloc((void**)&dev_bools, n * sizeof(int)); + cudaMalloc((void**)&dev_indices, n * sizeof(int)); + // copy idata to dev_idata + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + timer().startGpuTimer(); - // TODO + + // map idata to booleans + Common::kernMapToBoolean<<>>(n, dev_bools, dev_idata); + // copy booleans to odata + cudaMemcpy(odata, dev_bools, n * sizeof(int), cudaMemcpyDeviceToHost); + + count = odata[n - 1]; + scanNoTimer(n, odata, odata); + count += odata[n - 1]; + + cudaMemcpy(dev_indices, odata, n * sizeof(int), cudaMemcpyHostToDevice); + Common::kernScatter<<>>(n, dev_odata, dev_idata, dev_bools, dev_indices); + timer().endGpuTimer(); - return -1; + + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_idata); + cudaFree(dev_bools); + cudaFree(dev_indices); + cudaFree(dev_odata); + + return count; } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 9218f8e..ecb6177 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -11,15 +11,57 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } - // TODO: __global__ + + const int BLOCK_SIZE = 512; + + __global__ void kernScan(int n, int *odata, int *idata, int offset) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) return; + + if (index >= offset) + { + odata[index] = idata[index - offset] + idata[index]; + } + else + { + odata[index] = idata[index]; + } + } + + __global__ void kernShift(int n, int *odata, int *idata) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) return; + odata[index] = (index > 0) ? idata[index - 1] : 0; + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { + void scan(int n, int *odata, const int *idata) + { + int numBlocks = (n + BLOCK_SIZE - 1) / BLOCK_SIZE; + int *dev_idata, *dev_odata; + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + timer().startGpuTimer(); - // TODO + + for (int offset = 1; offset <= ilog2ceil(n); offset++) + { + kernScan<<>>(n, dev_odata, dev_idata, 1 << offset - 1); + std::swap(dev_idata, dev_odata); + } + + kernShift<<>>(n, dev_odata, dev_idata); + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(dev_idata); + cudaFree(dev_odata); } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 36b732d..0c6e156 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -17,12 +17,18 @@ namespace StreamCompaction { /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { + void scan(int n, int *odata, const int *idata) + { + thrust::device_vector dv_in(idata, idata + n); + thrust::device_vector dv_out(odata, odata + n); + timer().startGpuTimer(); - // TODO use `thrust::exclusive_scan` - // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + + thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + timer().endGpuTimer(); + + thrust::copy(dv_out.begin(), dv_out.end(), odata); } } }