diff --git a/README.md b/README.md index 0e38ddb..b93c779 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,112 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Zichuan Yu + * [LinkedIn](https://www.linkedin.com/in/zichuan-yu/), [Behance](https://www.behance.net/zainyu717ebcc) +* Tested on: Windows 10.0.17134 Build 17134, i7-4710 @ 2.50GHz 16GB, GTX 980m 4096MB GDDR5 + +## Features + +- CPU Scan +- CPU Stream Compaction +- Naive GPU Scan +- Work-Efficient GPU Scan +- Work-Efficient GPU Stream Compaction +- Thrust Implementation + +## Performance Analysis + +### Block size analysis + +We fix array size as 2^21 and change the block size. + +![block_size](img/block_size.png) + +As we can see, as long as the block size is not 32, it makes little differences when we increase the block size. + +### Array Size Analysis on Scan + +We fix block size as 1024 and change the array size. + +![scan](img/scan.png) + +As we can see, CPU is of course the slowest. We can also see that my own implementation is still much slower than +Thrust implementation. I think this is because our own code is still not efficient and hardware-exploiting enough. + +### Array Size Analysis on Compaction + +We fix block size as 1024 and change the array size. + +![compaction](img/compaction.png) + +As we can see, CPU with scan is the slowest. I think that scan brings overhead to CPU, thus, if we are using CPU, we'd rather not use scan at all. + +## Output + +Array size 2^28, block size 1024 + +```shell + +**************** +** SCAN TESTS ** +**************** + [ 1 1 1 1 1 1 1 1 1 1 1 1 1 ... 1 1 ] +==== cpu scan, power-of-two ==== + elapsed time: 1535.85ms (std::chrono Measured) + [ 0 1 2 3 4 5 6 7 8 9 10 11 12 ... 268435454 268435455 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 594.798ms (std::chrono Measured) + [ 0 1 2 3 4 5 6 7 8 9 10 11 12 ... 268435451 268435452 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 510.046ms (CUDA Measured) + [ 0 1 2 3 4 5 6 7 8 9 10 11 12 ... 268435454 268435455 ] + passed +==== naive scan, non-power-of-two ==== + elapsed time: 510.037ms (CUDA Measured) + [ 0 1 2 3 4 5 6 7 8 9 10 11 12 ... 0 0 ] + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 175.304ms (CUDA Measured) + [ 0 1 2 3 4 5 6 7 8 9 10 11 12 ... 268435454 268435455 ] + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 175.151ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 28.8416ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 28.8394ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 0 1 0 0 1 0 3 1 3 3 0 3 1 ... 3 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 708.621ms (std::chrono Measured) + [ 1 1 3 1 3 3 3 1 1 1 1 1 1 ... 1 3 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 680.761ms (std::chrono Measured) + [ 1 1 3 1 3 3 3 1 1 1 1 1 1 ... 1 1 ] + passed +==== cpu compact with scan ==== + elapsed time: 1471.92ms (std::chrono Measured) + [ 1 1 3 1 3 3 3 1 1 1 1 1 1 ... 1 3 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 213.044ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 212.931ms (CUDA Measured) + passed +Press any key to continue . . . +``` + + + -### (TODO: Your README) -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) diff --git a/img/block_size.png b/img/block_size.png new file mode 100644 index 0000000..afdd5f5 Binary files /dev/null and b/img/block_size.png differ diff --git a/img/compaction.png b/img/compaction.png new file mode 100644 index 0000000..7d552d4 Binary files /dev/null and b/img/compaction.png differ diff --git a/img/scan.png b/img/scan.png new file mode 100644 index 0000000..f8ab885 Binary files /dev/null and b/img/scan.png differ diff --git a/src/main.cpp b/src/main.cpp index 1850161..4ee1922 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -6,6 +6,9 @@ * @copyright University of Pennsylvania */ + + + #include #include #include @@ -13,7 +16,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 28; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -29,6 +32,7 @@ int main(int argc, char* argv[]) { genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case a[SIZE - 1] = 0; + onesArray(SIZE, a); printArray(SIZE, a, true); // initialize b using StreamCompaction::CPU::scan you implement @@ -51,7 +55,7 @@ int main(int argc, char* argv[]) { printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan @@ -64,14 +68,14 @@ int main(int argc, char* argv[]) { printDesc("naive scan, non-power-of-two"); StreamCompaction::Naive::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); diff --git a/src/testing_helpers.hpp b/src/testing_helpers.hpp index 46337ab..d6cc4e3 100644 --- a/src/testing_helpers.hpp +++ b/src/testing_helpers.hpp @@ -1,8 +1,8 @@ #pragma once -#include -#include -#include +#include +#include +#include #include #include @@ -69,8 +69,8 @@ void printArray(int n, int *a, bool abridged = false) { printf("]\n"); } -template -void printElapsedTime(T time, std::string note = "") -{ - std::cout << " elapsed time: " << time << "ms " << note << std::endl; +template +void printElapsedTime(T time, std::string note = "") +{ + std::cout << " elapsed time: " << time << "ms " << note << std::endl; } \ No newline at end of file diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index cdbef77..6ad13f6 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -13,5 +13,5 @@ set(SOURCE_FILES cuda_add_library(stream_compaction ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_52 ) diff --git a/stream_compaction/common.h b/stream_compaction/common.h index 99a1b04..bcca058 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -13,6 +13,11 @@ #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) + +#define BLOCK_SIZE 1024 + +#define PLUS_OP_IDENTITY 0 + /** * Check for CUDA errors; print and exit if there was a problem. */ diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 05ce667..99acff7 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -1,15 +1,15 @@ #include #include "cpu.h" -#include "common.h" +#include "common.h" namespace StreamCompaction { namespace CPU { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } /** @@ -19,7 +19,10 @@ namespace StreamCompaction { */ void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + odata[0] = 0; + for (int i = 1; i < n; ++i) { + odata[i] = odata[i - 1] + idata[i - 1]; + } timer().endCpuTimer(); } @@ -30,9 +33,15 @@ namespace StreamCompaction { */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + int non_zero_idx = 0; + for (int i = 0; i < n; ++i) { + if (idata[i] != 0) { + odata[non_zero_idx] = idata[i]; + ++non_zero_idx; + } + } timer().endCpuTimer(); - return -1; + return non_zero_idx; } /** @@ -41,10 +50,31 @@ namespace StreamCompaction { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); - // TODO + + // allocate a temporary 0/1 accumulating array + int* temp = new int[n]; + + //for (int i = 0; i < n; ++i) { + // temp[i] = 0; + //} + temp[0] = 0; + timer().startCpuTimer(); + // scan to 0/1 accumulating array + for (int i = 1; i < n; ++i) { + temp[i] = temp[i - 1] + (idata[i - 1] != 0); + } + + // use temp to map to output + int count = 0; + for (int i = 0; i < n; ++i) { + if (idata[i] != 0) { + ++count; + odata[temp[i]] = idata[i]; + } + } timer().endCpuTimer(); - return -1; + delete[] temp; + return count; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 36c5ef2..e217897 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -2,39 +2,201 @@ #include #include "common.h" #include "efficient.h" +#include +#include namespace StreamCompaction { - namespace Efficient { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; - } + namespace Efficient { + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; + } - /** - * Performs prefix-sum (aka scan) on idata, storing the result into odata. - */ - void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); - } - /** - * Performs stream compaction on idata, storing the result into odata. - * All zeroes are discarded. - * - * @param n The number of elements in idata. - * @param odata The array into which to store elements. - * @param idata The array of elements to compact. - * @returns The number of elements remaining after compaction. - */ - int compact(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); - return -1; + __global__ void kernEfficientGpuScan(int n, int *odata, const int *idata) { + + } + + __global__ void kernBuildTree(int n, const int* d_data_in, int* d_data_out) { + + } + + __global__ void kernEfficientScanUp(int n, int bitShift, int* d_data_in) { + int idx = threadIdx.x + blockDim.x * blockIdx.x; + if (idx >= n) { + return; + } + int pow1 = 1 << bitShift; + int pow2 = 1 << (bitShift + 1); + d_data_in[idx * pow2 + pow2 - 1] += d_data_in[idx * pow2 + pow1 - 1]; + } + + __global__ void kernEfficientScanDown(int n, int bitShift, int* d_data_in) { + int idx = threadIdx.x + blockDim.x * blockIdx.x; + if (idx >= n) { + return; + } + int pow1 = 1 << bitShift; + int pow2 = 1 << (bitShift + 1); + int pos1 = idx * pow2 + pow1 - 1; + int pos2 = idx * pow2 + pow2 - 1; + int temp = d_data_in[pos1]; + d_data_in[pos1] = d_data_in[pos2]; + d_data_in[pos2] += temp; + } + + /** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ + void scan(int n, int *odata, const int *idata) { + // make length to 2^n + int level = ilog2ceil(n); + int trueN = 1 << level; + + + + std::unique_ptrtrueIData{ new int[trueN] }; + + // pad 0 to the end + for (int i = 0; i < n; ++i) { + trueIData[i] = idata[i]; + } + for (int i = n; i < trueN; ++i) { + trueIData[i] = 0; + } + + // allocate memory + int* d_data_in; + cudaMalloc((void**)&d_data_in, trueN * sizeof(int)); + cudaMemcpy(d_data_in, trueIData.get(), trueN * sizeof(int), cudaMemcpyHostToDevice); + timer().startGpuTimer(); + + dim3 blockSize(BLOCK_SIZE); + dim3 gridSize; + int pow2; + // go up + for (int i = 0; i < level; ++i) + { + pow2 = 1 << (i + 1); + gridSize = ((trueN / pow2 + BLOCK_SIZE - 1) / BLOCK_SIZE); + kernEfficientScanUp << > > (trueN / pow2, i, d_data_in); + } + cudaMemset(d_data_in + trueN - 1, 0, sizeof(int)); + + // go down + for (int i = level - 1; i > -1; --i) + { + pow2 = 1 << (i + 1); + gridSize = ((trueN / pow2 + BLOCK_SIZE - 1) / BLOCK_SIZE); + kernEfficientScanDown << > > (trueN / pow2, i, d_data_in); + } + + timer().endGpuTimer(); + // only need copy n, no need to copy trueN + cudaMemcpy(odata, d_data_in, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(d_data_in); + } + + __global__ void kernValueMapToOne(int n, int* d_ones_out, int* d_data_in) { + int idx = threadIdx.x + blockDim.x * blockIdx.x; + if (idx >= n) { + return; + } + if (d_data_in[idx] != 0) { + d_ones_out[idx] = 1; + } + } + + __global__ void kernCompact(int n, int* d_indices, int* d_data_in, int* d_data_to_compact) { + int idx = threadIdx.x + blockDim.x * blockIdx.x; + if (idx >= n) { + return; + } + int data = d_data_in[idx]; + if (data != 0) { + d_data_to_compact[d_indices[idx]] = data; + } + } + + /** + * Performs stream compaction on idata, storing the result into odata. + * All zeroes are discarded. + * + * @param n The number of elements in idata. + * @param odata The array into which to store elements. + * @param idata The array of elements to compact. + * @returns The number of elements remaining after compaction. + */ + int compact(int n, int *odata, const int *idata) { + // ugly implementation of reusing scan code + // used for mapping to 1 and as the scan result + int* indexArray = (int*)malloc(n * sizeof(int)); + int* d_data_in; + int* d_compacted_data; + cudaMalloc((void**)&d_data_in, n * sizeof(int)); + cudaMalloc((void**)&d_compacted_data, n * sizeof(int)); + + cudaMemset(d_compacted_data, 0, n * sizeof(int)); + + cudaMemcpy(d_data_in, idata, n * sizeof(int), cudaMemcpyHostToDevice); + int* d_ones_scan_result; + + // calc true values + // make length to 2^n + int level = ilog2ceil(n); + int trueN = 1 << level; + + // set scan_result zero + cudaMalloc((void**)&d_ones_scan_result, trueN * sizeof(int)); + cudaMemset(d_ones_scan_result, 0, trueN * sizeof(int)); + + // useful constants + dim3 blockSize(BLOCK_SIZE); + dim3 gridSize((n + BLOCK_SIZE - 1) / BLOCK_SIZE); + int pow2; + + timer().startGpuTimer(); + // do ones + kernValueMapToOne << > > (n, d_ones_scan_result, d_data_in); + + // TODO(zichuanyu) make this a in the future + // scan + // go up + for (int i = 0; i < level; ++i) + { + pow2 = 1 << (i + 1); + gridSize = ((trueN / pow2 + BLOCK_SIZE - 1) / BLOCK_SIZE); + kernEfficientScanUp << > > (trueN / pow2, i, d_ones_scan_result); + } + cudaMemset(d_ones_scan_result + trueN - 1, 0, sizeof(int)); + // go down + for (int i = level - 1; i > -1; --i) + { + pow2 = 1 << (i + 1); + gridSize = ((trueN / pow2 + BLOCK_SIZE - 1) / BLOCK_SIZE); + kernEfficientScanDown << > > (trueN / pow2, i, d_ones_scan_result); + } + + // compact, only use useful part of the array + gridSize = (n + BLOCK_SIZE - 1) / BLOCK_SIZE; + kernCompact << > > (n, d_ones_scan_result, d_data_in, d_compacted_data); + timer().endGpuTimer(); + cudaMemcpy(odata, d_compacted_data, n * sizeof(int), cudaMemcpyDeviceToHost); + // count how many nums + // cpu or gpu + int num = 0; + for (int i = 0; i < n; ++i) { + if (odata[i] == 0) { + break; } + ++num; + } + cudaFree(d_compacted_data); + cudaFree(d_ones_scan_result); + cudaFree(d_data_in); + return num; } + } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 9218f8e..153e5bc 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -1,25 +1,63 @@ #include +#include #include #include "common.h" #include "naive.h" namespace StreamCompaction { - namespace Naive { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; - } - // TODO: __global__ + namespace Naive { + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; + } + + __global__ void kernNaiveGPUScan(const int n, const int offset, const int* d_data_in, int* d_data_out) { + int idx = threadIdx.x + blockDim.x * blockIdx.x; + if (idx >= n) return; + if (idx < offset) { + d_data_out[idx] = d_data_in[idx]; + } + else { + d_data_out[idx] = d_data_in[idx] + d_data_in[idx - offset]; + } + } + + __global__ void kernIncToExc(const int n, const int* d_data_in, int* d_data_out) { + int idx = threadIdx.x + blockDim.x * blockIdx.x; + if (idx == 0) { + d_data_out[0] = PLUS_OP_IDENTITY; + } + else if (idx < n) { + d_data_out[idx] = d_data_in[idx - 1]; + } + } + /** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ + void scan(int n, int* odata, const int* idata) { + dim3 blockSize(BLOCK_SIZE); + dim3 gridSize((n + BLOCK_SIZE - 1) / BLOCK_SIZE); - /** - * Performs prefix-sum (aka scan) on idata, storing the result into odata. - */ - void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); - } + // allocate memory + int* d_data_in; + int* d_data_out; + cudaMalloc((void**)&d_data_in, n * sizeof(int)); + cudaMalloc((void**)&d_data_out, n * sizeof(int)); + cudaMemcpy(d_data_in, idata, n * sizeof(int), cudaMemcpyHostToDevice); + timer().startGpuTimer(); + for (int i = 0; i < ilog2ceil(n); ++i) { + kernNaiveGPUScan << > > (n, 1 << i, d_data_in, d_data_out); + std::swap(d_data_in, d_data_out); + } + timer().endGpuTimer(); + // for readbility + std::swap(d_data_in, d_data_out); + cudaMemcpy(odata + 1, d_data_out, (n - 1) * sizeof(int), cudaMemcpyDeviceToHost); + odata[0] = 0; + cudaFree(d_data_out); + cudaFree(d_data_in); } + } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 36b732d..9693dca 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -7,22 +7,26 @@ #include "thrust.h" namespace StreamCompaction { - namespace Thrust { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; - } - /** - * Performs prefix-sum (aka scan) on idata, storing the result into odata. - */ - void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO use `thrust::exclusive_scan` - // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); - timer().endGpuTimer(); - } + namespace Thrust { + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } + /** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ + void scan(int n, int *odata, const int *idata) { + thrust::device_vector d_data_in(idata, idata + n); + thrust::device_vector d_data_out(odata, odata + n); + timer().startGpuTimer(); + // TODO use `thrust::exclusive_scan` + // example: for device_vectors dv_in and dv_out: + // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(d_data_in.begin(), d_data_in.end(), d_data_out.begin()); + timer().endGpuTimer(); + thrust::copy(d_data_out.begin(), d_data_out.end(), odata); + } + } }