diff --git a/README.md b/README.md index 0e38ddb..de503ff 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,133 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Jilin Liu + * [LinkedIn](https://www.linkedin.com/in/jilin-liu97/), [twitter](https://twitter.com/Jilin18043110). +* Tested on: Windows 10, i7-8750H @ 2.20GHz, 16GB, GTX 1050Ti 4096MB (personal) -### (TODO: Your README) +## Features and Results + +This project includes several parallel algorithms such as exclusive scan and stream compaction. These algorithms can be very useful as independent components of later projects like GPU based path tracer. A serial CPU version of these algorithms is also implemented and used as a performance comparison baseline. + +Features: +1. CPU Scan +2. CPU Stream Compact +3. Naive GPU Scan +4. Work-Efficient Scan +5. Work-Efficient Stream Compact + +Extra Credit: +1. Faster GPU Implementation + +``` +**************** +** SCAN TESTS ** +**************** + [ 30 44 27 31 9 34 15 27 24 39 44 33 38 ... 18 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 17.3105ms (std::chrono Measured) + [ 0 30 74 101 132 141 175 190 217 241 280 324 357 ... 102764441 102764459 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 2.102ms (std::chrono Measured) + [ 0 30 74 101 132 141 175 190 217 241 280 324 357 ... 102764362 102764372 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 8.62189ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 8.62202ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 3.79942ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 3.77037ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.560736ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.639808ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 0 0 1 3 3 0 1 1 2 3 0 3 2 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 9.2661ms (std::chrono Measured) + [ 1 3 3 1 1 2 3 3 2 3 2 3 3 ... 2 3 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 9.253ms (std::chrono Measured) + [ 1 3 3 1 1 2 3 3 2 3 2 3 3 ... 2 2 ] + passed +==== cpu compact with scan ==== + elapsed time: 24.8881ms (std::chrono Measured) + [ 1 3 3 1 1 2 3 3 2 3 2 3 3 ... 2 3 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 5.13638ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 5.15882ms (CUDA Measured) + passed +``` + +## Performance Analysis + +A more detailed performance comparison of exclusive scan with respect to the array size is shown below. + +![](./images/comp1.JPG) + +![](./images/comp2.JPG) + +The thrust implementation has the best scalability and runs about 10 times faster than my work-efficient scan. But its execution time still follows a linear increase with respect to the array size. My hypothesis about its efficiency is that it's using shared memory. + +In the timeline below, we can easily spot the bottleneck of my implementation. As I precluded the time measurement of device-to-host and host-to-device memory copy, the majority execution time comes from the calling of up-sweep kernel function and down-sweep kernel function. The up-sweep and down-sweep in total take 5 times longer than scatter and 10 times longer than map-to-boolean. + +![](./images/t.JPG) + +![](./images/t2.JPG) + +The thrust scan seems to have two stage as well, as you can see below. But there is an noticable gap between two kernel functions. My guess is that they are packing memory into shared memories so the access can be much faster. + +![](./images/thrust.JPG) + +In each step of up-sweep and down-sweep, I only used those threads that are necessary to the algorithm. Since in each step we are only accessing and modifying a subset of the array elements, so the number of threads we need is proportional to the size of this subset. This trick eliminates a large number of lazy threads and thus makes the program run faster. + +When the array has a trivial size, i.e. 256 elements, the CPU version runs faster(even faster than thrust implementation). The reason is that GPU has a overhead of packing and scheduling threads, which overweighs the benefits from parrallelism when the array size is small, as you can see below. + +``` +**************** +** SCAN TESTS ** +**************** + [ 38 13 31 10 45 3 41 44 26 12 44 28 12 ... 10 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.0005ms (std::chrono Measured) + [ 0 38 51 82 92 137 140 181 225 251 263 307 335 ... 6247 6257 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 0.0004ms (std::chrono Measured) + [ 0 38 51 82 92 137 140 181 225 251 263 307 335 ... 6169 6198 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 0.017408ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.017408ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.050176ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.044032ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.04448ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.041984ms (CUDA Measured) + passed +``` -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) diff --git a/images/comp1.JPG b/images/comp1.JPG new file mode 100644 index 0000000..5e39029 Binary files /dev/null and b/images/comp1.JPG differ diff --git a/images/comp2.JPG b/images/comp2.JPG new file mode 100644 index 0000000..d4bea0a Binary files /dev/null and b/images/comp2.JPG differ diff --git a/images/t.JPG b/images/t.JPG new file mode 100644 index 0000000..61dc4fe Binary files /dev/null and b/images/t.JPG differ diff --git a/images/t2.JPG b/images/t2.JPG new file mode 100644 index 0000000..198e231 Binary files /dev/null and b/images/t2.JPG differ diff --git a/images/thrust.JPG b/images/thrust.JPG new file mode 100644 index 0000000..83e5f45 Binary files /dev/null and b/images/thrust.JPG differ diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..4687dd8 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 22; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -34,6 +34,7 @@ int main(int argc, char* argv[]) { // initialize b using StreamCompaction::CPU::scan you implement // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct. // At first all cases passed because b && c are all zeroes. + zeroArray(SIZE, b); printDesc("cpu scan, power-of-two"); StreamCompaction::CPU::scan(SIZE, b, a); diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..7bb73b8 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -24,6 +24,11 @@ namespace StreamCompaction { */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { // TODO + int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index >= n) { + return; + } + bools[index] = idata[index] == 0 ? 0 : 1; } /** @@ -33,6 +38,13 @@ namespace StreamCompaction { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { // TODO + int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index >= n) { + return; + } + if (bools[index] == 1) { + odata[indices[index]] = idata[index]; + } } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..c137c0d 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -20,6 +20,11 @@ namespace StreamCompaction { void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int sum = 0; // identity + for (int i = 0; i < n; i++) { + odata[i] = sum; + sum += idata[i]; + } timer().endCpuTimer(); } @@ -31,8 +36,14 @@ namespace StreamCompaction { int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int p = 0; + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { // remove 0 + odata[p++] = idata[i]; + } + } timer().endCpuTimer(); - return -1; + return p; } /** @@ -43,8 +54,37 @@ namespace StreamCompaction { int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + if (n < 1) { + return 0; + } + int* e = new int[n]; + for (int i = 0; i < n; i++) { + e[i] = idata[i] != 0; + } + + // scan + int* prefix = new int[n]; + int sum = 0; + for (int i = 0; i < n; i++) { + prefix[i] = sum; + sum += e[i]; + } + + // scatter + for (int i = 0; i < n - 1; i++) { + if (prefix[i] != prefix[i + 1]) { + odata[prefix[i]] = idata[i]; + } + } + int len = prefix[n - 1]; + if (e[n - 1] == 1) { + len++; + odata[prefix[n - 1]] = idata[n - 1]; + } + delete[] e; + delete[] prefix; timer().endCpuTimer(); - return -1; + return len; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..0d0ff21 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -12,13 +12,90 @@ namespace StreamCompaction { return timer; } + __global__ void kernEfficientUpSweep(int n, int offset, + int numNode, int* data) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index >= numNode) { + return; + } + index = (index + 1) * offset * 2 - 1; + + data[index] += data[index - offset]; + } + + __global__ void kernEfficientDownSweep(int n, int offset, + int numNode, int* data) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index >= numNode) { + return; + } + index = (index + 1) * 2 * offset - 1; + + int temp = data[index - offset]; + data[index - offset] = data[index]; + data[index] += temp; + } + + void scanHelper(int full, int d, int blockSize, int *dev_data) { + dim3 threadsPerBlock(blockSize); + + // Up-Sweep + int offset = 1; + for (int i = 0; i < d; i++) { + int numNode = 1 << (d - i - 1); + dim3 blocks(numNode / threadsPerBlock.x + 1); + kernEfficientUpSweep << > > + (full, offset, numNode, dev_data); + offset <<= 1; + } + + // Down-Sweep + cudaMemset(dev_data + full - 1, 0, sizeof(int)); + for (int i = 0; i < d; i++) { + offset >>= 1; + int numNode = 1 << i; + dim3 blocks(numNode / threadsPerBlock.x + 1); + kernEfficientDownSweep << > > + (full, offset, numNode, dev_data); + } + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { + void scan(int n, int *odata, const int *idata) { // TODO + if (n < 1) { + return; + } + int d = ilog2ceil(n); + int full = 1 << d; + int* dev_data = nullptr; + + // Allocate memory on device + cudaMalloc((void**)&dev_data, full * sizeof(int)); + checkCUDAError("cudaMalloc dev_data failed!"); + + // Set additional memory values to zero + cudaMemset(dev_data + n, 0, (full - n) * sizeof(int)); + checkCUDAError("cudaMemset dev_data failed!"); + + // Copy data from host to device + cudaMemcpy(dev_data, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy to device failed!"); + timer().startGpuTimer(); - // TODO + + int blockSize = 128; + scanHelper(full, d, blockSize, dev_data); + timer().endGpuTimer(); + + // Copy data back to host + cudaMemcpy(odata, dev_data, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("memcpy back failed!"); + + // Free device memory + cudaFree(dev_data); } /** @@ -30,11 +107,73 @@ namespace StreamCompaction { * @param idata The array of elements to compact. * @returns The number of elements remaining after compaction. */ - int compact(int n, int *odata, const int *idata) { + int compact(int n, int *odata, const int *idata) { // TODO + if (n < 1) { + return 0; + } + int* dev_data = nullptr; + int* dev_bools = nullptr; + int* dev_indices = nullptr; + int* dev_final = nullptr; + + int d = ilog2ceil(n); + int full = 1 << d; + + // Allocate memory on device + cudaMalloc((void**)&dev_data, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_data failed!"); + cudaMalloc((void**)&dev_bools, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_bools failed!"); + cudaMalloc((void**)&dev_indices, full * sizeof(int)); + checkCUDAError("cudaMalloc dev_indices failed!"); + cudaMalloc((void**)&dev_final, full * sizeof(int)); + checkCUDAError("cudaMalloc dev_final failed!"); + + // Copy data to device + cudaMemcpy(dev_data, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy to device failed!"); + + // Set additional elements zero + cudaMemset(dev_indices + n, 0, (full - n) * sizeof(int)); + checkCUDAError("cudaMemset dev_bools failed!"); + timer().startGpuTimer(); - // TODO + + int blockSize = 128; + + dim3 threadsPerBlock(blockSize); + dim3 blocks(n / threadsPerBlock.x + 1); + + StreamCompaction::Common::kernMapToBoolean + << > > (n, dev_bools, dev_data); + + cudaMemcpy(dev_indices, dev_bools, n * sizeof(int), + cudaMemcpyDeviceToDevice); + + scanHelper(full, d, blockSize, dev_indices); + + StreamCompaction::Common::kernScatter<< > > + (n, dev_final, dev_data, dev_bools, dev_indices); + timer().endGpuTimer(); - return -1; + + // Get the number of remaining elements + int lastIndex; + int lastBool; + cudaMemcpy((void*)&lastIndex, dev_indices + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy((void*)&lastBool, dev_bools + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + int remain = lastIndex + lastBool; + + // Copy data back to host + cudaMemcpy(odata, dev_final, remain * sizeof(int), cudaMemcpyDeviceToHost); + + // Free device memory + cudaFree(dev_data); + cudaFree(dev_bools); + cudaFree(dev_indices); + cudaFree(dev_final); + + return remain; } } } diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 803cb4f..d2be253 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -6,6 +6,14 @@ namespace StreamCompaction { namespace Efficient { StreamCompaction::Common::PerformanceTimer& timer(); + __global__ void kernEfficientUpSweep(int n, int offset, + int numNode, int* data); + + __global__ void kernEfficientDownSweep(int n, int offset, + int numNode, int* data); + + void scanHelper(int full, int d, int blockSize, int* dev_data); + void scan(int n, int *odata, const int *idata); int compact(int n, int *odata, const int *idata); diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..57df169 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -13,13 +13,71 @@ namespace StreamCompaction { } // TODO: __global__ + __global__ void kernNaiveScan(int n, int offset, + int* odata, const int* idata) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index >= offset && index < n) { + odata[index] = idata[index - offset] + idata[index]; + } + else if (index < offset) { + odata[index] = idata[index]; + } + } + + __global__ void kernRightShift(int n, int* odata, int* idata) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index == 0) { + odata[index] = 0; + } + else if (index < n) { + odata[index] = idata[index - 1]; + } + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { + void scan(int n, int *odata, const int *idata) { // TODO + // Two ping pong buffers + int* dev_data1 = nullptr; + int* dev_data2 = nullptr; + + // Allocate memory on device + cudaMalloc((void**)&dev_data1, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_data1 failed!"); + cudaMalloc((void**)&dev_data2, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_data2 failed!"); + + // Copy data from host to device + cudaMemcpy(dev_data1, idata, n * sizeof(int), cudaMemcpyHostToDevice); + timer().startGpuTimer(); - // TODO + + if (n > 0) { + dim3 threadsPerBlock(128); + int d = ilog2ceil(n); + int offset = 1; + dim3 blocks(n / threadsPerBlock.x + 1); + for (int i = 1; i <= d; i++) { + kernNaiveScan << > > + (n, offset, dev_data2, dev_data1); + std::swap(dev_data1, dev_data2); + offset <<= 1; + } + + // Right shift to get the exclusive prefix sum + kernRightShift << > > + (n, dev_data2, dev_data1); + } + timer().endGpuTimer(); + + // Copy data back from device to host + cudaMemcpy(odata, dev_data2, n * sizeof(int), cudaMemcpyDeviceToHost); + + // Free device memory + cudaFree(dev_data1); + cudaFree(dev_data2); } } } diff --git a/stream_compaction/naive.h b/stream_compaction/naive.h index 37dcb06..6690a8b 100644 --- a/stream_compaction/naive.h +++ b/stream_compaction/naive.h @@ -6,6 +6,11 @@ namespace StreamCompaction { namespace Naive { StreamCompaction::Common::PerformanceTimer& timer(); + __global__ void kernNaiveScan(int n, int offset, + int* odata, const int* idata); + + __global__ void kernRightShift(int n, int* odata, int* idata); + void scan(int n, int *odata, const int *idata); } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..efaf94d 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -14,15 +14,32 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { + void scan(int n, int *odata, const int *idata) { // TODO + thrust::host_vector host_idata(idata, idata + n); + + // You may encounter exception in debug build, press continue + + // Copy host_vector to device_vector + thrust::device_vector dev_idata(host_idata); + thrust::device_vector dev_odata(n); + timer().startGpuTimer(); - // TODO use `thrust::exclusive_scan` + + // use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(dev_idata.begin(), dev_idata.end(), dev_odata.begin()); + timer().endGpuTimer(); + + // Copy data back to host + thrust::copy(dev_odata.begin(), dev_odata.end(), odata); + + // device and host vector are automatically deleted when the function returns } } }