diff --git a/README.md b/README.md index 0e38ddb..4ad3002 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,80 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Jacky Lu + * [LinkedIn](https://www.linkedin.com/in/jacky-lu-506968129/) -### (TODO: Your README) -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +# README + +## Result: +### The implementations' block sizes seem to be optimized at 128 threads per block. + +### Performance Comparison Between GPU Scan Implementations (Naive, Work-Efficient, and Thrust) And CPU Scan Implementations +#### (Tested With 128 Threads Per Block) +![](img/plot.png) + +### Output Of The Test Program With 33,554,432 Array Elements (Tested With 128 Threads Per Block) +``` +**************** +** SCAN TESTS ** +**************** + [ 12 3 19 17 37 34 6 17 18 22 23 8 49 ... 17 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 102.834ms (std::chrono Measured) + [ 0 12 15 34 51 88 122 128 145 163 185 208 216 ... 821676053 821676070 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 38.8949ms (std::chrono Measured) + [ 0 12 15 34 51 88 122 128 145 163 185 208 216 ... 821675987 821676034 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 19.5621ms (CUDA Measured) + [ 0 12 15 34 51 88 122 128 145 163 185 208 216 ... 821676053 821676070 ] + passed +==== naive scan, non-power-of-two ==== + elapsed time: 19.9352ms (CUDA Measured) + [ 0 12 15 34 51 88 122 128 145 163 185 208 216 ... 821675987 821676034 ] + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 14.7942ms (CUDA Measured) + [ 0 12 15 34 51 88 122 128 145 163 185 208 216 ... 821676053 821676070 ] + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 14.8075ms (CUDA Measured) + [ 0 12 15 34 51 88 122 128 145 163 185 208 216 ... 821675987 821676034 ] + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.8088ms (CUDA Measured) + [ 0 12 15 34 51 88 122 128 145 163 185 208 216 ... 821676053 821676070 ] + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.806624ms (CUDA Measured) + [ 0 12 15 34 51 88 122 128 145 163 185 208 216 ... 821675987 821676034 ] + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 0 2 1 2 0 0 3 3 1 2 2 2 2 ... 2 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 60.1642ms (std::chrono Measured) + [ 2 1 2 3 3 1 2 2 2 2 3 1 3 ... 3 2 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 60.4809ms (std::chrono Measured) + [ 2 1 2 3 3 1 2 2 2 2 3 1 3 ... 1 3 ] + passed +==== cpu compact with scan ==== + elapsed time: 205.018ms (std::chrono Measured) + [ 2 1 2 3 3 1 2 2 2 2 3 1 3 ... 3 2 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 17.7603ms (CUDA Measured) + [ 2 1 2 3 3 1 2 2 2 2 3 1 3 ... 3 2 ] + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 17.4516ms (CUDA Measured) + [ 2 1 2 3 3 1 2 2 2 2 3 1 3 ... 1 3 ] + passed +``` diff --git a/img/plot.png b/img/plot.png new file mode 100644 index 0000000..d25345c Binary files /dev/null and b/img/plot.png differ diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..986a2bf 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = (1 << 25); // feel free to change the size of array (8 originally) const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -51,7 +51,7 @@ int main(int argc, char* argv[]) { printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan @@ -64,35 +64,35 @@ int main(int argc, char* argv[]) { printDesc("naive scan, non-power-of-two"); StreamCompaction::Naive::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("thrust scan, power-of-two"); StreamCompaction::Thrust::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("thrust scan, non-power-of-two"); StreamCompaction::Thrust::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); printf("\n"); @@ -137,14 +137,14 @@ int main(int argc, char* argv[]) { printDesc("work-efficient compact, power-of-two"); count = StreamCompaction::Efficient::compact(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); // printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); printDesc("work-efficient compact, non-power-of-two"); count = StreamCompaction::Efficient::compact(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); // printCmpLenResult(count, expectedNPOT, b, c); system("pause"); // stop Win32 console from closing on exit diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..672da8c 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -1,5 +1,7 @@ #include "common.h" +#include + void checkCUDAErrorFn(const char *msg, const char *file, int line) { cudaError_t err = cudaGetLastError(); if (cudaSuccess == err) { @@ -24,6 +26,15 @@ namespace StreamCompaction { */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { // TODO + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index < n) { + if (idata[index] != 0) { + bools[index] = 1; + } else { + bools[index] = 0; + } + + } } /** @@ -33,6 +44,10 @@ namespace StreamCompaction { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { // TODO + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index < n && bools[index] == 1) { + odata[indices[index]] = idata[index]; + } } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..69e33a5 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -20,6 +20,10 @@ namespace StreamCompaction { void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + odata[0] = 0; + for (int i = 1; i < n; i++) { + odata[i] = odata[i - 1] + idata[i - 1]; + } timer().endCpuTimer(); } @@ -31,8 +35,15 @@ namespace StreamCompaction { int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int counter = 0; + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + odata[counter] = idata[i]; + counter++; + } + } timer().endCpuTimer(); - return -1; + return counter; } /** @@ -43,8 +54,32 @@ namespace StreamCompaction { int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + // Construct criteria vector + int* criteriaVec = new int[n]; + for (int i = 0; i < n; i++) { + criteriaVec[i] = idata[i] != 0 ? 1 : 0; + } + + // Construct scan vector from criteria vector + int* scanVec = new int[n]; + scanVec[0] = 0; + for (int i = 1; i < n; i++) { + scanVec[i] = scanVec[i - 1] + criteriaVec[i - 1]; + } + + // Scatter + int counter = 0; + for (int i = 0; i < n; i++) { + if (criteriaVec[i] == 1) { + odata[scanVec[i]] = idata[i]; + counter++; + } + } + + delete[] criteriaVec; + delete[] scanVec; timer().endCpuTimer(); - return -1; + return counter; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..a934ecd 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -3,6 +3,9 @@ #include "common.h" #include "efficient.h" +#include +#include + namespace StreamCompaction { namespace Efficient { using StreamCompaction::Common::PerformanceTimer; @@ -15,12 +18,67 @@ namespace StreamCompaction { /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ + + + __global__ void kernReduce(int nPadded, int d, int* dev_vec_padded) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index < nPadded) { + if (index % (1 << (d + 1)) == 0) { // (int)fmodf(index, 1 << (d + 1)) + dev_vec_padded[index + (1 << (d + 1)) - 1] += dev_vec_padded[index + (1 << d) - 1]; + } + } + } + + __global__ void downSweep(int nPadded, int d, int* dev_vec_padded) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index < nPadded) { + if (index % (1 << (d + 1)) == 0) { + int t = dev_vec_padded[index + (1 << d) - 1]; + dev_vec_padded[index + (1 << d) - 1] = dev_vec_padded[index + (1 << (d + 1)) - 1]; + dev_vec_padded[index + (1 << (d + 1)) - 1] += t; + } + } + + } + void scan(int n, int *odata, const int *idata) { + int paddedSize = 1 << ilog2ceil(n); + int nPadded = n; + if (paddedSize > n) { + nPadded = paddedSize; + } + + int* dev_vec_padded; + cudaMalloc((void**)&dev_vec_padded, nPadded * sizeof(int)); + cudaMemset(dev_vec_padded, 0, nPadded * sizeof(int)); + cudaMemcpy(dev_vec_padded, idata, n * sizeof(int), cudaMemcpyHostToDevice); + timer().startGpuTimer(); // TODO + int blockSize = 128; + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + dim3 fullBlocksPerGridPadded((nPadded + blockSize - 1) / blockSize); + + // Reduce/Up-Sweep + for (int d = 0; d <= ilog2ceil(nPadded) - 1; d++) { + kernReduce << > > (nPadded, d, dev_vec_padded); + } + // Set Root To Zero + cudaMemset(dev_vec_padded + nPadded - 1, 0, sizeof(int)); + + // Down-Sweep + for (int d = ilog2ceil(nPadded) - 1; d >= 0; d--) { + downSweep << > > (nPadded, d, dev_vec_padded); + } + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_vec_padded, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(dev_vec_padded); } + + /** * Performs stream compaction on idata, storing the result into odata. * All zeroes are discarded. @@ -30,11 +88,87 @@ namespace StreamCompaction { * @param idata The array of elements to compact. * @returns The number of elements remaining after compaction. */ + + __global__ void kernMakeBool(int num, int* dev_vec) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index < num && dev_vec[index] != 0) { + dev_vec[index] = 1; + } + } + + __global__ void kernScatter(int n, int* dev_idata, int* dev_indices, int* dev_result) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index < n && dev_idata[index] != 0) { + dev_result[dev_indices[index]] = dev_idata[index]; + } + } + __global__ void kernCheckNonZeroNum(int n, int* dev_result, int* num) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index < n) { + if (dev_result[index] == 0 && dev_result[index - 1] != 0) { + *num = index; + } + } + } + int compact(int n, int *odata, const int *idata) { + int paddedSize = 1 << ilog2ceil(n); + int nPadded = n; + if (paddedSize > n) { + nPadded = paddedSize; + } + + int* dev_indices; + cudaMalloc((void**)&dev_indices, nPadded * sizeof(int)); + + int* dev_bool; + cudaMalloc((void**)&dev_bool, nPadded * sizeof(int)); + + int* dev_idata; + cudaMalloc((void**)&dev_idata, nPadded * sizeof(int)); + cudaMemset(dev_idata, 0, nPadded * sizeof(int)); + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + int* dev_result; + cudaMalloc((void**)&dev_result, n * sizeof(int)); + timer().startGpuTimer(); // TODO + int blockSize = 128; + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + dim3 fullBlocksPerGridPadded((nPadded + blockSize - 1) / blockSize); + + // Convert to 0s and 1s + StreamCompaction::Common::kernMapToBoolean << > > (nPadded, dev_bool, dev_idata); + cudaMemcpy(dev_indices, dev_bool, nPadded * sizeof(int), cudaMemcpyDeviceToDevice); + + // Reduce/Up-Sweep + for (int d = 0; d <= ilog2ceil(nPadded) - 1; d++) { + kernReduce << > > (nPadded, d, dev_indices); + } + // Set Root To Zero + cudaMemset(dev_indices + nPadded - 1, 0, sizeof(int)); + + // Down-Sweep + for (int d = ilog2ceil(nPadded) - 1; d >= 0; d--) { + downSweep << > > (nPadded, d, dev_indices); + } + + // Scatter + StreamCompaction::Common::kernScatter << > > (n, dev_result, dev_idata, dev_bool, dev_indices); + timer().endGpuTimer(); - return -1; + + cudaMemcpy(odata, dev_result, n * sizeof(int), cudaMemcpyDeviceToHost); + int nonZeroNum; + cudaMemcpy(&nonZeroNum, dev_indices + nPadded - 1, sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_indices); + cudaFree(dev_bool); + cudaFree(dev_idata); + cudaFree(dev_result); + + return nonZeroNum; } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..5ee230b 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -3,6 +3,8 @@ #include "common.h" #include "naive.h" +#include + namespace StreamCompaction { namespace Naive { using StreamCompaction::Common::PerformanceTimer; @@ -12,14 +14,62 @@ namespace StreamCompaction { return timer; } // TODO: __global__ + __global__ void kernNaiveScan(int n, int d, int* dev_vec1, int* dev_vec2) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index < n) { + if (index >= (1 << (d - 1))) { + dev_vec2[index] = dev_vec1[index - (1 << (d - 1))] + dev_vec1[index]; + } else { + dev_vec2[index] = dev_vec1[index]; + } + } + } + + __global__ void kernInsertIdentity(int n, int* dev_vec1, int* dev_vec2) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index < n) { + if (index == 0) { + dev_vec2[index] = 0; // Insert identity element + } else { + dev_vec2[index] = dev_vec1[index - 1]; + } + } + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int* dev_vec1; + int* dev_vec2; + int size = n * sizeof(int); + cudaMalloc((void**)&dev_vec1, size); + cudaMalloc((void**)&dev_vec2, size); + cudaMemcpy(dev_vec1, idata, size, cudaMemcpyHostToDevice); + timer().startGpuTimer(); // TODO + int blockSize = 128; + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + for (int d = 1; d <= ilog2ceil(n); d++) { + // Launch kernel for inclusive scan + kernNaiveScan << > > (n, d, dev_vec1, dev_vec2); + // Ping-pong buffers + int* temp = dev_vec1; + dev_vec1 = dev_vec2; + dev_vec2 = temp; + } + // Launch kernel for shifting elements right and inserting identity element + kernInsertIdentity << > > (n, dev_vec1, dev_vec2); + // Ping-pong buffers again + int* temp = dev_vec1; + dev_vec1 = dev_vec2; + dev_vec2 = temp; timer().endGpuTimer(); + + cudaMemcpy(odata, dev_vec1, size, cudaMemcpyDeviceToHost); + cudaFree(dev_vec1); + cudaFree(dev_vec2); } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..7dcb6e7 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,18 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + thrust::host_vector h_in(idata, idata + n); + thrust::device_vector dv_in = h_in; + thrust::device_vector dv_out(n); + timer().startGpuTimer(); // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); timer().endGpuTimer(); + + thrust::copy(dv_out.begin(), dv_out.end(), odata); } } }