Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 80 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,87 @@ CUDA Stream Compaction
======================

**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
* Haorong Yang
* [LinkedIn](https://www.linkedin.com/in/haorong-henry-yang/)
* Tested on: Windows 10 Home, i7-10750H @ 2.60GHz 16GB, GTX 2070 Super Max-Q (Personal)

* (TODO) YOUR NAME HERE
* (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
The goal of this project was to implement a stream compaction algorithm on the GPU in CUDA from scratch.
The algorithm will remove `0`s from an array of `int`s utilizing a scan function, which performs parallel reduction on the array to obtain an exclusive prefix sum.

### (TODO: Your README)
Although the goal is to obtain an efficient parallel solution, for comparison, a few variations of the algorithm were also implemented.
A list of algorithms that will be compared to each other:
* CPU scan function
* CPU stream compaction without scan
* CPU sream compaction with scan
* GPU naive scan
* GPU work-efficient scan & compaction
* thrust library's implementation

Include analysis, etc. (Remember, this is public, so don't put
anything here that you don't want to share with the world.)
The test results for array size of 2^8 is:

```
****************
** SCAN TESTS **
****************
[ 17 20 19 34 19 3 6 3 27 2 14 5 21 ... 36 0 ]
==== cpu scan, power-of-two ====
elapsed time: 0.0005ms (std::chrono Measured)
[ 0 17 37 56 90 109 112 118 121 148 150 164 169 ... 6163 6199 ]
==== cpu scan, non-power-of-two ====
elapsed time: 0.0004ms (std::chrono Measured)
[ 0 17 37 56 90 109 112 118 121 148 150 164 169 ... 6092 6101 ]
passed
==== naive scan, power-of-two ====
elapsed time: 0.029056ms (CUDA Measured)
[ 0 17 37 56 90 109 112 118 121 148 150 164 169 ... 6163 6199 ]
passed
==== naive scan, non-power-of-two ====
elapsed time: 0.026752ms (CUDA Measured)
[ 0 17 37 56 90 109 112 118 121 148 150 164 169 ... 0 0 ]
passed
==== work-efficient scan, power-of-two ====
elapsed time: 0.012768ms (CUDA Measured)
[ 6199 6216 6236 6255 6289 6308 6311 6317 6320 6347 6349 6363 6368 ... 3050 3086 ]
a[0] = 0, b[0] = 6199
FAIL VALUE
==== work-efficient scan, non-power-of-two ====
elapsed time: 0.012512ms (CUDA Measured)
[ 6138 6155 6175 6194 6228 6247 6250 6256 6259 6286 6288 6302 6307 ... 2979 2988 ]
a[0] = 0, b[0] = 6138
FAIL VALUE
==== thrust scan, power-of-two ====
elapsed time: 0.055264ms (CUDA Measured)
[ 0 17 37 56 90 109 112 118 121 148 150 164 169 ... 6163 6199 ]
passed
==== thrust scan, non-power-of-two ====
elapsed time: 0.054368ms (CUDA Measured)
[ 0 17 37 56 90 109 112 118 121 148 150 164 169 ... 6092 6101 ]
passed

*****************************
** STREAM COMPACTION TESTS **
*****************************
[ 3 0 1 2 1 1 0 1 1 0 2 1 3 ... 2 0 ]
==== cpu compact without scan, power-of-two ====
elapsed time: 0.0008ms (std::chrono Measured)
[ 3 1 2 1 1 1 1 2 1 3 1 2 2 ... 3 2 ]
passed
==== cpu compact without scan, non-power-of-two ====
elapsed time: 0.0008ms (std::chrono Measured)
[ 3 1 2 1 1 1 1 2 1 3 1 2 2 ... 1 3 ]
passed
==== cpu compact with scan ====
elapsed time: 0.004ms (std::chrono Measured)
[ 3 1 2 1 1 1 1 2 1 3 1 2 2 ... 3 2 ]
passed
==== work-efficient compact, power-of-two ====
elapsed time: 0.020992ms (CUDA Measured)
[ 3 1 2 1 1 1 1 2 1 3 1 2 2 ... 0 0 ]
expected count is 185, count is 185
passed
==== work-efficient compact, non-power-of-two ====
elapsed time: 0.021888ms (CUDA Measured)
[ 3 1 2 1 1 1 1 2 1 3 1 2 2 ... 1 3 ]
expected count is 185, count is 183
passed
```
24 changes: 14 additions & 10 deletions src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
#include "testing_helpers.hpp"

const int SIZE = 1 << 8; // feel free to change the size of array
const int NPOT = SIZE - 3; // Non-Power-Of-Two
//const int SIZE = 8;
const int NPOT = SIZE - 3; // Non8Power-Of-Two
int *a = new int[SIZE];
int *b = new int[SIZE];
int *c = new int[SIZE];
Expand All @@ -27,7 +28,8 @@ int main(int argc, char* argv[]) {
printf("** SCAN TESTS **\n");
printf("****************\n");

genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case
genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case // 3rd argument is maxValue
//testArray(SIZE - 1, a, 50); // test
a[SIZE - 1] = 0;
printArray(SIZE, a, true);

Expand All @@ -51,7 +53,7 @@ int main(int argc, char* argv[]) {
printDesc("naive scan, power-of-two");
StreamCompaction::Naive::scan(SIZE, c, a);
printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
//printArray(SIZE, c, true);
printArray(SIZE, c, true);
printCmpResult(SIZE, b, c);

/* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
Expand All @@ -64,35 +66,35 @@ int main(int argc, char* argv[]) {
printDesc("naive scan, non-power-of-two");
StreamCompaction::Naive::scan(NPOT, c, a);
printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
//printArray(SIZE, c, true);
printArray(SIZE, c, true);
printCmpResult(NPOT, b, c);

zeroArray(SIZE, c);
printDesc("work-efficient scan, power-of-two");
StreamCompaction::Efficient::scan(SIZE, c, a);
printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
//printArray(SIZE, c, true);
printArray(SIZE, c, true);
printCmpResult(SIZE, b, c);

zeroArray(SIZE, c);
printDesc("work-efficient scan, non-power-of-two");
StreamCompaction::Efficient::scan(NPOT, c, a);
printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
//printArray(NPOT, c, true);
printArray(NPOT, c, true);
printCmpResult(NPOT, b, c);

zeroArray(SIZE, c);
printDesc("thrust scan, power-of-two");
StreamCompaction::Thrust::scan(SIZE, c, a);
printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
//printArray(SIZE, c, true);
printArray(SIZE, c, true);
printCmpResult(SIZE, b, c);

zeroArray(SIZE, c);
printDesc("thrust scan, non-power-of-two");
StreamCompaction::Thrust::scan(NPOT, c, a);
printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
//printArray(NPOT, c, true);
printArray(NPOT, c, true);
printCmpResult(NPOT, b, c);

printf("\n");
Expand Down Expand Up @@ -137,14 +139,16 @@ int main(int argc, char* argv[]) {
printDesc("work-efficient compact, power-of-two");
count = StreamCompaction::Efficient::compact(SIZE, c, a);
printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
//printArray(count, c, true);
printArray(SIZE, c, true);
std::cout << "expected count is " << expectedCount << ", count is " << count << std::endl;
printCmpLenResult(count, expectedCount, b, c);

zeroArray(SIZE, c);
printDesc("work-efficient compact, non-power-of-two");
count = StreamCompaction::Efficient::compact(NPOT, c, a);
printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
//printArray(count, c, true);
printArray(count, c, true);
std::cout << "expected count is " << expectedCount << ", count is " << count << std::endl;
printCmpLenResult(count, expectedNPOT, b, c);

system("pause"); // stop Win32 console from closing on exit
Expand Down
9 changes: 9 additions & 0 deletions src/testing_helpers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,15 @@ void genArray(int n, int *a, int maxval) {
}
}

void testArray(int n, int* a, int maxval) {
srand(time(nullptr));
int q = 0;
for (int i = 0; i < n; i++) {
a[i] = q;
q++;
}
}

void printArray(int n, int *a, bool abridged = false) {
printf(" [ ");
for (int i = 0; i < n; i++) {
Expand Down
45 changes: 40 additions & 5 deletions stream_compaction/cpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,12 @@ namespace StreamCompaction {
* (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first.
*/
void scan(int n, int *odata, const int *idata) {
// we are assuming that the arrays have at least one element
timer().startCpuTimer();
// TODO
odata[0] = 0;
for (int k = 1; k < n; k++) {
odata[k] = odata[k - 1] + idata[k - 1];
}
timer().endCpuTimer();
}

Expand All @@ -30,9 +34,15 @@ namespace StreamCompaction {
*/
int compactWithoutScan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO
int optr = 0;
for (int i = 0; i < n; i++) {
if (idata[i] != 0) {
odata[optr] = idata[i];
optr++;
}
}
timer().endCpuTimer();
return -1;
return optr;
}

/**
Expand All @@ -42,9 +52,34 @@ namespace StreamCompaction {
*/
int compactWithScan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO
int* zeroOnes = new int[n];
for (int i = 0; i < n; i++) {
if (idata[i] != 0) {
zeroOnes[i] = 1;
}
else {
zeroOnes[i] = 0;
}
}
int* scanResult = new int[n];

// scan
scanResult[0] = 0;
for (int k = 1; k < n; k++) {
scanResult[k] = scanResult[k - 1] + zeroOnes[k - 1];
}
// end of scan

for (int i = 0; i < n; i++) {
if (zeroOnes[i] == 1) {
odata[scanResult[i]] = idata[i];
}
}
int count = scanResult[n - 1];
delete[] zeroOnes;
delete[] scanResult;
timer().endCpuTimer();
return -1;
return count;
}
}
}
Loading