CIS565-Fall-2020 · YangH34 · Sep 24, 2020 · Sep 24, 2020 · Sep 24, 2020 · Sep 24, 2020
diff --git a/README.md b/README.md
@@ -2,13 +2,87 @@ CUDA Stream Compaction
 ======================
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
+* Haorong Yang
+* [LinkedIn](https://www.linkedin.com/in/haorong-henry-yang/)
+* Tested on: Windows 10 Home, i7-10750H @ 2.60GHz 16GB, GTX 2070 Super Max-Q (Personal)
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+The goal of this project was to implement a stream compaction algorithm on the GPU in CUDA from scratch. 
+The algorithm will remove `0`s from an array of `int`s utilizing a scan function, which performs parallel reduction on the array to obtain an exclusive prefix sum.
 
-### (TODO: Your README)
+Although the goal is to obtain an efficient parallel solution, for comparison, a few variations of the algorithm were also implemented.
+A list of algorithms that will be compared to each other:
+* CPU scan function
+* CPU stream compaction without scan
+* CPU sream compaction with scan
+* GPU naive scan
+* GPU work-efficient scan & compaction
+* thrust library's implementation
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+The test results for array size of 2^8 is:
 
+```
+****************
+** SCAN TESTS **
+****************
+    [  17  20  19  34  19   3   6   3  27   2  14   5  21 ...  36   0 ]
+==== cpu scan, power-of-two ====
+   elapsed time: 0.0005ms    (std::chrono Measured)
+    [   0  17  37  56  90 109 112 118 121 148 150 164 169 ... 6163 6199 ]
+==== cpu scan, non-power-of-two ====
+   elapsed time: 0.0004ms    (std::chrono Measured)
+    [   0  17  37  56  90 109 112 118 121 148 150 164 169 ... 6092 6101 ]
+    passed
+==== naive scan, power-of-two ====
+   elapsed time: 0.029056ms    (CUDA Measured)
+    [   0  17  37  56  90 109 112 118 121 148 150 164 169 ... 6163 6199 ]
+    passed
+==== naive scan, non-power-of-two ====
+   elapsed time: 0.026752ms    (CUDA Measured)
+    [   0  17  37  56  90 109 112 118 121 148 150 164 169 ...   0   0 ]
+    passed
+==== work-efficient scan, power-of-two ====
+   elapsed time: 0.012768ms    (CUDA Measured)
+    [ 6199 6216 6236 6255 6289 6308 6311 6317 6320 6347 6349 6363 6368 ... 3050 3086 ]
+    a[0] = 0, b[0] = 6199
+    FAIL VALUE
+==== work-efficient scan, non-power-of-two ====
+   elapsed time: 0.012512ms    (CUDA Measured)
+    [ 6138 6155 6175 6194 6228 6247 6250 6256 6259 6286 6288 6302 6307 ... 2979 2988 ]
+    a[0] = 0, b[0] = 6138
+    FAIL VALUE
+==== thrust scan, power-of-two ====
+   elapsed time: 0.055264ms    (CUDA Measured)
+    [   0  17  37  56  90 109 112 118 121 148 150 164 169 ... 6163 6199 ]
+    passed
+==== thrust scan, non-power-of-two ====
+   elapsed time: 0.054368ms    (CUDA Measured)
+    [   0  17  37  56  90 109 112 118 121 148 150 164 169 ... 6092 6101 ]
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   3   0   1   2   1   1   0   1   1   0   2   1   3 ...   2   0 ]
+==== cpu compact without scan, power-of-two ====
+   elapsed time: 0.0008ms    (std::chrono Measured)
+    [   3   1   2   1   1   1   1   2   1   3   1   2   2 ...   3   2 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+   elapsed time: 0.0008ms    (std::chrono Measured)
+    [   3   1   2   1   1   1   1   2   1   3   1   2   2 ...   1   3 ]
+    passed
+==== cpu compact with scan ====
+   elapsed time: 0.004ms    (std::chrono Measured)
+    [   3   1   2   1   1   1   1   2   1   3   1   2   2 ...   3   2 ]
+    passed
+==== work-efficient compact, power-of-two ====
+   elapsed time: 0.020992ms    (CUDA Measured)
+    [   3   1   2   1   1   1   1   2   1   3   1   2   2 ...   0   0 ]
+expected count is 185, count is 185
+    passed
+==== work-efficient compact, non-power-of-two ====
+   elapsed time: 0.021888ms    (CUDA Measured)
+    [   3   1   2   1   1   1   1   2   1   3   1   2   2 ...   1   3 ]
+expected count is 185, count is 183
+    passed
+```
diff --git a/src/main.cpp b/src/main.cpp
@@ -14,7 +14,8 @@
 #include "testing_helpers.hpp"
 
 const int SIZE = 1 << 8; // feel free to change the size of array
-const int NPOT = SIZE - 3; // Non-Power-Of-Two
+//const int SIZE = 8;
+const int NPOT = SIZE - 3; // Non8Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];
 int *c = new int[SIZE];
@@ -27,7 +28,8 @@ int main(int argc, char* argv[]) {
     printf("** SCAN TESTS **\n");
     printf("****************\n");
 
-    genArray(SIZE - 1, a, 50);  // Leave a 0 at the end to test that edge case
+    genArray(SIZE - 1, a, 50);  // Leave a 0 at the end to test that edge case // 3rd argument is maxValue
+    //testArray(SIZE - 1, a, 50); // test
     a[SIZE - 1] = 0;
     printArray(SIZE, a, true);
 
@@ -51,7 +53,7 @@ int main(int argc, char* argv[]) {
     printDesc("naive scan, power-of-two");
     StreamCompaction::Naive::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
@@ -64,35 +66,35 @@ int main(int argc, char* argv[]) {
     printDesc("naive scan, non-power-of-two");
     StreamCompaction::Naive::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, power-of-two");
     StreamCompaction::Efficient::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, non-power-of-two");
     StreamCompaction::Efficient::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
+    printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
     printDesc("thrust scan, power-of-two");
     StreamCompaction::Thrust::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("thrust scan, non-power-of-two");
     StreamCompaction::Thrust::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
+    printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
     printf("\n");
@@ -137,14 +139,16 @@ int main(int argc, char* argv[]) {
     printDesc("work-efficient compact, power-of-two");
     count = StreamCompaction::Efficient::compact(SIZE, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
+    printArray(SIZE, c, true);
+    std::cout << "expected count is " << expectedCount << ", count is " << count << std::endl;
     printCmpLenResult(count, expectedCount, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient compact, non-power-of-two");
     count = StreamCompaction::Efficient::compact(NPOT, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
+    printArray(count, c, true);
+    std::cout << "expected count is " << expectedCount << ", count is " << count << std::endl;
     printCmpLenResult(count, expectedNPOT, b, c);
 
     system("pause"); // stop Win32 console from closing on exit

diff --git a/src/testing_helpers.hpp b/src/testing_helpers.hpp
@@ -57,6 +57,15 @@ void genArray(int n, int *a, int maxval) {
     }
 }
 
+void testArray(int n, int* a, int maxval) {
+    srand(time(nullptr));
+    int q = 0;
+    for (int i = 0; i < n; i++) {
+        a[i] = q;
+        q++;
+    }
+}
+
 void printArray(int n, int *a, bool abridged = false) {
     printf("    [ ");
     for (int i = 0; i < n; i++) {

diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
@@ -18,8 +18,12 @@ namespace StreamCompaction {
          * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first.
          */
         void scan(int n, int *odata, const int *idata) {
+            // we are assuming that the arrays have at least one element
             timer().startCpuTimer();
-            // TODO
+            odata[0] = 0;
+            for (int k = 1; k < n; k++) {
+                odata[k] = odata[k - 1] + idata[k - 1];
+            }
             timer().endCpuTimer();
         }
 
@@ -30,9 +34,15 @@ namespace StreamCompaction {
          */
         int compactWithoutScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
-            // TODO
+            int optr = 0;
+            for (int i = 0; i < n; i++) {
+                if (idata[i] != 0) {
+                    odata[optr] = idata[i];
+                    optr++;
+                }
+            }
             timer().endCpuTimer();
-            return -1;
+            return optr;
         }
 
         /**
@@ -42,9 +52,34 @@ namespace StreamCompaction {
          */
         int compactWithScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
-            // TODO
+            int* zeroOnes = new int[n];
+            for (int i = 0; i < n; i++) {
+                if (idata[i] != 0) {
+                    zeroOnes[i] = 1;
+                }
+                else {
+                    zeroOnes[i] = 0;
+                }
+            }
+            int* scanResult = new int[n];
+
+            // scan
+            scanResult[0] = 0;
+            for (int k = 1; k < n; k++) {
+                scanResult[k] = scanResult[k - 1] + zeroOnes[k - 1];
+            }
+            // end of scan
+
+            for (int i = 0; i < n; i++) {
+                if (zeroOnes[i] == 1) {
+                    odata[scanResult[i]] = idata[i];
+                }
+            }
+            int count = scanResult[n - 1];
+            delete[] zeroOnes;
+            delete[] scanResult;
             timer().endCpuTimer();
-            return -1;
+            return count;
         }
     }
 }