diff --git a/README.md b/README.md
index 0e38ddb..4ad3002 100644
--- a/README.md
+++ b/README.md
@@ -3,12 +3,80 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Jacky Lu
+  * [LinkedIn](https://www.linkedin.com/in/jacky-lu-506968129/)
 
-### (TODO: Your README)
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+# README
+
+## Result:
+### The implementations' block sizes seem to be optimized at 128 threads per block.
+
+### Performance Comparison Between GPU Scan Implementations (Naive, Work-Efficient, and Thrust) And CPU Scan Implementations
+#### (Tested With 128 Threads Per Block)
+![](img/plot.png)
+
+### Output Of The Test Program With 33,554,432 Array Elements (Tested With 128 Threads Per Block)
+```
+****************
+** SCAN TESTS **
+****************
+    [  12   3  19  17  37  34   6  17  18  22  23   8  49 ...  17   0 ]
+==== cpu scan, power-of-two ====
+   elapsed time: 102.834ms    (std::chrono Measured)
+    [   0  12  15  34  51  88 122 128 145 163 185 208 216 ... 821676053 821676070 ]
+==== cpu scan, non-power-of-two ====
+   elapsed time: 38.8949ms    (std::chrono Measured)
+    [   0  12  15  34  51  88 122 128 145 163 185 208 216 ... 821675987 821676034 ]
+    passed
+==== naive scan, power-of-two ====
+   elapsed time: 19.5621ms    (CUDA Measured)
+    [   0  12  15  34  51  88 122 128 145 163 185 208 216 ... 821676053 821676070 ]
+    passed
+==== naive scan, non-power-of-two ====
+   elapsed time: 19.9352ms    (CUDA Measured)
+    [   0  12  15  34  51  88 122 128 145 163 185 208 216 ... 821675987 821676034 ]
+    passed
+==== work-efficient scan, power-of-two ====
+   elapsed time: 14.7942ms    (CUDA Measured)
+    [   0  12  15  34  51  88 122 128 145 163 185 208 216 ... 821676053 821676070 ]
+    passed
+==== work-efficient scan, non-power-of-two ====
+   elapsed time: 14.8075ms    (CUDA Measured)
+    [   0  12  15  34  51  88 122 128 145 163 185 208 216 ... 821675987 821676034 ]
+    passed
+==== thrust scan, power-of-two ====
+   elapsed time: 0.8088ms    (CUDA Measured)
+    [   0  12  15  34  51  88 122 128 145 163 185 208 216 ... 821676053 821676070 ]
+    passed
+==== thrust scan, non-power-of-two ====
+   elapsed time: 0.806624ms    (CUDA Measured)
+    [   0  12  15  34  51  88 122 128 145 163 185 208 216 ... 821675987 821676034 ]
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   0   2   1   2   0   0   3   3   1   2   2   2   2 ...   2   0 ]
+==== cpu compact without scan, power-of-two ====
+   elapsed time: 60.1642ms    (std::chrono Measured)
+    [   2   1   2   3   3   1   2   2   2   2   3   1   3 ...   3   2 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+   elapsed time: 60.4809ms    (std::chrono Measured)
+    [   2   1   2   3   3   1   2   2   2   2   3   1   3 ...   1   3 ]
+    passed
+==== cpu compact with scan ====
+   elapsed time: 205.018ms    (std::chrono Measured)
+    [   2   1   2   3   3   1   2   2   2   2   3   1   3 ...   3   2 ]
+    passed
+==== work-efficient compact, power-of-two ====
+   elapsed time: 17.7603ms    (CUDA Measured)
+    [   2   1   2   3   3   1   2   2   2   2   3   1   3 ...   3   2 ]
+    passed
+==== work-efficient compact, non-power-of-two ====
+   elapsed time: 17.4516ms    (CUDA Measured)
+    [   2   1   2   3   3   1   2   2   2   2   3   1   3 ...   1   3 ]
+    passed
+```
 
diff --git a/img/plot.png b/img/plot.png
new file mode 100644
index 0000000..d25345c
Binary files /dev/null and b/img/plot.png differ
diff --git a/src/main.cpp b/src/main.cpp
index 896ac2b..986a2bf 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,7 +13,7 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = (1 << 25); // feel free to change the size of array (8 originally)
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];
@@ -51,7 +51,7 @@ int main(int argc, char* argv[]) {
     printDesc("naive scan, power-of-two");
     StreamCompaction::Naive::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
@@ -64,35 +64,35 @@ int main(int argc, char* argv[]) {
     printDesc("naive scan, non-power-of-two");
     StreamCompaction::Naive::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, power-of-two");
     StreamCompaction::Efficient::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, non-power-of-two");
     StreamCompaction::Efficient::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
+    printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
     printDesc("thrust scan, power-of-two");
     StreamCompaction::Thrust::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("thrust scan, non-power-of-two");
     StreamCompaction::Thrust::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
+    printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
     printf("\n");
@@ -137,14 +137,14 @@ int main(int argc, char* argv[]) {
     printDesc("work-efficient compact, power-of-two");
     count = StreamCompaction::Efficient::compact(SIZE, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
+    printArray(count, c, true); //
     printCmpLenResult(count, expectedCount, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient compact, non-power-of-two");
     count = StreamCompaction::Efficient::compact(NPOT, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
+    printArray(count, c, true); //
     printCmpLenResult(count, expectedNPOT, b, c);
 
     system("pause"); // stop Win32 console from closing on exit
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index 2ed6d63..672da8c 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -1,5 +1,7 @@
 #include "common.h"
 
+#include <device_launch_parameters.h>
+
 void checkCUDAErrorFn(const char *msg, const char *file, int line) {
     cudaError_t err = cudaGetLastError();
     if (cudaSuccess == err) {
@@ -24,6 +26,15 @@ namespace StreamCompaction {
          */
         __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
             // TODO
+            int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (index < n) {
+                if (idata[index] != 0) {
+                    bools[index] = 1;
+                } else {
+                    bools[index] = 0;
+                }
+                
+            }
         }
 
         /**
@@ -33,6 +44,10 @@ namespace StreamCompaction {
         __global__ void kernScatter(int n, int *odata,
                 const int *idata, const int *bools, const int *indices) {
             // TODO
+            int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (index < n && bools[index] == 1) {
+                odata[indices[index]] = idata[index];
+            }
         }
 
     }
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 719fa11..69e33a5 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -20,6 +20,10 @@ namespace StreamCompaction {
         void scan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
             // TODO
+            odata[0] = 0;
+            for (int i = 1; i < n; i++) {
+                odata[i] = odata[i - 1] + idata[i - 1];
+            }
             timer().endCpuTimer();
         }
 
@@ -31,8 +35,15 @@ namespace StreamCompaction {
         int compactWithoutScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
             // TODO
+            int counter = 0;
+            for (int i = 0; i < n; i++) {
+                if (idata[i] != 0) {
+                    odata[counter] = idata[i];
+                    counter++;
+                }
+            }
             timer().endCpuTimer();
-            return -1;
+            return counter;
         }
 
         /**
@@ -43,8 +54,32 @@ namespace StreamCompaction {
         int compactWithScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
             // TODO
+            // Construct criteria vector
+            int* criteriaVec = new int[n];
+            for (int i = 0; i < n; i++) {
+                criteriaVec[i] = idata[i] != 0 ? 1 : 0;
+            }
+
+            // Construct scan vector from criteria vector
+            int* scanVec = new int[n];
+            scanVec[0] = 0;
+            for (int i = 1; i < n; i++) {
+                scanVec[i] = scanVec[i - 1] + criteriaVec[i - 1];
+            }
+
+            // Scatter
+            int counter = 0;
+            for (int i = 0; i < n; i++) {
+                if (criteriaVec[i] == 1) {
+                    odata[scanVec[i]] = idata[i];
+                    counter++;
+                }
+            }
+
+            delete[] criteriaVec;
+            delete[] scanVec;
             timer().endCpuTimer();
-            return -1;
+            return counter;
         }
     }
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 2db346e..a934ecd 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -3,6 +3,9 @@
 #include "common.h"
 #include "efficient.h"
 
+#include <device_launch_parameters.h>
+#include <iostream>
+
 namespace StreamCompaction {
     namespace Efficient {
         using StreamCompaction::Common::PerformanceTimer;
@@ -15,12 +18,67 @@ namespace StreamCompaction {
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
+        
+
+        __global__ void kernReduce(int nPadded, int d, int* dev_vec_padded) {
+            int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (index < nPadded) {
+                if (index % (1 << (d + 1)) == 0) { // (int)fmodf(index, 1 << (d + 1))
+                    dev_vec_padded[index + (1 << (d + 1)) - 1] += dev_vec_padded[index + (1 << d) - 1];
+                }
+            }
+        }
+
+        __global__ void downSweep(int nPadded, int d, int* dev_vec_padded) {
+            int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (index < nPadded) {
+                if (index % (1 << (d + 1)) == 0) {
+                    int t = dev_vec_padded[index + (1 << d) - 1];
+                    dev_vec_padded[index + (1 << d) - 1] = dev_vec_padded[index + (1 << (d + 1)) - 1];
+                    dev_vec_padded[index + (1 << (d + 1)) - 1] += t;
+                }
+            }
+
+        }
+
         void scan(int n, int *odata, const int *idata) {
+            int paddedSize = 1 << ilog2ceil(n);
+            int nPadded = n;
+            if (paddedSize > n) {
+                nPadded = paddedSize;
+            }
+
+            int* dev_vec_padded;
+            cudaMalloc((void**)&dev_vec_padded, nPadded * sizeof(int));
+            cudaMemset(dev_vec_padded, 0, nPadded * sizeof(int));
+            cudaMemcpy(dev_vec_padded, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+            
             timer().startGpuTimer();
             // TODO
+            int blockSize = 128;
+            dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize);
+            dim3 fullBlocksPerGridPadded((nPadded + blockSize - 1) / blockSize);
+            
+            // Reduce/Up-Sweep
+            for (int d = 0; d <= ilog2ceil(nPadded) - 1; d++) {
+                kernReduce << <fullBlocksPerGridPadded, blockSize >> > (nPadded, d, dev_vec_padded);
+            }
+            // Set Root To Zero
+            cudaMemset(dev_vec_padded + nPadded - 1, 0, sizeof(int));
+
+            // Down-Sweep
+            for (int d = ilog2ceil(nPadded) - 1; d >= 0; d--) {
+                downSweep << <fullBlocksPerGridPadded, blockSize >> > (nPadded, d, dev_vec_padded);
+            }
+
             timer().endGpuTimer();
+
+            cudaMemcpy(odata, dev_vec_padded, n * sizeof(int), cudaMemcpyDeviceToHost);
+            cudaFree(dev_vec_padded);
         }
 
+
+        
         /**
          * Performs stream compaction on idata, storing the result into odata.
          * All zeroes are discarded.
@@ -30,11 +88,87 @@ namespace StreamCompaction {
          * @param idata  The array of elements to compact.
          * @returns      The number of elements remaining after compaction.
          */
+
+        __global__ void kernMakeBool(int num, int* dev_vec) {
+            int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (index < num && dev_vec[index] != 0) {
+                dev_vec[index] = 1;
+            }
+        }
+
+        __global__ void kernScatter(int n, int* dev_idata, int* dev_indices, int* dev_result) {
+            int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (index < n && dev_idata[index] != 0) {
+                dev_result[dev_indices[index]] = dev_idata[index];
+            }
+        }
+        __global__ void kernCheckNonZeroNum(int n, int* dev_result, int* num) {
+            int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (index < n) {
+                if (dev_result[index] == 0 && dev_result[index - 1] != 0) {
+                    *num = index;
+                }
+            }
+        }
+
         int compact(int n, int *odata, const int *idata) {
+            int paddedSize = 1 << ilog2ceil(n);
+            int nPadded = n;
+            if (paddedSize > n) {
+                nPadded = paddedSize;
+            }
+
+            int* dev_indices;
+            cudaMalloc((void**)&dev_indices, nPadded * sizeof(int));
+
+            int* dev_bool;
+            cudaMalloc((void**)&dev_bool, nPadded * sizeof(int));
+            
+            int* dev_idata;
+            cudaMalloc((void**)&dev_idata, nPadded * sizeof(int));
+            cudaMemset(dev_idata, 0, nPadded * sizeof(int));
+            cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+            int* dev_result;
+            cudaMalloc((void**)&dev_result, n * sizeof(int));
+
             timer().startGpuTimer();
             // TODO
+            int blockSize = 128;
+            dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize);
+            dim3 fullBlocksPerGridPadded((nPadded + blockSize - 1) / blockSize);
+            
+            // Convert to 0s and 1s
+            StreamCompaction::Common::kernMapToBoolean << <fullBlocksPerGridPadded, blockSize >> > (nPadded, dev_bool, dev_idata);
+            cudaMemcpy(dev_indices, dev_bool, nPadded * sizeof(int), cudaMemcpyDeviceToDevice);
+
+            // Reduce/Up-Sweep
+            for (int d = 0; d <= ilog2ceil(nPadded) - 1; d++) {
+                kernReduce << <fullBlocksPerGridPadded, blockSize >> > (nPadded, d, dev_indices);
+            }
+            // Set Root To Zero
+            cudaMemset(dev_indices + nPadded - 1, 0, sizeof(int));
+
+            // Down-Sweep
+            for (int d = ilog2ceil(nPadded) - 1; d >= 0; d--) {
+                downSweep << <fullBlocksPerGridPadded, blockSize >> > (nPadded, d, dev_indices);
+            }
+            
+            // Scatter
+            StreamCompaction::Common::kernScatter << <fullBlocksPerGrid, blockSize >> > (n, dev_result, dev_idata, dev_bool, dev_indices);
+
             timer().endGpuTimer();
-            return -1;
+
+            cudaMemcpy(odata, dev_result, n * sizeof(int), cudaMemcpyDeviceToHost);
+            int nonZeroNum;
+            cudaMemcpy(&nonZeroNum, dev_indices + nPadded - 1, sizeof(int), cudaMemcpyDeviceToHost);
+
+            cudaFree(dev_indices);
+            cudaFree(dev_bool);
+            cudaFree(dev_idata);
+            cudaFree(dev_result);
+
+            return nonZeroNum;
         }
     }
 }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 4308876..5ee230b 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -3,6 +3,8 @@
 #include "common.h"
 #include "naive.h"
 
+#include <device_launch_parameters.h>
+
 namespace StreamCompaction {
     namespace Naive {
         using StreamCompaction::Common::PerformanceTimer;
@@ -12,14 +14,62 @@ namespace StreamCompaction {
             return timer;
         }
         // TODO: __global__
+        __global__ void kernNaiveScan(int n, int d, int* dev_vec1, int* dev_vec2) {
+            int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (index < n) {
+                if (index >= (1 << (d - 1))) {
+                    dev_vec2[index] = dev_vec1[index - (1 << (d - 1))] + dev_vec1[index];
+                } else {
+                    dev_vec2[index] = dev_vec1[index];
+                } 
+            } 
+        }
+
+        __global__ void kernInsertIdentity(int n, int* dev_vec1, int* dev_vec2) {
+            int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (index < n) {
+                if (index == 0) {
+                    dev_vec2[index] = 0;    // Insert identity element
+                } else {
+                    dev_vec2[index] = dev_vec1[index - 1];
+                }
+            }
+        }
 
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+            int* dev_vec1;
+            int* dev_vec2;
+            int size = n * sizeof(int);
+            cudaMalloc((void**)&dev_vec1, size);
+            cudaMalloc((void**)&dev_vec2, size);
+            cudaMemcpy(dev_vec1, idata, size, cudaMemcpyHostToDevice);
+            
             timer().startGpuTimer();
             // TODO
+            int blockSize = 128;
+            dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize);
+            for (int d = 1; d <= ilog2ceil(n); d++) {
+                // Launch kernel for inclusive scan
+                kernNaiveScan << <fullBlocksPerGrid, blockSize >> > (n, d, dev_vec1, dev_vec2);
+                // Ping-pong buffers
+                int* temp = dev_vec1;
+                dev_vec1 = dev_vec2;
+                dev_vec2 = temp;
+            }
+            // Launch kernel for shifting elements right and inserting identity element
+            kernInsertIdentity << <fullBlocksPerGrid, blockSize >> > (n, dev_vec1, dev_vec2);
+            // Ping-pong buffers again
+            int* temp = dev_vec1;
+            dev_vec1 = dev_vec2;
+            dev_vec2 = temp;
             timer().endGpuTimer();
+
+            cudaMemcpy(odata, dev_vec1, size, cudaMemcpyDeviceToHost);
+            cudaFree(dev_vec1);
+            cudaFree(dev_vec2);
         }
     }
 }
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 1def45e..7dcb6e7 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -18,11 +18,18 @@ namespace StreamCompaction {
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+            thrust::host_vector<int> h_in(idata, idata + n);
+            thrust::device_vector<int> dv_in = h_in;
+            thrust::device_vector<int> dv_out(n);
+
             timer().startGpuTimer();
             // TODO use `thrust::exclusive_scan`
             // example: for device_vectors dv_in and dv_out:
             // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+            thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
             timer().endGpuTimer();
+
+            thrust::copy(dv_out.begin(), dv_out.end(), odata);
         }
     }
 }