diff --git a/README.md b/README.md
index 0e38ddb..bf8894d 100644
--- a/README.md
+++ b/README.md
@@ -3,12 +3,90 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Eric Chiu
+* Tested on: Windows 10 Education, Intel(R) Xeon(R) CPU E5-1630 v4 @ 3.60GHz 32GB, NVIDIA GeForce GTX 1070 (SIGLAB)
 
-### (TODO: Your README)
+## Description
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+This project explores and compares different CPU and GPU implementations of the scan (prefix sum) and the compaction algorithms. Some implementations include naive and work efficient methods.
 
+## Performance Analysis
+
+
+The following images below show a comparison between CPU, naive, work-efficient, and thrust implementations of the scan algorithm.
+
+
+![](./img/scan-data.png)
+
+![](./img/scan-chart.png)
+
+
+The following images below show a comparison between CPU without scan, CPU with scan, and work-efficient implementations of the compaction algorithm.
+
+
+![](./img/compaction-data.png)
+
+![](./img/compaction-chart.png)
+
+
+Overall, the thrust implementation had the best performance long term. In the beginning with smaller array sizes however, the CPU implementation was the fastest, then naive implementation, then work-efficient implementation, and then thrust implementation. As the array size increased, different implementations experienced different bottlenecks. I believe the CPU implementation experienced a bottleneck in the number of operations because it goes through serial memory processing (as to parallel memory processing). I believe the naive and work-efficient implementations experienced a bottleneck in global memory access. 
+
+
+## Program Output
+
+```
+****************
+** SCAN TESTS **
+****************
+    [  26  38   4  11  28  35   3   0   3   4  36   3  39 ...  22   0 ]
+==== cpu scan, power-of-two ====
+   elapsed time: 0.0006ms    (std::chrono Measured)
+    [   0  26  64  68  79 107 142 145 145 148 152 188 191 ... 6071 6093 ]
+==== cpu scan, non-power-of-two ====
+   elapsed time: 0.0007ms    (std::chrono Measured)
+    [   0  26  64  68  79 107 142 145 145 148 152 188 191 ... 6031 6046 ]
+    passed
+==== naive scan, power-of-two ====
+   elapsed time: 0.023552ms    (CUDA Measured)
+    passed
+==== naive scan, non-power-of-two ====
+   elapsed time: 0.023552ms    (CUDA Measured)
+    passed
+==== work-efficient scan, power-of-two ====
+   elapsed time: 0.095232ms    (CUDA Measured)
+    passed
+==== work-efficient scan, non-power-of-two ====
+   elapsed time: 0.105472ms    (CUDA Measured)
+    passed
+==== thrust scan, power-of-two ====
+   elapsed time: 0.058368ms    (CUDA Measured)
+    passed
+==== thrust scan, non-power-of-two ====
+   elapsed time: 0.05632ms    (CUDA Measured)
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   3   0   0   3   2   1   1   1   0   0   2   1   1 ...   2   0 ]
+==== cpu compact without scan, power-of-two ====
+   elapsed time: 0.0009ms    (std::chrono Measured)
+    [   3   3   2   1   1   1   2   1   1   3   2   1   2 ...   1   2 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+   elapsed time: 0.0009ms    (std::chrono Measured)
+    [   3   3   2   1   1   1   2   1   1   3   2   1   2 ...   2   2 ]
+    passed
+==== cpu compact with scan ====
+   elapsed time: 0.0035ms    (std::chrono Measured)
+    [   3   3   2   1   1   1   2   1   1   3   2   1   2 ...   1   2 ]
+    passed
+==== work-efficient compact, power-of-two ====
+   elapsed time: 0.365568ms    (CUDA Measured)
+    passed
+==== work-efficient compact, non-power-of-two ====
+   elapsed time: 0.359424ms    (CUDA Measured)
+    passed
+Press any key to continue . . .
+
+```
\ No newline at end of file
diff --git a/img/compaction-chart.png b/img/compaction-chart.png
new file mode 100644
index 0000000..ce3b645
Binary files /dev/null and b/img/compaction-chart.png differ
diff --git a/img/compaction-data.png b/img/compaction-data.png
new file mode 100644
index 0000000..2cbc514
Binary files /dev/null and b/img/compaction-data.png differ
diff --git a/img/scan-chart.png b/img/scan-chart.png
new file mode 100644
index 0000000..1d53f82
Binary files /dev/null and b/img/scan-chart.png differ
diff --git a/img/scan-data.png b/img/scan-data.png
new file mode 100644
index 0000000..2ee2752
Binary files /dev/null and b/img/scan-data.png differ
diff --git a/src/main.cpp b/src/main.cpp
index 1850161..459cda3 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,7 +13,7 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 256; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];
diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt
index cdbef77..4bb0dc2 100644
--- a/stream_compaction/CMakeLists.txt
+++ b/stream_compaction/CMakeLists.txt
@@ -13,5 +13,5 @@ set(SOURCE_FILES
 
 cuda_add_library(stream_compaction
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_61
     )
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index 8fc0211..4dd7b66 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -22,8 +22,11 @@ namespace StreamCompaction {
          * Maps an array to an array of 0s and 1s for stream compaction. Elements
          * which map to 0 will be removed, and elements which map to 1 will be kept.
          */
-        __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
-            // TODO
+        __global__ void kernMapToBoolean(int n, int *bools, const int *idata) 
+        {
+            int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (index >= n) return;
+            bools[index] = idata[index] == 0 ? 0 : 1;
         }
 
         /**
@@ -31,8 +34,14 @@ namespace StreamCompaction {
          * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]].
          */
         __global__ void kernScatter(int n, int *odata,
-                const int *idata, const int *bools, const int *indices) {
-            // TODO
+                const int *idata, const int *bools, const int *indices) 
+        {
+            int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (index >= n) return;
+            if (bools[index] == 1)
+            {
+                odata[indices[index]] = idata[index];
+            }
         }
 
     }
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 05ce667..112d8d0 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -17,9 +17,25 @@ namespace StreamCompaction {
          * For performance analysis, this is supposed to be a simple for loop.
          * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first.
          */
-        void scan(int n, int *odata, const int *idata) {
+
+        void scanHelper(int n, int *odata, const int *idata)
+        {
+            if (n == 0) return;
+
+            odata[0] = 0;
+            for (int i = 1; i < n; i++)
+            {
+                odata[i] = odata[i - 1] + idata[i - 1];
+            }
+            
+        }
+
+        void scan(int n, int *odata, const int *idata) 
+        {
 	        timer().startCpuTimer();
-            // TODO
+            
+            scanHelper(n, odata, idata);
+
 	        timer().endCpuTimer();
         }
 
@@ -29,10 +45,21 @@ namespace StreamCompaction {
          * @returns the number of elements remaining after compaction.
          */
         int compactWithoutScan(int n, int *odata, const int *idata) {
-	        timer().startCpuTimer();
-            // TODO
+	        
+            timer().startCpuTimer();
+
+            int oIndex = 0;
+            for (int i = 0; i < n; i++)
+            {
+                if (idata[i] != 0)
+                {
+                    odata[oIndex] = idata[i];
+                    oIndex++;
+                }
+            }
+
 	        timer().endCpuTimer();
-            return -1;
+            return oIndex;
         }
 
         /**
@@ -40,11 +67,40 @@ namespace StreamCompaction {
          *
          * @returns the number of elements remaining after compaction.
          */
-        int compactWithScan(int n, int *odata, const int *idata) {
-	        timer().startCpuTimer();
-	        // TODO
-	        timer().endCpuTimer();
-            return -1;
+        int compactWithScan(int n, int *odata, const int *idata)
+        {
+            timer().startCpuTimer();
+	        
+            int* bdata = new int[n];
+            int* sdata = new int[n];
+
+            for (int i = 0; i < n; i++)
+            {
+                if (idata[i] != 0)
+                {
+                    bdata[i] = 1;
+                }
+                else
+                {
+                    bdata[i] = 0;
+                }
+            }
+
+            // cannot call scan because it uses startCpuTimer as well
+            scanHelper(n, sdata, bdata);
+
+            int sum = 0;
+            for (int i = 0; i < n; i++)
+            {
+                if (bdata[i] != 0)
+                {
+                    odata[sdata[i]] = idata[i];
+                    sum = sdata[i];
+                }
+            }
+
+            timer().endCpuTimer();
+            return sum + 1;
         }
     }
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 36c5ef2..07360fc 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -12,13 +12,117 @@ namespace StreamCompaction {
             return timer;
         }
 
+        const int BLOCK_SIZE = 256;
+        
+        __global__ void kernUpSweep(int n, int stride, int *data)
+        {
+            int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (index >= n) return;
+
+            int fullIndex = (index + 1) * stride - 1;
+            int halfIndex = index * stride - 1 + (stride / 2);
+            data[fullIndex] += data[halfIndex];
+        }
+
+        __global__ void kernDownSweep(int n, int stride, int *data)
+        {
+            int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (index >= n) return;
+
+            int fullIndex = (index + 1) * stride - 1;
+            int halfIndex = index * stride - 1 + (stride / 2);
+
+            int temp = data[halfIndex];
+            data[halfIndex] = data[fullIndex];
+            data[fullIndex] += temp;
+        }
+
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
-        void scan(int n, int *odata, const int *idata) {
+        void scan(int n, int *odata, const int *idata) 
+        {
+            int *dev_data;
+            int width = 1;
+            int squareN = pow(2, ilog2ceil(n));
+            int iterations = ilog2(squareN) - 1;
+            int numThreads, numBlocks;
+
+            cudaMalloc((void**)&dev_data, squareN * sizeof(int));
+            cudaMemcpy(dev_data, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
             timer().startGpuTimer();
-            // TODO
+            
+            // up sweep
+            for (int i = 0; i <= iterations; i++)
+            {
+                width = width * 2;
+                numThreads = squareN / width;
+                numBlocks = (numThreads + BLOCK_SIZE - 1) / BLOCK_SIZE;
+                kernUpSweep<<<numBlocks, BLOCK_SIZE>>>(numThreads, width, dev_data);
+                checkCUDAErrorFn("kernUpSweep failed!");
+            }
+
+            int zero = 0;
+            cudaMemcpy(&dev_data[squareN - 1], &zero, sizeof(int), cudaMemcpyHostToDevice);
+
+            // down sweep
+            width = pow(2, iterations + 2);
+            for (int i = iterations; i >= 0; i--)
+            {
+                width = width / 2;
+                numThreads = squareN / width;
+                numBlocks = (numThreads + BLOCK_SIZE - 1) / BLOCK_SIZE;
+                kernDownSweep<<<numBlocks, BLOCK_SIZE>>>(numThreads, width, dev_data);
+                checkCUDAErrorFn("kernDownSweep failed!");
+            }
+            
             timer().endGpuTimer();
+
+            cudaMemcpy(odata, dev_data, n * sizeof(int), cudaMemcpyDeviceToHost);
+            cudaFree(dev_data);
+        }
+        
+        /**
+         * Performs prefix-sum (aka scan) on idata, storing the result into odata.
+         */
+        void scanNoTimer(int n, int *odata, const int *idata)
+        {
+            int *dev_data;
+            int width = 1;
+            int squareN = pow(2, ilog2ceil(n));
+            int iterations = ilog2(squareN) - 1;
+            int numThreads, numBlocks;
+
+            cudaMalloc((void**)&dev_data, squareN * sizeof(int));
+            cudaMemcpy(dev_data, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+            // up sweep
+            for (int i = 0; i <= iterations; i++)
+            {
+                width = width * 2;
+                numThreads = squareN / width;
+                numBlocks = (numThreads + BLOCK_SIZE - 1) / BLOCK_SIZE;
+                kernUpSweep << <numBlocks, BLOCK_SIZE >> > (numThreads, width, dev_data);
+                checkCUDAErrorFn("kernUpSweep failed!");
+            }
+
+            int zero = 0;
+            cudaMemcpy(&dev_data[squareN - 1], &zero, sizeof(int), cudaMemcpyHostToDevice);
+
+            // down sweep
+            width = pow(2, iterations + 2);
+            for (int i = iterations; i >= 0; i--)
+            {
+                width = width / 2;
+                numThreads = squareN / width;
+                numBlocks = (numThreads + BLOCK_SIZE - 1) / BLOCK_SIZE;
+                kernDownSweep << <numBlocks, BLOCK_SIZE >> > (numThreads, width, dev_data);
+                checkCUDAErrorFn("kernDownSweep failed!");
+            }
+
+            cudaMemcpy(odata, dev_data, n * sizeof(int), cudaMemcpyDeviceToHost);
+            cudaFree(dev_data);
         }
 
         /**
@@ -30,11 +134,43 @@ namespace StreamCompaction {
          * @param idata  The array of elements to compact.
          * @returns      The number of elements remaining after compaction.
          */
-        int compact(int n, int *odata, const int *idata) {
+        int compact(int n, int *odata, const int *idata) 
+        {
+            int count;
+            int *dev_idata, *dev_odata, *dev_bools, *dev_indices;
+            int numBlocks = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+            cudaMalloc((void**)&dev_idata, n * sizeof(int));
+            cudaMalloc((void**)&dev_odata, n * sizeof(int));
+            cudaMalloc((void**)&dev_bools, n * sizeof(int));
+            cudaMalloc((void**)&dev_indices, n * sizeof(int));
+            // copy idata to dev_idata
+            cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
             timer().startGpuTimer();
-            // TODO
+            
+            // map idata to booleans
+            Common::kernMapToBoolean<<<numBlocks, BLOCK_SIZE>>>(n, dev_bools, dev_idata);
+            // copy booleans to odata
+            cudaMemcpy(odata, dev_bools, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+            count = odata[n - 1];
+            scanNoTimer(n, odata, odata);
+            count += odata[n - 1];
+
+            cudaMemcpy(dev_indices, odata, n * sizeof(int), cudaMemcpyHostToDevice);
+            Common::kernScatter<<<numBlocks, BLOCK_SIZE>>>(n, dev_odata, dev_idata, dev_bools, dev_indices);
+
             timer().endGpuTimer();
-            return -1;
+
+            cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+            cudaFree(dev_idata);
+            cudaFree(dev_bools);
+            cudaFree(dev_indices);
+            cudaFree(dev_odata);
+
+            return count;
         }
     }
 }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 9218f8e..ecb6177 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -11,15 +11,57 @@ namespace StreamCompaction {
             static PerformanceTimer timer;
             return timer;
         }
-        // TODO: __global__
+
+        const int BLOCK_SIZE = 512;
+
+        __global__ void kernScan(int n, int *odata, int *idata, int offset)
+        {
+            int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (index >= n) return;
+
+            if (index >= offset)
+            {
+                odata[index] = idata[index - offset] + idata[index];
+            }
+            else
+            {
+                odata[index] = idata[index];
+            }
+        }
+
+        __global__ void kernShift(int n, int *odata, int *idata)
+        {
+            int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (index >= n) return;
+            odata[index] = (index > 0) ? idata[index - 1] : 0;
+        }
 
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
-        void scan(int n, int *odata, const int *idata) {
+        void scan(int n, int *odata, const int *idata) 
+        {
+            int numBlocks = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
+            int *dev_idata, *dev_odata;
+            cudaMalloc((void**)&dev_idata, n * sizeof(int));
+            cudaMalloc((void**)&dev_odata, n * sizeof(int));
+            cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
             timer().startGpuTimer();
-            // TODO
+            
+            for (int offset = 1; offset <= ilog2ceil(n); offset++)
+            {
+                kernScan<<<numBlocks, BLOCK_SIZE>>>(n, dev_odata, dev_idata, 1 << offset - 1);
+                std::swap(dev_idata, dev_odata);
+            }
+
+            kernShift<<<numBlocks, BLOCK_SIZE>>>(n, dev_odata, dev_idata);
+
             timer().endGpuTimer();
+
+            cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost);
+            cudaFree(dev_idata);
+            cudaFree(dev_odata);
         }
     }
 }
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 36b732d..0c6e156 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -17,12 +17,18 @@ namespace StreamCompaction {
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
-        void scan(int n, int *odata, const int *idata) {
+        void scan(int n, int *odata, const int *idata) 
+        {
+            thrust::device_vector<int> dv_in(idata, idata + n);
+            thrust::device_vector<int> dv_out(odata, odata + n);
+
             timer().startGpuTimer();
-            // TODO use `thrust::exclusive_scan`
-            // example: for device_vectors dv_in and dv_out:
-            // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+
+            thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+
             timer().endGpuTimer();
+
+            thrust::copy(dv_out.begin(), dv_out.end(), odata);
         }
     }
 }