diff --git a/README.md b/README.md
index 0e38ddb..7047571 100644
--- a/README.md
+++ b/README.md
@@ -2,13 +2,87 @@ CUDA Stream Compaction
 ======================
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
+* Haorong Yang
+* [LinkedIn](https://www.linkedin.com/in/haorong-henry-yang/)
+* Tested on: Windows 10 Home, i7-10750H @ 2.60GHz 16GB, GTX 2070 Super Max-Q (Personal)
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+The goal of this project was to implement a stream compaction algorithm on the GPU in CUDA from scratch. 
+The algorithm will remove `0`s from an array of `int`s utilizing a scan function, which performs parallel reduction on the array to obtain an exclusive prefix sum.
 
-### (TODO: Your README)
+Although the goal is to obtain an efficient parallel solution, for comparison, a few variations of the algorithm were also implemented.
+A list of algorithms that will be compared to each other:
+* CPU scan function
+* CPU stream compaction without scan
+* CPU sream compaction with scan
+* GPU naive scan
+* GPU work-efficient scan & compaction
+* thrust library's implementation
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+The test results for array size of 2^8 is:
 
+```
+****************
+** SCAN TESTS **
+****************
+    [  17  20  19  34  19   3   6   3  27   2  14   5  21 ...  36   0 ]
+==== cpu scan, power-of-two ====
+   elapsed time: 0.0005ms    (std::chrono Measured)
+    [   0  17  37  56  90 109 112 118 121 148 150 164 169 ... 6163 6199 ]
+==== cpu scan, non-power-of-two ====
+   elapsed time: 0.0004ms    (std::chrono Measured)
+    [   0  17  37  56  90 109 112 118 121 148 150 164 169 ... 6092 6101 ]
+    passed
+==== naive scan, power-of-two ====
+   elapsed time: 0.029056ms    (CUDA Measured)
+    [   0  17  37  56  90 109 112 118 121 148 150 164 169 ... 6163 6199 ]
+    passed
+==== naive scan, non-power-of-two ====
+   elapsed time: 0.026752ms    (CUDA Measured)
+    [   0  17  37  56  90 109 112 118 121 148 150 164 169 ...   0   0 ]
+    passed
+==== work-efficient scan, power-of-two ====
+   elapsed time: 0.012768ms    (CUDA Measured)
+    [ 6199 6216 6236 6255 6289 6308 6311 6317 6320 6347 6349 6363 6368 ... 3050 3086 ]
+    a[0] = 0, b[0] = 6199
+    FAIL VALUE
+==== work-efficient scan, non-power-of-two ====
+   elapsed time: 0.012512ms    (CUDA Measured)
+    [ 6138 6155 6175 6194 6228 6247 6250 6256 6259 6286 6288 6302 6307 ... 2979 2988 ]
+    a[0] = 0, b[0] = 6138
+    FAIL VALUE
+==== thrust scan, power-of-two ====
+   elapsed time: 0.055264ms    (CUDA Measured)
+    [   0  17  37  56  90 109 112 118 121 148 150 164 169 ... 6163 6199 ]
+    passed
+==== thrust scan, non-power-of-two ====
+   elapsed time: 0.054368ms    (CUDA Measured)
+    [   0  17  37  56  90 109 112 118 121 148 150 164 169 ... 6092 6101 ]
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   3   0   1   2   1   1   0   1   1   0   2   1   3 ...   2   0 ]
+==== cpu compact without scan, power-of-two ====
+   elapsed time: 0.0008ms    (std::chrono Measured)
+    [   3   1   2   1   1   1   1   2   1   3   1   2   2 ...   3   2 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+   elapsed time: 0.0008ms    (std::chrono Measured)
+    [   3   1   2   1   1   1   1   2   1   3   1   2   2 ...   1   3 ]
+    passed
+==== cpu compact with scan ====
+   elapsed time: 0.004ms    (std::chrono Measured)
+    [   3   1   2   1   1   1   1   2   1   3   1   2   2 ...   3   2 ]
+    passed
+==== work-efficient compact, power-of-two ====
+   elapsed time: 0.020992ms    (CUDA Measured)
+    [   3   1   2   1   1   1   1   2   1   3   1   2   2 ...   0   0 ]
+expected count is 185, count is 185
+    passed
+==== work-efficient compact, non-power-of-two ====
+   elapsed time: 0.021888ms    (CUDA Measured)
+    [   3   1   2   1   1   1   1   2   1   3   1   2   2 ...   1   3 ]
+expected count is 185, count is 183
+    passed
+```
\ No newline at end of file
diff --git a/src/main.cpp b/src/main.cpp
index 896ac2b..f649d1b 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -14,7 +14,8 @@
 #include "testing_helpers.hpp"
 
 const int SIZE = 1 << 8; // feel free to change the size of array
-const int NPOT = SIZE - 3; // Non-Power-Of-Two
+//const int SIZE = 8;
+const int NPOT = SIZE - 3; // Non8Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];
 int *c = new int[SIZE];
@@ -27,7 +28,8 @@ int main(int argc, char* argv[]) {
     printf("** SCAN TESTS **\n");
     printf("****************\n");
 
-    genArray(SIZE - 1, a, 50);  // Leave a 0 at the end to test that edge case
+    genArray(SIZE - 1, a, 50);  // Leave a 0 at the end to test that edge case // 3rd argument is maxValue
+    //testArray(SIZE - 1, a, 50); // test
     a[SIZE - 1] = 0;
     printArray(SIZE, a, true);
 
@@ -51,7 +53,7 @@ int main(int argc, char* argv[]) {
     printDesc("naive scan, power-of-two");
     StreamCompaction::Naive::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
@@ -64,35 +66,35 @@ int main(int argc, char* argv[]) {
     printDesc("naive scan, non-power-of-two");
     StreamCompaction::Naive::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, power-of-two");
     StreamCompaction::Efficient::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, non-power-of-two");
     StreamCompaction::Efficient::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
+    printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
     printDesc("thrust scan, power-of-two");
     StreamCompaction::Thrust::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("thrust scan, non-power-of-two");
     StreamCompaction::Thrust::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
+    printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
     printf("\n");
@@ -137,14 +139,16 @@ int main(int argc, char* argv[]) {
     printDesc("work-efficient compact, power-of-two");
     count = StreamCompaction::Efficient::compact(SIZE, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
+    printArray(SIZE, c, true);
+    std::cout << "expected count is " << expectedCount << ", count is " << count << std::endl;
     printCmpLenResult(count, expectedCount, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient compact, non-power-of-two");
     count = StreamCompaction::Efficient::compact(NPOT, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
+    printArray(count, c, true);
+    std::cout << "expected count is " << expectedCount << ", count is " << count << std::endl;
     printCmpLenResult(count, expectedNPOT, b, c);
 
     system("pause"); // stop Win32 console from closing on exit
diff --git a/src/testing_helpers.hpp b/src/testing_helpers.hpp
index 025e94a..fe01387 100644
--- a/src/testing_helpers.hpp
+++ b/src/testing_helpers.hpp
@@ -57,6 +57,15 @@ void genArray(int n, int *a, int maxval) {
     }
 }
 
+void testArray(int n, int* a, int maxval) {
+    srand(time(nullptr));
+    int q = 0;
+    for (int i = 0; i < n; i++) {
+        a[i] = q;
+        q++;
+    }
+}
+
 void printArray(int n, int *a, bool abridged = false) {
     printf("    [ ");
     for (int i = 0; i < n; i++) {
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 719fa11..0e30336 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -18,8 +18,12 @@ namespace StreamCompaction {
          * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first.
          */
         void scan(int n, int *odata, const int *idata) {
+            // we are assuming that the arrays have at least one element
             timer().startCpuTimer();
-            // TODO
+            odata[0] = 0;
+            for (int k = 1; k < n; k++) {
+                odata[k] = odata[k - 1] + idata[k - 1];
+            }
             timer().endCpuTimer();
         }
 
@@ -30,9 +34,15 @@ namespace StreamCompaction {
          */
         int compactWithoutScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
-            // TODO
+            int optr = 0;
+            for (int i = 0; i < n; i++) {
+                if (idata[i] != 0) {
+                    odata[optr] = idata[i];
+                    optr++;
+                }
+            }
             timer().endCpuTimer();
-            return -1;
+            return optr;
         }
 
         /**
@@ -42,9 +52,34 @@ namespace StreamCompaction {
          */
         int compactWithScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
-            // TODO
+            int* zeroOnes = new int[n];
+            for (int i = 0; i < n; i++) {
+                if (idata[i] != 0) {
+                    zeroOnes[i] = 1;
+                }
+                else {
+                    zeroOnes[i] = 0;
+                }
+            }
+            int* scanResult = new int[n];
+
+            // scan
+            scanResult[0] = 0;
+            for (int k = 1; k < n; k++) {
+                scanResult[k] = scanResult[k - 1] + zeroOnes[k - 1];
+            }
+            // end of scan
+
+            for (int i = 0; i < n; i++) {
+                if (zeroOnes[i] == 1) {
+                    odata[scanResult[i]] = idata[i];
+                }
+            }
+            int count = scanResult[n - 1];
+            delete[] zeroOnes;
+            delete[] scanResult;
             timer().endCpuTimer();
-            return -1;
+            return count;
         }
     }
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 2db346e..624d446 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -3,6 +3,8 @@
 #include "common.h"
 #include "efficient.h"
 
+#define blockSize 128
+
 namespace StreamCompaction {
     namespace Efficient {
         using StreamCompaction::Common::PerformanceTimer;
@@ -12,15 +14,227 @@ namespace StreamCompaction {
             return timer;
         }
 
+        int* dev_scan_odata;
+        int* dev_dummy;
+        int* dev_zeroOnes;
+
+        __global__ void kernUpSweep(int n, int powDplus1, int powD, int* out) {
+            int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (index >= n || (index % powDplus1 != 0)) {
+                return;
+            }
+            out[index + powDplus1 - 1] += out[index + powD - 1];
+        }
+
+        __global__ void kernDownSweep(int n, int powDplus1, int powD, int* out) {
+            int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (index >= n || (index % powDplus1 != 0)) {
+                return;
+            }
+            int t = out[index + powD - 1];
+            out[index + powD - 1] = out[index + powDplus1 - 1];
+            out[index + powDplus1 - 1] += t;
+        }
+
+
+
+        __global__ void kernSweep(int n, int lgn, int* out) {
+            int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (index >= n) {
+                return;
+            }
+            for (int d = 0; d <= lgn - 1; d++) {
+                int powD = 1;
+                for (int i = 0; i < d; i++) {
+                    powD *= 2;
+                }
+                int powDplus1 = powD * 2;
+                if (index % powDplus1 == 0) {
+                    out[index + powDplus1 - 1] += out[index + powD - 1];
+                }
+                __syncthreads();
+            }
+
+            if (index == n - 1) {
+                out[index] = 0;
+            }
+            __syncthreads();
+
+            for (int d = lgn - 1; d >= 0; d--) {
+                int powD = 1;
+                for (int i = 0; i < d; i++) {
+                    powD *= 2;
+                }
+                int powDplus1 = powD * 2;
+                if (index % powDplus1 == 0) {
+                    int t = out[index + powD - 1];
+                    out[index + powD - 1] = out[index + powDplus1 - 1];
+                    out[index + powDplus1 - 1] += t;
+                }
+                __syncthreads();
+            }
+        }
+
+
+
+        __global__ void kernSetZero(int n, int* out) {
+            int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (index >= n) {
+                return;
+            }
+            out[index] = 0;
+        }
+
+        __global__ void kernCopy(int n, int* out, int* in) {
+            int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (index >= n) {
+                return;
+            }
+            out[index] = in[index];
+        }
+
+        
+
+
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
-        void scan(int n, int *odata, const int *idata) {
+        void scan(int n, int *odata, const int *idata) { // need to pad zeros if n is not pow of 2
+            int lgn = ilog2ceil(n);
+            int pow2Ceil = 1;
+            for (int i = 0; i < lgn; i++) {
+                pow2Ceil *= 2;
+            }
+            // set dimension
+            int fullBlocksPerGrid = (pow2Ceil + blockSize - 1) / blockSize; // test 1d grid
+
+            // cudaMalloc 
+            cudaMalloc((void**)&dev_scan_odata, pow2Ceil * sizeof(int));
+            cudaMalloc((void**)&dev_dummy, n * sizeof(int));
+            checkCUDAError("cudaMalloc failed!");
+            // cudaMemCpy
+            cudaMemcpy(dev_dummy, idata, sizeof(int) * n, cudaMemcpyHostToDevice); // copy idata
+            
+            // set zeros
+            kernSetZero << <fullBlocksPerGrid, blockSize >> > (pow2Ceil, dev_scan_odata);
+            checkCUDAError("kernSetZero failed!");
+            // copy dummy over to array padded with zeros
+            kernCopy << <fullBlocksPerGrid, blockSize >> > (n, dev_scan_odata, dev_dummy);
+            checkCUDAError("kernCopy failed!");
+
+
             timer().startGpuTimer();
-            // TODO
+            kernSweep << <fullBlocksPerGrid, blockSize >> > (pow2Ceil, lgn, dev_scan_odata);
+            checkCUDAError("kernSweep failed!");
             timer().endGpuTimer();
+
+
+            kernCopy << <fullBlocksPerGrid, blockSize >> > (n, dev_dummy, dev_scan_odata);
+            checkCUDAError("kernCopy failed!");
+
+            // cudaMemcpy back 
+            cudaMemcpy(odata, dev_dummy, sizeof(int) * n, cudaMemcpyDeviceToHost);
+            //cudaFree
+            cudaFree(dev_scan_odata);
+            cudaFree(dev_dummy);
+        }
+
+
+
+        __global__ void kernZeroOnes(int n, int* out, int* in) {
+            int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (index >= n) {
+                return;
+            }
+            if (in[index] != 0) {
+                out[index] = 1;
+            }
+            else {
+                out[index] = 0;
+            }
         }
 
+        __global__ void kernSweepCompact(int n, int lgn, int* out, int* dummy, int* idata) {
+            int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (index >= n) {
+                return;
+            }
+            bool isOne = false;
+            if (out[index] == 1) {
+                isOne = true;
+            }
+            for (int d = 0; d <= lgn - 1; d++) {
+                int powD = 1;
+                for (int i = 0; i < d; i++) {
+                    powD *= 2;
+                }
+                int powDplus1 = powD * 2;
+                if (index % powDplus1 == 0) {
+                    out[index + powDplus1 - 1] += out[index + powD - 1];
+                }
+                __syncthreads();
+            }
+
+            /*if (index == n - 1) {
+                out[index] = 0;
+            }
+            __syncthreads();*/
+
+     /*       for (int d = lgn - 1; d >= 0; d--) {
+                int powD = 1;
+                for (int i = 0; i < d; i++) {
+                    powD *= 2;
+                }
+                int powDplus1 = powD * 2;
+                if (index % powDplus1 == 0) {
+                    int t = out[index + powD - 1];
+                    out[index + powD - 1] = out[index + powDplus1 - 1];
+                    out[index + powDplus1 - 1] += t;
+                }
+                __syncthreads();
+            }*/
+
+            //if (isOne) {
+            //    dummy[out[index]] = idata[index];
+            //}
+            
+        }
+
+        __global__ void kernSetLastZero(int n, int* out) {
+            int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (index == n - 1) {
+                out[index] = 0;
+            }
+        }
+
+
+        __global__ void kernSweepDown(int n, int lgn, int* out, int* dummy, int* idata) {
+            int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (index >= n) {
+                return;
+            }
+
+            for (int d = lgn - 1; d >= 0; d--) {
+                int powD = 1;
+                for (int i = 0; i < d; i++) {
+                    powD *= 2;
+                }
+                int powDplus1 = powD * 2;
+                if (index % powDplus1 == 0) {
+                    int t = out[index + powD - 1];
+                    out[index + powD - 1] = out[index + powDplus1 - 1];
+                    out[index + powDplus1 - 1] += t;
+                }
+                __syncthreads();
+            }
+
+            if (out[index] != out[index + 1]) {
+                dummy[out[index]] = idata[index];
+            }
+
+        }
+
+
         /**
          * Performs stream compaction on idata, storing the result into odata.
          * All zeroes are discarded.
@@ -31,10 +245,79 @@ namespace StreamCompaction {
          * @returns      The number of elements remaining after compaction.
          */
         int compact(int n, int *odata, const int *idata) {
+            int lgn = ilog2ceil(n);
+            int pow2Ceil = 1;
+            for (int i = 0; i < lgn; i++) {
+                pow2Ceil *= 2;
+            }
+            //int pow2Ceil = n; // test
+
+            // set dimension
+            int fullBlocksPerGrid = (pow2Ceil + blockSize - 1) / blockSize; // test 1d grid
+
+            // cudaMalloc 
+            cudaMalloc((void**)&dev_scan_odata, pow2Ceil * sizeof(int));
+            cudaMalloc((void**)&dev_zeroOnes, pow2Ceil * sizeof(int));
+            cudaMalloc((void**)&dev_dummy, n * sizeof(int));
+            checkCUDAError("cudaMalloc failed!");
+            // cudaMemCpy
+            cudaMemcpy(dev_dummy, idata, sizeof(int) * n, cudaMemcpyHostToDevice); // copy idata
+
+            // set zeros
+            kernSetZero << <fullBlocksPerGrid, blockSize >> > (pow2Ceil, dev_scan_odata);
+            checkCUDAError("kernSetZero failed!");
+            // copy dummy over to array padded with zeros
+            kernCopy << <fullBlocksPerGrid, blockSize >> > (n, dev_scan_odata, dev_dummy);
+            checkCUDAError("kernCopy failed!");
+
+            // set dummy to zero for later
+            kernSetZero << <fullBlocksPerGrid, blockSize >> > (n, dev_dummy);
+            checkCUDAError("kernSetZero failed!");
+
+
+            // --------------------------------------------------------------------------------------
             timer().startGpuTimer();
-            // TODO
+            // create ZeroOne array
+            kernZeroOnes << <fullBlocksPerGrid, blockSize >> > (pow2Ceil, dev_zeroOnes, dev_scan_odata);
+            checkCUDAError("kernZeroOnes failed!");
+
+            kernSweepCompact << <fullBlocksPerGrid, blockSize >> > (pow2Ceil, lgn, dev_zeroOnes, dev_dummy, dev_scan_odata);
+            checkCUDAError("kernSweepCompact failed!");
+
+            // set zeros
+            kernSetLastZero << <fullBlocksPerGrid, blockSize >> > (pow2Ceil, dev_zeroOnes);
+            checkCUDAError("kernSetZero failed!");
+
+            kernSweepDown << <fullBlocksPerGrid, blockSize >> > (pow2Ceil, lgn, dev_zeroOnes, dev_dummy, dev_scan_odata);
+            checkCUDAError("kernSweepCompact failed!");
+
             timer().endGpuTimer();
-            return -1;
+            // --------------------------------------------------------------------------------------
+
+
+            /*kernCopy << <fullBlocksPerGrid, blockSize >> > (n, dev_dummy, dev_zeroOnes);
+            checkCUDAError("kernCopy failed!");*/
+
+            int* counter = new int[pow2Ceil];
+            cudaMemcpy(counter, dev_zeroOnes, sizeof(int) * pow2Ceil, cudaMemcpyDeviceToHost); // should change for non 2 pows
+            int count = 0;
+            for (int i = 0; i < pow2Ceil; i++) {
+                if (counter[i] == 1) {
+                    count++;
+                }
+            }
+            count = counter[pow2Ceil - 1];
+
+            // cudaMemcpy back 
+            //cudaMemcpy(odata, dev_dummy, sizeof(int) * n, cudaMemcpyDeviceToHost);
+            cudaMemcpy(odata, dev_dummy, sizeof(int) * n, cudaMemcpyDeviceToHost);
+            //cudaFree
+            cudaFree(dev_scan_odata);
+            cudaFree(dev_dummy);
+            cudaFree(dev_zeroOnes);
+            delete(counter);
+
+            return count;
         }
     }
 }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 4308876..40100b0 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -3,6 +3,8 @@
 #include "common.h"
 #include "naive.h"
 
+#define blockSize 128
+
 namespace StreamCompaction {
     namespace Naive {
         using StreamCompaction::Common::PerformanceTimer;
@@ -11,15 +13,86 @@ namespace StreamCompaction {
             static PerformanceTimer timer;
             return timer;
         }
-        // TODO: __global__
+
+        int* dev_idata;
+        int* dev_odata;
+
+        __global__ void kernUpdateK(int n, int offset, int* out, int* in) {
+            int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (index >= n) {
+                return;
+            }
+            if (index < offset) {
+                out[index] = in[index];
+            }
+            else {
+                out[index] = in[index - offset] + in[index];
+            }
+        }
+
+        __global__ void kernShiftRight(int n, int* out, int* in) {
+            int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (index >= n) {
+                return;
+            }
+            if (index == 0) {
+                out[index] = 0;
+            }
+            else {
+                out[index] = in[index - 1];
+            }
+        }
+
 
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+            // cudaMalloc 
+            cudaMalloc((void**)&dev_idata, n * sizeof(int));
+            checkCUDAError("cudaMalloc dev_idata failed!");
+            cudaMalloc((void**)&dev_odata, n * sizeof(int));
+            checkCUDAError("cudaMalloc dev_odata failed!");
+            // cudaMemCpy
+            cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+            cudaMemcpy(dev_odata, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+
             timer().startGpuTimer();
-            // TODO
+            // set dimension
+            int fullBlocksPerGrid = (n + blockSize - 1) / blockSize; // test 1d grid
+            int lgn = ilog2ceil(n);
+            int* temp;
+            for (int d = 1; d <= lgn; d++) {
+                // calculate offset from d
+                int offset = 1;
+                if (d > 1) {
+                    for (int i = 0; i < d - 1; i++) {
+                        offset *= 2;
+                    }
+                }
+           
+                kernUpdateK << <fullBlocksPerGrid, blockSize >> > (n, offset, dev_odata, dev_idata);
+                checkCUDAError("kernUpdateK failed!");
+                // now o = curr, i = old.
+
+                // ping-pong
+                temp = dev_odata;
+                dev_odata = dev_idata;
+                dev_idata = temp;
+                // now, o = old, i = curr
+            }
+
+            // shift right to make exclusive
+            kernShiftRight << <fullBlocksPerGrid, blockSize >> > (n, dev_odata, dev_idata);
+            checkCUDAError("kernShiftRight failed!");
+            // now o = curr, i = old
             timer().endGpuTimer();
+
+            // cudaMemcpy back 
+            cudaMemcpy(odata, dev_odata, sizeof(int) * n, cudaMemcpyDeviceToHost);
+            //cudaFree
+            cudaFree(dev_idata);
+            cudaFree(dev_odata);
         }
     }
 }
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 1def45e..8ddf089 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -17,12 +17,29 @@ namespace StreamCompaction {
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
+
+       
         void scan(int n, int *odata, const int *idata) {
+            thrust::host_vector<int> h_out(n);
+            thrust::host_vector<int> h_in(n);
+            for (int i = 0; i < n; i++) {
+                h_in[i] = idata[i];
+                h_out[i] = idata[i];
+            }
+
+
+            thrust::device_vector<int> dev_out = h_out;
+            thrust::device_vector<int> dev_in = h_in;
+
             timer().startGpuTimer();
-            // TODO use `thrust::exclusive_scan`
-            // example: for device_vectors dv_in and dv_out:
-            // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+            thrust::exclusive_scan(dev_in.begin(), dev_in.end(), dev_out.begin());
             timer().endGpuTimer();
+
+            thrust::copy(dev_out.begin(), dev_out.end(), h_out.begin());
+            for (int i = 0; i < n; i++) {
+                odata[i] = h_out[i];
+            }
+
         }
     }
 }