diff --git a/README.md b/README.md
index 0e38ddb..b93c779 100644
--- a/README.md
+++ b/README.md
@@ -3,12 +3,112 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Zichuan Yu
+  * [LinkedIn](https://www.linkedin.com/in/zichuan-yu/), [Behance](https://www.behance.net/zainyu717ebcc)
+* Tested on: Windows 10.0.17134 Build 17134, i7-4710 @ 2.50GHz 16GB, GTX 980m 4096MB GDDR5
+
+## Features
+
+- CPU Scan
+- CPU Stream Compaction
+- Naive GPU Scan
+- Work-Efficient GPU Scan
+- Work-Efficient GPU Stream Compaction
+- Thrust Implementation
+
+## Performance Analysis
+
+### Block size analysis
+
+We fix array size as 2^21 and change the block size.
+
+![block_size](img/block_size.png)
+
+As we can see, as long as the block size is not 32, it makes little differences when we increase the block size.
+
+### Array Size Analysis on Scan
+
+We fix block size as 1024 and change the array size.
+
+![scan](img/scan.png)
+
+As we can see, CPU is of course the slowest. We can also see that my own implementation is still much slower than
+Thrust implementation. I think this is because our own code is still not efficient and hardware-exploiting enough.
+
+### Array Size Analysis on Compaction
+
+We fix block size as 1024 and change the array size.
+
+![compaction](img/compaction.png)
+
+As we can see, CPU with scan is the slowest. I think that scan brings overhead to CPU, thus, if we are using CPU, we'd rather not use scan at all.
+
+## Output
+
+Array size 2^28, block size 1024
+
+```shell
+
+****************
+** SCAN TESTS **
+****************
+    [   1   1   1   1   1   1   1   1   1   1   1   1   1 ...   1   1 ]
+==== cpu scan, power-of-two ====
+   elapsed time: 1535.85ms    (std::chrono Measured)
+    [   0   1   2   3   4   5   6   7   8   9  10  11  12 ... 268435454 268435455 ]
+==== cpu scan, non-power-of-two ====
+   elapsed time: 594.798ms    (std::chrono Measured)
+    [   0   1   2   3   4   5   6   7   8   9  10  11  12 ... 268435451 268435452 ]
+    passed
+==== naive scan, power-of-two ====
+   elapsed time: 510.046ms    (CUDA Measured)
+    [   0   1   2   3   4   5   6   7   8   9  10  11  12 ... 268435454 268435455 ]
+    passed
+==== naive scan, non-power-of-two ====
+   elapsed time: 510.037ms    (CUDA Measured)
+    [   0   1   2   3   4   5   6   7   8   9  10  11  12 ...   0   0 ]
+    passed
+==== work-efficient scan, power-of-two ====
+   elapsed time: 175.304ms    (CUDA Measured)
+    [   0   1   2   3   4   5   6   7   8   9  10  11  12 ... 268435454 268435455 ]
+    passed
+==== work-efficient scan, non-power-of-two ====
+   elapsed time: 175.151ms    (CUDA Measured)
+    passed
+==== thrust scan, power-of-two ====
+   elapsed time: 28.8416ms    (CUDA Measured)
+    passed
+==== thrust scan, non-power-of-two ====
+   elapsed time: 28.8394ms    (CUDA Measured)
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   0   1   0   0   1   0   3   1   3   3   0   3   1 ...   3   0 ]
+==== cpu compact without scan, power-of-two ====
+   elapsed time: 708.621ms    (std::chrono Measured)
+    [   1   1   3   1   3   3   3   1   1   1   1   1   1 ...   1   3 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+   elapsed time: 680.761ms    (std::chrono Measured)
+    [   1   1   3   1   3   3   3   1   1   1   1   1   1 ...   1   1 ]
+    passed
+==== cpu compact with scan ====
+   elapsed time: 1471.92ms    (std::chrono Measured)
+    [   1   1   3   1   3   3   3   1   1   1   1   1   1 ...   1   3 ]
+    passed
+==== work-efficient compact, power-of-two ====
+   elapsed time: 213.044ms    (CUDA Measured)
+    passed
+==== work-efficient compact, non-power-of-two ====
+   elapsed time: 212.931ms    (CUDA Measured)
+    passed
+Press any key to continue . . .
+```
+
+
+
 
-### (TODO: Your README)
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
 
diff --git a/img/block_size.png b/img/block_size.png
new file mode 100644
index 0000000..afdd5f5
Binary files /dev/null and b/img/block_size.png differ
diff --git a/img/compaction.png b/img/compaction.png
new file mode 100644
index 0000000..7d552d4
Binary files /dev/null and b/img/compaction.png differ
diff --git a/img/scan.png b/img/scan.png
new file mode 100644
index 0000000..f8ab885
Binary files /dev/null and b/img/scan.png differ
diff --git a/src/main.cpp b/src/main.cpp
index 1850161..4ee1922 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -6,6 +6,9 @@
  * @copyright University of Pennsylvania
  */
 
+
+
+
 #include <cstdio>
 #include <stream_compaction/cpu.h>
 #include <stream_compaction/naive.h>
@@ -13,7 +16,7 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 1 << 28; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];
@@ -29,6 +32,7 @@ int main(int argc, char* argv[]) {
 
     genArray(SIZE - 1, a, 50);  // Leave a 0 at the end to test that edge case
     a[SIZE - 1] = 0;
+    onesArray(SIZE, a);
     printArray(SIZE, a, true);
 
     // initialize b using StreamCompaction::CPU::scan you implement
@@ -51,7 +55,7 @@ int main(int argc, char* argv[]) {
     printDesc("naive scan, power-of-two");
     StreamCompaction::Naive::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
 	/* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
@@ -64,14 +68,14 @@ int main(int argc, char* argv[]) {
     printDesc("naive scan, non-power-of-two");
     StreamCompaction::Naive::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, power-of-two");
     StreamCompaction::Efficient::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
diff --git a/src/testing_helpers.hpp b/src/testing_helpers.hpp
index 46337ab..d6cc4e3 100644
--- a/src/testing_helpers.hpp
+++ b/src/testing_helpers.hpp
@@ -1,8 +1,8 @@
 #pragma once
 
-#include <cstdlib>
-#include <cstdio>
-#include <iostream>
+#include <cstdlib>
+#include <cstdio>
+#include <iostream>
 #include <string>
 #include <ctime>
 
@@ -69,8 +69,8 @@ void printArray(int n, int *a, bool abridged = false) {
     printf("]\n");
 }
 
-template<typename T>
-void printElapsedTime(T time, std::string note = "")
-{
-	std::cout << "   elapsed time: " << time << "ms    " << note << std::endl;
+template<typename T>
+void printElapsedTime(T time, std::string note = "")
+{
+	std::cout << "   elapsed time: " << time << "ms    " << note << std::endl;
 }
\ No newline at end of file
diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt
index cdbef77..6ad13f6 100644
--- a/stream_compaction/CMakeLists.txt
+++ b/stream_compaction/CMakeLists.txt
@@ -13,5 +13,5 @@ set(SOURCE_FILES
 
 cuda_add_library(stream_compaction
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_52
     )
diff --git a/stream_compaction/common.h b/stream_compaction/common.h
index 99a1b04..bcca058 100644
--- a/stream_compaction/common.h
+++ b/stream_compaction/common.h
@@ -13,6 +13,11 @@
 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
 #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
 
+
+#define BLOCK_SIZE 1024
+
+#define PLUS_OP_IDENTITY 0
+
 /**
  * Check for CUDA errors; print and exit if there was a problem.
  */
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 05ce667..99acff7 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -1,15 +1,15 @@
 #include <cstdio>
 #include "cpu.h"
 
-#include "common.h"
+#include "common.h"
 
 namespace StreamCompaction {
     namespace CPU {
-        using StreamCompaction::Common::PerformanceTimer;
-        PerformanceTimer& timer()
-        {
-	        static PerformanceTimer timer;
-	        return timer;
+        using StreamCompaction::Common::PerformanceTimer;
+        PerformanceTimer& timer()
+        {
+	        static PerformanceTimer timer;
+	        return timer;
         }
 
         /**
@@ -19,7 +19,10 @@ namespace StreamCompaction {
          */
         void scan(int n, int *odata, const int *idata) {
 	        timer().startCpuTimer();
-            // TODO
+          odata[0] = 0;
+          for (int i = 1; i < n; ++i) {
+            odata[i] = odata[i - 1] + idata[i - 1];
+          }
 	        timer().endCpuTimer();
         }
 
@@ -30,9 +33,15 @@ namespace StreamCompaction {
          */
         int compactWithoutScan(int n, int *odata, const int *idata) {
 	        timer().startCpuTimer();
-            // TODO
+          int non_zero_idx = 0;
+          for (int i = 0; i < n; ++i) {
+            if (idata[i] != 0) {
+              odata[non_zero_idx] = idata[i];
+              ++non_zero_idx;
+            }
+          }
 	        timer().endCpuTimer();
-            return -1;
+          return non_zero_idx;
         }
 
         /**
@@ -41,10 +50,31 @@ namespace StreamCompaction {
          * @returns the number of elements remaining after compaction.
          */
         int compactWithScan(int n, int *odata, const int *idata) {
-	        timer().startCpuTimer();
-	        // TODO
+	        
+	        // allocate a temporary 0/1 accumulating array
+          int* temp = new int[n];
+
+          //for (int i = 0; i < n; ++i) {
+          //  temp[i] = 0;
+          //}
+          temp[0] = 0;
+          timer().startCpuTimer();
+          // scan to 0/1 accumulating array
+          for (int i = 1; i < n; ++i) {
+            temp[i] = temp[i - 1] + (idata[i - 1] != 0);
+          }
+          
+          // use temp to map to output
+          int count = 0;
+          for (int i = 0; i < n; ++i) {
+            if (idata[i] != 0) {
+              ++count;
+              odata[temp[i]] = idata[i];
+            }
+          }
 	        timer().endCpuTimer();
-            return -1;
+          delete[] temp;
+          return count;
         }
     }
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 36c5ef2..e217897 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -2,39 +2,201 @@
 #include <cuda_runtime.h>
 #include "common.h"
 #include "efficient.h"
+#include <memory>
+#include <iostream>
 
 namespace StreamCompaction {
-    namespace Efficient {
-        using StreamCompaction::Common::PerformanceTimer;
-        PerformanceTimer& timer()
-        {
-            static PerformanceTimer timer;
-            return timer;
-        }
+  namespace Efficient {
+    using StreamCompaction::Common::PerformanceTimer;
+    PerformanceTimer& timer()
+    {
+      static PerformanceTimer timer;
+      return timer;
+    }
 
-        /**
-         * Performs prefix-sum (aka scan) on idata, storing the result into odata.
-         */
-        void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
-            // TODO
-            timer().endGpuTimer();
-        }
 
-        /**
-         * Performs stream compaction on idata, storing the result into odata.
-         * All zeroes are discarded.
-         *
-         * @param n      The number of elements in idata.
-         * @param odata  The array into which to store elements.
-         * @param idata  The array of elements to compact.
-         * @returns      The number of elements remaining after compaction.
-         */
-        int compact(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
-            // TODO
-            timer().endGpuTimer();
-            return -1;
+    __global__ void kernEfficientGpuScan(int n, int *odata, const int *idata) {
+
+    }
+
+    __global__ void kernBuildTree(int n, const int* d_data_in, int* d_data_out) {
+
+    }
+
+    __global__ void kernEfficientScanUp(int n, int bitShift, int* d_data_in) {
+      int idx = threadIdx.x + blockDim.x * blockIdx.x;
+      if (idx >= n) {
+        return;
+      }
+      int pow1 = 1 << bitShift;
+      int pow2 = 1 << (bitShift + 1);
+      d_data_in[idx * pow2 + pow2 - 1] += d_data_in[idx * pow2 + pow1 - 1];
+    }
+
+    __global__ void kernEfficientScanDown(int n, int bitShift, int* d_data_in) {
+      int idx = threadIdx.x + blockDim.x * blockIdx.x;
+      if (idx >= n) {
+        return;
+      }
+      int pow1 = 1 << bitShift;
+      int pow2 = 1 << (bitShift + 1);
+      int pos1 = idx * pow2 + pow1 - 1;
+      int pos2 = idx * pow2 + pow2 - 1;
+      int temp = d_data_in[pos1];
+      d_data_in[pos1] = d_data_in[pos2];
+      d_data_in[pos2] += temp;
+    }
+
+    /**
+     * Performs prefix-sum (aka scan) on idata, storing the result into odata.
+     */
+    void scan(int n, int *odata, const int *idata) {
+      // make length to 2^n
+      int level = ilog2ceil(n);
+      int trueN = 1 << level;
+
+
+
+      std::unique_ptr<int[]>trueIData{ new int[trueN] };
+
+      // pad 0 to the end
+      for (int i = 0; i < n; ++i) {
+        trueIData[i] = idata[i];
+      }
+      for (int i = n; i < trueN; ++i) {
+        trueIData[i] = 0;
+      }
+
+      // allocate memory
+      int* d_data_in;
+      cudaMalloc((void**)&d_data_in, trueN * sizeof(int));
+      cudaMemcpy(d_data_in, trueIData.get(), trueN * sizeof(int), cudaMemcpyHostToDevice);
+      timer().startGpuTimer();
+
+      dim3 blockSize(BLOCK_SIZE);
+      dim3 gridSize;
+      int pow2;
+      // go up
+      for (int i = 0; i < level; ++i)
+      {
+        pow2 = 1 << (i + 1);
+        gridSize = ((trueN / pow2 + BLOCK_SIZE - 1) / BLOCK_SIZE);
+        kernEfficientScanUp << <gridSize, blockSize >> > (trueN / pow2, i, d_data_in);
+      }
+      cudaMemset(d_data_in + trueN - 1, 0, sizeof(int));
+
+      // go down
+      for (int i = level - 1; i > -1; --i)
+      {
+        pow2 = 1 << (i + 1);
+        gridSize = ((trueN / pow2 + BLOCK_SIZE - 1) / BLOCK_SIZE);
+        kernEfficientScanDown << <gridSize, blockSize >> > (trueN / pow2, i, d_data_in);
+      }
+
+      timer().endGpuTimer();
+      // only need copy n, no need to copy trueN
+      cudaMemcpy(odata, d_data_in, n * sizeof(int), cudaMemcpyDeviceToHost);
+      cudaFree(d_data_in);
+    }
+
+    __global__ void kernValueMapToOne(int n, int* d_ones_out, int* d_data_in) {
+      int idx = threadIdx.x + blockDim.x * blockIdx.x;
+      if (idx >= n) {
+        return;
+      }
+      if (d_data_in[idx] != 0) {
+        d_ones_out[idx] = 1;
+      }
+    }
+
+    __global__ void kernCompact(int n, int* d_indices, int* d_data_in, int* d_data_to_compact) {
+      int idx = threadIdx.x + blockDim.x * blockIdx.x;
+      if (idx >= n) {
+        return;
+      }
+      int data = d_data_in[idx];
+      if (data != 0) {
+        d_data_to_compact[d_indices[idx]] = data;
+      }
+    }
+
+    /**
+     * Performs stream compaction on idata, storing the result into odata.
+     * All zeroes are discarded.
+     *
+     * @param n      The number of elements in idata.
+     * @param odata  The array into which to store elements.
+     * @param idata  The array of elements to compact.
+     * @returns      The number of elements remaining after compaction.
+     */
+    int compact(int n, int *odata, const int *idata) {
+      // ugly implementation of reusing scan code
+      // used for mapping to 1 and as the scan result
+      int* indexArray = (int*)malloc(n * sizeof(int));
+      int* d_data_in;
+      int* d_compacted_data;
+      cudaMalloc((void**)&d_data_in, n * sizeof(int));
+      cudaMalloc((void**)&d_compacted_data, n * sizeof(int));
+
+      cudaMemset(d_compacted_data, 0, n * sizeof(int));
+
+      cudaMemcpy(d_data_in, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+      int* d_ones_scan_result;
+
+      // calc true values
+      // make length to 2^n
+      int level = ilog2ceil(n);
+      int trueN = 1 << level;
+
+      // set scan_result zero
+      cudaMalloc((void**)&d_ones_scan_result, trueN * sizeof(int));
+      cudaMemset(d_ones_scan_result, 0, trueN * sizeof(int));
+
+      // useful constants
+      dim3 blockSize(BLOCK_SIZE);
+      dim3 gridSize((n + BLOCK_SIZE - 1) / BLOCK_SIZE);
+      int pow2;
+
+      timer().startGpuTimer();
+      // do ones
+      kernValueMapToOne << <gridSize, blockSize >> > (n, d_ones_scan_result, d_data_in);
+
+      // TODO(zichuanyu) make this a in the future
+      // scan
+      // go up
+      for (int i = 0; i < level; ++i)
+      {
+        pow2 = 1 << (i + 1);
+        gridSize = ((trueN / pow2 + BLOCK_SIZE - 1) / BLOCK_SIZE);
+        kernEfficientScanUp << <gridSize, blockSize >> > (trueN / pow2, i, d_ones_scan_result);
+      }
+      cudaMemset(d_ones_scan_result + trueN - 1, 0, sizeof(int));
+      // go down
+      for (int i = level - 1; i > -1; --i)
+      {
+        pow2 = 1 << (i + 1);
+        gridSize = ((trueN / pow2 + BLOCK_SIZE - 1) / BLOCK_SIZE);
+        kernEfficientScanDown << <gridSize, blockSize >> > (trueN / pow2, i, d_ones_scan_result);
+      }
+
+      // compact, only use useful part of the array 
+      gridSize = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
+      kernCompact << <gridSize, blockSize >> > (n, d_ones_scan_result, d_data_in, d_compacted_data);
+      timer().endGpuTimer();
+      cudaMemcpy(odata, d_compacted_data, n * sizeof(int), cudaMemcpyDeviceToHost);
+      // count how many nums
+      // cpu or gpu
+      int num = 0;
+      for (int i = 0; i < n; ++i) {
+        if (odata[i] == 0) {
+          break;
         }
+        ++num;
+      }
+      cudaFree(d_compacted_data);
+      cudaFree(d_ones_scan_result);
+      cudaFree(d_data_in);
+      return num;
     }
+  }
 }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 9218f8e..153e5bc 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -1,25 +1,63 @@
 #include <cuda.h>
+#include <iostream>
 #include <cuda_runtime.h>
 #include "common.h"
 #include "naive.h"
 
 namespace StreamCompaction {
-    namespace Naive {
-        using StreamCompaction::Common::PerformanceTimer;
-        PerformanceTimer& timer()
-        {
-            static PerformanceTimer timer;
-            return timer;
-        }
-        // TODO: __global__
+  namespace Naive {
+    using StreamCompaction::Common::PerformanceTimer;
+    PerformanceTimer& timer()
+    {
+      static PerformanceTimer timer;
+      return timer;
+    }
+
+    __global__ void kernNaiveGPUScan(const int n, const int offset, const int* d_data_in, int* d_data_out) {
+      int idx = threadIdx.x + blockDim.x * blockIdx.x;
+      if (idx >= n) return;
+      if (idx < offset) {
+        d_data_out[idx] = d_data_in[idx];
+      }
+      else {
+        d_data_out[idx] = d_data_in[idx] + d_data_in[idx - offset];
+      }
+    }
+
+    __global__ void kernIncToExc(const int n, const int* d_data_in, int* d_data_out) {
+      int idx = threadIdx.x + blockDim.x * blockIdx.x;
+      if (idx == 0) {
+        d_data_out[0] = PLUS_OP_IDENTITY;
+      }
+      else if (idx < n) {
+        d_data_out[idx] = d_data_in[idx - 1];
+      }
+    }
+    /**
+     * Performs prefix-sum (aka scan) on idata, storing the result into odata.
+     */
+    void scan(int n, int* odata, const int* idata) {
+      dim3 blockSize(BLOCK_SIZE);
+      dim3 gridSize((n + BLOCK_SIZE - 1) / BLOCK_SIZE);
 
-        /**
-         * Performs prefix-sum (aka scan) on idata, storing the result into odata.
-         */
-        void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
-            // TODO
-            timer().endGpuTimer();
-        }
+      // allocate memory
+      int* d_data_in;
+      int* d_data_out;
+      cudaMalloc((void**)&d_data_in, n * sizeof(int));
+      cudaMalloc((void**)&d_data_out, n * sizeof(int));
+      cudaMemcpy(d_data_in, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+      timer().startGpuTimer();
+      for (int i = 0; i < ilog2ceil(n); ++i) {
+        kernNaiveGPUScan << <gridSize, blockSize >> > (n, 1 << i, d_data_in, d_data_out);
+        std::swap(d_data_in, d_data_out);
+      }
+      timer().endGpuTimer();
+      // for readbility
+      std::swap(d_data_in, d_data_out);
+      cudaMemcpy(odata + 1, d_data_out, (n - 1) * sizeof(int), cudaMemcpyDeviceToHost);
+      odata[0] = 0;
+      cudaFree(d_data_out);
+      cudaFree(d_data_in);
     }
+  }
 }
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 36b732d..9693dca 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -7,22 +7,26 @@
 #include "thrust.h"
 
 namespace StreamCompaction {
-    namespace Thrust {
-        using StreamCompaction::Common::PerformanceTimer;
-        PerformanceTimer& timer()
-        {
-            static PerformanceTimer timer;
-            return timer;
-        }
-        /**
-         * Performs prefix-sum (aka scan) on idata, storing the result into odata.
-         */
-        void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
-            // TODO use `thrust::exclusive_scan`
-            // example: for device_vectors dv_in and dv_out:
-            // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
-            timer().endGpuTimer();
-        }
+  namespace Thrust {
+    using StreamCompaction::Common::PerformanceTimer;
+    PerformanceTimer& timer()
+    {
+      static PerformanceTimer timer;
+      return timer;
     }
+    /**
+     * Performs prefix-sum (aka scan) on idata, storing the result into odata.
+     */
+    void scan(int n, int *odata, const int *idata) {
+      thrust::device_vector<int> d_data_in(idata, idata + n);
+      thrust::device_vector<int> d_data_out(odata, odata + n);
+      timer().startGpuTimer();
+      // TODO use `thrust::exclusive_scan`
+      // example: for device_vectors dv_in and dv_out:
+      // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+      thrust::exclusive_scan(d_data_in.begin(), d_data_in.end(), d_data_out.begin());
+      timer().endGpuTimer();
+      thrust::copy(d_data_out.begin(), d_data_out.end(), odata);
+    }
+  }
 }