diff --git a/README.md b/README.md
index 0e38ddb..ade0b8e 100644
--- a/README.md
+++ b/README.md
@@ -3,12 +3,122 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Shenyue Chen
+  * [LinkedIn](https://www.linkedin.com/in/shenyue-chen-5b2728119/), [personal website](http://github.com/EvsChen)
+* Tested on: Windows 10, Intel Xeon Platinum 8259CL @ 2.50GHz 16GB, Tesla T4 (AWS g4dn-xlarge)
+
+### Features
+* Implementation of cpu scan, cpu compact, naive scan, work-efficient scan and work-efficient compact
+* Optimization of the work-efficient scan algorithm by launching only necessary number of threads in `upSweep` and `downSweep`
+    * Map thread index to the actual index by `interval * idx + interval - 1`, where interval is `1 << iteration`
+    * For example, for N = 8, 4 threads is launched in the first iteration, 2 launched in the secondm etc.
+
+### Performance analysis
+Block size: 128
+
+Lengh of array: from (2^10 - 2^21)
+
+**All the time measured in this section is the average of 100 tests, to avoid caching of functions**
+
+As N increases, the time for cpu algorithms increases in a exponential manner while the GPU algorithms increases much slower.
+
+<p align="center">
+<image src="doc/scan_time.png">
+</p>
+
+<p align="center">
+<image src="doc/compact.png">
+</p>
+
+For the GPU algorithms only, the naive algorithm performs the best when N is small. But thrust scan turns out to be the best when N becomes larger.
+<p align="center">
+<image src="doc/gpu_scan.png">
+</p>
+
+In my experiments, there are no obvious difference for the NPOT version of the work efficient scan algorithm.
+<p align="center">
+<image src="doc/scan_time_npot.png">
+</p>
+
+Similar things happen for the thrust scan.
+<p align="center">
+<image src="doc/thrust_scan_time_npot.png">
+</p>
+
+
+
+
+### Sample output
+I tested each of the algorithm for 100 times and include some additional information.
+```
+****************
+** SCAN TESTS **
+****************
+    [  41  23  25   1   5  46  28  37  30  42  42  25  35 ...  38   0 ]
+==== cpu scan, power-of-two ====
+    Time record is [11.006, 11.431, 11.774, 13.613, 14.736, 11.656, 11.874, 11.659, 11.823, 12.829, 11.737, 11.989, 11.872, ... 25.619]
+   elapsed time: 13.311ms    (std::chrono Measured)
+    [   0  41  64  89  90  95 141 169 206 236 278 320 345 ... 51331714 51331752 ]
+==== cpu scan, non-power-of-two ====
+    Time record is [6.4447, 6.2375, 20.608, 8.2194, 4.5156, 5.2029, 4.3476, 3.7569, 3.7672, 3.9463, 3.8041, 6.7647, 8.9613, ... 3.7673]
+   elapsed time: 4.7256ms    (std::chrono Measured)
+    [   0  41  64  89  90  95 141 169 206 236 278 320 345 ... 51331652 51331692 ]
+    passed
+==== naive scan, power-of-two ====
+    Time record is [1.6997, 1.6947, 1.6957, 1.6964, 1.6937, 1.6972, 1.6957, 1.6955, 1.6977, 1.6956, 1.6957, 1.697, 1.6957, ... 1.5053]
+   elapsed time: 1.5804ms    (CUDA Measured)
+    passed
+==== naive scan, non-power-of-two ====
+    Time record is [1.505, 1.5154, 1.5114, 1.5073, 1.5134, 1.5131, 1.5095, 1.5173, 1.5183, 1.5173, 1.5181, 1.5193, 1.5177, ... 1.5286]
+   elapsed time: 1.5617ms    (CUDA Measured)
+    passed
+==== work-efficient scan, power-of-two ====
+    Time record is [1.0465, 0.9728, 0.92182, 0.9257, 0.93184, 2.3247, 0.92374, 0.9345, 0.92896, 0.92176, 0.92269, 0.92266, 0.93555, ... 0.9264]
+   elapsed time: 0.97037ms    (CUDA Measured)
+    passed
+==== work-efficient scan, non-power-of-two ====
+    Time record is [0.93405, 0.92058, 0.92365, 0.91706, 0.93229, 0.92541, 0.9175, 0.93056, 0.9216, 0.9176, 0.91802, 0.93424, 0.91955, ... 0.91955]
+   elapsed time: 0.92851ms    (CUDA Measured)
+    passed
+==== thrust scan, power-of-two ====
+    Time record is [0.27674, 0.29424, 0.37693, 0.28387, 0.26618, 0.26726, 0.29562, 0.27222, 0.27443, 0.29901, 0.31325, 0.30925, 0.27082, ... 0.26301]
+   elapsed time: 0.30972ms    (CUDA Measured)
+    passed
+==== thrust scan, non-power-of-two ====
+    Time record is [0.2639, 0.36099, 0.28035, 0.26765, 0.39936, 0.30883, 0.29104, 0.27306, 0.26934, 0.2631, 0.29914, 0.26224, 0.2863, ... 0.29773]
+   elapsed time: 0.31379ms    (CUDA Measured)
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   0   1   1   3   0   1   0   2   3   3   1   1   1 ...   3   0 ]
+==== cpu compact without scan, power-of-two ====
+    Time record is [7.5451, 5.9954, 5.9785, 5.7061, 6.2471, 6.2241, 5.9236, 5.7947, 6.4508, 5.8406, 5.8629, 5.7438, 5.7671, ... 5.7358]
+   elapsed time: 6.2666ms    (std::chrono Measured)
+    [   1   1   3   1   2   3   3   1   1   1   3   1   2 ...   1   3 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+    Time record is [7.3508, 8.2913, 5.9938, 5.9339, 5.8238, 5.7023, 5.9023, 5.8208, 6.926, 6.5035, 5.7348, 6.8928, 7.7841, ... 5.8204]
+   elapsed time: 6.2747ms    (std::chrono Measured)
+    [   1   1   3   1   2   3   3   1   1   1   3   1   2 ...   2   3 ]
+    passed
+==== cpu compact with scan ====
+    Time record is [28.751, 24.819, 23.735, 23.725, 23.829, 23.911, 23.968, 26.001, 24.981, 24.231, 23.606, 24.335, 23.51, ... 26.916]
+   elapsed time: 24.973ms    (std::chrono Measured)
+    [   1   1   3   1   2   3   3   1   1   1   3   1   2 ...   1   3 ]
+    passed
+==== work-efficient compact, power-of-two ====
+    Time record is [1.9505, 1.6153, 1.6267, 1.6086, 1.6013, 1.6535, 1.6727, 1.7992, 1.7735, 2.1627, 1.7832, 1.7375, 1.7575, ... 1.682]
+   elapsed time: 1.8556ms    (CUDA Measured)
+    passed
+==== work-efficient compact, non-power-of-two ====
+    Time record is [1.6855, 1.9108, 1.7044, 1.6872, 1.7303, 1.9761, 1.7651, 1.9041, 1.6835, 1.6977, 1.7791, 1.6812, 1.7178, ... 1.6495]
+   elapsed time: 1.7805ms    (CUDA Measured)
+    passed
+Result for n = 2097152 is :
+    Time record is [13.311, 4.7256, 1.5804, 1.5617, 0.97037, 0.92851, 0.30972, 0.31379, 6.2666, 6.2747, 24.973, 1.8556, 1.7805, ... 0]
+```
 
-### (TODO: Your README)
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
 
diff --git a/doc/analysis.py b/doc/analysis.py
new file mode 100644
index 0000000..ce7ab35
--- /dev/null
+++ b/doc/analysis.py
@@ -0,0 +1,75 @@
+import csv
+import matplotlib.pyplot as plt
+
+data = []
+with open('lab2.csv', 'r') as csvfile:
+  spamreader = csv.reader(csvfile, delimiter=',')
+  for row in spamreader:
+    drow = []
+    for data_str in row:
+      drow.append(float(data_str))
+    data.append(drow)
+
+start = 10
+end = 21
+
+cpu_scan = []
+naive_scan = []
+efficient_scan = []
+efficient_scan_npot = []
+thrust_scan = []
+thrust_scan_npot = []
+cpu_compact = []
+efficient_compact = []
+
+x = []
+
+# import pdb; pdb.set_trace()
+
+for i in range(start, end + 1):
+  x.append(i)
+  row = data[i - 10]
+  cpu_scan.append(row[0])
+  naive_scan.append(row[2])
+  efficient_scan.append(row[4])
+  efficient_scan_npot.append(row[6])
+  thrust_scan.append(row[6])
+  thrust_scan_npot.append(row[7])
+  cpu_compact.append(row[8])
+  efficient_compact.append(row[11])  
+
+fig, ax = plt.subplots()
+
+### Scan 
+# plt.plot(x, cpu_scan, label="cpu_scan")
+# plt.plot(x, naive_scan, label="naive_scan")
+# plt.plot(x, efficient_scan, label="work_efficient")
+# plt.plot(x, thrust_scan, label="thrust_scane")
+# plt.title("Scan time")
+
+### NPOT
+# plt.plot(x, efficient_scan, label="work-efficient scan")
+# plt.plot(x, efficient_scan_npot, label="work-efficient scan NPOT")
+# plt.title("Scan time")
+
+
+### Thrust NPOT
+# plt.plot(x, thrust_scan, label="thrust scan")
+# plt.plot(x, thrust_scan_npot, label="thrust scan NPOT")
+# plt.title("Scan time")
+
+### Compact
+# plt.plot(x, cpu_compact, label="cpu compact")
+# plt.plot(x, efficient_compact, label="work efficient compact")
+# plt.title("Compact time")
+
+### GPU
+plt.plot(x, naive_scan, label="naive_scan")
+plt.plot(x, efficient_scan, label="work_efficient")
+plt.plot(x, thrust_scan, label="thrust_scan")
+plt.title("GPU Scan time")
+
+plt.xlabel("Number (in base 2)")
+plt.ylabel("Time (ms)")
+plt.legend()
+plt.show()
\ No newline at end of file
diff --git a/doc/compact.png b/doc/compact.png
new file mode 100644
index 0000000..36a6aae
Binary files /dev/null and b/doc/compact.png differ
diff --git a/doc/gpu_scan.png b/doc/gpu_scan.png
new file mode 100644
index 0000000..9a8c82a
Binary files /dev/null and b/doc/gpu_scan.png differ
diff --git a/doc/lab2.csv b/doc/lab2.csv
new file mode 100644
index 0000000..bf95092
--- /dev/null
+++ b/doc/lab2.csv
@@ -0,0 +1,12 @@
+0.001885, 0.001899, 0.036519, 0.03666, 0.10034, 0.12692, 0.19843, 0.18431, 0.002746, 0.004293, 0.007446, 0.2877, 0.32142
+0.005164, 0.003933, 0.05982, 0.064216, 0.16541, 0.13125, 0.15975, 0.1403, 0.003983, 0.005303, 0.010544, 0.30586, 0.37939
+0.009546, 0.007338, 0.046396, 0.046294, 0.13717, 0.1225, 0.14251, 0.12847, 0.020095, 0.008572, 0.045547, 0.3119, 0.32465
+0.009546, 0.007338, 0.046396, 0.046294, 0.13717, 0.1225, 0.14251, 0.12847, 0.020095, 0.008572, 0.045547, 0.3119, 0.32465
+0.032347, 0.03035, 0.057651, 0.057629, 0.14566, 0.1432, 0.17856, 0.17249, 0.062693, 0.068325, 0.141, 0.39171, 0.40263
+0.072447, 0.066555, 0.065433, 0.065277, 0.16558, 0.17479, 0.16487, 0.20362, 0.12599, 0.11522, 0.27981, 0.34057, 0.50167
+0.13301, 0.1447, 0.087247, 0.087203, 0.16367, 0.19217, 0.13575, 0.18643, 0.28578, 0.25115, 0.55336, 0.44775, 0.34398
+0.28354, 0.33327, 0.12242, 0.11939, 0.19577, 0.18088, 0.1958, 0.15035, 0.70451, 0.6561, 1.6623, 0.51467, 0.52659
+1.4489, 0.57192, 0.20105, 0.18198, 0.21742, 0.25338, 0.64729, 0.28838, 0.79568, 0.85801, 3.0992, 0.55925, 0.56684
+3.1769, 1.1265, 0.35451, 0.3327, 0.31406, 0.34869, 0.29619, 0.31233, 1.602, 1.6526, 6.3687, 0.68551, 0.66141
+6.3527, 2.0997, 0.9041, 0.8269, 0.34781, 0.3672, 0.28274, 0.28195, 3.196, 3.239, 12.361, 0.89818, 0.90519
+13.311, 4.7256, 1.5804, 1.5617, 0.97037, 0.92851, 0.30972, 0.31379, 6.2666, 6.2747, 24.973, 1.8556, 1.7805
\ No newline at end of file
diff --git a/doc/scan_time.png b/doc/scan_time.png
new file mode 100644
index 0000000..1ea81e8
Binary files /dev/null and b/doc/scan_time.png differ
diff --git a/doc/scan_time_npot.png b/doc/scan_time_npot.png
new file mode 100644
index 0000000..5401296
Binary files /dev/null and b/doc/scan_time_npot.png differ
diff --git a/doc/thrust_scan_time_npot.png b/doc/thrust_scan_time_npot.png
new file mode 100644
index 0000000..c3bcdf4
Binary files /dev/null and b/doc/thrust_scan_time_npot.png differ
diff --git a/src/main.cpp b/src/main.cpp
index 896ac2b..989423f 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,12 +13,40 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 1 << 10; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];
 int *c = new int[SIZE];
 
+const int repeatTime = 100;
+float *record = new float[repeatTime];
+
+float getTimeAvg(float *src) {
+  float t = 0.f;
+  for (int i = 0; i < repeatTime; i++) {
+    t += src[i];
+  }
+  return t / repeatTime;
+}
+
+void printTime(float *src) {
+  std::cout << "    Time record is [";
+  std::cout.precision(5);
+  for (int i = 0; i < repeatTime; i++) {
+    if (repeatTime > 16 && i == 13) {
+          std::cout << "... ";
+          i = repeatTime - 2;
+          continue;
+    }
+    std::cout << src[i];
+    if (i != repeatTime - 1) {
+        std::cout << ", ";
+    }
+  }
+  std::cout << "]" << std::endl;
+}
+
 int main(int argc, char* argv[]) {
     // Scan tests
 
@@ -30,28 +58,45 @@ int main(int argc, char* argv[]) {
     genArray(SIZE - 1, a, 50);  // Leave a 0 at the end to test that edge case
     a[SIZE - 1] = 0;
     printArray(SIZE, a, true);
+    float res[13];
+
 
     // initialize b using StreamCompaction::CPU::scan you implement
     // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct.
     // At first all cases passed because b && c are all zeroes.
-    zeroArray(SIZE, b);
     printDesc("cpu scan, power-of-two");
-    StreamCompaction::CPU::scan(SIZE, b, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+    for (int i = 0; i < repeatTime; i++) {
+      zeroArray(SIZE, b);
+      StreamCompaction::CPU::scan(SIZE, b, a);
+      record[i] = StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation();
+    }
+    printTime(record);
+    res[0] = getTimeAvg(record);
+    printElapsedTime(res[0], "(std::chrono Measured)");
     printArray(SIZE, b, true);
 
-    zeroArray(SIZE, c);
+    
     printDesc("cpu scan, non-power-of-two");
-    StreamCompaction::CPU::scan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+    for (int i = 0; i < repeatTime; i++) {
+      zeroArray(SIZE, c);
+      StreamCompaction::CPU::scan(NPOT, c, a);
+      record[i] = StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation();
+    }
+    printTime(record);
+    res[1] = getTimeAvg(record);
+    printElapsedTime(getTimeAvg(record), "(std::chrono Measured)");
     printArray(NPOT, b, true);
     printCmpResult(NPOT, b, c);
 
-    zeroArray(SIZE, c);
     printDesc("naive scan, power-of-two");
-    StreamCompaction::Naive::scan(SIZE, c, a);
-    printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    for (int i = 0; i < repeatTime; i++) {
+      zeroArray(SIZE, c); 
+      StreamCompaction::Naive::scan(SIZE, c, a);
+      record[i] = StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation();
+    }
+    printTime(record);
+    res[2] = getTimeAvg(record);
+    printElapsedTime(getTimeAvg(record), "(CUDA Measured)");
     printCmpResult(SIZE, b, c);
 
     /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
@@ -59,39 +104,62 @@ int main(int argc, char* argv[]) {
     printDesc("1s array for finding bugs");
     StreamCompaction::Naive::scan(SIZE, c, a);
     printArray(SIZE, c, true); */
-
-    zeroArray(SIZE, c);
+    
     printDesc("naive scan, non-power-of-two");
-    StreamCompaction::Naive::scan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    for (int i = 0; i < repeatTime; i++) {
+      zeroArray(SIZE, c);
+      StreamCompaction::Naive::scan(NPOT, c, a);
+      record[i] = StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation();
+    }
+    printTime(record);
+    res[3] = getTimeAvg(record);
+    printElapsedTime(getTimeAvg(record), "(CUDA Measured)");
     printCmpResult(NPOT, b, c);
 
-    zeroArray(SIZE, c);
     printDesc("work-efficient scan, power-of-two");
-    StreamCompaction::Efficient::scan(SIZE, c, a);
-    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+    for (int i = 0; i < repeatTime; i++) {
+      zeroArray(SIZE, c);
+      StreamCompaction::Efficient::scan(SIZE, c, a);
+      record[i] = StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation();
+    }
+    printTime(record);
+    res[4] = getTimeAvg(record);
+    printElapsedTime(getTimeAvg(record), "(CUDA Measured)");
     //printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
-    zeroArray(SIZE, c);
     printDesc("work-efficient scan, non-power-of-two");
-    StreamCompaction::Efficient::scan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
+    for (int i = 0; i < repeatTime; i++) {
+      zeroArray(SIZE, c);
+      StreamCompaction::Efficient::scan(NPOT, c, a);
+      record[i] = StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation();
+    }
+    printTime(record);
+    res[5] = getTimeAvg(record);
+    printElapsedTime(getTimeAvg(record), "(CUDA Measured)");
     printCmpResult(NPOT, b, c);
 
-    zeroArray(SIZE, c);
     printDesc("thrust scan, power-of-two");
-    StreamCompaction::Thrust::scan(SIZE, c, a);
-    printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+    for (int i = 0; i < repeatTime; i++) {
+      zeroArray(SIZE, c);
+      StreamCompaction::Thrust::scan(SIZE, c, a);
+      record[i] = StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation();
+    }
+    printTime(record);
+    res[6] = getTimeAvg(record);
+    printElapsedTime(getTimeAvg(record), "(CUDA Measured)");
     //printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
-    zeroArray(SIZE, c);
     printDesc("thrust scan, non-power-of-two");
-    StreamCompaction::Thrust::scan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+    for (int i = 0; i < repeatTime; i++) {
+      zeroArray(SIZE, c);
+      StreamCompaction::Thrust::scan(NPOT, c, a);
+      record[i] = StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation();
+    }    
+    printTime(record);
+    res[7] = getTimeAvg(record);
+    printElapsedTime(getTimeAvg(record), "(CUDA Measured)");
     //printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
@@ -110,43 +178,71 @@ int main(int argc, char* argv[]) {
 
     // initialize b using StreamCompaction::CPU::compactWithoutScan you implement
     // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct.
-    zeroArray(SIZE, b);
     printDesc("cpu compact without scan, power-of-two");
-    count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+    for (int i = 0; i < repeatTime; i++) {
+      zeroArray(SIZE, b);
+      count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
+      record[i] = StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation();
+    }
+    printTime(record);
+    res[8] = getTimeAvg(record);
+    printElapsedTime(getTimeAvg(record), "(std::chrono Measured)");
     expectedCount = count;
     printArray(count, b, true);
     printCmpLenResult(count, expectedCount, b, b);
 
-    zeroArray(SIZE, c);
     printDesc("cpu compact without scan, non-power-of-two");
-    count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+    for (int i = 0; i < repeatTime; i++) {
+      zeroArray(SIZE, c);
+      count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a);
+      record[i] = StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation();
+    }
+    printTime(record);
+    res[9] = getTimeAvg(record);
+    printElapsedTime(getTimeAvg(record), "(std::chrono Measured)");
     expectedNPOT = count;
     printArray(count, c, true);
     printCmpLenResult(count, expectedNPOT, b, c);
 
-    zeroArray(SIZE, c);
     printDesc("cpu compact with scan");
-    count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+    for (int i = 0; i < repeatTime; i++) {
+      zeroArray(SIZE, c);
+      count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
+      record[i] = StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation();
+    }
+    printTime(record);
+    res[10] = getTimeAvg(record);
+    printElapsedTime(getTimeAvg(record), "(std::chrono Measured)");
     printArray(count, c, true);
     printCmpLenResult(count, expectedCount, b, c);
 
-    zeroArray(SIZE, c);
     printDesc("work-efficient compact, power-of-two");
-    count = StreamCompaction::Efficient::compact(SIZE, c, a);
-    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+    for (int i = 0; i < repeatTime; i++) {
+      zeroArray(SIZE, c);
+      count = StreamCompaction::Efficient::compact(SIZE, c, a);
+      record[i] = StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation();
+    }
+    printTime(record);
+    res[11] = getTimeAvg(record);
+    printElapsedTime(getTimeAvg(record), "(CUDA Measured)");
     //printArray(count, c, true);
     printCmpLenResult(count, expectedCount, b, c);
 
-    zeroArray(SIZE, c);
     printDesc("work-efficient compact, non-power-of-two");
-    count = StreamCompaction::Efficient::compact(NPOT, c, a);
-    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+    for (int i = 0; i < repeatTime; i++) {
+      zeroArray(SIZE, c);
+      count = StreamCompaction::Efficient::compact(NPOT, c, a);
+      record[i] = StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation();
+    }
+    printTime(record);
+    res[12] = getTimeAvg(record);
+    printElapsedTime(getTimeAvg(record), "(CUDA Measured)");
     //printArray(count, c, true);
     printCmpLenResult(count, expectedNPOT, b, c);
 
+    std::cout << "Result for n = " << SIZE << " is :" << std::endl;
+    printTime(res);
+
     system("pause"); // stop Win32 console from closing on exit
     delete[] a;
     delete[] b;
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index 2ed6d63..7a373c6 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -23,7 +23,9 @@ namespace StreamCompaction {
          * which map to 0 will be removed, and elements which map to 1 will be kept.
          */
         __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
-            // TODO
+            int idx = threadIdx.x + (blockDim.x * blockIdx.x);
+            if (idx >= n) return;
+            bools[idx] = idata[idx] == 0 ? 0 : 1;
         }
 
         /**
@@ -32,7 +34,11 @@ namespace StreamCompaction {
          */
         __global__ void kernScatter(int n, int *odata,
                 const int *idata, const int *bools, const int *indices) {
-            // TODO
+            int idx = threadIdx.x + (blockDim.x * blockIdx.x);
+            if (idx >= n) return;
+            if (bools[idx] == 1) {
+              odata[indices[idx]] = idata[idx];
+            }
         }
 
     }
diff --git a/stream_compaction/common.h b/stream_compaction/common.h
index d2c1fed..aa7d58c 100644
--- a/stream_compaction/common.h
+++ b/stream_compaction/common.h
@@ -11,7 +11,14 @@
 #include <stdexcept>
 
 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
+#ifdef _DEBUG
 #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
+#else
+#define checkCUDAError(msg)
+#endif // _DEBUG
+
+
+#define blockSize 128
 
 /**
  * Check for CUDA errors; print and exit if there was a problem.
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 719fa11..2172a8a 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -19,7 +19,10 @@ namespace StreamCompaction {
          */
         void scan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
-            // TODO
+            odata[0] = 0;
+            for (int i = 1; i < n; i++) {
+              odata[i] = odata[i - 1] + idata[i - 1];
+            }
             timer().endCpuTimer();
         }
 
@@ -30,9 +33,14 @@ namespace StreamCompaction {
          */
         int compactWithoutScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
-            // TODO
+            int j = 0;
+            for (int i = 0; i < n; i++) {
+              if (idata[i] != 0) {
+                odata[j++] = idata[i];
+              }
+            }
             timer().endCpuTimer();
-            return -1;
+            return j;
         }
 
         /**
@@ -41,10 +49,27 @@ namespace StreamCompaction {
          * @returns the number of elements remaining after compaction.
          */
         int compactWithScan(int n, int *odata, const int *idata) {
+            int *temp = new int[n],
+                *tempSum = new int[n];
             timer().startCpuTimer();
-            // TODO
+            for (int i = 0; i < n; i++) {
+              temp[i] = idata[i] == 0 ? 0 : 1;
+            }
+            tempSum[0] = 0;
+            for (int i = 1; i < n; i++) {
+                tempSum[i] = tempSum[i - 1] + temp[i - 1];
+            }
+            int cnt = 0;
+            for (int i = 0; i < n; i++) {
+              if (temp[i] == 1) {
+                odata[tempSum[i]] = idata[i];
+                cnt++;
+              }
+            }
             timer().endCpuTimer();
-            return -1;
+            delete[] temp;
+            delete[] tempSum;
+            return cnt;
         }
     }
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 2db346e..4878221 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -12,13 +12,73 @@ namespace StreamCompaction {
             return timer;
         }
 
+        __global__ void upSweep(int numThreads, int *data, int d) {
+          int idx = threadIdx.x + (blockIdx.x * blockDim.x);
+          if (idx >= numThreads) return;
+          int interval = 1 << d;
+          int mapped = interval * idx + interval - 1;
+          data[mapped] += data[mapped - (interval >> 1)];
+        }
+
+        __global__ void downSweep(int numThreads, int *data, int d) {
+          int idx = threadIdx.x + (blockIdx.x * blockDim.x);
+          if (idx >= numThreads) return;
+          int interval = 1 << d;
+          int node = interval * idx + interval - 1;
+          int left = node - (interval >> 1);
+          int temp = data[left];
+          data[left] = data[node];
+          data[node] += temp;
+        }
+
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
-        void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
-            // TODO
-            timer().endGpuTimer();
+        void scan(int n, int *dev_odata, const int *dev_idata, bool callFromMain) {
+          int iterations = ilog2ceil(n);
+          int nextN = 1 << iterations;
+          int *dev_idata_temp;
+          cudaMalloc((void **) &dev_idata_temp, nextN * sizeof(int));
+          checkCUDAError("SCAN: cudaMalloc dev_idata_temp failed");
+          cudaMemset(dev_idata_temp, 0, nextN *sizeof(int));
+          checkCUDAError("SCAN: cudaMemset dev_idata_temp failed");
+          if (callFromMain) {
+              cudaMemcpy(dev_idata_temp, dev_idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+              timer().startGpuTimer();
+          }
+          else {
+              cudaMemcpy(dev_idata_temp, dev_idata, sizeof(int) * n, cudaMemcpyDeviceToDevice);
+          }
+          checkCUDAError("SCAN: cudaMemcpy dev_idata_temp failed");
+
+          // Up-sweep
+          for (int d = 1; d <= iterations; d++) {
+            int numThreads = 1 << (iterations - d);
+            dim3 blocks((numThreads + blockSize - 1) / blockSize);
+            upSweep<<<blocks, blockSize>>>(numThreads, dev_idata_temp, d);
+            checkCUDAError("SCAN: upSweep failed");
+          }
+
+          // Down-sweep
+          // Set the "root" to 0
+          cudaMemset(&dev_idata_temp[nextN - 1], 0, sizeof(int));
+          for (int d = iterations; d >= 1; d--) {
+            int numThreads = 1 << (iterations - d);
+            dim3 blocks((numThreads + blockSize - 1) / blockSize);
+            downSweep<<<blocks, blockSize>>>(numThreads, dev_idata_temp, d);
+            checkCUDAError("SCAN: downSweep failed");
+          }
+          
+          if (callFromMain) {
+              timer().endGpuTimer();
+              cudaMemcpy(dev_odata, dev_idata_temp, sizeof(int) * n, cudaMemcpyDeviceToHost);
+          }
+          else {
+              cudaMemcpy(dev_odata, dev_idata_temp, sizeof(int) * n, cudaMemcpyDeviceToDevice);
+          }
+          checkCUDAError("SCAN: cudaMemcpy dev_odata failed");
+
+          cudaFree(dev_idata_temp);
         }
 
         /**
@@ -31,10 +91,42 @@ namespace StreamCompaction {
          * @returns      The number of elements remaining after compaction.
          */
         int compact(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
-            // TODO
-            timer().endGpuTimer();
-            return -1;
+          int *bools, *indices, *dev_idata, *dev_odata;
+          cudaMalloc((void**) &bools, sizeof(int) * n);
+          checkCUDAError("COMPACT: cudaMalloc bools failed");
+          cudaMalloc((void**) &indices, sizeof(int) * n);
+          checkCUDAError("COMPACT: cudaMalloc indices failed");
+          cudaMalloc((void**) &dev_idata, sizeof(int) * n);
+          checkCUDAError("COMPACT: cudaMalloc dev_idata failed");
+          cudaMalloc((void**) &dev_odata, sizeof(int) * n);
+          checkCUDAError("COMPACT: cudaMalloc dev_odata failed");
+          cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+          checkCUDAError("COMPACT: cudaMalloc idata->dev_idata failed");
+
+          timer().startGpuTimer();
+
+          dim3 blocks((n + blockSize - 1) / blockSize);
+          Common::kernMapToBoolean<<<blocks, blockSize>>>(n, bools, dev_idata);
+          checkCUDAError("COMPACT: kernMapToBoolean failed");
+          scan(n, indices, bools, false);
+          Common::kernScatter<<<blocks, blockSize>>>(n, dev_odata, dev_idata, bools, indices);
+          checkCUDAError("COMPACT: kernScatter failed");
+
+          timer().endGpuTimer();
+          
+          int cnt, lastBool;
+          cudaMemcpy(odata, dev_odata, sizeof(int) * n, cudaMemcpyDeviceToHost);
+          checkCUDAError("COMPACT: cudaMemcpy dev_odata->odata failed");
+          // Copy the count back
+          cudaMemcpy(&cnt, &indices[n - 1], sizeof(int), cudaMemcpyDeviceToHost);
+          checkCUDAError("COMPACT: cudaMemcpy indices->cnt failed");
+          cudaMemcpy(&lastBool, &bools[n - 1], sizeof(int), cudaMemcpyDeviceToHost);
+          checkCUDAError("COMPACT: cudaMemcpy bools->lastBool failed");
+          cudaFree(bools);
+          cudaFree(indices);
+          cudaFree(dev_idata);
+          cudaFree(dev_odata);
+          return lastBool ? cnt + 1 : cnt;
         }
     }
 }
diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h
index 803cb4f..5ba93b0 100644
--- a/stream_compaction/efficient.h
+++ b/stream_compaction/efficient.h
@@ -6,7 +6,7 @@ namespace StreamCompaction {
     namespace Efficient {
         StreamCompaction::Common::PerformanceTimer& timer();
 
-        void scan(int n, int *odata, const int *idata);
+        void scan(int n, int *odata, const int *idata, bool useTimer = true);
 
         int compact(int n, int *odata, const int *idata);
     }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 4308876..9fd83e9 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -11,15 +11,45 @@ namespace StreamCompaction {
             static PerformanceTimer timer;
             return timer;
         }
-        // TODO: __global__
+        __global__ void addPrev(int n, int *idata, int *odata, int d) {
+          int idx = threadIdx.x + (blockIdx.x * blockDim.x);
+          if (idx >= n) return;
+          int base = 1 << (d - 1);
+          odata[idx] = idx >= base ? idata[idx - base] + idata[idx] : idata[idx];
+        }
 
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+            int *dev_idata, *dev_odata;
+            cudaMalloc((void **) &dev_idata, n * sizeof(int));
+            checkCUDAError("cudaMalloc dev_idata failed");
+            cudaMalloc((void **) &dev_odata, n * sizeof(int));
+            checkCUDAError("cudaMalloc dev_odata failed");
+            cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+            checkCUDAError("cudaMemcpy dev_idata failed");
+
             timer().startGpuTimer();
-            // TODO
+            int iterations = ilog2ceil(n);
+            
+            dim3 blocks((n + blockSize - 1) / blockSize);
+            for (int d = 1; d <= iterations; d++) {
+                if (d % 2 == 1) {
+                    addPrev << <blocks, blockSize >> > (n, dev_idata, dev_odata, d);
+                }
+                else {
+                    addPrev << <blocks, blockSize >> > (n, dev_odata, dev_idata, d);
+                }
+              checkCUDAError("addPrev failed");
+            }
+
             timer().endGpuTimer();
+            odata[0] = 0;
+            cudaMemcpy(odata + 1, (iterations % 2 == 1) ? dev_odata : dev_idata, sizeof(int) * (n - 1), cudaMemcpyDeviceToHost);
+            checkCUDAError("cudaMemcpy odata failed");
+            cudaFree(dev_idata);
+            cudaFree(dev_odata);
         }
     }
 }
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 1def45e..cb3bf9d 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -18,11 +18,13 @@ namespace StreamCompaction {
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+            thrust::host_vector<int> host_idata(idata, idata + n);
+            thrust::device_vector<int> dev_idata = host_idata;
+            thrust::device_vector<int> dev_odata(n);
             timer().startGpuTimer();
-            // TODO use `thrust::exclusive_scan`
-            // example: for device_vectors dv_in and dv_out:
-            // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+            thrust::exclusive_scan(dev_idata.begin(), dev_idata.end(), dev_odata.begin());
             timer().endGpuTimer();
+            thrust::copy(dev_odata.begin(), dev_odata.end(), odata);
         }
     }
 }