diff --git a/README.md b/README.md
index 0e38ddb..f1023e2 100644
--- a/README.md
+++ b/README.md
@@ -3,12 +3,90 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Xinyu Lin
+[Linkedin](https://www.linkedin.com/in/xinyu-lin-138352125/)
+* Tested on: Windows 10, Intel(R) Core(TM) i7-6700HQ CPU@2.60GHz, 16GB, GTX960M(Private Computer)
 
-### (TODO: Your README)
+### Features
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+* CPU Scan & Stream Compaction
+* Naive GPU Scan Algorithm
+* Work-Efficient GPU Scan
+* Work-Efficient GPU Stream Compaction
+* Thrust Implementation
 
+### Performance Analysis
+* BlockSize over Scan methods(ArraySize: 256)
+![](img/bsscan.png)
+
+* BlockSize over Compact methods(ArraySize: 256)
+![](img/bscompact.png)
+
+* ArraySize over Scan methods(BlockSize: 512)
+![](img/as256scan.png)
+![](img/as512scan'.png)
+![](img/as1024scan.png)
+
+* ArraySize over Compact methods(BlockSize: 512)
+![](img/as256compact.png)
+![](img/as512compact.png)
+![](img/as1024compact.png)
+
+### Result
+```
+****************
+** SCAN TESTS **
+****************
+    [  33  32  23  11   4  10  29  49   9  36  47  43  19 ...  20   0 ]
+==== cpu scan, power-of-two ====
+   elapsed time: 0.001185ms    (std::chrono Measured)
+    [   0  33  65  88  99 103 113 142 191 200 236 283 326 ... 6332 6352 ]
+==== cpu scan, non-power-of-two ====
+   elapsed time: 0.001185ms    (std::chrono Measured)
+    [   0  33  65  88  99 103 113 142 191 200 236 283 326 ... 6208 6251 ]
+    passed
+==== naive scan, power-of-two ====
+   elapsed time: 0.240992ms    (CUDA Measured)
+    passed
+==== naive scan, non-power-of-two ====
+   elapsed time: 0.189888ms    (CUDA Measured)
+    passed
+==== work-efficient scan, power-of-two ====
+   elapsed time: 0.262976ms    (CUDA Measured)
+    passed
+==== work-efficient scan, non-power-of-two ====
+   elapsed time: 0.228768ms    (CUDA Measured)
+    passed
+==== thrust scan, power-of-two ====
+   elapsed time: 0.001088ms    (CUDA Measured)
+    passed
+==== thrust scan, non-power-of-two ====
+   elapsed time: 0.001088ms    (CUDA Measured)
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   1   0   2   2   0   2   3   2   0   0   3   1   0 ...   0   0 ]
+==== cpu compact without scan, power-of-two ====
+   elapsed time: 0.001975ms    (std::chrono Measured)
+    [   1   2   2   2   3   2   3   1   3   2   1   2   1 ...   2   3 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+   elapsed time: 0.001976ms    (std::chrono Measured)
+    [   1   2   2   2   3   2   3   1   3   2   1   2   1 ...   2   3 ]
+    passed
+==== cpu compact with scan ====
+   elapsed time: 0.008692ms    (std::chrono Measured)
+    [   1   2   2   2   3   2   3   1   3   2   1   2   1 ...   2   3 ]
+    passed
+==== work-efficient compact, power-of-two ====
+   elapsed time: 0.499232ms    (CUDA Measured)
+    passed
+==== work-efficient compact, non-power-of-two ====
+   elapsed time: 0.43552ms    (CUDA Measured)
+    passed
+```
+    
+
+  
diff --git a/img/as1024compact.png b/img/as1024compact.png
new file mode 100644
index 0000000..d82518f
Binary files /dev/null and b/img/as1024compact.png differ
diff --git a/img/as1024scan.png b/img/as1024scan.png
new file mode 100644
index 0000000..adb9952
Binary files /dev/null and b/img/as1024scan.png differ
diff --git a/img/as256compact.png b/img/as256compact.png
new file mode 100644
index 0000000..0e3a631
Binary files /dev/null and b/img/as256compact.png differ
diff --git a/img/as256scan.png b/img/as256scan.png
new file mode 100644
index 0000000..9667086
Binary files /dev/null and b/img/as256scan.png differ
diff --git a/img/as512compact.png b/img/as512compact.png
new file mode 100644
index 0000000..9dbb95e
Binary files /dev/null and b/img/as512compact.png differ
diff --git a/img/as512scan'.png b/img/as512scan'.png
new file mode 100644
index 0000000..4d17317
Binary files /dev/null and b/img/as512scan'.png differ
diff --git a/img/bscompact.png b/img/bscompact.png
new file mode 100644
index 0000000..7cf1215
Binary files /dev/null and b/img/bscompact.png differ
diff --git a/img/bsscan.png b/img/bsscan.png
new file mode 100644
index 0000000..eb78182
Binary files /dev/null and b/img/bsscan.png differ
diff --git a/src/main.cpp b/src/main.cpp
index 1850161..60494e7 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -79,6 +79,17 @@ int main(int argc, char* argv[]) {
     StreamCompaction::Efficient::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
     //printArray(NPOT, c, true);
+	//std::cout << "my result: "<< std::endl;
+	//for (int i = 0; i < SIZE; ++i)
+	//{
+	//	std::cout << c[i] << " ";
+	//}
+	//std::cout << std::endl;
+	//std::cout << "correct result: " << std::endl;
+	//for (int i = 0; i < SIZE; ++i)
+	//{
+	//	std::cout << b[i] << " ";
+	//}
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt
index cdbef77..c8709e7 100644
--- a/stream_compaction/CMakeLists.txt
+++ b/stream_compaction/CMakeLists.txt
@@ -13,5 +13,5 @@ set(SOURCE_FILES
 
 cuda_add_library(stream_compaction
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_50
     )
diff --git a/stream_compaction/common.h b/stream_compaction/common.h
index 99a1b04..ca558d6 100644
--- a/stream_compaction/common.h
+++ b/stream_compaction/common.h
@@ -12,7 +12,7 @@
 
 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
 #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
-
+#define blockSize 1024
 /**
  * Check for CUDA errors; print and exit if there was a problem.
  */
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 05ce667..42fe477 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -1,15 +1,15 @@
 #include <cstdio>
 #include "cpu.h"
 
-#include "common.h"
+#include "common.h"
 
 namespace StreamCompaction {
     namespace CPU {
-        using StreamCompaction::Common::PerformanceTimer;
-        PerformanceTimer& timer()
-        {
-	        static PerformanceTimer timer;
-	        return timer;
+        using StreamCompaction::Common::PerformanceTimer;
+        PerformanceTimer& timer()
+        {
+	        static PerformanceTimer timer;
+	        return timer;
         }
 
         /**
@@ -20,6 +20,13 @@ namespace StreamCompaction {
         void scan(int n, int *odata, const int *idata) {
 	        timer().startCpuTimer();
             // TODO
+			// Serial Scan: Exclusive
+			int acc = 0;
+			for (int i = 0; i < n; i++)
+			{
+				odata[i] = acc;
+				acc = acc + idata[i];			
+			}
 	        timer().endCpuTimer();
         }
 
@@ -31,8 +38,19 @@ namespace StreamCompaction {
         int compactWithoutScan(int n, int *odata, const int *idata) {
 	        timer().startCpuTimer();
             // TODO
+			int flag = 0;
+			for (int i = 0; i < n; ++i)
+			{
+				if (idata[i] != 0)
+				{
+					odata[flag] = idata[i];
+					flag++;
+				}
+				
+			}
+
 	        timer().endCpuTimer();
-            return -1;
+            return flag;
         }
 
         /**
@@ -43,8 +61,46 @@ namespace StreamCompaction {
         int compactWithScan(int n, int *odata, const int *idata) {
 	        timer().startCpuTimer();
 	        // TODO
+			// predicate
+			int acc = 0;
+			int* tempdata = new int[n];
+			for (int i = 0; i < n; ++i)
+			{
+				if (idata[i] == 0)
+				{
+					tempdata[i] = 0;
+
+				}
+				else
+				{
+					tempdata[i] = 1;
+					acc++;
+				}
+			}
+			// ======= tempdata[] = 1110011111001111
+			// scan sum
+			int accSum = 0;
+			int* tempData2 = new int[n];
+			for (int i = 0; i < n; i++)
+			{
+				tempData2[i] = accSum;
+				accSum += tempdata[i];
+
+			}
+
+			// idata[] = 3120043215002222
+			// ======= tempdata[] = 1110011111001111
+			// tempData2[] = 0123334567899910....
+			// scatter
+			for (int i = 0; i < n; ++i)
+			{
+				odata[tempData2[i]] = idata[i];
+			}
+
+			delete[] tempdata;
+			delete[] tempData2;
 	        timer().endCpuTimer();
-            return -1;
+            return acc;
         }
     }
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 36c5ef2..5c32efd 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -2,23 +2,118 @@
 #include <cuda_runtime.h>
 #include "common.h"
 #include "efficient.h"
-
+#include <iostream>
 namespace StreamCompaction {
     namespace Efficient {
-        using StreamCompaction::Common::PerformanceTimer;
-        PerformanceTimer& timer()
-        {
-            static PerformanceTimer timer;
-            return timer;
+        using StreamCompaction::Common::PerformanceTimer;
+        PerformanceTimer& timer()
+        {
+            static PerformanceTimer timer;
+            return timer;
         }
 
+		__global__ void kernelScanReduce(int n, int d, int* idata)
+		{
+			int thID = threadIdx.x + blockDim.x * blockIdx.x;
+			if (thID >= n) return;
+			int temp = 1 << d;
+			int temp2 = 1 << (d - 1);
+			if ((thID % temp) == 0)
+			{
+				idata[thID + temp - 1] = idata[thID + temp2 - 1] + idata[thID + temp - 1];
+			}
+		}
+		// two array to get result
+		//__global__ void kernelScanReduce(int n, int d, int* odata, int* idata)
+		//{
+		//	int thID = threadIdx.x + blockDim.x * blockIdx.x;
+		//	if (thID >= n) return;
+		//	int temp = 1 << d;
+		//	int temp2 = 1 << (d - 1);
+		//	odata[thID] = idata[thID];
+		//	if ((thID % temp) == 0)
+		//	{
+		//		odata[thID + temp - 1] = idata[thID + temp2 - 1] + idata[thID + temp - 1];
+		//	}
+		//}
+		//__global__ void kernelScanDownSweep(int n, int d, int* odata, int* idata)
+		//{
+ 	//		int thID = threadIdx.x + blockDim.x * blockIdx.x;
+		//	if (thID >= n) return;
+		//	int tempdp1 = 1 << (d + 1);
+		//	int tempd = 1 << d;
+		//	odata[thID] = idata[thID];
+		//	if ((thID % tempdp1) == 0)
+		//	{
+		//		int t = idata[thID + tempd - 1];
+		//		odata[thID + tempd - 1] = idata[thID + tempdp1 - 1];
+		//		odata[thID + tempdp1 - 1] = t + idata[thID + tempdp1 - 1];
+		//	}
+		//}
+
+
+		__global__ void kernelScanDownSweep(int n, int d, int* idata)
+		{
+			int thID = threadIdx.x + blockDim.x * blockIdx.x;
+			if (thID >= n) return;
+			int tempdp1 = 1 << (d + 1);
+			int tempd = 1 << d;
+			if ((thID % tempdp1) == 0)
+			{
+				int t = idata[thID + tempd - 1];
+				idata[thID + tempd - 1] = idata[thID + tempdp1 - 1];
+				idata[thID + tempdp1 - 1] = t + idata[thID + tempdp1 - 1];
+			}
+		}
+
+		__global__ void kernelChangeN1(int *arr, int index, int identity)
+		{
+				arr[index] = identity;			
+		}
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
+
+		void myScan(int n, int temp, int* idata)
+		{
+			dim3 fullBlocksPerGrid((temp + blockSize - 1) / blockSize);
+			int myIdentity = 0;
+			for (int d = 1; d <= ilog2ceil(n); ++d)
+			{
+				kernelScanReduce << <fullBlocksPerGrid, blockSize >> > (temp, d, idata);
+			}
+
+			kernelChangeN1 << < 1, 1 >> > (idata, temp - 1, myIdentity);
+
+			for (int d = ilog2ceil(n) - 1; d >= 0; --d)
+			{
+				kernelScanDownSweep << <fullBlocksPerGrid, blockSize >> > (temp, d, idata);
+			}
+		}
         void scan(int n, int *odata, const int *idata) {
             timer().startGpuTimer();
             // TODO
+			int temp = 1 << ilog2ceil(n);
+
+			dim3 fullBlocksPerGrid((temp + blockSize - 1) / blockSize);
+			int* dev_In = NULL;
+			int* dev_Out = NULL;
+			cudaMalloc((void**)&dev_In, temp * sizeof(int));
+			checkCUDAError("Malloc dev_In failed!");
+			cudaMalloc((void**)&dev_Out, temp * sizeof(int));
+			checkCUDAError("Malloc dev_Out failed!");
+			cudaMemcpy(dev_In, idata, temp * sizeof(int), cudaMemcpyHostToDevice);
+			checkCUDAError("Memcpy from idata to dev_In failed!");
+			myScan(n, temp, dev_In);
+			
+			std::swap(dev_Out, dev_In);
+			cudaMemcpy(odata, dev_Out, temp * sizeof(int), cudaMemcpyDeviceToHost);
+			checkCUDAError("Memcoy from dev_Out to odata failed!");
+
             timer().endGpuTimer();
+
+			cudaFree(dev_In);
+			cudaFree(dev_Out);
         }
 
         /**
@@ -30,11 +125,84 @@ namespace StreamCompaction {
          * @param idata  The array of elements to compact.
          * @returns      The number of elements remaining after compaction.
          */
+
+
+		__global__ void kernelPredicate(int n, int* odata, const int* idata)
+		{
+			int thID = threadIdx.x + blockDim.x * blockIdx.x;
+			if (thID >= n) return;
+			if (idata[thID] == 0)
+			{
+				odata[thID] = 0;
+			}
+			else
+			{
+				odata[thID] = 1;
+			}
+
+		}
+		__global__ void kernelScatter(int n, int* odata, int* myBool ,int* address, int* idata)
+		{
+			int thID = threadIdx.x + blockDim.x * blockIdx.x;
+			if (thID >= n) return;
+			if (myBool[thID] == 1)
+			{
+				odata[address[thID]] = idata[thID];
+			}
+
+		}
         int compact(int n, int *odata, const int *idata) {
             timer().startGpuTimer();
             // TODO
-            timer().endGpuTimer();
-            return -1;
+			int temp = 1 << ilog2ceil(n);
+			dim3 fullBlocksPerGrid((temp + blockSize - 1) / blockSize);
+			int* dev_In = NULL;
+			int* dev_Out = NULL;
+			int* dev_Bool = NULL;
+			int* dev_Address = NULL;
+
+			cudaMalloc((void**)&dev_In, temp * sizeof(int));
+			checkCUDAError("compact: malloc dev_In failed!");
+			cudaMalloc((void**)&dev_Out, temp * sizeof(int));
+			checkCUDAError("compact: malloc dev_Out failed!");
+			cudaMalloc((void**)&dev_Bool, temp * sizeof(int));
+			checkCUDAError("compact: malloc dev_Bool failed!");
+			cudaMalloc((void**)&dev_Address, temp * sizeof(int));
+			checkCUDAError("compact: malloc dev_Address failed!");
+
+			cudaMemcpy(dev_In, idata, temp * sizeof(int), cudaMemcpyHostToDevice);
+			checkCUDAError("compact: memcpy from idata to dev_In failed!");
+
+			kernelPredicate << < fullBlocksPerGrid, blockSize >> > (temp, dev_Bool, dev_In);
+
+			cudaMemcpy(dev_Address, dev_Bool, temp * sizeof(int), cudaMemcpyDeviceToDevice);
+			checkCUDAError("compact: memcoy from dev_Bool to dev_Address failed!");
+
+			myScan(n, temp, dev_Address);
+
+			kernelScatter << <fullBlocksPerGrid, blockSize >> > (n, dev_Out, dev_Bool, dev_Address, dev_In);
+			
+			cudaMemcpy(odata, dev_Out, temp * sizeof(int), cudaMemcpyDeviceToHost);
+
+			cudaFree(dev_In);
+			cudaFree(dev_Out);
+			cudaFree(dev_Bool);
+			cudaFree(dev_Address);
+
+			int flag = 0;
+			for (int i = 0; i < n; ++i)
+			{
+				if (odata[i] != 0)
+				{
+					flag++;
+				}
+				else
+				{
+					break;
+				}
+			}
+			timer().endGpuTimer();
+			return flag;
         }
     }
 }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 9218f8e..55ede8c 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -2,24 +2,72 @@
 #include <cuda_runtime.h>
 #include "common.h"
 #include "naive.h"
+#include <iostream>
+
+
 
 namespace StreamCompaction {
     namespace Naive {
-        using StreamCompaction::Common::PerformanceTimer;
-        PerformanceTimer& timer()
-        {
-            static PerformanceTimer timer;
-            return timer;
+        using StreamCompaction::Common::PerformanceTimer;
+        PerformanceTimer& timer()
+        {
+            static PerformanceTimer timer;
+            return timer;
         }
         // TODO: __global__
-
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
+
+		__global__ void kernelScan(int n, int d, int* odata, int* idata)
+		{
+			int thID = threadIdx.x + blockDim.x * blockIdx.x;
+			int temp = 1 << (d - 1);
+
+			if (thID >= n) return;
+			if (thID >= temp)
+			{
+				odata[thID] = idata[thID - temp] + idata[thID];
+			}
+			else
+			{
+				odata[thID] = idata[thID];
+			}
+			
+		}
         void scan(int n, int *odata, const int *idata) {
             timer().startGpuTimer();
             // TODO
+
+			dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize);
+			int* dev_tempIn = NULL;
+			int* dev_tempOut = NULL;
+
+			cudaMalloc((void**)&dev_tempIn, n * sizeof(int));
+			checkCUDAError("Malloc memory to dev_tempIn failed!");
+			cudaMalloc((void**)&dev_tempOut, n * sizeof(int));
+			checkCUDAError("Malloc memory to dev_tempOut failed!");
+			cudaMemcpy(dev_tempIn, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+			checkCUDAError("Memory copy from host to device failed!");
+
+			for (int d = 1; d <= ilog2ceil(n); ++d)
+			{
+				kernelScan << <fullBlocksPerGrid, blockSize >> > (n, d, dev_tempOut, dev_tempIn);
+				std::swap(dev_tempOut, dev_tempIn);
+			}
+			std::swap(dev_tempOut, dev_tempIn);
+
+			odata[0] = 0;
+			cudaMemcpy(odata + 1, dev_tempOut, (n - 1) * sizeof(int), cudaMemcpyDeviceToHost);
+			checkCUDAError("Memory copy from device to host failed!");
+			//for (int i = 0; i < n; ++i)
+			//{
+			//	std::cout << odata[i] << " ";
+			//}
             timer().endGpuTimer();
+
+			cudaFree(dev_tempIn);
+			cudaFree(dev_tempOut);
         }
     }
 }
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 36b732d..d1083c9 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -8,20 +8,29 @@
 
 namespace StreamCompaction {
     namespace Thrust {
-        using StreamCompaction::Common::PerformanceTimer;
-        PerformanceTimer& timer()
-        {
-            static PerformanceTimer timer;
-            return timer;
+        using StreamCompaction::Common::PerformanceTimer;
+        PerformanceTimer& timer()
+        {
+            static PerformanceTimer timer;
+            return timer;
         }
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
             timer().startGpuTimer();
+
+			//thrust::host_vector<int> host_thrust_in = idata;
+			//thrust::host_vector<int> host_thrust_out = odata;
+
+			//thrust::host_vector<int> host_thrust_in(n);
+			
+			//thrust::device_vector<int> dev_thrust_in = host_thrust_in;
             // TODO use `thrust::exclusive_scan`
             // example: for device_vectors dv_in and dv_out:
             // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+
+			thrust::exclusive_scan(idata, idata + n, odata);
             timer().endGpuTimer();
         }
     }