diff --git a/README.md b/README.md
index 0e38ddb..b7acfec 100644
--- a/README.md
+++ b/README.md
@@ -3,12 +3,171 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* SPENCER WEBSTER-BASS
+  * [LinkedIn](https://www.linkedin.com/in/spencer-webster-bass/)
+* Tested on: Windows 10, i7-6700 @ 3.40GHz 16GB, Quadro P1000 222MB (MOR100B-19)
 
-### (TODO: Your README)
+### DESCRIPTION
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+This project is an implementation of the stream compaction parallel algorithm on the GPU using CUDA and C++.
+
+Features:
+* Serial implementation of scan and stream compaction algorithms on the CPU
+* Naive, parallel implementation of scan and stream compaction algorithms on the GPU
+* Atepted work-efficient, parallel implementation of scan and stream compaction algorithms on the GPU
+* Comparison between my implementations' efficiency and thrust's implementation of exclusive scan algorithm
+
+TODOs:
+Include Analysis
+
+****************
+** SCAN TESTS **
+****************
+    [  49   8   2  27  20  44  21  27  49   3  20   3  16   3  31   0 ]
+    
+==== cpu scan, power-of-two ====
+
+   elapsed time: 0ms    (std::chrono Measured)
+   
+    [   0  49  57  59  86 106 150 171 198 247 250 270 273 289 292 323 ]
+    
+==== cpu scan, non-power-of-two ====
+
+   elapsed time: 0ms    (std::chrono Measured)
+   
+    [   0  49  57  59  86 106 150 171 198 247 250 270 273 ]
+    
+    passed
+    
+==== naive scan, power-of-two ====
+
+    [  49   8   2  27  20  44  21  27  49   3  20   3  16   3  31   0 ]
+    
+    [ 49 8 2 27 20 44 21 27 49 3 20 3 16 3 31 0 ]
+    
+    [ 0 49 57 10 29 47 64 65 48 76 52 23 23 19 19 34 ]
+    
+    [ 0 49 57 59 86 57 93 112 112 141 100 99 75 42 42 53 ]
+    
+    [ 0 49 57 59 86 106 150 171 198 198 193 211 187 183 142 152 ]
+    
+    [ 0 49 57 59 86 106 150 171 198 247 250 270 273 289 292 323 ]
+    
+   elapsed time: 7.58922ms    (CUDA Measured)
+   
+    [   0  49  57  59  86 106 150 171 198 247 250 270 273 289 292 323 ]
+    
+    passed
+    
+==== 1s array for finding bugs ====
+
+    [  49   8   2  27  20  44  21  27  49   3  20   3  16   3  31   0 ]
+    
+    [ 49 8 2 27 20 44 21 27 49 3 20 3 16 3 31 0 ]
+    
+    [ 0 49 57 10 29 47 64 65 48 76 52 23 23 19 19 34 ]
+    
+    [ 0 49 57 59 86 57 93 112 112 141 100 99 75 42 42 53 ]
+    
+    [ 0 49 57 59 86 106 150 171 198 198 193 211 187 183 142 152 ]
+    
+    [ 0 49 57 59 86 106 150 171 198 247 250 270 273 289 292 323 ]
+    
+    [   0  49  57  59  86 106 150 171 198 247 250 270 273 289 292 323 ]
+    
+==== naive scan, non-power-of-two ====
+
+    [  49   8   2  27  20  44  21  27  49   3  20   3  16   3  31   0 ]
+    
+    [ 49 8 2 27 20 44 21 27 49 3 20 3 16 ]
+    
+    [ 0 49 57 10 29 47 64 65 48 76 52 23 23 3 0 0 ]
+    
+    [ 0 49 57 59 86 57 93 112 112 141 100 99 75 26 23 3 ]
+    
+    [ 0 49 57 59 86 106 150 171 198 198 193 211 187 167 123 102 ]
+    
+    [ 0 49 57 59 86 106 150 171 198 247 250 270 273 273 273 273 ]
+    
+   elapsed time: 15.0825ms    (CUDA Measured)
+   
+    [   0  49  57  59  86 106 150 171 198 247 250 270 273   0   0   0 ]
+    
+    passed
+    
+==== work-efficient scan, power-of-two ====
+
+   elapsed time: 0ms    (CUDA Measured)
+   
+    a[1] = 49, b[1] = 0
+    
+    FAIL VALUE
+    
+==== work-efficient scan, non-power-of-two ====
+
+   elapsed time: 0ms    (CUDA Measured)
+   
+    a[1] = 49, b[1] = 0
+    
+    FAIL VALUE
+    
+==== thrust scan, power-of-two ====
+
+   elapsed time: 0.083008ms    (CUDA Measured)
+   
+    passed
+    
+==== thrust scan, non-power-of-two ====
+
+   elapsed time: 0.069632ms    (CUDA Measured)
+   
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+
+    [   1   2   2   1   2   0   1   3   1   3   0   3   0   3   3   0 ]
+    
+==== cpu compact without scan, power-of-two ====
+
+   elapsed time: 0.0034ms    (std::chrono Measured)
+   
+    [   0   0   0   0   0   0   0   0   0   0   0   0 ]
+    
+    passed
+    
+==== cpu compact without scan, non-power-of-two ====
+
+   elapsed time: 0.004ms    (std::chrono Measured)
+   
+    [   0   0   0   0   0   0   0   0   0   0 ]
+    
+    passed
+    
+==== cpu compact with scan ====
+
+   elapsed time: 0.0023ms    (std::chrono Measured)
+   
+    [ ]
+    
+    expected 12 elements, got -1
+    
+    FAIL COUNT
+    
+==== work-efficient compact, power-of-two ====
+
+   elapsed time: 0ms    (CUDA Measured)
+   
+    expected 12 elements, got -1
+    
+    FAIL COUNT
+    
+==== work-efficient compact, non-power-of-two ====
+
+   elapsed time: 0ms    (CUDA Measured)
+   
+    expected 10 elements, got -1
+    
+    FAIL COUNT
 
diff --git a/notes.txt b/notes.txt
new file mode 100644
index 0000000..c1f8511
--- /dev/null
+++ b/notes.txt
@@ -0,0 +1,34 @@
+How does dim3 work? and dimensions in CUDA. Does dim3 set unspecified arguments to 1 or 0?
+Does CUDA expect unused dimensions to be 1 or 0?
+
+
+
+How do these checkCUDAError work? They sometimes say that an entire is occurring at an incorrect location if 
+I dont have one at every CUDA function call.
+
+
+
+Do we need to include new functions in header files?
+
+
+
+Inside of CUDA files are we using c or C++
+
+Dont have classes. The coding style might be closer to C instead of C++. You can pass structs to CUDA kernels.
+
+When using the memory window is it showing you gpu or cpu memory when you copy and paste an address
+from the locals or autos window?
+
+Can memcpy from the device back to the host.
+And start with a smaller sized buffer so that you can check the values in the buffers by hand.
+
+
+dev_data1, dev_data2;
+// cudamalloc, memcpy, etc
+// for eah iteration, launch kernels on dev_data1 and dev_data2
+int* temp = dev_data1
+dev_dta1 = dev_data2
+dev_data2 = temp
+
+
+The weird alternating thing where every other value was zero was due to the book's funky way of ping ponging.
\ No newline at end of file
diff --git a/src/main.cpp b/src/main.cpp
index 896ac2b..13791af 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,7 +13,7 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 1 << 4; //8; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];
@@ -49,22 +49,25 @@ int main(int argc, char* argv[]) {
 
     zeroArray(SIZE, c);
     printDesc("naive scan, power-of-two");
+    printArray(SIZE, a, true);
     StreamCompaction::Naive::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
-    /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
+    /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan */
     onesArray(SIZE, c);
     printDesc("1s array for finding bugs");
+    printArray(SIZE, a, true);
     StreamCompaction::Naive::scan(SIZE, c, a);
-    printArray(SIZE, c, true); */
+    printArray(SIZE, c, true);
 
     zeroArray(SIZE, c);
     printDesc("naive scan, non-power-of-two");
+    printArray(SIZE, a, true);
     StreamCompaction::Naive::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index 2ed6d63..222ee88 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -24,6 +24,13 @@ namespace StreamCompaction {
          */
         __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
             // TODO
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (index >= n)
+                return;
+            if (idata[index])
+                bools[index] = 1;
+            else
+                bools[index] = 0;
         }
 
         /**
@@ -33,6 +40,11 @@ namespace StreamCompaction {
         __global__ void kernScatter(int n, int *odata,
                 const int *idata, const int *bools, const int *indices) {
             // TODO
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (index >= n)
+                return;
+            if (bools[index])
+                odata[indices[index]] = idata[index];
         }
 
     }
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 719fa11..2b6b05e 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -1,4 +1,5 @@
 #include <cstdio>
+#include <vector>
 #include "cpu.h"
 
 #include "common.h"
@@ -18,9 +19,12 @@ namespace StreamCompaction {
          * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first.
          */
         void scan(int n, int *odata, const int *idata) {
-            timer().startCpuTimer();
+            // timer().startCpuTimer();
             // TODO
-            timer().endCpuTimer();
+            odata[0] = 0;
+            for (int i = 1; i < n; i++)
+                odata[i] = odata[i - 1] + idata[i - 1];
+            // timer().endCpuTimer();
         }
 
         /**
@@ -31,8 +35,13 @@ namespace StreamCompaction {
         int compactWithoutScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
             // TODO
+            std::vector<int> o = std::vector<int>();
+            for (int i = 0; i < n; i++)
+                if (idata[i])
+                    o.push_back(idata[i]);
+            odata = o.data();
             timer().endCpuTimer();
-            return -1;
+            return o.size();
         }
 
         /**
@@ -43,8 +52,38 @@ namespace StreamCompaction {
         int compactWithScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
             // TODO
+            // step 1: compute bit mask
+            std::vector<int> mask(n);
+            for (int i = 0; i < n; i++) {
+                if (idata[i] == 0) {
+                    mask.at(i) = 0;
+                }
+                else {
+                    mask.at(i) = 1;
+                }
+            }
+
+            // step 2: exclusive scan 
+            scan(n, odata, mask.data());
+
             timer().endCpuTimer();
             return -1;
+
+            // step 3: scatter
+            int m = odata[n - 1];
+            std::vector<int> ovec(m);
+            m = 0;
+            for (int i = 0; i < n; i++) {
+                if (mask[i]) {
+                    ovec[odata[i]] = idata[i];
+                    m++;
+                }
+            }
+
+            odata = ovec.data();
+
+            timer().endCpuTimer();
+            return m;
         }
     }
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 2db346e..cb799f3 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -12,13 +12,122 @@ namespace StreamCompaction {
             return timer;
         }
 
+
+        __global__ void prescan(int n, float* g_odata, float* g_idata) {
+            extern __shared__ float temp[];
+            // allocated on invocation 
+            int thid = threadIdx.x;
+            int offset = 1;
+            temp[2 * thid] = g_idata[2 * thid]; // load input into shared memory
+            temp[2*thid+1] = g_idata[2*thid+1];
+
+            // build sum in place up the tree
+            for (int d = n >> 1; d > 0; d >>= 1) {
+                __syncthreads();    
+                if (thid < d) {
+                    int ai = offset * (2 * thid + 1) - 1;
+                    int bi = offset * (2 * thid + 2) - 1;
+                    temp[bi] += temp[ai];
+                }
+                offset *= 2;
+            }
+            if (thid == 0) { temp[n - 1] = 0; } // clear the last element
+            
+            // traverse down tree & build scan
+            for (int d = 1; d < n; d *= 2) {
+                offset >>= 1;
+                __syncthreads();
+                if (thid < d) {
+                    int ai = offset * (2 * thid + 1) - 1;
+                    int bi = offset * (2 * thid + 2) - 1;
+                    float t = temp[ai];
+                    temp[ai] = temp[bi];
+                    temp[bi] += t;
+                }
+            }
+            __syncthreads();
+            g_odata[2 * thid] = temp[2 * thid];
+            // write results to device memory
+            g_odata[2*thid+1] = temp[2*thid+1];
+        }
+
+        __device__ void kernUpSweep(int n, int pN, int* idata, int offset, int d) {
+            int idx = threadIdx.x + (blockIdx.x * blockDim.x);
+
+            if (idx >= pN)
+                return;
+
+            if (idx < d) {
+                int ai = offset * (2 * thid + 1) - 1;
+                int bi = offset * (2 * thid + 2) - 1;
+                temp[bi] += temp[ai];
+            }
+        }
+
+        __device__ void kernDownSweep() {
+
+        }
+
+        __global__ void kernExScan(int n, int pN) {
+
+        }
+
+
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+
+            return;
+
+            int* dev_idata;
+
+            int depth = ilog2ceil(n);
+            // remember numbers are read from right to left
+            int pN = 1 << depth;    // n rounded to the next power of 2 = n after padding
+
+            cudaMalloc((void**)&dev_idata, n * sizeof(int));
+            checkCUDAError("cudaMalloc dev_idata failed!");
+
+            cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+            checkCUDAError("cudaMemcpy idata to dev_idata failed!");
+
+            // for most gpus there are 1024 threads per block
+            int threadsPerBlock = 1024;
+            int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock; // ceiling of n / threadsPerBlock
+            dim3 blockDim(threadsPerBlock, 0, 0);
+            dim3 gridDim(blocksPerGrid, 0, 0);
+
+
             timer().startGpuTimer();
             // TODO
+            // kernScan << <gridDim, blockDim >> > ();
+            // upsweep
+            int offset = 1;
+            for (int d = n >> 1; d > 0; d >>= 1) {
+                kernUpSweep<<<gridDim, blockDim>>>();
+                offset *= 2;
+
+
+                kernExScan << <gridDim, blockDim >> > (pN, dev_temp, dev_odata, dev_idata, dev_ping, dev_pong, offset, pingpong);
+                checkCUDAError("kernExScan failed!");
+
+                vector<int> temp_test(pN);
+                cudaMemcpy(temp_test.data(), dev_ping, sizeof(int) * pN, cudaMemcpyDeviceToHost);
+                checkCUDAError("cudaMemcpy dev_temp to temp_test failed!");
+                printArray(pN, temp_test.data(), false);
+
+                pingpong = 1 - pingpong;
+                /*cudaMemcpy(dev_temp, dev_ping, pN * sizeof(int), cudaMemcpyDeviceToDevice);
+                cudaMemcpy(dev_ping, dev_pong, pN * sizeof(int), cudaMemcpyDeviceToDevice);
+                cudaMemcpy(dev_pong, dev_temp, pN * sizeof(int), cudaMemcpyDeviceToDevice);*/
+                int* temp = dev_ping;
+                dev_ping = dev_pong;
+                dev_pong = temp;
+            }
             timer().endGpuTimer();
+
+            cudaFree(dev_idata);
         }
 
         /**
@@ -30,10 +139,54 @@ namespace StreamCompaction {
          * @param idata  The array of elements to compact.
          * @returns      The number of elements remaining after compaction.
          */
+        using namespace StreamCompaction::Common;
         int compact(int n, int *odata, const int *idata) {
+            return -1;
+
+            int* dev_idata;
+            int* dev_odata;
+            bool* dev_bools;
+            int* dev_indices;
+
+            cudaMalloc((void**)&dev_idata, n * sizeof(int));
+            checkCUDAError("cudaMalloc dev_idata failed!");
+
+            cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+            checkCUDAError("cudaMemcpy idata to dev_idata failed!");
+
+            cudaMalloc((void**)&dev_odata, n * sizeof(int));
+            checkCUDAError("cudaMalloc dev_odata failed!");
+
+            cudaMalloc((void**)&dev_bools, n * sizeof(bool));
+            checkCUDAError("cudaMalloc dev_mask failed!");
+
+            cudaMalloc((void**)&dev_indices, n * sizeof(bool));
+            checkCUDAError("cudaMalloc dev_indices failed!");
+
+            // for most gpus there are 1024 threads per block
+            int threadsPerBlock = 1024;
+            int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock; // ceiling of n / threadsPerBlock
+            dim3 blockDim(threadsPerBlock, 0, 0);
+            dim3 gridDim(blocksPerGrid, 0, 0);
+
+
             timer().startGpuTimer();
             // TODO
+            int k = ilog2ceil(n);
+            // step 1: compute dev_bools = determine which elements should be purged
+            // kernMapToBoolean<<<gridDim, blockDim>>>(n, dev_bools, dev_idata);
+            // step 2: exclusive scan on dev_bools
+            // kernScan<<<gridDim, blockDim>>>(n, dev_indices, dev_bools);
+            // step 3: reduce the array based on bools
+            // kernScatter<<<gridDim, blockDim>>>(n, dev_odata, dev_idata, dev_bools, dev_indices);
+
             timer().endGpuTimer();
+
+            cudaFree(dev_idata);
+            cudaFree(dev_odata);
+            cudaFree(dev_bools);
+            cudaFree(dev_indices);
+
             return -1;
         }
     }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 4308876..235d47b 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -3,6 +3,10 @@
 #include "common.h"
 #include "naive.h"
 
+#include <string>
+#include <iostream>
+#include <vector>
+
 namespace StreamCompaction {
     namespace Naive {
         using StreamCompaction::Common::PerformanceTimer;
@@ -11,15 +15,169 @@ namespace StreamCompaction {
             static PerformanceTimer timer;
             return timer;
         }
+
         // TODO: __global__
+        // This version can handle arrays only as large as can be processed by a single thread block running on one multiprocessor of a GPU.
+        //__global__ void scan(float* g_odata, float* g_idata, int n) {
+        //    extern __shared__ float temp[]; 
+        //    // allocated on invocation    
+        //    int thid = threadIdx.x;   int pout = 0, pin = 1;   
+        //    // Load input into shared memory.   
+        //    // This is exclusive scan, so shift right by one    
+        //    // and set first element to 0
+        //    temp[pout*n + thid] = (thid > 0) ? g_idata[thid-1] : 0;
+        //    __syncthreads(); 
+        //    for (int offset = 1; offset < n; offset *= 2)   
+        //    {     
+        //        pout = 1 - pout; 
+        //        // swap double buffer indices     
+        //        pin = 1 - pout;     
+        //        if (thid >= offset)       
+        //            temp[pout*n+thid] += temp[pin*n+thid - offset];     
+        //        else       
+        //            temp[pout*n+thid] = temp[pin*n+thid];     
+        //        __syncthreads();
+        //    }   
+        //    g_odata[thid] = temp[pout*n+thid]; 
+        //    // write output 
+        //} 
+
+        __global__ void kernInitExScan(int n, int pN, int* temp, int* idata, int* pong) {
+            int idx = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (idx >= pN)
+                return;
+
+            if (idx >= n)
+                idata[idx] = 0;
+            // shift the array to the right by one for exclusive scan
+            // the initializing the padding of idata inn the above line is not guaranteed to be 
+            // completed for all threads by the time the next line is reached
+            // so just initialize all of the padding in the temp to 0 here
+            pong[idx] = (idx > 0 && idx < n) ? idata[idx - 1] : 0;
+            return;
+
+            temp[idx] = (idx > 0 && idx < n) ? idata[idx - 1] : 0;
+        }
+
+        __global__ void kernExScan(int pN, int* temp, int* odata, const int*idata, int* ping, int* pong, int offset, int pingpong) {
+            int idx = threadIdx.x + (blockIdx.x * blockDim.x);
+
+            if (idx >= pN)
+                return;
+
+            if (idx >= offset)
+                ping[idx] = pong[idx] + pong[idx - offset];
+            else
+                ping[idx] = pong[idx];
+
+            return;
+
+            if (idx >= offset)
+                ping[idx] += pong[idx - offset];
+            else
+                ping[idx] = pong[idx];
+
+            return;
+
+            if (idx >= offset)
+                temp[pingpong * pN + idx] += temp[(1 - pingpong) * pN + idx - offset];
+            else
+                temp[pingpong * pN + idx] = temp[(1 - pingpong) * pN + idx];
+        }
+
+        using namespace std;
+        void printArray(int n, const int* a, bool abridged = false) {
+            cout << "    [ ";
+            for (int i = 0; i < n; i++) {
+                if (abridged && i + 2 == 15 && n > 16) {
+                    i = n - 2;
+                    cout << "... ";
+                }
+                cout << a[i] << " ";
+            }
+            cout << "]\n";
+        }
 
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+            int* dev_idata;
+            int* dev_odata;
+            int* dev_temp;
+            int* dev_ping;
+            int* dev_pong;
+
+            int depth = ilog2ceil(n);
+            // remember numbers are read from right to left
+            int pN = 1 << depth;    // n rounded to the next power of 2 = n after padding
+
+            // allocating memory for dev_idata and copying memory over from idata
+            cudaMalloc((void**)&dev_idata, pN * sizeof(int));
+            checkCUDAError("cudaMalloc dev_idata failed!");
+
+            cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+            checkCUDAError("cudaMemcpy idata to dev_idata failed!");
+
+            // allocating memory for dev_odata
+            cudaMalloc((void**)&dev_odata, n * sizeof(int));
+            checkCUDAError("cudaMalloc dev_odata failed!");
+
+            // allocating memory for dev_temp
+            cudaMalloc((void**)&dev_temp, 2 * pN * sizeof(int));
+            checkCUDAError("cudaMalloc dev_temp failed!");
+
+            // allocating memory for dev_ping
+            cudaMalloc((void**)&dev_ping, pN * sizeof(int));
+            checkCUDAError("cudaMalloc dev_ping failed!");
+
+            // allocating memory for dev_pong
+            cudaMalloc((void**)&dev_pong, pN * sizeof(int));
+            checkCUDAError("cudaMalloc dev_pong failed!");
+
+            // for most gpus 1024 is the maximum number of threads per block
+            int threadsPerBlock = 1024;
+            int blocksPerGrid = (pN + threadsPerBlock - 1) / threadsPerBlock; // ceiling of ( pN / threadsPerBlock )
+            dim3 blockDim(threadsPerBlock);
+            dim3 gridDim(blocksPerGrid);
+
             timer().startGpuTimer();
-            // TODO
+            // launches a kernel that initializes buffers necessary for naive exclusive scan
+            kernInitExScan<<<gridDim, blockDim>>>(n, pN, dev_temp, dev_idata, dev_pong);
+            checkCUDAError("kernInitExScan failed!");
+
+            printArray(n, idata, false);
+
+            // execution of naive exclusive scan in parallel
+            // uses global memory instead of shared memory for ping pong buffers
+            // so that the data can be of arbitrary size
+            int pingpong = 0;
+            for (int offset = 1; offset < pN; offset *= 2) {
+                kernExScan<<<gridDim, blockDim>>>(pN, dev_temp, dev_odata, dev_idata, dev_ping, dev_pong, offset, pingpong);
+                checkCUDAError("kernExScan failed!");
+
+                vector<int> temp_test(pN);
+                cudaMemcpy(temp_test.data(), dev_ping, sizeof(int) * pN, cudaMemcpyDeviceToHost);
+                checkCUDAError("cudaMemcpy dev_temp to temp_test failed!");
+                printArray(pN, temp_test.data(), false);
+
+                pingpong = 1 - pingpong;
+                /*cudaMemcpy(dev_temp, dev_ping, pN * sizeof(int), cudaMemcpyDeviceToDevice);
+                cudaMemcpy(dev_ping, dev_pong, pN * sizeof(int), cudaMemcpyDeviceToDevice);
+                cudaMemcpy(dev_pong, dev_temp, pN * sizeof(int), cudaMemcpyDeviceToDevice);*/
+                int* temp = dev_ping;
+                dev_ping = dev_pong;
+                dev_pong = temp;
+            }
+            cudaMemcpy(odata, dev_pong, n * sizeof(int), cudaMemcpyDeviceToHost);
+            // cudaMemcpy(odata, dev_temp, n * sizeof(int), cudaMemcpyDeviceToHost);
             timer().endGpuTimer();
+
+            cudaFree(dev_idata);
+            cudaFree(dev_odata);
+            cudaFree(dev_temp);
+            cudaFree(dev_ping);
+            cudaFree(dev_pong);
         }
     }
 }
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 1def45e..07fea05 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -18,11 +18,31 @@ namespace StreamCompaction {
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+            int* dev_idata;
+            int* dev_odata;
+
+            cudaMalloc((void**)&dev_idata, n * sizeof(int));
+            checkCUDAError("cudaMalloc dev_idata failed!");
+
+            cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+            cudaMalloc((void**)&dev_odata, n * sizeof(int));
+            checkCUDAError("cudaMalloc dev_idata failed!");
+
+            thrust::device_ptr<int> thrust_dev_idata(dev_idata);
+            thrust::device_ptr<int> thrust_dev_odata(dev_odata);
+
             timer().startGpuTimer();
             // TODO use `thrust::exclusive_scan`
             // example: for device_vectors dv_in and dv_out:
             // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+            thrust::exclusive_scan(thrust_dev_idata, thrust_dev_idata + n, thrust_dev_odata);
             timer().endGpuTimer();
+
+            cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+            cudaFree(dev_idata);
+            cudaFree(dev_odata);
         }
     }
 }