diff --git a/Project2/Project2.sln b/Project2/Project2.sln
new file mode 100755
index 0000000..05eb50c
--- /dev/null
+++ b/Project2/Project2.sln
@@ -0,0 +1,26 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 11.00
+# Visual Studio 2010
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Project2", "Project2\Project2.vcxproj", "{75C44C78-8F9A-474E-9DA0-44F16C438147}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Win32 = Debug|Win32
+		Debug|x64 = Debug|x64
+		Release|Win32 = Release|Win32
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{75C44C78-8F9A-474E-9DA0-44F16C438147}.Debug|Win32.ActiveCfg = Debug|Win32
+		{75C44C78-8F9A-474E-9DA0-44F16C438147}.Debug|Win32.Build.0 = Debug|Win32
+		{75C44C78-8F9A-474E-9DA0-44F16C438147}.Debug|x64.ActiveCfg = Debug|x64
+		{75C44C78-8F9A-474E-9DA0-44F16C438147}.Debug|x64.Build.0 = Debug|x64
+		{75C44C78-8F9A-474E-9DA0-44F16C438147}.Release|Win32.ActiveCfg = Release|Win32
+		{75C44C78-8F9A-474E-9DA0-44F16C438147}.Release|Win32.Build.0 = Release|Win32
+		{75C44C78-8F9A-474E-9DA0-44F16C438147}.Release|x64.ActiveCfg = Release|x64
+		{75C44C78-8F9A-474E-9DA0-44F16C438147}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Project2/Project2.suo b/Project2/Project2.suo
new file mode 100755
index 0000000..c259d57
Binary files /dev/null and b/Project2/Project2.suo differ
diff --git a/Project2/Project2/Debug/Project2.log b/Project2/Project2/Debug/Project2.log
new file mode 100755
index 0000000..f3f70f7
--- /dev/null
+++ b/Project2/Project2/Debug/Project2.log
@@ -0,0 +1,53 @@
+﻿Build started 9/28/2014 8:03:49 PM.
+     1>Project "S:\CIS565\Project2-StreamCompaction\Project2\Project2\Project2.vcxproj" on node 2 (build target(s)).
+     1>InitializeBuildStatus:
+         Creating "Debug\Project2.unsuccessfulbuild" because "AlwaysCreate" was specified.
+       AddCudaCompileDeps:
+         c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\bin\cl.exe /E /nologo /showIncludes /TP /D__CUDACC__ /DWIN32 /D_DEBUG /D_CONSOLE /D_MBCS /I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\include" /I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\bin" /I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\include" /I. /FIcuda_runtime.h /c S:\CIS565\Project2-StreamCompaction\Project2\Project2\kernel.cu 
+       CudaBuild:
+         Compiling CUDA source file kernel.cu...
+         cmd.exe /C "C:\Users\lejoyce\AppData\Local\Temp\tmp15e45a570e7549f9bbd0f46f7abf8904.cmd"
+         "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\bin\nvcc.exe" -gencode=arch=compute_10,code=\"sm_10,compute_10\" --use-local-env --cl-version 2010 -ccbin "c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\bin"  -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\include" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\include"  -G   --keep-dir Debug -maxrregcount=0  --machine 32 --compile -cudart static  -g   -DWIN32 -D_DEBUG -D_CONSOLE -D_MBCS -Xcompiler "/EHsc /W3 /nologo /Od /Zi /RTC1 /MDd  " -o S:\CIS565\Project2-StreamCompaction\Project2\Project2\kernel.cu.obj "S:\CIS565\Project2-StreamCompaction\Project2\Project2\kernel.cu"
+         
+         C:\user>"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\bin\nvcc.exe" -gencode=arch=compute_10,code=\"sm_10,compute_10\" --use-local-env --cl-version 2010 -ccbin "c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\bin"  -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\include" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\include"  -G   --keep-dir Debug -maxrregcount=0  --machine 32 --compile -cudart static  -g   -DWIN32 -D_DEBUG -D_CONSOLE -D_MBCS -Xcompiler "/EHsc /W3 /nologo /Od /Zi /RTC1 /MDd  " -o S:\CIS565\Project2-StreamCompaction\Project2\Project2\kernel.cu.obj "S:\CIS565\Project2-StreamCompaction\Project2\Project2\kernel.cu" 
+       ClCompile:
+         c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\bin\CL.exe /c /I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\include" /ZI /nologo /W3 /WX- /Od /Oy- /D WIN32 /D _DEBUG /D _CONSOLE /D _MBCS /Gm /EHsc /RTC1 /MDd /GS /fp:precise /Zc:wchar_t /Zc:forScope /Fo"Debug\\" /Fd"Debug\vc100.pdb" /Gd /TP /analyze- /errorReport:prompt main.cpp serial.cpp
+         serial.cpp
+     1>c:\program files\nvidia gpu computing toolkit\cuda\v5.5\include\thrust\detail\allocator\allocator_traits.inl(180): warning C4003: not enough actual parameters for macro 'max'
+     1>c:\program files\nvidia gpu computing toolkit\cuda\v5.5\include\thrust\system\detail\error_category.inl(102): warning C4996: 'strerror': This function or variable may be unsafe. Consider using strerror_s instead. To disable deprecation, use _CRT_SECURE_NO_WARNINGS. See online help for details.
+                 c:\program files (x86)\microsoft visual studio 10.0\vc\include\string.h(157) : see declaration of 'strerror'
+     1>c:\program files\nvidia gpu computing toolkit\cuda\v5.5\include\thrust\system\cuda\detail\for_each.inl(112): warning C4003: not enough actual parameters for macro 'max'
+         main.cpp
+         Generating Code...
+       ManifestResourceCompile:
+         C:\Program Files (x86)\Microsoft SDKs\Windows\v7.0A\bin\rc.exe /nologo /fo"Debug\Project2.exe.embed.manifest.res" Debug\Project2_manifest.rc 
+       Link:
+         c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\bin\link.exe /ERRORREPORT:PROMPT /OUT:"S:\CIS565\Project2-StreamCompaction\Project2\Debug\Project2.exe" /INCREMENTAL /NOLOGO /LIBPATH:"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\lib\Win32" cudart.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /MANIFEST /ManifestFile:"Debug\Project2.exe.intermediate.manifest" /MANIFESTUAC:"level='asInvoker' uiAccess='false'" /DEBUG /PDB:"S:\CIS565\Project2-StreamCompaction\Project2\Debug\Project2.pdb" /SUBSYSTEM:CONSOLE /TLBID:1 /DYNAMICBASE /NXCOMPAT /IMPLIB:"S:\CIS565\Project2-StreamCompaction\Project2\Debug\Project2.lib" /MACHINE:X86 "S:\CIS565\Project2-StreamCompaction\Project2\Project2\kernel.cu.obj"
+         Debug\Project2.exe.embed.manifest.res
+         Debug\main.obj
+         Debug\serial.obj
+       Manifest:
+         C:\Program Files (x86)\Microsoft SDKs\Windows\v7.0A\bin\mt.exe /nologo /verbose /out:"Debug\Project2.exe.embed.manifest" /manifest Debug\Project2.exe.intermediate.manifest
+         C:\Program Files (x86)\Microsoft SDKs\Windows\v7.0A\bin\rc.exe /nologo /fo"Debug\Project2.exe.embed.manifest.res" Debug\Project2_manifest.rc 
+       LinkEmbedManifest:
+         c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\bin\link.exe /ERRORREPORT:PROMPT /OUT:"S:\CIS565\Project2-StreamCompaction\Project2\Debug\Project2.exe" /INCREMENTAL /NOLOGO /LIBPATH:"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\lib\Win32" cudart.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /MANIFEST /ManifestFile:"Debug\Project2.exe.intermediate.manifest" /MANIFESTUAC:"level='asInvoker' uiAccess='false'" /DEBUG /PDB:"S:\CIS565\Project2-StreamCompaction\Project2\Debug\Project2.pdb" /SUBSYSTEM:CONSOLE /TLBID:1 /DYNAMICBASE /NXCOMPAT /IMPLIB:"S:\CIS565\Project2-StreamCompaction\Project2\Debug\Project2.lib" /MACHINE:X86 "S:\CIS565\Project2-StreamCompaction\Project2\Project2\kernel.cu.obj"
+         Debug\Project2.exe.embed.manifest.res
+         Debug\main.obj
+         Debug\serial.obj
+         Project2.vcxproj -> S:\CIS565\Project2-StreamCompaction\Project2\Debug\Project2.exe
+       PostBuildEvent:
+         echo copy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\bin\cudart*.dll" "S:\CIS565\Project2-StreamCompaction\Project2\Debug\"
+         copy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\bin\cudart*.dll" "S:\CIS565\Project2-StreamCompaction\Project2\Debug\"
+         :VCEnd
+         copy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\bin\cudart*.dll" "S:\CIS565\Project2-StreamCompaction\Project2\Debug\"
+         C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\bin\cudart32_55.dll
+         C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\bin\cudart64_55.dll
+                 2 file(s) copied.
+       FinalizeBuildStatus:
+         Deleting file "Debug\Project2.unsuccessfulbuild".
+         Touching "Debug\Project2.lastbuildstate".
+     1>Done Building Project "S:\CIS565\Project2-StreamCompaction\Project2\Project2\Project2.vcxproj" (build target(s)).
+
+Build succeeded.
+
+Time Elapsed 00:00:16.63
diff --git a/Project2/Project2/Project2.vcxproj b/Project2/Project2/Project2.vcxproj
new file mode 100755
index 0000000..01928b8
--- /dev/null
+++ b/Project2/Project2/Project2.vcxproj
@@ -0,0 +1,164 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{75C44C78-8F9A-474E-9DA0-44F16C438147}</ProjectGuid>
+    <RootNamespace>Project2</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 5.5.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+    <PostBuildEvent>
+      <Command>echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"
+copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
+    </PostBuildEvent>
+    <CudaCompile>
+      <CompileOut>$(ProjectDir)%(Filename)%(Extension).obj</CompileOut>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN64;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+    <PostBuildEvent>
+      <Command>echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"
+copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
+    </PostBuildEvent>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+    <PostBuildEvent>
+      <Command>echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"
+copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
+    </PostBuildEvent>
+    <CudaCompile>
+      <CompileOut>$(ProjectDir)%(Filename)%(Extension).obj</CompileOut>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN64;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+    <PostBuildEvent>
+      <Command>echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"
+copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
+    </PostBuildEvent>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="kernel.cu" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="main.cpp" />
+    <ClCompile Include="serial.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="kernel.h" />
+    <ClInclude Include="serial.h" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 5.5.targets" />
+  </ImportGroup>
+</Project>
\ No newline at end of file
diff --git a/Project2/Project2/Project2.vcxproj.user b/Project2/Project2/Project2.vcxproj.user
new file mode 100755
index 0000000..695b5c7
--- /dev/null
+++ b/Project2/Project2/Project2.vcxproj.user
@@ -0,0 +1,3 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+</Project>
\ No newline at end of file
diff --git a/Project2/Project2/kernel.cu b/Project2/Project2/kernel.cu
new file mode 100755
index 0000000..bdeaeca
--- /dev/null
+++ b/Project2/Project2/kernel.cu
@@ -0,0 +1,173 @@
+#include <cuda_runtime.h>
+#include <stdio.h>
+#include <cmath>
+#include <iostream>
+
+#include "kernel.h"
+
+#define BLOCK_SIZE 1024
+
+// PART 2
+__global__ void kernelScan(float* in, float* out, int size) {
+	out[0] = 0;
+	int x = threadIdx.x;
+	out[x + 1] = in[x];
+	__syncthreads();
+	for (int i = 1; i < size; i*=2) {
+		if (x > i / 2) {
+			out[x + 1] += out[x + 1 - i];
+		}
+		__syncthreads();
+	}
+}
+
+// PART 3
+__global__ void sharedScan(float* in, float* out, int size) {
+	extern __shared__ float buffer[];
+
+	int x = threadIdx.x;
+	int offset = 1;
+	buffer[2 * x] = in[2 * x];
+	buffer[2 * x + 1] = in[2 * x + 1];
+	for (int d = (size + 1) / 2; d > 0; d /= 2) {
+		__syncthreads();
+		if (x < d) {
+			int a = offset * (2 * x + 1) - 1;
+			int b = offset * (2 * x + 2) - 1;
+			buffer[b] += buffer[a];
+		}
+		offset *= 2;
+		if (!x) {
+			buffer[size] = 0;
+		}
+		for (int d = 1; d < size + 1; d *= 2) {
+			offset /= 2;
+			__syncthreads();
+			if (x < d) {
+				int a = offset * (2 * x + 1) - 1;
+				int b = offset * (2 * x + 2) - 1;
+				float t = buffer[a];
+				buffer[a] = buffer[b];
+				buffer[b] += t;
+			}
+		}
+	}
+	out[2 * x] = buffer[2 * x];
+	out[2 * x + 1] = buffer[2 * x + 1];
+}
+
+float* scanKernel(float* in, int size) {
+	dim3 dimBlock(size + 1);
+	dim3 dimGrid(1, 1);
+
+	cudaEvent_t start, stop;
+	float time;
+	cudaEventCreate(&start);
+	cudaEventCreate(&stop);
+
+	float *inD, *outD;
+	cudaMalloc(&inD, size * sizeof(float));
+	cudaMemcpy(inD, in, size * sizeof(float), cudaMemcpyHostToDevice);
+	cudaMalloc(&outD, (size + 1)  * sizeof(float));
+	float* out = new float[size + 1];
+
+	// Running Part 2
+	cudaEventRecord(start, 0);
+	kernelScan<<<dimGrid, dimBlock>>>(inD, outD, size);
+	cudaEventRecord(stop, 0);
+	cudaEventSynchronize(stop);
+	cudaEventElapsedTime(&time, start, stop);
+	printf("Naive: %f ms\t", time);
+	cudaMemcpy(out, outD, (size + 1) * sizeof(float), cudaMemcpyDeviceToHost);
+
+	// Running Part 3a
+	cudaEventRecord(start, 0);
+	sharedScan<<<dimGrid, dimBlock, (size + 1) * sizeof(float)>>>(inD, outD, size);
+	cudaEventRecord(stop, 0);
+	cudaEventSynchronize(stop);
+	cudaEventElapsedTime(&time, start, stop);
+	printf("Shared: %f ms\t", time);
+	//cudaMemcpy(out, outD, (size + 1) * sizeof(float), cudaMemcpyDeviceToHost);
+
+	// Running Part 3b
+	dim3 fullBlock(BLOCK_SIZE);
+	dim3 fullBlocksPerGrid((int)ceil(float(size) / float(BLOCK_SIZE)));
+	cudaEventRecord(start, 0);
+	sharedScan<<<fullBlocksPerGrid, fullBlock>>>(inD, outD, size);
+	cudaEventRecord(stop, 0);
+	cudaEventSynchronize(stop);
+	cudaEventElapsedTime(&time, start, stop);
+	printf("Blocked: %f ms\n", time);
+	//cudaMemcpy(out, outD, (size + 1) * sizeof(float), cudaMemcpyDeviceToHost);
+
+	cudaFree(inD);
+	cudaFree(outD);
+	return out;
+}
+
+
+// PART 4
+__global__ void kernelScatter(float* in, float* out, int size) {
+	if (threadIdx.x < size) {
+		out[threadIdx.x] = in[threadIdx.x] != 0;
+	}
+}
+
+__global__ void streamCompactKernel(float* in, float* scat, float* scan, float* out, int size) {
+	int x = threadIdx.x;
+	if (x < size && scat[x] != 0)
+	  out[int(scan[x])] = in[x];
+}
+
+float* scatterKernel(float* in, int size) {
+	dim3 dimBlock(size);
+	dim3 dimGrid(1, 1);
+
+	cudaEvent_t start, stop;
+	float time;
+	cudaEventCreate(&start);
+	cudaEventCreate(&stop);
+
+	float *inD, *outD;
+	cudaMalloc(&inD, size * sizeof(float));
+	cudaMemcpy(inD, in, size * sizeof(float), cudaMemcpyHostToDevice);
+	cudaMalloc(&outD, size * sizeof(float));
+	float* out = new float[size];
+
+	cudaEventRecord(start, 0);
+	kernelScatter<<<dimGrid, dimBlock>>>(inD, outD, size);
+	cudaEventRecord(stop, 0);
+	cudaEventSynchronize(stop);
+	cudaEventElapsedTime(&time, start, stop);
+	printf("Kernel: %f ms\n", time);
+
+	cudaMemcpy(out, outD, size * sizeof(float), cudaMemcpyDeviceToHost);
+	cudaFree(inD);
+	cudaFree(outD);
+	return out;
+}
+
+float* streamCompact(float* in, int size) {
+	dim3 dimBlock(BLOCK_SIZE);
+	dim3 dimGrid((int)ceil(float(size) / float(BLOCK_SIZE)));
+	float *inD, *outScat, *outScan, *outD;
+	cudaMalloc(&inD, size * sizeof(float));
+	cudaMemcpy(inD, in, size * sizeof(float), cudaMemcpyHostToDevice);
+	cudaMalloc(&outScat, size * sizeof(float));
+	cudaMalloc(&outScan, (size + 1) * sizeof(float));
+	cudaMalloc(&outD, size * sizeof(float));
+
+	kernelScatter<<<dimGrid, dimBlock>>>(inD, outScat, size);
+	kernelScan<<<dimGrid, dimBlock>>>(outScat, outScan, size);
+	int s;
+	cudaMemcpy(&s, &outScan[size], sizeof(float), cudaMemcpyDeviceToHost);
+	s++;
+	float* out = new float[s];
+	streamCompactKernel<<<dimGrid, dimBlock>>>(inD, outScat, outScan, outD, s);
+
+	cudaMemcpy(out, outD, s * sizeof(float), cudaMemcpyDeviceToHost);
+	cudaFree(inD);
+	cudaFree(outScat);
+	cudaFree(outScan);
+	return out;
+}
diff --git a/Project2/Project2/kernel.h b/Project2/Project2/kernel.h
new file mode 100755
index 0000000..381b6a7
--- /dev/null
+++ b/Project2/Project2/kernel.h
@@ -0,0 +1,9 @@
+#pragma once
+#ifndef KERNEL_H
+#define KERNEL_H
+
+float* scanKernel(float* input, int size);
+float* scatterKernel(float* input, int size);
+float* streamCompact(float* input, int size);
+
+#endif
diff --git a/Project2/Project2/main.cpp b/Project2/Project2/main.cpp
new file mode 100755
index 0000000..523021d
--- /dev/null
+++ b/Project2/Project2/main.cpp
@@ -0,0 +1,88 @@
+#include <iostream>
+#include <conio.h>
+
+#include "kernel.h"
+#include "serial.h"
+
+#define SIZE 100
+#define PRINT
+
+using namespace std;
+
+int main() {
+	float input[SIZE];
+
+	for (int i = 0; i < SIZE; i++) {
+		input[i] = rand() % 10;
+		cout << input[i] << " ";
+	}
+	cout << endl << endl;
+
+	float* serScan = scan(input, SIZE);
+#ifdef PRINT
+	float* addr = serScan;
+	for (int i = 0; i < SIZE; i++) {
+		cout << *addr << " ";
+		addr++;
+	}
+	cout << endl << endl;
+#endif
+	delete serScan;
+
+	float* parScan = scanKernel(input, SIZE);
+#ifdef PRINT
+	addr = parScan;
+	for (int i = 0; i < SIZE; i++) {
+		cout << *addr << " ";
+		addr++;
+	}
+	cout << endl << endl;
+#endif
+	delete parScan;
+
+	float* serScatter = scatter(input, SIZE);
+#ifdef PRINT
+	addr = serScatter;
+	for (int i = 0; i < SIZE; i++) {
+		cout << *addr << " ";
+		addr++;
+	}
+	cout << endl << endl;
+#endif
+	delete serScatter;
+
+	float* parScatter = scatterKernel(input, SIZE);
+#ifdef PRINT
+	addr = parScatter;
+	for (int i = 0; i < SIZE; i++) {
+		cout << *addr << " ";
+		addr++;
+	}
+	cout << endl << endl;
+#endif
+	delete parScatter;
+	
+	float* strCmp = streamCompact(input, SIZE);
+#ifdef PRINT
+	addr = strCmp;
+	for (int i = 0; i < SIZE; i++) {
+		cout << *addr << " ";
+		addr++;
+	}
+	cout << endl << endl;
+#endif
+	delete strCmp;
+
+	float* thr = serThrust(input, SIZE);
+#ifdef PRINT
+	addr = thr;
+	for (int i = 0; i < SIZE; i++) {
+		cout << *addr << " ";
+		addr++;
+	}
+	cout << endl << endl;
+#endif
+	delete thr;
+
+	_getch();
+}
diff --git a/Project2/Project2/serial.cpp b/Project2/Project2/serial.cpp
new file mode 100755
index 0000000..ab1286f
--- /dev/null
+++ b/Project2/Project2/serial.cpp
@@ -0,0 +1,57 @@
+#include <Windows.h>
+#include <stdio.h>
+#include <thrust/copy.h>
+#include <thrust/execution_policy.h>
+
+#include "serial.h"
+
+// PART 1
+float* scan(float* in, int size) {
+	LARGE_INTEGER begin, end;
+	LARGE_INTEGER frequency;
+	QueryPerformanceFrequency(&frequency);
+
+	float* out = new float[size + 1];
+	out[0] = 0;
+	QueryPerformanceCounter(&begin);
+	for (int i = 0; i < size; i++)
+		out[i+1] = out[i] + in[i];
+	QueryPerformanceCounter(&end);
+	printf("CPU: %f ms\t", (end.QuadPart - begin.QuadPart) * 1000.0 / frequency.QuadPart);
+	return out;
+}
+
+// PART 4
+float* scatter(float* in, int size) {
+	LARGE_INTEGER begin, end;
+	LARGE_INTEGER frequency;
+	QueryPerformanceFrequency(&frequency);
+
+	float* out = new float[size];
+	QueryPerformanceCounter(&begin);
+	for (int i = 0; i < size; i++)
+		out[i] = in[i] != 0;
+	QueryPerformanceCounter(&end);
+	printf("CPU: %f ms\t", (end.QuadPart - begin.QuadPart) * 1000.0 / frequency.QuadPart);
+	return out;
+}
+
+struct keep {
+	__host__ __device__ bool operator() (const int x) {
+		return x > 0;
+	}
+};
+
+float* serThrust(float* in, int size) {
+	LARGE_INTEGER begin, end;
+	LARGE_INTEGER frequency;
+	QueryPerformanceFrequency(&frequency);
+
+	float* scat = scatter(in, size);
+	float* out = new float[size];
+	QueryPerformanceCounter(&begin);
+	thrust::copy_if(thrust::host, in, in + size, scat, out, keep());
+	QueryPerformanceCounter(&end);
+	printf("CPU: %f ms\t", (end.QuadPart - begin.QuadPart) * 1000.0 / frequency.QuadPart);
+	return out;
+}
diff --git a/Project2/Project2/serial.h b/Project2/Project2/serial.h
new file mode 100755
index 0000000..350a0bc
--- /dev/null
+++ b/Project2/Project2/serial.h
@@ -0,0 +1,9 @@
+#pragma once
+#ifndef SERIAL_H
+#define SERIAL_H
+
+float* scan(float* input, int size);
+float* scatter(float* input, int size);
+float* serThrust(float* input, int size);
+
+#endif
diff --git a/README.md b/README.md
index 6e02afa..0cd19f0 100644
--- a/README.md
+++ b/README.md
@@ -3,131 +3,22 @@ Project-2
 
 A Study in Parallel Algorithms : Stream Compaction
 
-# INTRODUCTION
-Many of the algorithms you have learned thus far in your career have typically
-been developed from a serial standpoint.  When it comes to GPUs, we are mainly
-looking at massively parallel work.  Thus, it is necessary to reorient our
-thinking.  In this project, we will be implementing a couple different versions
-of prefix sum.  We will start with a simple single thread serial CPU version,
-and then move to a naive GPU version.  Each part of this homework is meant to
-follow the logic of the previous parts, so please do not do this homework out of
-order.
-
-This project will serve as a stream compaction library that you may use (and
-will want to use) in your
-future projects.  For that reason, we suggest you create proper header and CUDA
-files so that you can reuse this code later.  You may want to create a separate
-cpp file that contains your main function so that you can test the code you
-write.
-
-# OVERVIEW
-Stream compaction is broken down into two parts: (1) scan, and (2) scatter.
-
-## SCAN
-Scan or prefix sum is the summation of the elements in an array such that the
-resulting array is the summation of the terms before it.  Prefix sum can either
-be inclusive, meaning the current term is a summation of all the elements before
-it and itself, or exclusive, meaning the current term is a summation of all
-elements before it excluding itself. 
-
-Inclusive:
-
-In : [ 3 4 6 7 9 10 ]
-
-Out : [ 3 7 13 20 29 39 ]
-
-Exclusive
-
-In : [ 3 4 6 7 9 10 ]
-
-Out : [ 0 3 7 13 20 29 ]
-
-Note that the resulting prefix sum will always be n + 1 elements if the input
-array is of length n.  Similarly, the first element of the exclusive prefix sum
-will always be 0.  In the following sections, all references to prefix sum will
-be to the exclusive version of prefix sum.
-
-## SCATTER
-The scatter section of stream compaction takes the results of the previous scan
-in order to reorder the elements to form a compact array.
-
-For example, let's say we have the following array:
-[ 0 0 3 4 0 6 6 7 0 1 ]
-
-We would only like to consider the non-zero elements in this zero, so we would
-like to compact it into the following array:
-[ 3 4 6 6 7 1 ]
-
-We can perform a transform on input array to transform it into a boolean array:
-
-In :  [ 0 0 3 4 0 6 6 7 0 1 ]
-
-Out : [ 0 0 1 1 0 1 1 1 0 1 ]
-
-Performing a scan on the output, we get the following array :
-
-In :  [ 0 0 1 1 0 1 1 1 0 1 ]
-
-Out : [ 0 0 0 1 2 2 3 4 5 5 ]
-
-Notice that the output array produces a corresponding index array that we can
-use to create the resulting array for stream compaction. 
-
-# PART 1 : REVIEW OF PREFIX SUM
-Given the definition of exclusive prefix sum, please write a serial CPU version
-of prefix sum.  You may write this in the cpp file to separate this from the
-CUDA code you will be writing in your .cu file. 
-
-# PART 2 : NAIVE PREFIX SUM
-We will now parallelize this the previous section's code.  Recall from lecture
-that we can parallelize this using a series of kernel calls.  In this portion,
-you are NOT allowed to use shared memory.
-
-### Questions 
-* Compare this version to the serial version of exclusive prefix scan. Please
-  include a table of how the runtimes compare on different lengths of arrays.
-* Plot a graph of the comparison and write a short explanation of the phenomenon you
-  see here.
-
-# PART 3 : OPTIMIZING PREFIX SUM
-In the previous section we did not take into account shared memory.  In the
-previous section, we kept everything in global memory, which is much slower than
-shared memory.
-
-## PART 3a : Write prefix sum for a single block
-Shared memory is accessible to threads of a block. Please write a version of
-prefix sum that works on a single block.  
-
-## PART 3b : Generalizing to arrays of any length.
-Taking the previous portion, please write a version that generalizes prefix sum
-to arbitrary length arrays, this includes arrays that will not fit on one block.
-
-### Questions
-* Compare this version to the parallel prefix sum using global memory.
-* Plot a graph of the comparison and write a short explanation of the phenomenon
-  you see here.
-
-# PART 4 : ADDING SCATTER
-First create a serial version of scatter by expanding the serial version of
-prefix sum.  Then create a GPU version of scatter.  Combine the function call
-such that, given an array, you can call stream compact and it will compact the
-array for you.  Finally, write a version using thrust. 
-
-### Questions
-* Compare your version of stream compact to your version using thrust.  How do
-  they compare?  How might you optimize yours more, or how might thrust's stream
-  compact be optimized.
-
-# EXTRA CREDIT (+10)
-For extra credit, please optimize your prefix sum for work parallelism and to
-deal with bank conflicts.  Information on this can be found in the GPU Gems
-chapter listed in the references.  
-
-# SUBMISSION
-Please answer all the questions in each of the subsections above and write your
-answers in the README by overwriting the README file.  In future projects, we
-expect your analysis to be similar to the one we have led you through in this
-project.  Like other projects, please open a pull request and email Harmony.
-
-# REFERENCES
-"Parallel Prefix Sum (Scan) with CUDA." GPU Gems 3.
+Prefix scan has a linear running time, and as the input size becomes larger, the
+time it takes for the CPU to run increases linearly. In comparison, for the GPU,
+because of the immense multithreading, the runtime is nearly constant. Thus even
+drastically increasing the input size does not greatly affect the runtime. Scan and
+scatter ran in virtually the same time on both the CPU and the GPU, which makes sense
+because they run through the input data in the same manner. <br>
+![Efficiency Comparison](https://github.com/leejcw/Project2-StreamCompaction/blob/master/chart.JPG)
+
+In Part 3, I was not quite able to get shared memory working. Using extern, the
+size of the array dynamically allocated to me was always zero and I could not
+figure out why. Using a hardcoded size for shared memory based on the input size
+I knew I was providing it, the runtime was exactly the same as using global memory
+which leads me to think that I was either doing something wrong or the timers were
+inaccurate the day I was testing this (which happened to be the last day too).
+
+In Part 4, I was not able to get thrust working. Although I read up on documentation
+and followed the examples provided online, I was not able to make sense out of the
+error message on the console. Unfortunately I have no point of comparison between
+my implementation of stream compaction and that of thrust.
diff --git a/chart.JPG b/chart.JPG
new file mode 100755
index 0000000..925b9d4
Binary files /dev/null and b/chart.JPG differ