diff --git a/Project2/Project2.sln b/Project2/Project2.sln
new file mode 100755
index 0000000..05eb50c
--- /dev/null
+++ b/Project2/Project2.sln
@@ -0,0 +1,26 @@
+
+Microsoft Visual Studio Solution File, Format Version 11.00
+# Visual Studio 2010
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Project2", "Project2\Project2.vcxproj", "{75C44C78-8F9A-474E-9DA0-44F16C438147}"
+EndProject
+Global
+ GlobalSection(SolutionConfigurationPlatforms) = preSolution
+ Debug|Win32 = Debug|Win32
+ Debug|x64 = Debug|x64
+ Release|Win32 = Release|Win32
+ Release|x64 = Release|x64
+ EndGlobalSection
+ GlobalSection(ProjectConfigurationPlatforms) = postSolution
+ {75C44C78-8F9A-474E-9DA0-44F16C438147}.Debug|Win32.ActiveCfg = Debug|Win32
+ {75C44C78-8F9A-474E-9DA0-44F16C438147}.Debug|Win32.Build.0 = Debug|Win32
+ {75C44C78-8F9A-474E-9DA0-44F16C438147}.Debug|x64.ActiveCfg = Debug|x64
+ {75C44C78-8F9A-474E-9DA0-44F16C438147}.Debug|x64.Build.0 = Debug|x64
+ {75C44C78-8F9A-474E-9DA0-44F16C438147}.Release|Win32.ActiveCfg = Release|Win32
+ {75C44C78-8F9A-474E-9DA0-44F16C438147}.Release|Win32.Build.0 = Release|Win32
+ {75C44C78-8F9A-474E-9DA0-44F16C438147}.Release|x64.ActiveCfg = Release|x64
+ {75C44C78-8F9A-474E-9DA0-44F16C438147}.Release|x64.Build.0 = Release|x64
+ EndGlobalSection
+ GlobalSection(SolutionProperties) = preSolution
+ HideSolutionNode = FALSE
+ EndGlobalSection
+EndGlobal
diff --git a/Project2/Project2.suo b/Project2/Project2.suo
new file mode 100755
index 0000000..c259d57
Binary files /dev/null and b/Project2/Project2.suo differ
diff --git a/Project2/Project2/Debug/Project2.log b/Project2/Project2/Debug/Project2.log
new file mode 100755
index 0000000..f3f70f7
--- /dev/null
+++ b/Project2/Project2/Debug/Project2.log
@@ -0,0 +1,53 @@
+Build started 9/28/2014 8:03:49 PM.
+ 1>Project "S:\CIS565\Project2-StreamCompaction\Project2\Project2\Project2.vcxproj" on node 2 (build target(s)).
+ 1>InitializeBuildStatus:
+ Creating "Debug\Project2.unsuccessfulbuild" because "AlwaysCreate" was specified.
+ AddCudaCompileDeps:
+ c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\bin\cl.exe /E /nologo /showIncludes /TP /D__CUDACC__ /DWIN32 /D_DEBUG /D_CONSOLE /D_MBCS /I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\include" /I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\bin" /I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\include" /I. /FIcuda_runtime.h /c S:\CIS565\Project2-StreamCompaction\Project2\Project2\kernel.cu
+ CudaBuild:
+ Compiling CUDA source file kernel.cu...
+ cmd.exe /C "C:\Users\lejoyce\AppData\Local\Temp\tmp15e45a570e7549f9bbd0f46f7abf8904.cmd"
+ "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\bin\nvcc.exe" -gencode=arch=compute_10,code=\"sm_10,compute_10\" --use-local-env --cl-version 2010 -ccbin "c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\bin" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\include" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\include" -G --keep-dir Debug -maxrregcount=0 --machine 32 --compile -cudart static -g -DWIN32 -D_DEBUG -D_CONSOLE -D_MBCS -Xcompiler "/EHsc /W3 /nologo /Od /Zi /RTC1 /MDd " -o S:\CIS565\Project2-StreamCompaction\Project2\Project2\kernel.cu.obj "S:\CIS565\Project2-StreamCompaction\Project2\Project2\kernel.cu"
+
+ C:\user>"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\bin\nvcc.exe" -gencode=arch=compute_10,code=\"sm_10,compute_10\" --use-local-env --cl-version 2010 -ccbin "c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\bin" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\include" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\include" -G --keep-dir Debug -maxrregcount=0 --machine 32 --compile -cudart static -g -DWIN32 -D_DEBUG -D_CONSOLE -D_MBCS -Xcompiler "/EHsc /W3 /nologo /Od /Zi /RTC1 /MDd " -o S:\CIS565\Project2-StreamCompaction\Project2\Project2\kernel.cu.obj "S:\CIS565\Project2-StreamCompaction\Project2\Project2\kernel.cu"
+ ClCompile:
+ c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\bin\CL.exe /c /I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\include" /ZI /nologo /W3 /WX- /Od /Oy- /D WIN32 /D _DEBUG /D _CONSOLE /D _MBCS /Gm /EHsc /RTC1 /MDd /GS /fp:precise /Zc:wchar_t /Zc:forScope /Fo"Debug\\" /Fd"Debug\vc100.pdb" /Gd /TP /analyze- /errorReport:prompt main.cpp serial.cpp
+ serial.cpp
+ 1>c:\program files\nvidia gpu computing toolkit\cuda\v5.5\include\thrust\detail\allocator\allocator_traits.inl(180): warning C4003: not enough actual parameters for macro 'max'
+ 1>c:\program files\nvidia gpu computing toolkit\cuda\v5.5\include\thrust\system\detail\error_category.inl(102): warning C4996: 'strerror': This function or variable may be unsafe. Consider using strerror_s instead. To disable deprecation, use _CRT_SECURE_NO_WARNINGS. See online help for details.
+ c:\program files (x86)\microsoft visual studio 10.0\vc\include\string.h(157) : see declaration of 'strerror'
+ 1>c:\program files\nvidia gpu computing toolkit\cuda\v5.5\include\thrust\system\cuda\detail\for_each.inl(112): warning C4003: not enough actual parameters for macro 'max'
+ main.cpp
+ Generating Code...
+ ManifestResourceCompile:
+ C:\Program Files (x86)\Microsoft SDKs\Windows\v7.0A\bin\rc.exe /nologo /fo"Debug\Project2.exe.embed.manifest.res" Debug\Project2_manifest.rc
+ Link:
+ c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\bin\link.exe /ERRORREPORT:PROMPT /OUT:"S:\CIS565\Project2-StreamCompaction\Project2\Debug\Project2.exe" /INCREMENTAL /NOLOGO /LIBPATH:"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\lib\Win32" cudart.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /MANIFEST /ManifestFile:"Debug\Project2.exe.intermediate.manifest" /MANIFESTUAC:"level='asInvoker' uiAccess='false'" /DEBUG /PDB:"S:\CIS565\Project2-StreamCompaction\Project2\Debug\Project2.pdb" /SUBSYSTEM:CONSOLE /TLBID:1 /DYNAMICBASE /NXCOMPAT /IMPLIB:"S:\CIS565\Project2-StreamCompaction\Project2\Debug\Project2.lib" /MACHINE:X86 "S:\CIS565\Project2-StreamCompaction\Project2\Project2\kernel.cu.obj"
+ Debug\Project2.exe.embed.manifest.res
+ Debug\main.obj
+ Debug\serial.obj
+ Manifest:
+ C:\Program Files (x86)\Microsoft SDKs\Windows\v7.0A\bin\mt.exe /nologo /verbose /out:"Debug\Project2.exe.embed.manifest" /manifest Debug\Project2.exe.intermediate.manifest
+ C:\Program Files (x86)\Microsoft SDKs\Windows\v7.0A\bin\rc.exe /nologo /fo"Debug\Project2.exe.embed.manifest.res" Debug\Project2_manifest.rc
+ LinkEmbedManifest:
+ c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\bin\link.exe /ERRORREPORT:PROMPT /OUT:"S:\CIS565\Project2-StreamCompaction\Project2\Debug\Project2.exe" /INCREMENTAL /NOLOGO /LIBPATH:"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\lib\Win32" cudart.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /MANIFEST /ManifestFile:"Debug\Project2.exe.intermediate.manifest" /MANIFESTUAC:"level='asInvoker' uiAccess='false'" /DEBUG /PDB:"S:\CIS565\Project2-StreamCompaction\Project2\Debug\Project2.pdb" /SUBSYSTEM:CONSOLE /TLBID:1 /DYNAMICBASE /NXCOMPAT /IMPLIB:"S:\CIS565\Project2-StreamCompaction\Project2\Debug\Project2.lib" /MACHINE:X86 "S:\CIS565\Project2-StreamCompaction\Project2\Project2\kernel.cu.obj"
+ Debug\Project2.exe.embed.manifest.res
+ Debug\main.obj
+ Debug\serial.obj
+ Project2.vcxproj -> S:\CIS565\Project2-StreamCompaction\Project2\Debug\Project2.exe
+ PostBuildEvent:
+ echo copy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\bin\cudart*.dll" "S:\CIS565\Project2-StreamCompaction\Project2\Debug\"
+ copy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\bin\cudart*.dll" "S:\CIS565\Project2-StreamCompaction\Project2\Debug\"
+ :VCEnd
+ copy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\bin\cudart*.dll" "S:\CIS565\Project2-StreamCompaction\Project2\Debug\"
+ C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\bin\cudart32_55.dll
+ C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\bin\cudart64_55.dll
+ 2 file(s) copied.
+ FinalizeBuildStatus:
+ Deleting file "Debug\Project2.unsuccessfulbuild".
+ Touching "Debug\Project2.lastbuildstate".
+ 1>Done Building Project "S:\CIS565\Project2-StreamCompaction\Project2\Project2\Project2.vcxproj" (build target(s)).
+
+Build succeeded.
+
+Time Elapsed 00:00:16.63
diff --git a/Project2/Project2/Project2.vcxproj b/Project2/Project2/Project2.vcxproj
new file mode 100755
index 0000000..01928b8
--- /dev/null
+++ b/Project2/Project2/Project2.vcxproj
@@ -0,0 +1,164 @@
+
+
+
+
+ Debug
+ Win32
+
+
+ Debug
+ x64
+
+
+ Release
+ Win32
+
+
+ Release
+ x64
+
+
+
+ {75C44C78-8F9A-474E-9DA0-44F16C438147}
+ Project2
+
+
+
+ Application
+ true
+ MultiByte
+
+
+ Application
+ true
+ MultiByte
+
+
+ Application
+ false
+ true
+ MultiByte
+
+
+ Application
+ false
+ true
+ MultiByte
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true
+
+
+ true
+
+
+
+ Level3
+ Disabled
+ WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)
+
+
+ true
+ Console
+ cudart.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)
+
+
+ echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"
+copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"
+
+
+ $(ProjectDir)%(Filename)%(Extension).obj
+
+
+
+
+ Level3
+ Disabled
+ WIN64;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)
+
+
+ true
+ Console
+ cudart.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)
+
+
+ echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"
+copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"
+
+
+
+
+ Level3
+ MaxSpeed
+ true
+ true
+ WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)
+
+
+ true
+ true
+ true
+ Console
+ cudart.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)
+
+
+ echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"
+copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"
+
+
+ $(ProjectDir)%(Filename)%(Extension).obj
+
+
+
+
+ Level3
+ MaxSpeed
+ true
+ true
+ WIN64;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)
+
+
+ true
+ true
+ true
+ Console
+ cudart.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)
+
+
+ echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"
+copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/Project2/Project2/Project2.vcxproj.user b/Project2/Project2/Project2.vcxproj.user
new file mode 100755
index 0000000..695b5c7
--- /dev/null
+++ b/Project2/Project2/Project2.vcxproj.user
@@ -0,0 +1,3 @@
+
+
+
\ No newline at end of file
diff --git a/Project2/Project2/kernel.cu b/Project2/Project2/kernel.cu
new file mode 100755
index 0000000..bdeaeca
--- /dev/null
+++ b/Project2/Project2/kernel.cu
@@ -0,0 +1,173 @@
+#include
+#include
+#include
+#include
+
+#include "kernel.h"
+
+#define BLOCK_SIZE 1024
+
+// PART 2
+__global__ void kernelScan(float* in, float* out, int size) {
+ out[0] = 0;
+ int x = threadIdx.x;
+ out[x + 1] = in[x];
+ __syncthreads();
+ for (int i = 1; i < size; i*=2) {
+ if (x > i / 2) {
+ out[x + 1] += out[x + 1 - i];
+ }
+ __syncthreads();
+ }
+}
+
+// PART 3
+__global__ void sharedScan(float* in, float* out, int size) {
+ extern __shared__ float buffer[];
+
+ int x = threadIdx.x;
+ int offset = 1;
+ buffer[2 * x] = in[2 * x];
+ buffer[2 * x + 1] = in[2 * x + 1];
+ for (int d = (size + 1) / 2; d > 0; d /= 2) {
+ __syncthreads();
+ if (x < d) {
+ int a = offset * (2 * x + 1) - 1;
+ int b = offset * (2 * x + 2) - 1;
+ buffer[b] += buffer[a];
+ }
+ offset *= 2;
+ if (!x) {
+ buffer[size] = 0;
+ }
+ for (int d = 1; d < size + 1; d *= 2) {
+ offset /= 2;
+ __syncthreads();
+ if (x < d) {
+ int a = offset * (2 * x + 1) - 1;
+ int b = offset * (2 * x + 2) - 1;
+ float t = buffer[a];
+ buffer[a] = buffer[b];
+ buffer[b] += t;
+ }
+ }
+ }
+ out[2 * x] = buffer[2 * x];
+ out[2 * x + 1] = buffer[2 * x + 1];
+}
+
+float* scanKernel(float* in, int size) {
+ dim3 dimBlock(size + 1);
+ dim3 dimGrid(1, 1);
+
+ cudaEvent_t start, stop;
+ float time;
+ cudaEventCreate(&start);
+ cudaEventCreate(&stop);
+
+ float *inD, *outD;
+ cudaMalloc(&inD, size * sizeof(float));
+ cudaMemcpy(inD, in, size * sizeof(float), cudaMemcpyHostToDevice);
+ cudaMalloc(&outD, (size + 1) * sizeof(float));
+ float* out = new float[size + 1];
+
+ // Running Part 2
+ cudaEventRecord(start, 0);
+ kernelScan<<>>(inD, outD, size);
+ cudaEventRecord(stop, 0);
+ cudaEventSynchronize(stop);
+ cudaEventElapsedTime(&time, start, stop);
+ printf("Naive: %f ms\t", time);
+ cudaMemcpy(out, outD, (size + 1) * sizeof(float), cudaMemcpyDeviceToHost);
+
+ // Running Part 3a
+ cudaEventRecord(start, 0);
+ sharedScan<<>>(inD, outD, size);
+ cudaEventRecord(stop, 0);
+ cudaEventSynchronize(stop);
+ cudaEventElapsedTime(&time, start, stop);
+ printf("Shared: %f ms\t", time);
+ //cudaMemcpy(out, outD, (size + 1) * sizeof(float), cudaMemcpyDeviceToHost);
+
+ // Running Part 3b
+ dim3 fullBlock(BLOCK_SIZE);
+ dim3 fullBlocksPerGrid((int)ceil(float(size) / float(BLOCK_SIZE)));
+ cudaEventRecord(start, 0);
+ sharedScan<<>>(inD, outD, size);
+ cudaEventRecord(stop, 0);
+ cudaEventSynchronize(stop);
+ cudaEventElapsedTime(&time, start, stop);
+ printf("Blocked: %f ms\n", time);
+ //cudaMemcpy(out, outD, (size + 1) * sizeof(float), cudaMemcpyDeviceToHost);
+
+ cudaFree(inD);
+ cudaFree(outD);
+ return out;
+}
+
+
+// PART 4
+__global__ void kernelScatter(float* in, float* out, int size) {
+ if (threadIdx.x < size) {
+ out[threadIdx.x] = in[threadIdx.x] != 0;
+ }
+}
+
+__global__ void streamCompactKernel(float* in, float* scat, float* scan, float* out, int size) {
+ int x = threadIdx.x;
+ if (x < size && scat[x] != 0)
+ out[int(scan[x])] = in[x];
+}
+
+float* scatterKernel(float* in, int size) {
+ dim3 dimBlock(size);
+ dim3 dimGrid(1, 1);
+
+ cudaEvent_t start, stop;
+ float time;
+ cudaEventCreate(&start);
+ cudaEventCreate(&stop);
+
+ float *inD, *outD;
+ cudaMalloc(&inD, size * sizeof(float));
+ cudaMemcpy(inD, in, size * sizeof(float), cudaMemcpyHostToDevice);
+ cudaMalloc(&outD, size * sizeof(float));
+ float* out = new float[size];
+
+ cudaEventRecord(start, 0);
+ kernelScatter<<>>(inD, outD, size);
+ cudaEventRecord(stop, 0);
+ cudaEventSynchronize(stop);
+ cudaEventElapsedTime(&time, start, stop);
+ printf("Kernel: %f ms\n", time);
+
+ cudaMemcpy(out, outD, size * sizeof(float), cudaMemcpyDeviceToHost);
+ cudaFree(inD);
+ cudaFree(outD);
+ return out;
+}
+
+float* streamCompact(float* in, int size) {
+ dim3 dimBlock(BLOCK_SIZE);
+ dim3 dimGrid((int)ceil(float(size) / float(BLOCK_SIZE)));
+ float *inD, *outScat, *outScan, *outD;
+ cudaMalloc(&inD, size * sizeof(float));
+ cudaMemcpy(inD, in, size * sizeof(float), cudaMemcpyHostToDevice);
+ cudaMalloc(&outScat, size * sizeof(float));
+ cudaMalloc(&outScan, (size + 1) * sizeof(float));
+ cudaMalloc(&outD, size * sizeof(float));
+
+ kernelScatter<<>>(inD, outScat, size);
+ kernelScan<<>>(outScat, outScan, size);
+ int s;
+ cudaMemcpy(&s, &outScan[size], sizeof(float), cudaMemcpyDeviceToHost);
+ s++;
+ float* out = new float[s];
+ streamCompactKernel<<>>(inD, outScat, outScan, outD, s);
+
+ cudaMemcpy(out, outD, s * sizeof(float), cudaMemcpyDeviceToHost);
+ cudaFree(inD);
+ cudaFree(outScat);
+ cudaFree(outScan);
+ return out;
+}
diff --git a/Project2/Project2/kernel.h b/Project2/Project2/kernel.h
new file mode 100755
index 0000000..381b6a7
--- /dev/null
+++ b/Project2/Project2/kernel.h
@@ -0,0 +1,9 @@
+#pragma once
+#ifndef KERNEL_H
+#define KERNEL_H
+
+float* scanKernel(float* input, int size);
+float* scatterKernel(float* input, int size);
+float* streamCompact(float* input, int size);
+
+#endif
diff --git a/Project2/Project2/main.cpp b/Project2/Project2/main.cpp
new file mode 100755
index 0000000..523021d
--- /dev/null
+++ b/Project2/Project2/main.cpp
@@ -0,0 +1,88 @@
+#include
+#include
+
+#include "kernel.h"
+#include "serial.h"
+
+#define SIZE 100
+#define PRINT
+
+using namespace std;
+
+int main() {
+ float input[SIZE];
+
+ for (int i = 0; i < SIZE; i++) {
+ input[i] = rand() % 10;
+ cout << input[i] << " ";
+ }
+ cout << endl << endl;
+
+ float* serScan = scan(input, SIZE);
+#ifdef PRINT
+ float* addr = serScan;
+ for (int i = 0; i < SIZE; i++) {
+ cout << *addr << " ";
+ addr++;
+ }
+ cout << endl << endl;
+#endif
+ delete serScan;
+
+ float* parScan = scanKernel(input, SIZE);
+#ifdef PRINT
+ addr = parScan;
+ for (int i = 0; i < SIZE; i++) {
+ cout << *addr << " ";
+ addr++;
+ }
+ cout << endl << endl;
+#endif
+ delete parScan;
+
+ float* serScatter = scatter(input, SIZE);
+#ifdef PRINT
+ addr = serScatter;
+ for (int i = 0; i < SIZE; i++) {
+ cout << *addr << " ";
+ addr++;
+ }
+ cout << endl << endl;
+#endif
+ delete serScatter;
+
+ float* parScatter = scatterKernel(input, SIZE);
+#ifdef PRINT
+ addr = parScatter;
+ for (int i = 0; i < SIZE; i++) {
+ cout << *addr << " ";
+ addr++;
+ }
+ cout << endl << endl;
+#endif
+ delete parScatter;
+
+ float* strCmp = streamCompact(input, SIZE);
+#ifdef PRINT
+ addr = strCmp;
+ for (int i = 0; i < SIZE; i++) {
+ cout << *addr << " ";
+ addr++;
+ }
+ cout << endl << endl;
+#endif
+ delete strCmp;
+
+ float* thr = serThrust(input, SIZE);
+#ifdef PRINT
+ addr = thr;
+ for (int i = 0; i < SIZE; i++) {
+ cout << *addr << " ";
+ addr++;
+ }
+ cout << endl << endl;
+#endif
+ delete thr;
+
+ _getch();
+}
diff --git a/Project2/Project2/serial.cpp b/Project2/Project2/serial.cpp
new file mode 100755
index 0000000..ab1286f
--- /dev/null
+++ b/Project2/Project2/serial.cpp
@@ -0,0 +1,57 @@
+#include
+#include
+#include
+#include
+
+#include "serial.h"
+
+// PART 1
+float* scan(float* in, int size) {
+ LARGE_INTEGER begin, end;
+ LARGE_INTEGER frequency;
+ QueryPerformanceFrequency(&frequency);
+
+ float* out = new float[size + 1];
+ out[0] = 0;
+ QueryPerformanceCounter(&begin);
+ for (int i = 0; i < size; i++)
+ out[i+1] = out[i] + in[i];
+ QueryPerformanceCounter(&end);
+ printf("CPU: %f ms\t", (end.QuadPart - begin.QuadPart) * 1000.0 / frequency.QuadPart);
+ return out;
+}
+
+// PART 4
+float* scatter(float* in, int size) {
+ LARGE_INTEGER begin, end;
+ LARGE_INTEGER frequency;
+ QueryPerformanceFrequency(&frequency);
+
+ float* out = new float[size];
+ QueryPerformanceCounter(&begin);
+ for (int i = 0; i < size; i++)
+ out[i] = in[i] != 0;
+ QueryPerformanceCounter(&end);
+ printf("CPU: %f ms\t", (end.QuadPart - begin.QuadPart) * 1000.0 / frequency.QuadPart);
+ return out;
+}
+
+struct keep {
+ __host__ __device__ bool operator() (const int x) {
+ return x > 0;
+ }
+};
+
+float* serThrust(float* in, int size) {
+ LARGE_INTEGER begin, end;
+ LARGE_INTEGER frequency;
+ QueryPerformanceFrequency(&frequency);
+
+ float* scat = scatter(in, size);
+ float* out = new float[size];
+ QueryPerformanceCounter(&begin);
+ thrust::copy_if(thrust::host, in, in + size, scat, out, keep());
+ QueryPerformanceCounter(&end);
+ printf("CPU: %f ms\t", (end.QuadPart - begin.QuadPart) * 1000.0 / frequency.QuadPart);
+ return out;
+}
diff --git a/Project2/Project2/serial.h b/Project2/Project2/serial.h
new file mode 100755
index 0000000..350a0bc
--- /dev/null
+++ b/Project2/Project2/serial.h
@@ -0,0 +1,9 @@
+#pragma once
+#ifndef SERIAL_H
+#define SERIAL_H
+
+float* scan(float* input, int size);
+float* scatter(float* input, int size);
+float* serThrust(float* input, int size);
+
+#endif
diff --git a/README.md b/README.md
index 6e02afa..0cd19f0 100644
--- a/README.md
+++ b/README.md
@@ -3,131 +3,22 @@ Project-2
A Study in Parallel Algorithms : Stream Compaction
-# INTRODUCTION
-Many of the algorithms you have learned thus far in your career have typically
-been developed from a serial standpoint. When it comes to GPUs, we are mainly
-looking at massively parallel work. Thus, it is necessary to reorient our
-thinking. In this project, we will be implementing a couple different versions
-of prefix sum. We will start with a simple single thread serial CPU version,
-and then move to a naive GPU version. Each part of this homework is meant to
-follow the logic of the previous parts, so please do not do this homework out of
-order.
-
-This project will serve as a stream compaction library that you may use (and
-will want to use) in your
-future projects. For that reason, we suggest you create proper header and CUDA
-files so that you can reuse this code later. You may want to create a separate
-cpp file that contains your main function so that you can test the code you
-write.
-
-# OVERVIEW
-Stream compaction is broken down into two parts: (1) scan, and (2) scatter.
-
-## SCAN
-Scan or prefix sum is the summation of the elements in an array such that the
-resulting array is the summation of the terms before it. Prefix sum can either
-be inclusive, meaning the current term is a summation of all the elements before
-it and itself, or exclusive, meaning the current term is a summation of all
-elements before it excluding itself.
-
-Inclusive:
-
-In : [ 3 4 6 7 9 10 ]
-
-Out : [ 3 7 13 20 29 39 ]
-
-Exclusive
-
-In : [ 3 4 6 7 9 10 ]
-
-Out : [ 0 3 7 13 20 29 ]
-
-Note that the resulting prefix sum will always be n + 1 elements if the input
-array is of length n. Similarly, the first element of the exclusive prefix sum
-will always be 0. In the following sections, all references to prefix sum will
-be to the exclusive version of prefix sum.
-
-## SCATTER
-The scatter section of stream compaction takes the results of the previous scan
-in order to reorder the elements to form a compact array.
-
-For example, let's say we have the following array:
-[ 0 0 3 4 0 6 6 7 0 1 ]
-
-We would only like to consider the non-zero elements in this zero, so we would
-like to compact it into the following array:
-[ 3 4 6 6 7 1 ]
-
-We can perform a transform on input array to transform it into a boolean array:
-
-In : [ 0 0 3 4 0 6 6 7 0 1 ]
-
-Out : [ 0 0 1 1 0 1 1 1 0 1 ]
-
-Performing a scan on the output, we get the following array :
-
-In : [ 0 0 1 1 0 1 1 1 0 1 ]
-
-Out : [ 0 0 0 1 2 2 3 4 5 5 ]
-
-Notice that the output array produces a corresponding index array that we can
-use to create the resulting array for stream compaction.
-
-# PART 1 : REVIEW OF PREFIX SUM
-Given the definition of exclusive prefix sum, please write a serial CPU version
-of prefix sum. You may write this in the cpp file to separate this from the
-CUDA code you will be writing in your .cu file.
-
-# PART 2 : NAIVE PREFIX SUM
-We will now parallelize this the previous section's code. Recall from lecture
-that we can parallelize this using a series of kernel calls. In this portion,
-you are NOT allowed to use shared memory.
-
-### Questions
-* Compare this version to the serial version of exclusive prefix scan. Please
- include a table of how the runtimes compare on different lengths of arrays.
-* Plot a graph of the comparison and write a short explanation of the phenomenon you
- see here.
-
-# PART 3 : OPTIMIZING PREFIX SUM
-In the previous section we did not take into account shared memory. In the
-previous section, we kept everything in global memory, which is much slower than
-shared memory.
-
-## PART 3a : Write prefix sum for a single block
-Shared memory is accessible to threads of a block. Please write a version of
-prefix sum that works on a single block.
-
-## PART 3b : Generalizing to arrays of any length.
-Taking the previous portion, please write a version that generalizes prefix sum
-to arbitrary length arrays, this includes arrays that will not fit on one block.
-
-### Questions
-* Compare this version to the parallel prefix sum using global memory.
-* Plot a graph of the comparison and write a short explanation of the phenomenon
- you see here.
-
-# PART 4 : ADDING SCATTER
-First create a serial version of scatter by expanding the serial version of
-prefix sum. Then create a GPU version of scatter. Combine the function call
-such that, given an array, you can call stream compact and it will compact the
-array for you. Finally, write a version using thrust.
-
-### Questions
-* Compare your version of stream compact to your version using thrust. How do
- they compare? How might you optimize yours more, or how might thrust's stream
- compact be optimized.
-
-# EXTRA CREDIT (+10)
-For extra credit, please optimize your prefix sum for work parallelism and to
-deal with bank conflicts. Information on this can be found in the GPU Gems
-chapter listed in the references.
-
-# SUBMISSION
-Please answer all the questions in each of the subsections above and write your
-answers in the README by overwriting the README file. In future projects, we
-expect your analysis to be similar to the one we have led you through in this
-project. Like other projects, please open a pull request and email Harmony.
-
-# REFERENCES
-"Parallel Prefix Sum (Scan) with CUDA." GPU Gems 3.
+Prefix scan has a linear running time, and as the input size becomes larger, the
+time it takes for the CPU to run increases linearly. In comparison, for the GPU,
+because of the immense multithreading, the runtime is nearly constant. Thus even
+drastically increasing the input size does not greatly affect the runtime. Scan and
+scatter ran in virtually the same time on both the CPU and the GPU, which makes sense
+because they run through the input data in the same manner.
+
+
+In Part 3, I was not quite able to get shared memory working. Using extern, the
+size of the array dynamically allocated to me was always zero and I could not
+figure out why. Using a hardcoded size for shared memory based on the input size
+I knew I was providing it, the runtime was exactly the same as using global memory
+which leads me to think that I was either doing something wrong or the timers were
+inaccurate the day I was testing this (which happened to be the last day too).
+
+In Part 4, I was not able to get thrust working. Although I read up on documentation
+and followed the examples provided online, I was not able to make sense out of the
+error message on the console. Unfortunately I have no point of comparison between
+my implementation of stream compaction and that of thrust.
diff --git a/chart.JPG b/chart.JPG
new file mode 100755
index 0000000..925b9d4
Binary files /dev/null and b/chart.JPG differ