diff --git a/Project2/Project2.sln b/Project2/Project2.sln new file mode 100755 index 0000000..05eb50c --- /dev/null +++ b/Project2/Project2.sln @@ -0,0 +1,26 @@ + +Microsoft Visual Studio Solution File, Format Version 11.00 +# Visual Studio 2010 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Project2", "Project2\Project2.vcxproj", "{75C44C78-8F9A-474E-9DA0-44F16C438147}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Win32 = Debug|Win32 + Debug|x64 = Debug|x64 + Release|Win32 = Release|Win32 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {75C44C78-8F9A-474E-9DA0-44F16C438147}.Debug|Win32.ActiveCfg = Debug|Win32 + {75C44C78-8F9A-474E-9DA0-44F16C438147}.Debug|Win32.Build.0 = Debug|Win32 + {75C44C78-8F9A-474E-9DA0-44F16C438147}.Debug|x64.ActiveCfg = Debug|x64 + {75C44C78-8F9A-474E-9DA0-44F16C438147}.Debug|x64.Build.0 = Debug|x64 + {75C44C78-8F9A-474E-9DA0-44F16C438147}.Release|Win32.ActiveCfg = Release|Win32 + {75C44C78-8F9A-474E-9DA0-44F16C438147}.Release|Win32.Build.0 = Release|Win32 + {75C44C78-8F9A-474E-9DA0-44F16C438147}.Release|x64.ActiveCfg = Release|x64 + {75C44C78-8F9A-474E-9DA0-44F16C438147}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Project2/Project2.suo b/Project2/Project2.suo new file mode 100755 index 0000000..c259d57 Binary files /dev/null and b/Project2/Project2.suo differ diff --git a/Project2/Project2/Debug/Project2.log b/Project2/Project2/Debug/Project2.log new file mode 100755 index 0000000..f3f70f7 --- /dev/null +++ b/Project2/Project2/Debug/Project2.log @@ -0,0 +1,53 @@ +Build started 9/28/2014 8:03:49 PM. + 1>Project "S:\CIS565\Project2-StreamCompaction\Project2\Project2\Project2.vcxproj" on node 2 (build target(s)). + 1>InitializeBuildStatus: + Creating "Debug\Project2.unsuccessfulbuild" because "AlwaysCreate" was specified. + AddCudaCompileDeps: + c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\bin\cl.exe /E /nologo /showIncludes /TP /D__CUDACC__ /DWIN32 /D_DEBUG /D_CONSOLE /D_MBCS /I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\include" /I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\bin" /I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\include" /I. /FIcuda_runtime.h /c S:\CIS565\Project2-StreamCompaction\Project2\Project2\kernel.cu + CudaBuild: + Compiling CUDA source file kernel.cu... + cmd.exe /C "C:\Users\lejoyce\AppData\Local\Temp\tmp15e45a570e7549f9bbd0f46f7abf8904.cmd" + "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\bin\nvcc.exe" -gencode=arch=compute_10,code=\"sm_10,compute_10\" --use-local-env --cl-version 2010 -ccbin "c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\bin" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\include" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\include" -G --keep-dir Debug -maxrregcount=0 --machine 32 --compile -cudart static -g -DWIN32 -D_DEBUG -D_CONSOLE -D_MBCS -Xcompiler "/EHsc /W3 /nologo /Od /Zi /RTC1 /MDd " -o S:\CIS565\Project2-StreamCompaction\Project2\Project2\kernel.cu.obj "S:\CIS565\Project2-StreamCompaction\Project2\Project2\kernel.cu" + + C:\user>"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\bin\nvcc.exe" -gencode=arch=compute_10,code=\"sm_10,compute_10\" --use-local-env --cl-version 2010 -ccbin "c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\bin" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\include" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\include" -G --keep-dir Debug -maxrregcount=0 --machine 32 --compile -cudart static -g -DWIN32 -D_DEBUG -D_CONSOLE -D_MBCS -Xcompiler "/EHsc /W3 /nologo /Od /Zi /RTC1 /MDd " -o S:\CIS565\Project2-StreamCompaction\Project2\Project2\kernel.cu.obj "S:\CIS565\Project2-StreamCompaction\Project2\Project2\kernel.cu" + ClCompile: + c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\bin\CL.exe /c /I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\include" /ZI /nologo /W3 /WX- /Od /Oy- /D WIN32 /D _DEBUG /D _CONSOLE /D _MBCS /Gm /EHsc /RTC1 /MDd /GS /fp:precise /Zc:wchar_t /Zc:forScope /Fo"Debug\\" /Fd"Debug\vc100.pdb" /Gd /TP /analyze- /errorReport:prompt main.cpp serial.cpp + serial.cpp + 1>c:\program files\nvidia gpu computing toolkit\cuda\v5.5\include\thrust\detail\allocator\allocator_traits.inl(180): warning C4003: not enough actual parameters for macro 'max' + 1>c:\program files\nvidia gpu computing toolkit\cuda\v5.5\include\thrust\system\detail\error_category.inl(102): warning C4996: 'strerror': This function or variable may be unsafe. Consider using strerror_s instead. To disable deprecation, use _CRT_SECURE_NO_WARNINGS. See online help for details. + c:\program files (x86)\microsoft visual studio 10.0\vc\include\string.h(157) : see declaration of 'strerror' + 1>c:\program files\nvidia gpu computing toolkit\cuda\v5.5\include\thrust\system\cuda\detail\for_each.inl(112): warning C4003: not enough actual parameters for macro 'max' + main.cpp + Generating Code... + ManifestResourceCompile: + C:\Program Files (x86)\Microsoft SDKs\Windows\v7.0A\bin\rc.exe /nologo /fo"Debug\Project2.exe.embed.manifest.res" Debug\Project2_manifest.rc + Link: + c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\bin\link.exe /ERRORREPORT:PROMPT /OUT:"S:\CIS565\Project2-StreamCompaction\Project2\Debug\Project2.exe" /INCREMENTAL /NOLOGO /LIBPATH:"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\lib\Win32" cudart.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /MANIFEST /ManifestFile:"Debug\Project2.exe.intermediate.manifest" /MANIFESTUAC:"level='asInvoker' uiAccess='false'" /DEBUG /PDB:"S:\CIS565\Project2-StreamCompaction\Project2\Debug\Project2.pdb" /SUBSYSTEM:CONSOLE /TLBID:1 /DYNAMICBASE /NXCOMPAT /IMPLIB:"S:\CIS565\Project2-StreamCompaction\Project2\Debug\Project2.lib" /MACHINE:X86 "S:\CIS565\Project2-StreamCompaction\Project2\Project2\kernel.cu.obj" + Debug\Project2.exe.embed.manifest.res + Debug\main.obj + Debug\serial.obj + Manifest: + C:\Program Files (x86)\Microsoft SDKs\Windows\v7.0A\bin\mt.exe /nologo /verbose /out:"Debug\Project2.exe.embed.manifest" /manifest Debug\Project2.exe.intermediate.manifest + C:\Program Files (x86)\Microsoft SDKs\Windows\v7.0A\bin\rc.exe /nologo /fo"Debug\Project2.exe.embed.manifest.res" Debug\Project2_manifest.rc + LinkEmbedManifest: + c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\bin\link.exe /ERRORREPORT:PROMPT /OUT:"S:\CIS565\Project2-StreamCompaction\Project2\Debug\Project2.exe" /INCREMENTAL /NOLOGO /LIBPATH:"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\lib\Win32" cudart.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /MANIFEST /ManifestFile:"Debug\Project2.exe.intermediate.manifest" /MANIFESTUAC:"level='asInvoker' uiAccess='false'" /DEBUG /PDB:"S:\CIS565\Project2-StreamCompaction\Project2\Debug\Project2.pdb" /SUBSYSTEM:CONSOLE /TLBID:1 /DYNAMICBASE /NXCOMPAT /IMPLIB:"S:\CIS565\Project2-StreamCompaction\Project2\Debug\Project2.lib" /MACHINE:X86 "S:\CIS565\Project2-StreamCompaction\Project2\Project2\kernel.cu.obj" + Debug\Project2.exe.embed.manifest.res + Debug\main.obj + Debug\serial.obj + Project2.vcxproj -> S:\CIS565\Project2-StreamCompaction\Project2\Debug\Project2.exe + PostBuildEvent: + echo copy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\bin\cudart*.dll" "S:\CIS565\Project2-StreamCompaction\Project2\Debug\" + copy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\bin\cudart*.dll" "S:\CIS565\Project2-StreamCompaction\Project2\Debug\" + :VCEnd + copy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\bin\cudart*.dll" "S:\CIS565\Project2-StreamCompaction\Project2\Debug\" + C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\bin\cudart32_55.dll + C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\bin\cudart64_55.dll + 2 file(s) copied. + FinalizeBuildStatus: + Deleting file "Debug\Project2.unsuccessfulbuild". + Touching "Debug\Project2.lastbuildstate". + 1>Done Building Project "S:\CIS565\Project2-StreamCompaction\Project2\Project2\Project2.vcxproj" (build target(s)). + +Build succeeded. + +Time Elapsed 00:00:16.63 diff --git a/Project2/Project2/Project2.vcxproj b/Project2/Project2/Project2.vcxproj new file mode 100755 index 0000000..01928b8 --- /dev/null +++ b/Project2/Project2/Project2.vcxproj @@ -0,0 +1,164 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {75C44C78-8F9A-474E-9DA0-44F16C438147} + Project2 + + + + Application + true + MultiByte + + + Application + true + MultiByte + + + Application + false + true + MultiByte + + + Application + false + true + MultiByte + + + + + + + + + + + + + + + + + + + + true + + + true + + + + Level3 + Disabled + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + + + true + Console + cudart.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + + + echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" +copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" + + + $(ProjectDir)%(Filename)%(Extension).obj + + + + + Level3 + Disabled + WIN64;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + + + true + Console + cudart.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + + + echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" +copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" + + + + + Level3 + MaxSpeed + true + true + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + + + true + true + true + Console + cudart.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + + + echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" +copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" + + + $(ProjectDir)%(Filename)%(Extension).obj + + + + + Level3 + MaxSpeed + true + true + WIN64;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + + + true + true + true + Console + cudart.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + + + echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" +copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/Project2/Project2/Project2.vcxproj.user b/Project2/Project2/Project2.vcxproj.user new file mode 100755 index 0000000..695b5c7 --- /dev/null +++ b/Project2/Project2/Project2.vcxproj.user @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/Project2/Project2/kernel.cu b/Project2/Project2/kernel.cu new file mode 100755 index 0000000..bdeaeca --- /dev/null +++ b/Project2/Project2/kernel.cu @@ -0,0 +1,173 @@ +#include +#include +#include +#include + +#include "kernel.h" + +#define BLOCK_SIZE 1024 + +// PART 2 +__global__ void kernelScan(float* in, float* out, int size) { + out[0] = 0; + int x = threadIdx.x; + out[x + 1] = in[x]; + __syncthreads(); + for (int i = 1; i < size; i*=2) { + if (x > i / 2) { + out[x + 1] += out[x + 1 - i]; + } + __syncthreads(); + } +} + +// PART 3 +__global__ void sharedScan(float* in, float* out, int size) { + extern __shared__ float buffer[]; + + int x = threadIdx.x; + int offset = 1; + buffer[2 * x] = in[2 * x]; + buffer[2 * x + 1] = in[2 * x + 1]; + for (int d = (size + 1) / 2; d > 0; d /= 2) { + __syncthreads(); + if (x < d) { + int a = offset * (2 * x + 1) - 1; + int b = offset * (2 * x + 2) - 1; + buffer[b] += buffer[a]; + } + offset *= 2; + if (!x) { + buffer[size] = 0; + } + for (int d = 1; d < size + 1; d *= 2) { + offset /= 2; + __syncthreads(); + if (x < d) { + int a = offset * (2 * x + 1) - 1; + int b = offset * (2 * x + 2) - 1; + float t = buffer[a]; + buffer[a] = buffer[b]; + buffer[b] += t; + } + } + } + out[2 * x] = buffer[2 * x]; + out[2 * x + 1] = buffer[2 * x + 1]; +} + +float* scanKernel(float* in, int size) { + dim3 dimBlock(size + 1); + dim3 dimGrid(1, 1); + + cudaEvent_t start, stop; + float time; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + float *inD, *outD; + cudaMalloc(&inD, size * sizeof(float)); + cudaMemcpy(inD, in, size * sizeof(float), cudaMemcpyHostToDevice); + cudaMalloc(&outD, (size + 1) * sizeof(float)); + float* out = new float[size + 1]; + + // Running Part 2 + cudaEventRecord(start, 0); + kernelScan<<>>(inD, outD, size); + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&time, start, stop); + printf("Naive: %f ms\t", time); + cudaMemcpy(out, outD, (size + 1) * sizeof(float), cudaMemcpyDeviceToHost); + + // Running Part 3a + cudaEventRecord(start, 0); + sharedScan<<>>(inD, outD, size); + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&time, start, stop); + printf("Shared: %f ms\t", time); + //cudaMemcpy(out, outD, (size + 1) * sizeof(float), cudaMemcpyDeviceToHost); + + // Running Part 3b + dim3 fullBlock(BLOCK_SIZE); + dim3 fullBlocksPerGrid((int)ceil(float(size) / float(BLOCK_SIZE))); + cudaEventRecord(start, 0); + sharedScan<<>>(inD, outD, size); + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&time, start, stop); + printf("Blocked: %f ms\n", time); + //cudaMemcpy(out, outD, (size + 1) * sizeof(float), cudaMemcpyDeviceToHost); + + cudaFree(inD); + cudaFree(outD); + return out; +} + + +// PART 4 +__global__ void kernelScatter(float* in, float* out, int size) { + if (threadIdx.x < size) { + out[threadIdx.x] = in[threadIdx.x] != 0; + } +} + +__global__ void streamCompactKernel(float* in, float* scat, float* scan, float* out, int size) { + int x = threadIdx.x; + if (x < size && scat[x] != 0) + out[int(scan[x])] = in[x]; +} + +float* scatterKernel(float* in, int size) { + dim3 dimBlock(size); + dim3 dimGrid(1, 1); + + cudaEvent_t start, stop; + float time; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + float *inD, *outD; + cudaMalloc(&inD, size * sizeof(float)); + cudaMemcpy(inD, in, size * sizeof(float), cudaMemcpyHostToDevice); + cudaMalloc(&outD, size * sizeof(float)); + float* out = new float[size]; + + cudaEventRecord(start, 0); + kernelScatter<<>>(inD, outD, size); + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&time, start, stop); + printf("Kernel: %f ms\n", time); + + cudaMemcpy(out, outD, size * sizeof(float), cudaMemcpyDeviceToHost); + cudaFree(inD); + cudaFree(outD); + return out; +} + +float* streamCompact(float* in, int size) { + dim3 dimBlock(BLOCK_SIZE); + dim3 dimGrid((int)ceil(float(size) / float(BLOCK_SIZE))); + float *inD, *outScat, *outScan, *outD; + cudaMalloc(&inD, size * sizeof(float)); + cudaMemcpy(inD, in, size * sizeof(float), cudaMemcpyHostToDevice); + cudaMalloc(&outScat, size * sizeof(float)); + cudaMalloc(&outScan, (size + 1) * sizeof(float)); + cudaMalloc(&outD, size * sizeof(float)); + + kernelScatter<<>>(inD, outScat, size); + kernelScan<<>>(outScat, outScan, size); + int s; + cudaMemcpy(&s, &outScan[size], sizeof(float), cudaMemcpyDeviceToHost); + s++; + float* out = new float[s]; + streamCompactKernel<<>>(inD, outScat, outScan, outD, s); + + cudaMemcpy(out, outD, s * sizeof(float), cudaMemcpyDeviceToHost); + cudaFree(inD); + cudaFree(outScat); + cudaFree(outScan); + return out; +} diff --git a/Project2/Project2/kernel.h b/Project2/Project2/kernel.h new file mode 100755 index 0000000..381b6a7 --- /dev/null +++ b/Project2/Project2/kernel.h @@ -0,0 +1,9 @@ +#pragma once +#ifndef KERNEL_H +#define KERNEL_H + +float* scanKernel(float* input, int size); +float* scatterKernel(float* input, int size); +float* streamCompact(float* input, int size); + +#endif diff --git a/Project2/Project2/main.cpp b/Project2/Project2/main.cpp new file mode 100755 index 0000000..523021d --- /dev/null +++ b/Project2/Project2/main.cpp @@ -0,0 +1,88 @@ +#include +#include + +#include "kernel.h" +#include "serial.h" + +#define SIZE 100 +#define PRINT + +using namespace std; + +int main() { + float input[SIZE]; + + for (int i = 0; i < SIZE; i++) { + input[i] = rand() % 10; + cout << input[i] << " "; + } + cout << endl << endl; + + float* serScan = scan(input, SIZE); +#ifdef PRINT + float* addr = serScan; + for (int i = 0; i < SIZE; i++) { + cout << *addr << " "; + addr++; + } + cout << endl << endl; +#endif + delete serScan; + + float* parScan = scanKernel(input, SIZE); +#ifdef PRINT + addr = parScan; + for (int i = 0; i < SIZE; i++) { + cout << *addr << " "; + addr++; + } + cout << endl << endl; +#endif + delete parScan; + + float* serScatter = scatter(input, SIZE); +#ifdef PRINT + addr = serScatter; + for (int i = 0; i < SIZE; i++) { + cout << *addr << " "; + addr++; + } + cout << endl << endl; +#endif + delete serScatter; + + float* parScatter = scatterKernel(input, SIZE); +#ifdef PRINT + addr = parScatter; + for (int i = 0; i < SIZE; i++) { + cout << *addr << " "; + addr++; + } + cout << endl << endl; +#endif + delete parScatter; + + float* strCmp = streamCompact(input, SIZE); +#ifdef PRINT + addr = strCmp; + for (int i = 0; i < SIZE; i++) { + cout << *addr << " "; + addr++; + } + cout << endl << endl; +#endif + delete strCmp; + + float* thr = serThrust(input, SIZE); +#ifdef PRINT + addr = thr; + for (int i = 0; i < SIZE; i++) { + cout << *addr << " "; + addr++; + } + cout << endl << endl; +#endif + delete thr; + + _getch(); +} diff --git a/Project2/Project2/serial.cpp b/Project2/Project2/serial.cpp new file mode 100755 index 0000000..ab1286f --- /dev/null +++ b/Project2/Project2/serial.cpp @@ -0,0 +1,57 @@ +#include +#include +#include +#include + +#include "serial.h" + +// PART 1 +float* scan(float* in, int size) { + LARGE_INTEGER begin, end; + LARGE_INTEGER frequency; + QueryPerformanceFrequency(&frequency); + + float* out = new float[size + 1]; + out[0] = 0; + QueryPerformanceCounter(&begin); + for (int i = 0; i < size; i++) + out[i+1] = out[i] + in[i]; + QueryPerformanceCounter(&end); + printf("CPU: %f ms\t", (end.QuadPart - begin.QuadPart) * 1000.0 / frequency.QuadPart); + return out; +} + +// PART 4 +float* scatter(float* in, int size) { + LARGE_INTEGER begin, end; + LARGE_INTEGER frequency; + QueryPerformanceFrequency(&frequency); + + float* out = new float[size]; + QueryPerformanceCounter(&begin); + for (int i = 0; i < size; i++) + out[i] = in[i] != 0; + QueryPerformanceCounter(&end); + printf("CPU: %f ms\t", (end.QuadPart - begin.QuadPart) * 1000.0 / frequency.QuadPart); + return out; +} + +struct keep { + __host__ __device__ bool operator() (const int x) { + return x > 0; + } +}; + +float* serThrust(float* in, int size) { + LARGE_INTEGER begin, end; + LARGE_INTEGER frequency; + QueryPerformanceFrequency(&frequency); + + float* scat = scatter(in, size); + float* out = new float[size]; + QueryPerformanceCounter(&begin); + thrust::copy_if(thrust::host, in, in + size, scat, out, keep()); + QueryPerformanceCounter(&end); + printf("CPU: %f ms\t", (end.QuadPart - begin.QuadPart) * 1000.0 / frequency.QuadPart); + return out; +} diff --git a/Project2/Project2/serial.h b/Project2/Project2/serial.h new file mode 100755 index 0000000..350a0bc --- /dev/null +++ b/Project2/Project2/serial.h @@ -0,0 +1,9 @@ +#pragma once +#ifndef SERIAL_H +#define SERIAL_H + +float* scan(float* input, int size); +float* scatter(float* input, int size); +float* serThrust(float* input, int size); + +#endif diff --git a/README.md b/README.md index 6e02afa..0cd19f0 100644 --- a/README.md +++ b/README.md @@ -3,131 +3,22 @@ Project-2 A Study in Parallel Algorithms : Stream Compaction -# INTRODUCTION -Many of the algorithms you have learned thus far in your career have typically -been developed from a serial standpoint. When it comes to GPUs, we are mainly -looking at massively parallel work. Thus, it is necessary to reorient our -thinking. In this project, we will be implementing a couple different versions -of prefix sum. We will start with a simple single thread serial CPU version, -and then move to a naive GPU version. Each part of this homework is meant to -follow the logic of the previous parts, so please do not do this homework out of -order. - -This project will serve as a stream compaction library that you may use (and -will want to use) in your -future projects. For that reason, we suggest you create proper header and CUDA -files so that you can reuse this code later. You may want to create a separate -cpp file that contains your main function so that you can test the code you -write. - -# OVERVIEW -Stream compaction is broken down into two parts: (1) scan, and (2) scatter. - -## SCAN -Scan or prefix sum is the summation of the elements in an array such that the -resulting array is the summation of the terms before it. Prefix sum can either -be inclusive, meaning the current term is a summation of all the elements before -it and itself, or exclusive, meaning the current term is a summation of all -elements before it excluding itself. - -Inclusive: - -In : [ 3 4 6 7 9 10 ] - -Out : [ 3 7 13 20 29 39 ] - -Exclusive - -In : [ 3 4 6 7 9 10 ] - -Out : [ 0 3 7 13 20 29 ] - -Note that the resulting prefix sum will always be n + 1 elements if the input -array is of length n. Similarly, the first element of the exclusive prefix sum -will always be 0. In the following sections, all references to prefix sum will -be to the exclusive version of prefix sum. - -## SCATTER -The scatter section of stream compaction takes the results of the previous scan -in order to reorder the elements to form a compact array. - -For example, let's say we have the following array: -[ 0 0 3 4 0 6 6 7 0 1 ] - -We would only like to consider the non-zero elements in this zero, so we would -like to compact it into the following array: -[ 3 4 6 6 7 1 ] - -We can perform a transform on input array to transform it into a boolean array: - -In : [ 0 0 3 4 0 6 6 7 0 1 ] - -Out : [ 0 0 1 1 0 1 1 1 0 1 ] - -Performing a scan on the output, we get the following array : - -In : [ 0 0 1 1 0 1 1 1 0 1 ] - -Out : [ 0 0 0 1 2 2 3 4 5 5 ] - -Notice that the output array produces a corresponding index array that we can -use to create the resulting array for stream compaction. - -# PART 1 : REVIEW OF PREFIX SUM -Given the definition of exclusive prefix sum, please write a serial CPU version -of prefix sum. You may write this in the cpp file to separate this from the -CUDA code you will be writing in your .cu file. - -# PART 2 : NAIVE PREFIX SUM -We will now parallelize this the previous section's code. Recall from lecture -that we can parallelize this using a series of kernel calls. In this portion, -you are NOT allowed to use shared memory. - -### Questions -* Compare this version to the serial version of exclusive prefix scan. Please - include a table of how the runtimes compare on different lengths of arrays. -* Plot a graph of the comparison and write a short explanation of the phenomenon you - see here. - -# PART 3 : OPTIMIZING PREFIX SUM -In the previous section we did not take into account shared memory. In the -previous section, we kept everything in global memory, which is much slower than -shared memory. - -## PART 3a : Write prefix sum for a single block -Shared memory is accessible to threads of a block. Please write a version of -prefix sum that works on a single block. - -## PART 3b : Generalizing to arrays of any length. -Taking the previous portion, please write a version that generalizes prefix sum -to arbitrary length arrays, this includes arrays that will not fit on one block. - -### Questions -* Compare this version to the parallel prefix sum using global memory. -* Plot a graph of the comparison and write a short explanation of the phenomenon - you see here. - -# PART 4 : ADDING SCATTER -First create a serial version of scatter by expanding the serial version of -prefix sum. Then create a GPU version of scatter. Combine the function call -such that, given an array, you can call stream compact and it will compact the -array for you. Finally, write a version using thrust. - -### Questions -* Compare your version of stream compact to your version using thrust. How do - they compare? How might you optimize yours more, or how might thrust's stream - compact be optimized. - -# EXTRA CREDIT (+10) -For extra credit, please optimize your prefix sum for work parallelism and to -deal with bank conflicts. Information on this can be found in the GPU Gems -chapter listed in the references. - -# SUBMISSION -Please answer all the questions in each of the subsections above and write your -answers in the README by overwriting the README file. In future projects, we -expect your analysis to be similar to the one we have led you through in this -project. Like other projects, please open a pull request and email Harmony. - -# REFERENCES -"Parallel Prefix Sum (Scan) with CUDA." GPU Gems 3. +Prefix scan has a linear running time, and as the input size becomes larger, the +time it takes for the CPU to run increases linearly. In comparison, for the GPU, +because of the immense multithreading, the runtime is nearly constant. Thus even +drastically increasing the input size does not greatly affect the runtime. Scan and +scatter ran in virtually the same time on both the CPU and the GPU, which makes sense +because they run through the input data in the same manner.
+![Efficiency Comparison](https://github.com/leejcw/Project2-StreamCompaction/blob/master/chart.JPG) + +In Part 3, I was not quite able to get shared memory working. Using extern, the +size of the array dynamically allocated to me was always zero and I could not +figure out why. Using a hardcoded size for shared memory based on the input size +I knew I was providing it, the runtime was exactly the same as using global memory +which leads me to think that I was either doing something wrong or the timers were +inaccurate the day I was testing this (which happened to be the last day too). + +In Part 4, I was not able to get thrust working. Although I read up on documentation +and followed the examples provided online, I was not able to make sense out of the +error message on the console. Unfortunately I have no point of comparison between +my implementation of stream compaction and that of thrust. diff --git a/chart.JPG b/chart.JPG new file mode 100755 index 0000000..925b9d4 Binary files /dev/null and b/chart.JPG differ