diff --git a/.gitignore b/.gitignore index b8bd026..6fb8ab5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,8 @@ +*.sdf +*.pdb +Debug/ +Release/ + # Compiled Object files *.slo *.lo diff --git a/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction.sln b/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction.sln new file mode 100644 index 0000000..5779f38 --- /dev/null +++ b/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction.sln @@ -0,0 +1,28 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 2013 +VisualStudioVersion = 12.0.30723.0 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CIS565_2014_Fall_StreamCompaction", "CIS565_2014_Fall_StreamCompaction\CIS565_2014_Fall_StreamCompaction.vcxproj", "{AC55E4AA-BABB-4FAE-8DD6-FA7720EFAB54}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Win32 = Debug|Win32 + Debug|x64 = Debug|x64 + Release|Win32 = Release|Win32 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {AC55E4AA-BABB-4FAE-8DD6-FA7720EFAB54}.Debug|Win32.ActiveCfg = Debug|Win32 + {AC55E4AA-BABB-4FAE-8DD6-FA7720EFAB54}.Debug|Win32.Build.0 = Debug|Win32 + {AC55E4AA-BABB-4FAE-8DD6-FA7720EFAB54}.Debug|x64.ActiveCfg = Debug|x64 + {AC55E4AA-BABB-4FAE-8DD6-FA7720EFAB54}.Debug|x64.Build.0 = Debug|x64 + {AC55E4AA-BABB-4FAE-8DD6-FA7720EFAB54}.Release|Win32.ActiveCfg = Release|Win32 + {AC55E4AA-BABB-4FAE-8DD6-FA7720EFAB54}.Release|Win32.Build.0 = Release|Win32 + {AC55E4AA-BABB-4FAE-8DD6-FA7720EFAB54}.Release|x64.ActiveCfg = Release|x64 + {AC55E4AA-BABB-4FAE-8DD6-FA7720EFAB54}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction.v12.suo b/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction.v12.suo new file mode 100644 index 0000000..736e9e5 Binary files /dev/null and b/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction.v12.suo differ diff --git a/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction.vcxproj b/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction.vcxproj new file mode 100644 index 0000000..f9f0513 --- /dev/null +++ b/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction.vcxproj @@ -0,0 +1,165 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {AC55E4AA-BABB-4FAE-8DD6-FA7720EFAB54} + CIS565_2014_Fall_StreamCompaction + + + + Application + true + MultiByte + v120 + + + Application + true + MultiByte + v120 + + + Application + false + true + MultiByte + v120 + + + Application + false + true + MultiByte + v120 + + + + + + + + + + + + + + + + + + + + true + + + true + + + + Level3 + Disabled + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + + + true + Console + cudart.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + + + echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" +copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" + + + + + Level3 + Disabled + WIN32;WIN64;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + + + true + Console + cudart.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + + + echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" +copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" + + + 64 + compute_30,sm_30 + + + + + Level3 + MaxSpeed + true + true + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + + + true + true + true + Console + cudart.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + + + echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" +copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" + + + + + Level3 + MaxSpeed + true + true + WIN32;WIN64;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + + + true + true + true + Console + cudart.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + + + echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" +copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" + + + 64 + compute_30,sm_30 + + + + + + + + + + + + + \ No newline at end of file diff --git a/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/Macros.h b/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/Macros.h new file mode 100644 index 0000000..1ccfec0 --- /dev/null +++ b/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/Macros.h @@ -0,0 +1,11 @@ +int arraySize = 512; +int scatterSize = 10; +//const __device__ int size = arraySize + 1; +int blockSize = 128; +int blkHost; +//const __device__ int blocks = (arraySize)/blockSize+1; +//typedef std::chrono::duration> microSecond; + +#define NUM_BANKS 32 +#define LOG_NUM_BANKS 5 +#define CONFLICT_FREE_OFFSET(n) (NUM_BANKS*((n) >> LOG_NUM_BANKS)+((n)+((n) >> LOG_NUM_BANKS))%NUM_BANKS) \ No newline at end of file diff --git a/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/config.cfg b/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/config.cfg new file mode 100644 index 0000000..d789b9f --- /dev/null +++ b/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/config.cfg @@ -0,0 +1,4 @@ +#This is a configure file of the GPU +#The last line begin without '#' should have space separated numbers: +#arraysize_for_PrefixSum blockSize arraysize_for_Scattering +80000 128 80000 \ No newline at end of file diff --git a/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/kernel.cu b/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/kernel.cu new file mode 100644 index 0000000..a7576d3 --- /dev/null +++ b/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/kernel.cu @@ -0,0 +1,768 @@ + +#include "cuda_runtime.h" +#include "device_launch_parameters.h" +#include +#include +#include +#include + +//print and file +#include +#include +#include +#include + +//#include +#include +#include +#include + +#include "Macros.h" + + + +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +////////////////////////////////Helper functions//////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +void printResult(const int* in, const int& size){ + for (int i = 0; i < size; i++){ + printf("%d ", in[i]); + } + printf("\n"); +} + +void RandArray(int* arr, const int& size){ + for (int i = 0; i < size; i++){ + arr[i] = rand() % 201-100; + + } + +} + +void RandLotsZeros(int* arr, const int& size){ + for (int i = 0; i < size; i++){ + arr[i] = rand() % 2; + if (arr[i] != 0) arr[i] = rand() % 21 - 10; + } +} + +bool GPU_MemHelper(const int* src, int* &dst, const int& size, const cudaMemcpyKind& type = cudaMemcpyHostToDevice){ + cudaError_t cudaStatus; + if (type == cudaMemcpyHostToDevice) + cudaStatus = cudaMalloc((void**)&dst, size*sizeof(int)); + //if (!cudaStatus) return false; + cudaStatus = cudaMemcpy(dst, src, size*sizeof(int), type); + return cudaStatus; +} +template +bool verifyResult(const T& a, const T& b, const int& size){ + for (int i = 0; i < size; i++) + if (a[i] != b[i]) { + printf("The %dth is expected %d, but got %d!\n", i, a[i], b[i]); + //return false; + } + return true; +} + +template +void swap(T& a, T& b){ + T c = a; + a = b; + b = c; +} + +void printusage(){ + printf("This is the help of this program!\n"); + printf("You can use the configure.cfg to configure the program as well!\n"); + printf("If you use cmd parameters, you have three option:\n"); + printf("-b : blockSize\n-p : prefix sum array size\n-s : scattering array size\n"); +} +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +////////////////////////////////Serial Version On CPU/////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +void CPU_PrefixSumEx(int* out, const int* in, const int& size){ + out[0] = 0; + //for (int i = 0; i < INT_MAX>>4; i++){}; + for (int i = 1; i <= size; i++) + out[i] = in[i - 1]+out[i-1]; +} + +void CPU_Scatter(int* out, int & outsize, const int* in, const int& size){ + //out[0] = 0; + //for (int i = 0; i < INT_MAX>>4; i++){}; + //for (int i = 1; i <= size; i++) + //out[i] = (in[i - 1]!=0) + out[i - 1]; + for (int i = 0; i < size; i++) + if (*(in + i) == 0) continue; + else { + *(out+outsize) = *(in + i); + outsize++; + } +} +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +////////////////////////////////Naive Parrallel///////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +__global__ void GPU_PrefixSumEx_Naive(int* out, const int* in, int i,int size=arraySize){ + int id = (blockIdx.x*blockDim.x)+threadIdx.x; + + if (id > i&&id>= 1; + if (thid == 0){ + multiBlock[blockIdx.x] = cin[size - 1]; + //out[arraySize] = cin[it-1]; + cin[size - 1] = 0; + } + int upbound = it; + while (it>1){ + it >>= 1; + lid >>= 1; + if (lid <= upbound){ + int rid = lid - it; + int tmp = cin[lid - 1]; + cin[lid - 1] += cin[rid - 1]; + cin[rid - 1] = tmp; + } + __syncthreads(); + } + + //swap(cin, cout); + out[id] = cin[threadIdx.x]; + + __syncthreads(); + } +} + +__global__ void GPU_final(int* out, int*in, int size){ + out[size] = out[size - 1] + in[size - 1]; +} + +__global__ void GPU_append(int *out, int *val, int loc,int pos=0){ + out[loc] = val[pos]; +} + +__global__ void GPU_add(int *out, int *src, int size = arraySize){ + if (blockIdx.x == 0) return; + //int blockid = (blockIdx.x*blockDim.x); + //int thid = threadIdx.x; + int id = (blockIdx.x*blockDim.x) + threadIdx.x; + if (id < size){ + out[id] += src[blockIdx.x]; + } +} +void rec_GPU_PO(int* out, int* src, int size = arraySize, int* multiBlock = NULL){ + int blocks = (size-1) / blockSize+1; + if (!multiBlock) + cudaMalloc((void**)&multiBlock, (blocks+1)*sizeof(int)); + GPU_PrefixSumEx_Optimized << < blocks, blockSize, blockSize*sizeof(int) >> >(out, src, multiBlock,blockSize); + cudaThreadSynchronize(); + if (blocks == 1){ + GPU_append << <1, 1 >> >(out, multiBlock, size); + return; + } + else{ + rec_GPU_PO(multiBlock, multiBlock, blocks); + GPU_add<<>>(out, multiBlock, size); + cudaThreadSynchronize(); + GPU_append << <1, 1 >> >(out, multiBlock, size, blocks); + } + + if (size == arraySize){ + GPU_final << <1, 1 >> >(out, src, size); + } + cudaFree(multiBlock); +} + + +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +////////////////////////////////Optimized Parrallel for Scatter///////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +__global__ void GPU_PrefixSumEx_Optimized_WithScatter(int* out, int* src, int* multiBlock, int size, int dataSize = scatterSize){ + extern __shared__ int cache[]; + int *cin = cache; + //int *cout = &cache[blockSize]; + int blockid = (blockIdx.x*blockDim.x); + int thid = threadIdx.x; + int id = (blockIdx.x*blockDim.x) + threadIdx.x; + + if (id < dataSize){ + cin[threadIdx.x] = (src[id]!=0); + //cin[2 * threadIdx.x+1] = src[blockid + 2 * threadIdx.x+1]; + __syncthreads(); + //swap(in, out); + int lid = (1 + thid) << 1; + int it = 1; + + for (it = 1; it < size; it *= 2){ + + if (lid <= size + 1){ + + int rid = lid - it; + cin[lid - 1] += cin[rid - 1]; + } + lid <<= 1; + __syncthreads(); + //swap(cin, cout); + } + //it >>= 1; + if (thid == 0){ + multiBlock[blockIdx.x] = cin[size - 1]; + //out[arraySize] = cin[it-1]; + cin[size - 1] = 0; + } + int upbound = it; + while (it>1){ + it >>= 1; + lid >>= 1; + if (lid <= upbound){ + int rid = lid - it; + int tmp = cin[lid - 1]; + cin[lid - 1] += cin[rid - 1]; + cin[rid - 1] = tmp; + } + __syncthreads(); + } + + //swap(cin, cout); + out[id] = cin[threadIdx.x]; + + __syncthreads(); + /*for (int i = 1; i < blocks; i <<= 1) + if (blockIdx.x >= i) cout[threadIdx.x] += multiBlock[blockIdx.x - i];*/ + } +} + +void rec_GPU_POS(int* out, int* src, int size = scatterSize, int* multiBlock = NULL){ + int blocks = (size - 1) / blockSize + 1; + if (!multiBlock) + cudaMalloc((void**)&multiBlock, (blocks + 1)*sizeof(int)); + GPU_PrefixSumEx_Optimized_WithScatter << < blocks, blockSize, blockSize*sizeof(int) >> >(out, src, multiBlock, blockSize); + cudaThreadSynchronize(); + if (blocks == 1){ + GPU_append << <1, 1 >> >(out, multiBlock, size); + return; + } + else{ + rec_GPU_PO(multiBlock, multiBlock, blocks); + GPU_add << > >(out, multiBlock, size); + cudaThreadSynchronize(); + GPU_append << <1, 1 >> >(out, multiBlock, size, blocks); + } + + if (size == scatterSize){ + GPU_final << <1, 1 >> >(out, src, size); + } + cudaFree(multiBlock); +} + +__global__ void GPU_scatter_cp(int* out, int* aux, int* in, int size = scatterSize){ + int id = (blockIdx.x*blockDim.x) + threadIdx.x; + if (id < size&&in[id] != 0){ + out[aux[id]] = in[id]; + } + __syncthreads(); +} +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +///////////////////Optimized Parrallel with Bank Conflicts resolving//////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +__global__ void GPU_PrefixSumEx_Optimized_BankConflicts(int* out, int* src, int* multiBlock, int size, int dataSize = arraySize){ + extern __shared__ int cache[]; + int *cin = cache; + //int *cout = &cache[blockSize]; + int blockid = (blockIdx.x*blockDim.x); + int thid = threadIdx.x; + int id = (blockIdx.x*blockDim.x) + threadIdx.x; + int offset = 1; + + if (id < dataSize){ + + cin[CONFLICT_FREE_OFFSET(thid)] = src[id]; + + __syncthreads(); + //swap(in, out); + int lid = (1 + thid) << 1; + int it = 1; + + for (it = 1; it < size; it *= 2){ + + if (lid <= size + 1){ + + int rid = lid - it; + cin[CONFLICT_FREE_OFFSET(lid - 1)] += cin[CONFLICT_FREE_OFFSET(rid - 1)]; + } + lid <<= 1; + __syncthreads(); + //swap(cin, cout); + } + //it >>= 1; + if (thid == 0){ + multiBlock[blockIdx.x] = cin[CONFLICT_FREE_OFFSET(size - 1)]; + //out[arraySize] = cin[it-1]; + cin[CONFLICT_FREE_OFFSET(size - 1)] = 0; + } + int upbound = it; + while (it>1){ + it >>= 1; + lid >>= 1; + if (lid <= upbound){ + int rid = lid - it; + int tmp = cin[CONFLICT_FREE_OFFSET(lid - 1)]; + cin[CONFLICT_FREE_OFFSET(lid - 1)] += cin[CONFLICT_FREE_OFFSET(rid - 1)]; + cin[CONFLICT_FREE_OFFSET(rid - 1)] = tmp; + } + __syncthreads(); + } + + //swap(cin, cout); + out[id] = cin[CONFLICT_FREE_OFFSET(thid)]; + + __syncthreads(); + + } +} + +void rec_GPU_POBR(int* out, int* src, int size = arraySize, int* multiBlock = NULL){ + int blocks = (size - 1) / blockSize + 1; + if (!multiBlock) + cudaMalloc((void**)&multiBlock, (blocks + 1)*sizeof(int)); + GPU_PrefixSumEx_Optimized_BankConflicts << < blocks, blockSize, blockSize*sizeof(int) >> >(out, src, multiBlock, blockSize); + cudaThreadSynchronize(); + if (blocks == 1){ + GPU_append << <1, 1 >> >(out, multiBlock, size); + return; + } + else{ + rec_GPU_POBR(multiBlock, multiBlock, blocks); + GPU_add << > >(out, multiBlock, size); + cudaThreadSynchronize(); + GPU_append << <1, 1 >> >(out, multiBlock, size, blocks); + } + + if (size == arraySize){ + GPU_final << <1, 1 >> >(out, src, size); + } + cudaFree(multiBlock); +} + +__global__ void GPU_PrefixSumEx_Optimized_WithScatter_BankConflicts(int* out, int* src, int* multiBlock, int size, int dataSize = arraySize){ + extern __shared__ int cache[]; + int *cin = cache; + //int *cout = &cache[blockSize]; + int blockid = (blockIdx.x*blockDim.x); + int thid = threadIdx.x; + int id = (blockIdx.x*blockDim.x) + threadIdx.x; + int offset = 1; + + if (id < dataSize){ + + cin[CONFLICT_FREE_OFFSET(thid)] = (src[id]!=0); + + __syncthreads(); + //swap(in, out); + int lid = (1 + thid) << 1; + int it = 1; + + for (it = 1; it < size; it *= 2){ + + if (lid <= size + 1){ + + int rid = lid - it; + cin[CONFLICT_FREE_OFFSET(lid - 1)] += cin[CONFLICT_FREE_OFFSET(rid - 1)]; + } + lid <<= 1; + __syncthreads(); + //swap(cin, cout); + } + //it >>= 1; + if (thid == 0){ + multiBlock[blockIdx.x] = cin[CONFLICT_FREE_OFFSET(size - 1)]; + //out[arraySize] = cin[it-1]; + cin[CONFLICT_FREE_OFFSET(size - 1)] = 0; + } + int upbound = it; + while (it>1){ + it >>= 1; + lid >>= 1; + if (lid <= upbound){ + int rid = lid - it; + int tmp = cin[CONFLICT_FREE_OFFSET(lid - 1)]; + cin[CONFLICT_FREE_OFFSET(lid - 1)] += cin[CONFLICT_FREE_OFFSET(rid - 1)]; + cin[CONFLICT_FREE_OFFSET(rid - 1)] = tmp; + } + __syncthreads(); + } + + //swap(cin, cout); + out[id] = cin[CONFLICT_FREE_OFFSET(thid)]; + + __syncthreads(); + + } +} + +void rec_GPU_POSBR(int* out, int* src, int size = scatterSize, int* multiBlock = NULL){ + int blocks = (size - 1) / blockSize + 1; + if (!multiBlock) + cudaMalloc((void**)&multiBlock, (blocks + 1)*sizeof(int)); + GPU_PrefixSumEx_Optimized_WithScatter_BankConflicts << < blocks, blockSize, blockSize*sizeof(int) >> >(out, src, multiBlock, blockSize); + cudaThreadSynchronize(); + if (blocks == 1){ + GPU_append << <1, 1 >> >(out, multiBlock, size); + return; + } + else{ + rec_GPU_POBR(multiBlock, multiBlock, blocks); + GPU_add << > >(out, multiBlock, size); + cudaThreadSynchronize(); + GPU_append << <1, 1 >> >(out, multiBlock, size, blocks); + } + + if (size == scatterSize){ + GPU_final << <1, 1 >> >(out, src, size); + } + cudaFree(multiBlock); +} +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +////////////////////////////////Main functions////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////// +int main(int argc, char** argv) +{ + std::ifstream fin("config.cfg"); + char buff[80]; + while (fin.getline(buff, 80)){ + if (buff[0] == '#') continue; + std::istringstream is(buff); + is >> arraySize >> blockSize>>scatterSize; + } + fin.close(); + if (argc > 1){ + for (int i = 1; i < argc;){ + if (!strcmp(argv[i], "-b")){ + blockSize = atoi(argv[i + 1]); + i += 2; + } + else if (!strcmp(argv[i], "-p")){ + arraySize = atoi(argv[i + 1]); + i += 2; + } + else if (!strcmp(argv[i], "-s")){ + scatterSize = atoi(argv[i + 1]); + i += 2; + } + else{ + printf("Unrecognized Parameter: %s\n", argv[i]); + printusage(); + exit(-1); + } + } + } + std::ofstream fout("output.log", std::ios::app); + fout << blockSize << '\t' << arraySize << '\t' << scatterSize << '\t'; + blkHost = (int)ceil((arraySize) / (float)blockSize); + //const int arraySize = 6; + srand(time(NULL)); + int *a=new int[arraySize];// = { 3, 4, 6, 7, 9, 10 }; + RandArray(a, arraySize); + //const int b[arraySize] = { 10, 20, 30, 40, 50 }; + int *c=new int [arraySize+1]; + memset(c, 0, (arraySize + 1)*sizeof(int)); + + //std::chrono::time_point start, end; + LARGE_INTEGER large_interger; + __int64 start, end; + double diff; + QueryPerformanceFrequency(&large_interger); + diff = large_interger.QuadPart; + ///////////////////////////////////////////// + //////////////CPU calling//////////////////// + ///////////////////////////////////////////// + QueryPerformanceCounter(&large_interger); + start = large_interger.QuadPart; + + CPU_PrefixSumEx(c, a, arraySize); + QueryPerformanceCounter(&large_interger); + end = large_interger.QuadPart; + + printf("CPU_Version:\t %f ms:\n", 1000 * (end - start)/diff); + fout << 1000 * (end - start) / diff << '\t'; + //printResult(c, arraySize+1); + + ///////////////////////////////////////////// + //////////////GPU calling//////////////////// + ///////////////////////////////////////////// + cudaSetDevice(0); + + ///////////////////////////////////////////// + //////////////Optimized calling////////////// + ///////////////////////////////////////////// + { + cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte); + int *src, *res; + int *host_res = new int[arraySize + 1]; + memset(host_res, 0, (arraySize + 1)*sizeof(int)); + GPU_MemHelper(a, src, arraySize); + + + + //GPU_MemHelper(host_res, buff, arraySize + 1); + GPU_MemHelper(host_res, res, arraySize + 1); + + QueryPerformanceCounter(&large_interger); + start = large_interger.QuadPart; + + rec_GPU_POBR(res, src); + + + QueryPerformanceCounter(&large_interger); + end = large_interger.QuadPart; + GPU_MemHelper(res, host_res, arraySize + 1, cudaMemcpyDeviceToHost); + + if (verifyResult(c, host_res, arraySize + 1)) printf("GPU Second Step verifed OK!\n"); + printf("GPU_Optimized_Version:\t %f ms:\n", 1000 * (end - start) / diff); + fout << 1000 * (end - start) / diff << '\t'; + free(host_res); + cudaFree(res); + cudaFree(src); + cudaDeviceReset(); + } + + { + cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte); + int *src, *res; + int *host_res = new int[arraySize + 1]; + memset(host_res, 0, (arraySize + 1)*sizeof(int)); + GPU_MemHelper(a, src, arraySize); + + + + //GPU_MemHelper(host_res, buff, arraySize + 1); + GPU_MemHelper(host_res, res, arraySize + 1); + + QueryPerformanceCounter(&large_interger); + start = large_interger.QuadPart; + + rec_GPU_PO(res, src); + + + QueryPerformanceCounter(&large_interger); + end = large_interger.QuadPart; + GPU_MemHelper(res, host_res, arraySize + 1, cudaMemcpyDeviceToHost); + + if (verifyResult(c, host_res, arraySize + 1)) printf("GPU Second Step verifed OK!\n"); + printf("GPU_Optimized_Bank_Conflicts_Resolved_Version:\t %f ms:\n", 1000 * (end - start) / diff); + fout << 1000 * (end - start) / diff << '\t'; + free(host_res); + cudaFree(res); + cudaFree(src); + cudaDeviceReset(); + } + + ///////////////////////////////////////////// + //////////////CPU scatter calling//////////// + ///////////////////////////////////////////// + int *Host_before_Scatter = new int[scatterSize]; + RandLotsZeros(Host_before_Scatter, scatterSize); + //printResult(Host_before_Scatter, scatterSize); + int *Host_after_Scatter = new int[scatterSize]; + int sizeb=0; + QueryPerformanceCounter(&large_interger); + start = large_interger.QuadPart; + CPU_Scatter(Host_after_Scatter, sizeb, Host_before_Scatter, scatterSize); + QueryPerformanceCounter(&large_interger); + end = large_interger.QuadPart; + printf("CPU_Scatter:\t %f ms:\n", 1000 * (end - start) / diff); + + fout << 1000 * (end - start) / diff << '\t'; + //printResult(Host_after_Scatter, sizeb); + //printf("scatterSize-sizeb=%d\n", scatterSize - sizeb); + + ///////////////////////////////////////////// + //////////////GPU scatter calling//////////// + ///////////////////////////////////////////// + { + cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte); + int* Dev_before_Scatter; + int* allzero = new int[scatterSize + 1]; + memset(allzero, 0, (scatterSize + 1)*sizeof(int)); + GPU_MemHelper(Host_before_Scatter, Dev_before_Scatter, scatterSize); + int* Dev_aux_Scatter; + GPU_MemHelper(allzero, Dev_aux_Scatter, scatterSize+1); + int* Dev_after_Scatter; + GPU_MemHelper(allzero, Dev_after_Scatter, scatterSize+1); + + QueryPerformanceCounter(&large_interger); + start = large_interger.QuadPart; + + rec_GPU_POS(Dev_aux_Scatter, Dev_before_Scatter); + GPU_scatter_cp << <(int)ceil((scatterSize) / (float)blockSize) , blockSize >> >(Dev_after_Scatter, Dev_aux_Scatter, Dev_before_Scatter); + + QueryPerformanceCounter(&large_interger); + end = large_interger.QuadPart; + + int* host_res=new int[scatterSize]; + GPU_MemHelper(Dev_after_Scatter, host_res, scatterSize, cudaMemcpyDeviceToHost); + if (verifyResult(Host_after_Scatter, host_res, sizeb)) printf("GPU Scatter verifed OK!\n"); + printf("GPU_Scatter_Version:\t %f ms:\n", 1000 * (end - start) / diff); + fout << 1000 * (end - start) / diff << '\t'; + free(host_res); + cudaFree(Dev_after_Scatter); + cudaFree(Dev_aux_Scatter); + cudaFree(Dev_before_Scatter); + cudaDeviceReset(); + } + + { + cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte); + int* Dev_before_Scatter; + int* allzero = new int[scatterSize + 1]; + memset(allzero, 0, (scatterSize + 1)*sizeof(int)); + GPU_MemHelper(Host_before_Scatter, Dev_before_Scatter, scatterSize); + int* Dev_aux_Scatter; + GPU_MemHelper(allzero, Dev_aux_Scatter, scatterSize + 1); + int* Dev_after_Scatter; + GPU_MemHelper(allzero, Dev_after_Scatter, scatterSize + 1); + + QueryPerformanceCounter(&large_interger); + start = large_interger.QuadPart; + + rec_GPU_POSBR(Dev_aux_Scatter, Dev_before_Scatter); + GPU_scatter_cp << <(int)ceil((scatterSize) / (float)blockSize), blockSize >> >(Dev_after_Scatter, Dev_aux_Scatter, Dev_before_Scatter); + + QueryPerformanceCounter(&large_interger); + end = large_interger.QuadPart; + + int* host_res = new int[scatterSize]; + GPU_MemHelper(Dev_after_Scatter, host_res, scatterSize, cudaMemcpyDeviceToHost); + if (verifyResult(Host_after_Scatter, host_res, sizeb)) printf("GPU Scatter Bank Conflicts Resolved verifed OK!\n"); + printf("GPU Scatter Bank Conflicts Resolved Version:\t %f ms:\n", 1000 * (end - start) / diff); + fout << 1000 * (end - start) / diff << '\t'; + free(host_res); + cudaFree(Dev_after_Scatter); + cudaFree(Dev_aux_Scatter); + cudaFree(Dev_before_Scatter); + cudaDeviceReset(); + } + ///////////////////////////////////////////// + /////////GPU Thrust::scatter calling///////// + ///////////////////////////////////////////// + { + int* host_res = new int[scatterSize]; + struct notzero{ + __host__ __device__ bool operator()(const int x){ + return x != 0; + } + }; + + QueryPerformanceCounter(&large_interger); + start = large_interger.QuadPart; + thrust::copy_if(Host_before_Scatter, Host_before_Scatter + scatterSize, host_res, notzero()); + QueryPerformanceCounter(&large_interger); + end = large_interger.QuadPart; + + if (verifyResult(Host_after_Scatter, host_res, sizeb)) printf("Thrust Scatter verifed OK!\n"); + printf("Thrust::GPU_Scatter_Version:\t %f ms:\n", 1000 * (end - start) / diff); + fout << 1000 * (end - start) / diff << '\n'; + free(host_res); + cudaDeviceReset(); + } + ///////////////////////////////////////////// + //////////////Naive calling////////////////// + ///////////////////////////////////////////// + /*{ + int *src, *res; + int *host_res = new int[arraySize + 1]; + memset(host_res, 0, (arraySize + 1)*sizeof(int)); + GPU_MemHelper(a, src, arraySize); + GPU_MemHelper(host_res, res, arraySize + 1); + QueryPerformanceCounter(&large_interger); + start = large_interger.QuadPart; + for (int i = 0; i <= arraySize; i++) + GPU_PrefixSumEx_Naive << < (int)ceil((arraySize + 1) / (float)blockSize), blockSize >> >(res, src, i); + QueryPerformanceCounter(&large_interger); + end = large_interger.QuadPart; + GPU_MemHelper(res, host_res, arraySize + 1, cudaMemcpyDeviceToHost); + if (verifyResult(c, host_res, arraySize + 1)) printf("GPU Naive verifed OK!\n"); + printf("GPU_Naive_Version:\t %f ms:\n", 1000 * (end - start) / diff); + free(host_res); + cudaFree(res); + cudaFree(src); + cudaDeviceReset(); + }*/ + + + + + + + ///////////////////////////////////////////// + //////////////GPU Reset////////////////////// + ///////////////////////////////////////////// + cudaDeviceReset(); + + return 0; +} + + + diff --git a/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/output.log b/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/output.log new file mode 100644 index 0000000..b858342 --- /dev/null +++ b/CIS565_2014_Fall_StreamCompaction/CIS565_2014_Fall_StreamCompaction/output.log @@ -0,0 +1,6 @@ +Blocksize PrefixArraySize ScatterArraySize CPUPrefixTime GPUOPTPrefixTime GPUOPTPrefixBankConfResolvedTime CPUScatteringTime GPUScatteringTime GPUScatteringTimeBankConfResolvedTime GPU::ThrustScatteringTime +128 80000 80000 0.1779 0.919437 0.827493 0.349814 0.997696 0.926707 0.320306 +128 80000 80000 0.214678 1.07767 1.06141 0.335274 0.882659 1.1679 0.316457 +128 80000 80000 0.18517 1.15849 0.869402 0.339123 0.856145 0.874106 0.320734 +128 80000 80000 0.175762 1.253 1.02036 0.336557 0.874962 1.0473 0.31774 +128 80000 80000 0.175334 0.998551 0.9498 0.451593 0.83904 0.868547 0.316885 diff --git a/Performance.xlsx b/Performance.xlsx new file mode 100644 index 0000000..720c662 Binary files /dev/null and b/Performance.xlsx differ diff --git a/README.md b/README.md index 6e02afa..6c1e6f9 100644 --- a/README.md +++ b/README.md @@ -1,133 +1,8 @@ -Project-2 -========= +Questions and answers: +##You can use the performance.py to execute the program with ease. +1, Please see the enclosed performance.xlsx to see all the result and chart; + Based on chart in tab:GPU_Naive, we can see that the performance of this naive version is very bad. Much worse than the serial version. Because the computation threads of this naive version is very large and data transfer from the global memory to the SMs can consume large quantity of time. That is why the performance is so bad. -A Study in Parallel Algorithms : Stream Compaction +2, Also in the tab:GPU_Naive, we can see that the curve of optimized version is better than the naive version. That is because the latency of the shared memory is much shorter than global memory. But still, it does not exceeded the performance of CPU with little test data set. -# INTRODUCTION -Many of the algorithms you have learned thus far in your career have typically -been developed from a serial standpoint. When it comes to GPUs, we are mainly -looking at massively parallel work. Thus, it is necessary to reorient our -thinking. In this project, we will be implementing a couple different versions -of prefix sum. We will start with a simple single thread serial CPU version, -and then move to a naive GPU version. Each part of this homework is meant to -follow the logic of the previous parts, so please do not do this homework out of -order. - -This project will serve as a stream compaction library that you may use (and -will want to use) in your -future projects. For that reason, we suggest you create proper header and CUDA -files so that you can reuse this code later. You may want to create a separate -cpp file that contains your main function so that you can test the code you -write. - -# OVERVIEW -Stream compaction is broken down into two parts: (1) scan, and (2) scatter. - -## SCAN -Scan or prefix sum is the summation of the elements in an array such that the -resulting array is the summation of the terms before it. Prefix sum can either -be inclusive, meaning the current term is a summation of all the elements before -it and itself, or exclusive, meaning the current term is a summation of all -elements before it excluding itself. - -Inclusive: - -In : [ 3 4 6 7 9 10 ] - -Out : [ 3 7 13 20 29 39 ] - -Exclusive - -In : [ 3 4 6 7 9 10 ] - -Out : [ 0 3 7 13 20 29 ] - -Note that the resulting prefix sum will always be n + 1 elements if the input -array is of length n. Similarly, the first element of the exclusive prefix sum -will always be 0. In the following sections, all references to prefix sum will -be to the exclusive version of prefix sum. - -## SCATTER -The scatter section of stream compaction takes the results of the previous scan -in order to reorder the elements to form a compact array. - -For example, let's say we have the following array: -[ 0 0 3 4 0 6 6 7 0 1 ] - -We would only like to consider the non-zero elements in this zero, so we would -like to compact it into the following array: -[ 3 4 6 6 7 1 ] - -We can perform a transform on input array to transform it into a boolean array: - -In : [ 0 0 3 4 0 6 6 7 0 1 ] - -Out : [ 0 0 1 1 0 1 1 1 0 1 ] - -Performing a scan on the output, we get the following array : - -In : [ 0 0 1 1 0 1 1 1 0 1 ] - -Out : [ 0 0 0 1 2 2 3 4 5 5 ] - -Notice that the output array produces a corresponding index array that we can -use to create the resulting array for stream compaction. - -# PART 1 : REVIEW OF PREFIX SUM -Given the definition of exclusive prefix sum, please write a serial CPU version -of prefix sum. You may write this in the cpp file to separate this from the -CUDA code you will be writing in your .cu file. - -# PART 2 : NAIVE PREFIX SUM -We will now parallelize this the previous section's code. Recall from lecture -that we can parallelize this using a series of kernel calls. In this portion, -you are NOT allowed to use shared memory. - -### Questions -* Compare this version to the serial version of exclusive prefix scan. Please - include a table of how the runtimes compare on different lengths of arrays. -* Plot a graph of the comparison and write a short explanation of the phenomenon you - see here. - -# PART 3 : OPTIMIZING PREFIX SUM -In the previous section we did not take into account shared memory. In the -previous section, we kept everything in global memory, which is much slower than -shared memory. - -## PART 3a : Write prefix sum for a single block -Shared memory is accessible to threads of a block. Please write a version of -prefix sum that works on a single block. - -## PART 3b : Generalizing to arrays of any length. -Taking the previous portion, please write a version that generalizes prefix sum -to arbitrary length arrays, this includes arrays that will not fit on one block. - -### Questions -* Compare this version to the parallel prefix sum using global memory. -* Plot a graph of the comparison and write a short explanation of the phenomenon - you see here. - -# PART 4 : ADDING SCATTER -First create a serial version of scatter by expanding the serial version of -prefix sum. Then create a GPU version of scatter. Combine the function call -such that, given an array, you can call stream compact and it will compact the -array for you. Finally, write a version using thrust. - -### Questions -* Compare your version of stream compact to your version using thrust. How do - they compare? How might you optimize yours more, or how might thrust's stream - compact be optimized. - -# EXTRA CREDIT (+10) -For extra credit, please optimize your prefix sum for work parallelism and to -deal with bank conflicts. Information on this can be found in the GPU Gems -chapter listed in the references. - -# SUBMISSION -Please answer all the questions in each of the subsections above and write your -answers in the README by overwriting the README file. In future projects, we -expect your analysis to be similar to the one we have led you through in this -project. Like other projects, please open a pull request and email Harmony. - -# REFERENCES -"Parallel Prefix Sum (Scan) with CUDA." GPU Gems 3. +3, I don't really sure what is thrust do. But one thing, according to my performance statistics, the thrust performance is highly similar to my CPU version. You can see here I plot 10 plus chart and the thrust version and the CPU version is always almost the same line. And my GPU version exceeded them with block size=128 and data size larger than about 1MB. However, the thrust seems allocate memory on device with no time. Because If I count the time used for data transfer, it even exceeded the time thrust required for complete the whole computation. \ No newline at end of file diff --git a/performance.py b/performance.py new file mode 100644 index 0000000..29ab152 --- /dev/null +++ b/performance.py @@ -0,0 +1,20 @@ +from os import* +import os +#os.chdir("D:\workspace\GitHub\CIS565CUDA\Project-2\Project2-StreamCompaction\CIS565_2014_Fall_StreamCompaction\x64\Release"); +from subprocess import call +blkSize=range(1,11) +dataSize=range(1,9) +for i in range(10): + blkSize[i]=2**blkSize[i] +for i in range(8): + dataSize[i]=10**dataSize[i]; +#print blkSize +#print dataSize +#ss=' '.join([".\CIS565_2014_Fall_StreamCompaction.exe", "-b", str(blkSize[0]),"-p",str(dataSize[0]),"-s", str(dataSize[0])]) +#print ss +#os.system(' '.join([".\CIS565_2014_Fall_StreamCompaction.exe"])); +for blk in blkSize: + for ds in dataSize: + print "Current: "+str(blk)+" "+str(ds) + os.system(' '.join([".\CIS565_2014_Fall_StreamCompaction.exe", "-b", str(blk),"-p",str(ds),"-s", str(ds)])) + \ No newline at end of file