diff --git a/README.md b/README.md index 0e38ddb..f1023e2 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,90 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Xinyu Lin +[Linkedin](https://www.linkedin.com/in/xinyu-lin-138352125/) +* Tested on: Windows 10, Intel(R) Core(TM) i7-6700HQ CPU@2.60GHz, 16GB, GTX960M(Private Computer) -### (TODO: Your README) +### Features -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +* CPU Scan & Stream Compaction +* Naive GPU Scan Algorithm +* Work-Efficient GPU Scan +* Work-Efficient GPU Stream Compaction +* Thrust Implementation +### Performance Analysis +* BlockSize over Scan methods(ArraySize: 256) +![](img/bsscan.png) + +* BlockSize over Compact methods(ArraySize: 256) +![](img/bscompact.png) + +* ArraySize over Scan methods(BlockSize: 512) +![](img/as256scan.png) +![](img/as512scan'.png) +![](img/as1024scan.png) + +* ArraySize over Compact methods(BlockSize: 512) +![](img/as256compact.png) +![](img/as512compact.png) +![](img/as1024compact.png) + +### Result +``` +**************** +** SCAN TESTS ** +**************** + [ 33 32 23 11 4 10 29 49 9 36 47 43 19 ... 20 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.001185ms (std::chrono Measured) + [ 0 33 65 88 99 103 113 142 191 200 236 283 326 ... 6332 6352 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 0.001185ms (std::chrono Measured) + [ 0 33 65 88 99 103 113 142 191 200 236 283 326 ... 6208 6251 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 0.240992ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.189888ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.262976ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.228768ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.001088ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.001088ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 1 0 2 2 0 2 3 2 0 0 3 1 0 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.001975ms (std::chrono Measured) + [ 1 2 2 2 3 2 3 1 3 2 1 2 1 ... 2 3 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.001976ms (std::chrono Measured) + [ 1 2 2 2 3 2 3 1 3 2 1 2 1 ... 2 3 ] + passed +==== cpu compact with scan ==== + elapsed time: 0.008692ms (std::chrono Measured) + [ 1 2 2 2 3 2 3 1 3 2 1 2 1 ... 2 3 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.499232ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.43552ms (CUDA Measured) + passed +``` + + + diff --git a/img/as1024compact.png b/img/as1024compact.png new file mode 100644 index 0000000..d82518f Binary files /dev/null and b/img/as1024compact.png differ diff --git a/img/as1024scan.png b/img/as1024scan.png new file mode 100644 index 0000000..adb9952 Binary files /dev/null and b/img/as1024scan.png differ diff --git a/img/as256compact.png b/img/as256compact.png new file mode 100644 index 0000000..0e3a631 Binary files /dev/null and b/img/as256compact.png differ diff --git a/img/as256scan.png b/img/as256scan.png new file mode 100644 index 0000000..9667086 Binary files /dev/null and b/img/as256scan.png differ diff --git a/img/as512compact.png b/img/as512compact.png new file mode 100644 index 0000000..9dbb95e Binary files /dev/null and b/img/as512compact.png differ diff --git a/img/as512scan'.png b/img/as512scan'.png new file mode 100644 index 0000000..4d17317 Binary files /dev/null and b/img/as512scan'.png differ diff --git a/img/bscompact.png b/img/bscompact.png new file mode 100644 index 0000000..7cf1215 Binary files /dev/null and b/img/bscompact.png differ diff --git a/img/bsscan.png b/img/bsscan.png new file mode 100644 index 0000000..eb78182 Binary files /dev/null and b/img/bsscan.png differ diff --git a/src/main.cpp b/src/main.cpp index 1850161..60494e7 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -79,6 +79,17 @@ int main(int argc, char* argv[]) { StreamCompaction::Efficient::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); //printArray(NPOT, c, true); + //std::cout << "my result: "<< std::endl; + //for (int i = 0; i < SIZE; ++i) + //{ + // std::cout << c[i] << " "; + //} + //std::cout << std::endl; + //std::cout << "correct result: " << std::endl; + //for (int i = 0; i < SIZE; ++i) + //{ + // std::cout << b[i] << " "; + //} printCmpResult(NPOT, b, c); zeroArray(SIZE, c); diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index cdbef77..c8709e7 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -13,5 +13,5 @@ set(SOURCE_FILES cuda_add_library(stream_compaction ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_50 ) diff --git a/stream_compaction/common.h b/stream_compaction/common.h index 99a1b04..ca558d6 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -12,7 +12,7 @@ #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) - +#define blockSize 1024 /** * Check for CUDA errors; print and exit if there was a problem. */ diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 05ce667..42fe477 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -1,15 +1,15 @@ #include #include "cpu.h" -#include "common.h" +#include "common.h" namespace StreamCompaction { namespace CPU { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } /** @@ -20,6 +20,13 @@ namespace StreamCompaction { void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + // Serial Scan: Exclusive + int acc = 0; + for (int i = 0; i < n; i++) + { + odata[i] = acc; + acc = acc + idata[i]; + } timer().endCpuTimer(); } @@ -31,8 +38,19 @@ namespace StreamCompaction { int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int flag = 0; + for (int i = 0; i < n; ++i) + { + if (idata[i] != 0) + { + odata[flag] = idata[i]; + flag++; + } + + } + timer().endCpuTimer(); - return -1; + return flag; } /** @@ -43,8 +61,46 @@ namespace StreamCompaction { int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + // predicate + int acc = 0; + int* tempdata = new int[n]; + for (int i = 0; i < n; ++i) + { + if (idata[i] == 0) + { + tempdata[i] = 0; + + } + else + { + tempdata[i] = 1; + acc++; + } + } + // ======= tempdata[] = 1110011111001111 + // scan sum + int accSum = 0; + int* tempData2 = new int[n]; + for (int i = 0; i < n; i++) + { + tempData2[i] = accSum; + accSum += tempdata[i]; + + } + + // idata[] = 3120043215002222 + // ======= tempdata[] = 1110011111001111 + // tempData2[] = 0123334567899910.... + // scatter + for (int i = 0; i < n; ++i) + { + odata[tempData2[i]] = idata[i]; + } + + delete[] tempdata; + delete[] tempData2; timer().endCpuTimer(); - return -1; + return acc; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 36c5ef2..5c32efd 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -2,23 +2,118 @@ #include #include "common.h" #include "efficient.h" - +#include namespace StreamCompaction { namespace Efficient { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } + __global__ void kernelScanReduce(int n, int d, int* idata) + { + int thID = threadIdx.x + blockDim.x * blockIdx.x; + if (thID >= n) return; + int temp = 1 << d; + int temp2 = 1 << (d - 1); + if ((thID % temp) == 0) + { + idata[thID + temp - 1] = idata[thID + temp2 - 1] + idata[thID + temp - 1]; + } + } + // two array to get result + //__global__ void kernelScanReduce(int n, int d, int* odata, int* idata) + //{ + // int thID = threadIdx.x + blockDim.x * blockIdx.x; + // if (thID >= n) return; + // int temp = 1 << d; + // int temp2 = 1 << (d - 1); + // odata[thID] = idata[thID]; + // if ((thID % temp) == 0) + // { + // odata[thID + temp - 1] = idata[thID + temp2 - 1] + idata[thID + temp - 1]; + // } + //} + //__global__ void kernelScanDownSweep(int n, int d, int* odata, int* idata) + //{ + // int thID = threadIdx.x + blockDim.x * blockIdx.x; + // if (thID >= n) return; + // int tempdp1 = 1 << (d + 1); + // int tempd = 1 << d; + // odata[thID] = idata[thID]; + // if ((thID % tempdp1) == 0) + // { + // int t = idata[thID + tempd - 1]; + // odata[thID + tempd - 1] = idata[thID + tempdp1 - 1]; + // odata[thID + tempdp1 - 1] = t + idata[thID + tempdp1 - 1]; + // } + //} + + + __global__ void kernelScanDownSweep(int n, int d, int* idata) + { + int thID = threadIdx.x + blockDim.x * blockIdx.x; + if (thID >= n) return; + int tempdp1 = 1 << (d + 1); + int tempd = 1 << d; + if ((thID % tempdp1) == 0) + { + int t = idata[thID + tempd - 1]; + idata[thID + tempd - 1] = idata[thID + tempdp1 - 1]; + idata[thID + tempdp1 - 1] = t + idata[thID + tempdp1 - 1]; + } + } + + __global__ void kernelChangeN1(int *arr, int index, int identity) + { + arr[index] = identity; + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ + + void myScan(int n, int temp, int* idata) + { + dim3 fullBlocksPerGrid((temp + blockSize - 1) / blockSize); + int myIdentity = 0; + for (int d = 1; d <= ilog2ceil(n); ++d) + { + kernelScanReduce << > > (temp, d, idata); + } + + kernelChangeN1 << < 1, 1 >> > (idata, temp - 1, myIdentity); + + for (int d = ilog2ceil(n) - 1; d >= 0; --d) + { + kernelScanDownSweep << > > (temp, d, idata); + } + } void scan(int n, int *odata, const int *idata) { timer().startGpuTimer(); // TODO + int temp = 1 << ilog2ceil(n); + + dim3 fullBlocksPerGrid((temp + blockSize - 1) / blockSize); + int* dev_In = NULL; + int* dev_Out = NULL; + cudaMalloc((void**)&dev_In, temp * sizeof(int)); + checkCUDAError("Malloc dev_In failed!"); + cudaMalloc((void**)&dev_Out, temp * sizeof(int)); + checkCUDAError("Malloc dev_Out failed!"); + cudaMemcpy(dev_In, idata, temp * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("Memcpy from idata to dev_In failed!"); + myScan(n, temp, dev_In); + + std::swap(dev_Out, dev_In); + cudaMemcpy(odata, dev_Out, temp * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("Memcoy from dev_Out to odata failed!"); + timer().endGpuTimer(); + + cudaFree(dev_In); + cudaFree(dev_Out); } /** @@ -30,11 +125,84 @@ namespace StreamCompaction { * @param idata The array of elements to compact. * @returns The number of elements remaining after compaction. */ + + + __global__ void kernelPredicate(int n, int* odata, const int* idata) + { + int thID = threadIdx.x + blockDim.x * blockIdx.x; + if (thID >= n) return; + if (idata[thID] == 0) + { + odata[thID] = 0; + } + else + { + odata[thID] = 1; + } + + } + __global__ void kernelScatter(int n, int* odata, int* myBool ,int* address, int* idata) + { + int thID = threadIdx.x + blockDim.x * blockIdx.x; + if (thID >= n) return; + if (myBool[thID] == 1) + { + odata[address[thID]] = idata[thID]; + } + + } int compact(int n, int *odata, const int *idata) { timer().startGpuTimer(); // TODO - timer().endGpuTimer(); - return -1; + int temp = 1 << ilog2ceil(n); + dim3 fullBlocksPerGrid((temp + blockSize - 1) / blockSize); + int* dev_In = NULL; + int* dev_Out = NULL; + int* dev_Bool = NULL; + int* dev_Address = NULL; + + cudaMalloc((void**)&dev_In, temp * sizeof(int)); + checkCUDAError("compact: malloc dev_In failed!"); + cudaMalloc((void**)&dev_Out, temp * sizeof(int)); + checkCUDAError("compact: malloc dev_Out failed!"); + cudaMalloc((void**)&dev_Bool, temp * sizeof(int)); + checkCUDAError("compact: malloc dev_Bool failed!"); + cudaMalloc((void**)&dev_Address, temp * sizeof(int)); + checkCUDAError("compact: malloc dev_Address failed!"); + + cudaMemcpy(dev_In, idata, temp * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("compact: memcpy from idata to dev_In failed!"); + + kernelPredicate << < fullBlocksPerGrid, blockSize >> > (temp, dev_Bool, dev_In); + + cudaMemcpy(dev_Address, dev_Bool, temp * sizeof(int), cudaMemcpyDeviceToDevice); + checkCUDAError("compact: memcoy from dev_Bool to dev_Address failed!"); + + myScan(n, temp, dev_Address); + + kernelScatter << > > (n, dev_Out, dev_Bool, dev_Address, dev_In); + + cudaMemcpy(odata, dev_Out, temp * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_In); + cudaFree(dev_Out); + cudaFree(dev_Bool); + cudaFree(dev_Address); + + int flag = 0; + for (int i = 0; i < n; ++i) + { + if (odata[i] != 0) + { + flag++; + } + else + { + break; + } + } + timer().endGpuTimer(); + return flag; } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 9218f8e..55ede8c 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -2,24 +2,72 @@ #include #include "common.h" #include "naive.h" +#include + + namespace StreamCompaction { namespace Naive { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } // TODO: __global__ - /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ + + __global__ void kernelScan(int n, int d, int* odata, int* idata) + { + int thID = threadIdx.x + blockDim.x * blockIdx.x; + int temp = 1 << (d - 1); + + if (thID >= n) return; + if (thID >= temp) + { + odata[thID] = idata[thID - temp] + idata[thID]; + } + else + { + odata[thID] = idata[thID]; + } + + } void scan(int n, int *odata, const int *idata) { timer().startGpuTimer(); // TODO + + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + int* dev_tempIn = NULL; + int* dev_tempOut = NULL; + + cudaMalloc((void**)&dev_tempIn, n * sizeof(int)); + checkCUDAError("Malloc memory to dev_tempIn failed!"); + cudaMalloc((void**)&dev_tempOut, n * sizeof(int)); + checkCUDAError("Malloc memory to dev_tempOut failed!"); + cudaMemcpy(dev_tempIn, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("Memory copy from host to device failed!"); + + for (int d = 1; d <= ilog2ceil(n); ++d) + { + kernelScan << > > (n, d, dev_tempOut, dev_tempIn); + std::swap(dev_tempOut, dev_tempIn); + } + std::swap(dev_tempOut, dev_tempIn); + + odata[0] = 0; + cudaMemcpy(odata + 1, dev_tempOut, (n - 1) * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("Memory copy from device to host failed!"); + //for (int i = 0; i < n; ++i) + //{ + // std::cout << odata[i] << " "; + //} timer().endGpuTimer(); + + cudaFree(dev_tempIn); + cudaFree(dev_tempOut); } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 36b732d..d1083c9 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -8,20 +8,29 @@ namespace StreamCompaction { namespace Thrust { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { timer().startGpuTimer(); + + //thrust::host_vector host_thrust_in = idata; + //thrust::host_vector host_thrust_out = odata; + + //thrust::host_vector host_thrust_in(n); + + //thrust::device_vector dev_thrust_in = host_thrust_in; // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + + thrust::exclusive_scan(idata, idata + n, odata); timer().endGpuTimer(); } }