diff --git a/README.md b/README.md index 0e38ddb..967b123 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,75 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Weiyu Du + * [LinkedIn](https://www.linkedin.com/in/weiyu-du/) +* Tested on: CETS virtual lab MOR100B-05 Intel(R) Core(TM) i7-6700 CPU @ 3.40GHz -### (TODO: Your README) +### Plots +1) Plot of time elapsed in (ms) versus array size when n is a power of 2 (x axis: 2^8, 2^12, 2^16, 2^20) + +2) Plot of time elapsed in (ms) versus array size when n is not a power of 2 (x axis: 2^8, 2^12, 2^16, 2^20) + -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +### Analysis +When the array size is small, we observe that cpu method is better than gpu ones and naive scan is best of the gpu ones. Possible explanations: 1) When array size is small, computation time difference is very small and accessing memory contributes to the largest portion of time. That's why gpu methods are worse than cpu. 2) Work efficient has up-sweep and down-sweep stages. Even though it has the same time complexity as naive method, constants matter with small n. +However, when array size increases, we observe that cpu performance quickly deteriorates and becomes worse than work efficient and thrust implementation. Among all the gpu methods, thrust is the fastest, work-efficient scan comes the second and naive scan is the slowest. This is as expected: 1) cpu method has run time complexity of O(n) while gpu methods have O(logn). Therefore, gpu performance is less susceptible to increase in array size. 2) Work efficient scan requires only one array while naive implementation has to access memory of two arrays. Global memory I/O is the bottleneck here, causing naive method (with heavy memory access) to be even worse than cpu. 3) Thrust utilizes shared memory while naive and work-efficient both uses global memory -- accessing shared memory is faster than accessing global memory. + +### Output +Array size is 2^20. +```` + +**************** +** SCAN TESTS ** +**************** + [ 19 36 40 30 35 35 17 8 28 32 41 40 15 ... 44 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 1.7577ms (std::chrono Measured) + [ 0 19 55 95 125 160 195 212 220 248 280 321 361 ... 25698986 25699030 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 1.9503ms (std::chrono Measured) + [ 0 19 55 95 125 160 195 212 220 248 280 321 361 ... 25698890 25698926 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 2.7335ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 2.73654ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 1.32346ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 1.30934ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.405888ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.328032ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 1 3 0 2 1 1 2 1 0 3 1 2 3 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 4.1676ms (std::chrono Measured) + [ 1 3 2 1 1 2 1 3 1 2 3 3 3 ... 1 2 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 2.6659ms (std::chrono Measured) + [ 1 3 2 1 1 2 1 3 1 2 3 3 3 ... 2 1 ] + passed +==== cpu compact with scan ==== + elapsed time: 10.0887ms (std::chrono Measured) + [ 1 3 2 1 1 2 1 3 1 2 3 3 3 ... 1 2 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 2.32755ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 2.18624ms (CUDA Measured) + passed +```` diff --git a/img/hw2_nonpow2.png b/img/hw2_nonpow2.png new file mode 100644 index 0000000..7a81b38 Binary files /dev/null and b/img/hw2_nonpow2.png differ diff --git a/img/hw2_pow2.png b/img/hw2_pow2.png new file mode 100644 index 0000000..55b5a2e Binary files /dev/null and b/img/hw2_pow2.png differ diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..e5353ec 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 20;//1000000;//1 << 8; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..716f58b 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -23,7 +23,18 @@ namespace StreamCompaction { * which map to 0 will be removed, and elements which map to 1 will be kept. */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + + int idx = (blockIdx.x * blockDim.x) + threadIdx.x; + if (idx >= n) { + return; + } + if (idata[idx] == 0) { + bools[idx] = 0; + } + else { + bools[idx] = 1; + } + return; } /** @@ -32,7 +43,14 @@ namespace StreamCompaction { */ __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { - // TODO + + int idx = (blockIdx.x * blockDim.x) + threadIdx.x; + if (idx >= n) { + return; + } + if (bools[idx] == 1) { + odata[indices[idx]] = idata[idx]; + } } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..2f94be2 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -19,7 +19,11 @@ namespace StreamCompaction { */ void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + + odata[0] = 0; + for (int i = 1; i < n; i++) { + odata[i] = odata[i - 1] + idata[i - 1]; + } timer().endCpuTimer(); } @@ -30,9 +34,16 @@ namespace StreamCompaction { */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + + int ctr = 0; + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + odata[ctr] = idata[i]; + ctr++; + } + } timer().endCpuTimer(); - return -1; + return ctr; } /** @@ -42,9 +53,39 @@ namespace StreamCompaction { */ int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + + int ctr = 0; + int* marker = new int[n]; + int* scan_res = new int[n]; + + for (int i = 0; i < n; i++) { + scan_res[i] = 0; + marker[i] = 0; + } + + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + marker[i] = 1; + } + } + + for (int i = 1; i < n; i++) { + scan_res[i] = marker[i-1] + scan_res[i-1]; + } + + for (int i = 0; i < n; i++) { + if (marker[i] == 1) { + odata[scan_res[i]] = idata[i]; + ctr++; + } + } + + delete[] scan_res; + delete[] marker; + timer().endCpuTimer(); - return -1; + + return ctr; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..3288a91 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -2,7 +2,7 @@ #include #include "common.h" #include "efficient.h" - +#include namespace StreamCompaction { namespace Efficient { using StreamCompaction::Common::PerformanceTimer; @@ -15,10 +15,68 @@ namespace StreamCompaction { /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ + __global__ void kernScan1(int n, int d, int* in) { + int k = (blockIdx.x * blockDim.x) + threadIdx.x; + int pow_d_1 = 1 << (d + 1); + int pow_d = 1 << d; + if (k >= n / pow_d_1) { + return; + } + k = k * pow_d_1; + in[k + pow_d_1 - 1] += in[k + pow_d - 1]; // 1 += 0 + return; + } + + __global__ void kernScan2(int n, int d, int* in) { + int k = (blockIdx.x * blockDim.x) + threadIdx.x; + int pow_d_1 = 1 << (d + 1); + int pow_d = 1 << d; + if (k >= n / pow_d_1) { + return; + } + k = k * pow_d_1; + int t = in[k + pow_d - 1]; + in[k + pow_d - 1] = in[k + pow_d_1 - 1]; + in[k + pow_d_1 - 1] += t; + return; + } + + __global__ void kernPadZero(int idx, int roundup, int* in) { + int k = (blockIdx.x * blockDim.x) + threadIdx.x; + if (k >= idx && k < roundup) { + in[k] = 0; + } + return; + } + void scan(int n, int *odata, const int *idata) { + int blockSize = 128; + int roundup_n = pow(2, ilog2ceil(n)); + + int* in; + cudaMalloc((void**)&in, roundup_n * sizeof(int)); + cudaMemcpy(in, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + timer().startGpuTimer(); - // TODO + + dim3 blockPerGrid((roundup_n + blockSize - 1) / blockSize); + kernPadZero << >>(n, roundup_n, in); + int num = 0; + for (int d = 0; d <= ilog2ceil(n) - 1; d++) { + num = roundup_n / pow(2, d + 1); + dim3 blockPerGridLoop1((num + blockSize - 1) / blockSize); + kernScan1 << > > (roundup_n, d, in); + } + //kernPadZero << > > (roundup_n - 1, roundup_n, in); + cudaMemset(in + roundup_n - 1, 0, sizeof(int)); + for (int d = ilog2ceil(n) - 1; d >= 0; d--) { + num = roundup_n / (1 << (d + 1)); + dim3 blockPerGridLoop2((num + blockSize - 1) / blockSize); + kernScan2 << > > (roundup_n, d, in); + } timer().endGpuTimer(); + cudaMemcpy(odata, in, sizeof(int) * n, cudaMemcpyDeviceToHost); + cudaFree(in); } /** @@ -31,10 +89,43 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); - return -1; + int blockSize = 128; + int roundup_n = pow(2, ilog2ceil(n)); + int* in; + cudaMalloc((void**)&in, n * sizeof(int)); + int* out; + cudaMalloc((void**)&out, n * sizeof(int)); + int* scan_res; + cudaMalloc((void**)&scan_res, n * sizeof(int)); + int* bools; + cudaMalloc((void**)&bools, n * sizeof(int)); + cudaMemcpy(in, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + int ctr = 0; + //timer().startGpuTimer(); + dim3 blockPerGrid((n + blockSize - 1) / blockSize); + StreamCompaction::Common::kernMapToBoolean << > > (n, bools, in); + scan(n, scan_res, bools); + StreamCompaction::Common::kernScatter << > > (n, out, in, bools, scan_res); + //timer().endGpuTimer(); + int* bools_last = new int[0]; + cudaMemcpy(bools_last, bools + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + int* scan_res_last = new int[0]; + cudaMemcpy(scan_res_last, scan_res + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + if (bools_last[0] == 1) { + ctr = scan_res_last[0] + 1; + } + else { + ctr = scan_res_last[0]; + } + + cudaMemcpy(odata, out, sizeof(int) * n, cudaMemcpyDeviceToHost); + cudaFree(in); + cudaFree(out); + cudaFree(scan_res); + cudaFree(bools); + delete(bools_last); + delete(scan_res_last); + return ctr; } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..bbab361 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -2,7 +2,8 @@ #include #include "common.h" #include "naive.h" - +#include +#include namespace StreamCompaction { namespace Naive { using StreamCompaction::Common::PerformanceTimer; @@ -11,15 +12,64 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } - // TODO: __global__ + + __global__ void kernScan(int n, int bar, int *in, int *out) { + int k = (blockIdx.x * blockDim.x) + threadIdx.x; + if (k >= n) { + return; + } + if (k >= bar) { + out[k] = in[k - bar] + in[k]; + } + else { + out[k] = in[k]; + } + + return; + } + + __global__ void kernShift(int n, int *in, int *out) { + int k = (blockIdx.x * blockDim.x) + threadIdx.x; + if (k >= n) { + return; + } + if (k == 0) { + out[k] = 0; + } + else { + out[k] = in[k - 1]; + } + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + + int* in; + cudaMalloc((void**)&in, n * sizeof(int)); + int* out; + cudaMalloc((void**)&out, n * sizeof(int)); + cudaMemcpy(in, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + timer().startGpuTimer(); - // TODO + int roundup_n = pow(2, ilog2ceil(n)); + int blockSize = 128; + dim3 blockPerGrid((roundup_n + blockSize - 1) / blockSize); + + for (int d = 1; d <= ilog2ceil(n); d++) { + kernScan <<>>(n, pow(2, d-1), in, out); + std::swap(in, out); + } + + kernShift << > > (n, in, out); timer().endGpuTimer(); + + cudaMemcpy(odata, out, sizeof(int) * n, cudaMemcpyDeviceToHost); + + cudaFree(in); + cudaFree(out); + } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..53a0b39 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,18 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + + int* in; + cudaMalloc((void**)&in, n * sizeof(int)); + int* out; + cudaMalloc((void**)&out, n * sizeof(int)); + cudaMemcpy(in, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + thrust::device_vector dev_in(in, in + n); + thrust::device_vector dev_out(n); timer().startGpuTimer(); - // TODO use `thrust::exclusive_scan` - // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(dev_in.begin(), dev_in.end(), dev_out.begin()); timer().endGpuTimer(); + thrust::copy(dev_out.begin(), dev_out.end(), odata); } } }