diff --git a/README.md b/README.md index 0e38ddb..a5e9403 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,80 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Han Yan +* Tested on: CETS Virtual Lab -### (TODO: Your README) +### Questions and Plots +Plots for varying array size. The scan time is shown in log 2 scale. -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +![](img/project2-p1.png) +![](img/project2-p2.png) + +Analysis: +* For all array sizes, the CPU approach takes less time than GPU approaches. Among the GPU approaches, the naive implementation somehow takes less time. In terms of trend, the CPU approach is more susceptible to increasing array size (with larger slope), whereas both GPU methods are less susceptible - this is expected because the time complexity of CPU approach is O(n) and GPU approaches is O(log(n)). + +* I expected both the naive and work efficient scans to be faster than simple gpu scan, but this is not the case for my implementation. I think a bottleneck here for both naive and work efficient scan could be global memory I/O, since I'm storing all arrays in the device global memory. And both naive and efficient algorithms have global memory access in every level of iteration. + +* I also expected the efficient scan to be faster than the naive scan. One factor that potentially slows down the efficient scan is the invocation of "__syncthreads()" in each level of up/down sweep. But in most levels, many threads don't really contribute any work. + +* In thrust exclusive_scan implementation, I think it first does some memory copy, and then do the computation. + +### Test Program Output + +Array size = 1 << 8 + +``` +**************** +** SCAN TESTS ** +**************** + [ 22 1 25 15 7 27 27 23 12 1 49 11 46 ... 19 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.0005ms (std::chrono Measured) + [ 0 22 23 48 63 70 97 124 147 159 160 209 220 ... 6133 6152 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 0.0005ms (std::chrono Measured) + [ 0 22 23 48 63 70 97 124 147 159 160 209 220 ... 6088 6092 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 0.009216ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.008192ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.013312ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.012288ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.091968ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.053248ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 0 1 3 3 3 3 1 3 0 3 3 3 2 ... 1 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.0008ms (std::chrono Measured) + [ 1 3 3 3 3 1 3 3 3 3 2 1 1 ... 3 1 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.0012ms (std::chrono Measured) + [ 1 3 3 3 3 1 3 3 3 3 2 1 1 ... 3 2 ] + passed +==== cpu compact with scan ==== + elapsed time: 0.0077ms (std::chrono Measured) + [ 1 3 3 3 3 1 3 3 3 3 2 1 1 ... 3 1 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.017408ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.017408ms (CUDA Measured) + passed +``` diff --git a/img/project2-p1.png b/img/project2-p1.png new file mode 100644 index 0000000..9332f03 Binary files /dev/null and b/img/project2-p1.png differ diff --git a/img/project2-p2.png b/img/project2-p2.png new file mode 100644 index 0000000..d6ab2c2 Binary files /dev/null and b/img/project2-p2.png differ diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..08f4b13 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -24,6 +24,13 @@ namespace StreamCompaction { */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { // TODO + int index = threadIdx.x; + if (idata[index] == 0) { + bools[index] = 0; + } + else { + bools[index] = 1; + } } /** @@ -33,6 +40,10 @@ namespace StreamCompaction { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { // TODO + int index = threadIdx.x; + if (bools[index] == 1) { + odata[indices[index]] = idata[index]; + } } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..7eed8d0 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -2,6 +2,7 @@ #include "cpu.h" #include "common.h" +#include namespace StreamCompaction { namespace CPU { @@ -19,7 +20,13 @@ namespace StreamCompaction { */ void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + // TODO (exclusive) + if (n > 0) { + odata[0] = 0; + for (int i = 1; i < n; i++) { + odata[i] = odata[i - 1] + idata[i - 1]; + } + } timer().endCpuTimer(); } @@ -31,8 +38,15 @@ namespace StreamCompaction { int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int idx = 0; + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + odata[idx] = idata[i]; + idx++; + } + } timer().endCpuTimer(); - return -1; + return idx; } /** @@ -43,8 +57,32 @@ namespace StreamCompaction { int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + std::vector tmp(n, 0); + std::vector scan_result(n); + int count = 0; + // build tmp binary array + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + tmp[i] = 1; + count++; + } + } + // scan + if (n > 0) { + scan_result[0] = 0; + for (int k = 1; k < n; k++) { + scan_result[k] = scan_result[k - 1] + tmp[k - 1]; + } + } + // scatter + for (int i = 0; i < n; i++) { + if (tmp[i] == 1) { + int idx = scan_result[i]; + odata[idx] = idata[i]; + } + } timer().endCpuTimer(); - return -1; + return count; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..cab5f33 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -12,13 +12,156 @@ namespace StreamCompaction { return timer; } + // GPU Gems 3 example + __global__ void prescan(float *g_odata, float *g_idata, int n) { + extern __shared__ float temp[]; // allocated on invocation + int thid = threadIdx.x; + int offset = 1; + temp[2 * thid] = g_idata[2 * thid]; // load input into shared memory + temp[2 * thid + 1] = g_idata[2 * thid + 1]; + for (int d = n >> 1; d > 0; d >>= 1) // build sum in place up the tree + { + __syncthreads(); + if (thid < d) { + int ai = offset * (2 * thid + 1) - 1; + int bi = offset * (2 * thid + 2) - 1; + temp[bi] += temp[ai]; + } + offset *= 2; + } + if (thid == 0) { temp[n - 1] = 0; } // clear the last element + for (int d = 1; d < n; d *= 2) // traverse down tree & build scan + { + offset >>= 1; + __syncthreads(); + if (thid < d) { + int ai = offset * (2 * thid + 1) - 1; + int bi = offset * (2 * thid + 2) - 1; + float t = temp[ai]; + temp[ai] = temp[bi]; + temp[bi] += t; + } + } + __syncthreads(); + g_odata[2 * thid] = temp[2 * thid]; // write results to device memory + g_odata[2 * thid + 1] = temp[2 * thid + 1]; + } + + __global__ void kernelEfficientScan(int *g_odata, int *g_idata, int n, int N) { + int index = threadIdx.x; + int offset = 2; + g_odata[index] = g_idata[index]; + // up-sweep + for (int d = N / 2; d >= 1; d >>= 1) { + __syncthreads(); + if (index < d) { + int a = n - 1 - (index * offset); + int b = a - offset / 2; + if (a >= 0 && b >= 0) { + g_odata[a] += g_odata[b]; + } + } + offset *= 2; + } + // down-sweep + if (index == 0 && n > 0) { + g_odata[n - 1] = 0; + } + offset /= 2; + for (int d = 1; d <= N / 2; d *= 2) { + __syncthreads(); + if (index < d) { + int a = n - 1 - (index * offset); + int b = a - offset / 2; + if (a >= 0 && b >= 0) { + int tmp = g_odata[b]; + g_odata[b] = g_odata[a]; + g_odata[a] += tmp; + } + } + offset /= 2; + } + } + + __global__ void kernelEfficientCompact(int *g_odata, int *g_idata, int *g_sdata, int *g_bdata, int n, int N) { + int index = threadIdx.x; + // Build binary array + if (g_idata[index] == 0) { + g_bdata[index] = 0; + } + else { + g_bdata[index] = 1; + } + // Efficient scan + __syncthreads(); + int offset = 2; + g_sdata[index] = g_bdata[index]; + // up-sweep + for (int d = N / 2; d >= 1; d >>= 1) { + __syncthreads(); + if (index < d) { + int a = n - 1 - (index * offset); + int b = a - offset / 2; + if (a >= 0 && b >= 0) { + g_sdata[a] += g_sdata[b]; + } + } + offset *= 2; + } + // down-sweep + if (index == 0 && n > 0) { + g_sdata[n - 1] = 0; + } + offset /= 2; + for (int d = 1; d <= N / 2; d *= 2) { + __syncthreads(); + if (index < d) { + int a = n - 1 - (index * offset); + int b = a - offset / 2; + if (a >= 0 && b >= 0) { + int tmp = g_sdata[b]; + g_sdata[b] = g_sdata[a]; + g_sdata[a] += tmp; + } + } + offset /= 2; + } + // Scatter + __syncthreads(); + if (g_bdata[index] == 1) { + int idx = g_sdata[index]; + g_odata[idx] = g_idata[index]; + } + } + + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int k = ilog2ceil(n); + int N = (int) pow(2, k); + + int *g_odata; + int *g_idata; + cudaMalloc((void**)&g_idata, n * sizeof(int)); + checkCUDAErrorFn("cudaMalloc g_idata failed!"); + cudaMalloc((void**)&g_odata, n * sizeof(int)); + checkCUDAErrorFn("cudaMalloc g_odata failed!"); + cudaMemcpy(g_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + timer().startGpuTimer(); // TODO + kernelEfficientScan<<<1, n >>>(g_odata, g_idata, n, N); + timer().endGpuTimer(); + + // copy back ouput + cudaMemcpy(odata, g_odata, sizeof(int) * n, cudaMemcpyDeviceToHost); + checkCUDAErrorFn("cudaMemcpy odata failed!"); + + cudaFree(g_odata); + cudaFree(g_idata); } /** @@ -31,10 +174,46 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + int k = ilog2ceil(n); + int N = (int)pow(2, k); + + int *g_odata; + int *g_idata; + int *g_bdata; + int *g_sdata; + cudaMalloc((void**)&g_idata, n * sizeof(int)); + checkCUDAErrorFn("cudaMalloc g_idata failed!"); + cudaMalloc((void**)&g_odata, n * sizeof(int)); + checkCUDAErrorFn("cudaMalloc g_odata failed!"); + cudaMalloc((void**)&g_bdata, n * sizeof(int)); + checkCUDAErrorFn("cudaMalloc g_bdata failed!"); + cudaMalloc((void**)&g_sdata, n * sizeof(int)); + checkCUDAErrorFn("cudaMalloc g_sdata failed!"); + + cudaMemcpy(g_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + timer().startGpuTimer(); // TODO + //kernelEfficientCompact<<<1, n>>>(g_odata, g_idata, g_sdata, g_bdata, n, N); + Common::kernMapToBoolean<<<1, n>>>(n, g_bdata, g_idata); + kernelEfficientScan<<<1, n>>>(g_sdata, g_bdata, n, N); + Common::kernScatter<<<1, n>>>(n, g_odata, g_idata, g_bdata, g_sdata); + timer().endGpuTimer(); - return -1; + + // copy back output + int c1, c2; + cudaMemcpy(&c1, g_bdata + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(&c2, g_sdata + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + int count = c1 + c2; + cudaMemcpy(odata, g_odata, sizeof(int) * count, cudaMemcpyDeviceToHost); + + cudaFree(g_odata); + cudaFree(g_idata); + cudaFree(g_sdata); + cudaFree(g_bdata); + + return count; } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..b0fb55a 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -12,14 +12,86 @@ namespace StreamCompaction { return timer; } // TODO: __global__ + __global__ void kernelScan(int *g_odata, int *g_idata, int *A_data, int *B_data, int k) { + /* + extern __shared__ float temp[]; // allocated on invocation + int thid = threadIdx.x; + int pout = 0, pin = 1; // Load input into shared memory. + // This is exclusive scan, so shift right by one + // and set first element to 0 + temp[pout*n + thid] = (thid > 0) ? g_idata[thid-1] : 0; + __syncthreads(); + for (int offset = 1; offset < n; offset *= 2) { + pout = 1 - pout; // swap double buffer indices + pin = 1 - pout; + if (thid >= offset) + temp[pout*n+thid] += temp[pin*n+thid - offset]; + else temp[pout*n+thid] = temp[pin*n+thid]; + __syncthreads(); + } + g_odata[thid] = temp[pout*n+thid]; // write output + */ - /** + int index = threadIdx.x; + A_data[index] = (index > 0) ? g_idata[index - 1] : 0; + __syncthreads(); + int offset = 1; + for (int d = 1; d <= k; d++) { + if (index >= offset) { + B_data[index] = A_data[index] + A_data[index - offset]; + } + else { + B_data[index] = A_data[index]; + } + offset *= 2; + __syncthreads(); + // swap pointers + int *tmp = A_data; + A_data = B_data; + B_data = tmp; + } + // point odata to A_data + g_odata[index] = A_data[index]; + } + + + /*** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + // allocate device arrays + int *A_data; + int *B_data; + cudaMalloc((void**)&A_data, n * sizeof(int)); + checkCUDAErrorFn("cudaMalloc A_data failed!"); + cudaMalloc((void**)&B_data, n * sizeof(int)); + checkCUDAErrorFn("cudaMalloc B_data failed!"); + + int *g_odata; + int *g_idata; + cudaMalloc((void**)&g_idata, n * sizeof(int)); + checkCUDAErrorFn("cudaMalloc g_idata failed!"); + cudaMalloc((void**)&g_odata, n * sizeof(int)); + checkCUDAErrorFn("cudaMalloc g_odata failed!"); + // cudaMemcpy(g_odata, odata, sizeof(int) * n, cudaMemcpyHostToDevice); + cudaMemcpy(g_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + + int k = ilog2ceil(n); + timer().startGpuTimer(); // TODO + kernelScan<<<1, n>>>(g_odata, g_idata, A_data, B_data, k); + timer().endGpuTimer(); + + // copy output + cudaMemcpy(odata, g_odata, sizeof(int) * n, cudaMemcpyDeviceToHost); + checkCUDAErrorFn("cudaMemcpy odata failed!"); + // free device arrays + cudaFree(A_data); + cudaFree(B_data); + cudaFree(g_odata); + cudaFree(g_idata); } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..bf3ae41 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,21 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + thrust::host_vector host_in(idata, idata + n); + + thrust::device_vector dev_in = host_in; + thrust::device_vector dev_out(n); + timer().startGpuTimer(); // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(dev_in.begin(), dev_in.end(), dev_out.begin()); + timer().endGpuTimer(); + + int *dev_out_ptr = thrust::raw_pointer_cast(&dev_out[0]); + cudaMemcpy(odata, dev_out_ptr, sizeof(int) * n, cudaMemcpyDeviceToHost); } } }