diff --git a/README.md b/README.md index 0e38ddb..a75554b 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,85 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Alexander Chan +* Tested on: Windows 10 Version 1803, i7-5820k @ 3.70 GHz 16GB, GTX 1080 @ 1620 MHz 8GB (Personal Computer) -### (TODO: Your README) +### README -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +* Implemented CPU scan +* Implemented naive and work efficient scan +* Implemented stream compaction using work efficient scan +## Performance Analysis +Here are metrics for scan, including CPU, naive, work efficient, and thrust. + +![](img/scan.png) + +The horizontal axis indicates the array size, doubling with every tick. Thus, to visualize a better relationship, the vertical axis is a log scale. + +As we can see, the CPU scan is almost a perfect linear relationship between array size and time. This makes sense as there are O(N) operations in scan. For smaller array sizes, +the CPU scan is faster than all GPU implementations. This is likely due to constant overhead of kernel invocations, and the fact that the GPU implementation only uses slow global memory, while the CPU was able to take advantage of its cache, which excelled in the sequential lookups and writes of small elements in the scan algorithm. The work efficient implementations are slower than the naive implementations, probably because there are twice as many kernel invocations, in addition to using more global memory. Thrust's implementation is slower, but constant. This probably means that Thrust is doing other work in addition to performing the scan. + +Here are metrics for stream compaction. Once again, the horizontal axis indicates the array size, doubling with every tick, and the vertical axis is a log scale. + +![](img/stream-compact.png) + +## Output + +``` +**************** +** SCAN TESTS ** +**************** + [ 38 4 37 27 41 44 46 5 2 8 23 9 12 ... 13 0 ] +cpu scan, power-of-two +0.032968 + [ 0 38 42 79 106 147 191 237 242 244 252 275 284 ... 399406 399419 ] +cpu scan, non-power-of-two +0.03359 + [ 0 38 42 79 106 147 191 237 242 244 252 275 284 ... 399313 399349 ] + passed +naive scan, power-of-two +0.041984 + passed +naive scan, non-power-of-two +0.04096 + passed +work-efficient scan, power-of-two +0.136192 + passed +work-efficient scan, non-power-of-two +0.136192 + a[8384] = 203041, b[8384] = 602399 + FAIL VALUE +thrust scan, power-of-two +4.56704 + passed +thrust scan, non-power-of-two +0.048096 + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 0 1 0 0 1 0 2 1 2 3 1 1 ... 1 0 ] +cpu compact without scan, power-of-two +0.039811 + [ 2 1 1 2 1 2 3 1 1 1 2 1 3 ... 2 1 ] + passed +cpu compact without scan, non-power-of-two +0.038878 + [ 2 1 1 2 1 2 3 1 1 1 2 1 3 ... 1 2 ] + passed +cpu compact with scan +4.0355 + [ 2 1 1 2 1 2 3 1 1 1 2 1 3 ... 2 1 ] + passed +work-efficient compact, power-of-two +3.71302 + [ 2 1 1 2 1 2 3 1 1 1 2 1 3 ... 2 1 ] + passed +work-efficient compact, non-power-of-two +4.68685 + [ 2 1 1 2 1 2 3 1 1 1 2 1 3 ... 1 2 ] + passed +``` \ No newline at end of file diff --git a/img/scan.png b/img/scan.png new file mode 100644 index 0000000..46daa03 Binary files /dev/null and b/img/scan.png differ diff --git a/img/stream-compact.png b/img/stream-compact.png new file mode 100644 index 0000000..3c21ae3 Binary files /dev/null and b/img/stream-compact.png differ diff --git a/src/main.cpp b/src/main.cpp index 1850161..3129c96 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,141 +13,135 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 14; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; int *c = new int[SIZE]; int main(int argc, char* argv[]) { - // Scan tests - - printf("\n"); - printf("****************\n"); - printf("** SCAN TESTS **\n"); - printf("****************\n"); - - genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); - - // initialize b using StreamCompaction::CPU::scan you implement - // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct. - // At first all cases passed because b && c are all zeroes. - zeroArray(SIZE, b); - printDesc("cpu scan, power-of-two"); - StreamCompaction::CPU::scan(SIZE, b, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(SIZE, b, true); - - zeroArray(SIZE, c); - printDesc("cpu scan, non-power-of-two"); - StreamCompaction::CPU::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(NPOT, b, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("naive scan, power-of-two"); - StreamCompaction::Naive::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan - onesArray(SIZE, c); - printDesc("1s array for finding bugs"); - StreamCompaction::Naive::scan(SIZE, c, a); - printArray(SIZE, c, true); */ - - zeroArray(SIZE, c); - printDesc("naive scan, non-power-of-two"); - StreamCompaction::Naive::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient scan, power-of-two"); - StreamCompaction::Efficient::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient scan, non-power-of-two"); - StreamCompaction::Efficient::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("thrust scan, power-of-two"); - StreamCompaction::Thrust::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - zeroArray(SIZE, c); - printDesc("thrust scan, non-power-of-two"); - StreamCompaction::Thrust::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); - - printf("\n"); - printf("*****************************\n"); - printf("** STREAM COMPACTION TESTS **\n"); - printf("*****************************\n"); - - // Compaction tests - - genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); - - int count, expectedCount, expectedNPOT; - - // initialize b using StreamCompaction::CPU::compactWithoutScan you implement - // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct. - zeroArray(SIZE, b); - printDesc("cpu compact without scan, power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - expectedCount = count; - printArray(count, b, true); - printCmpLenResult(count, expectedCount, b, b); - - zeroArray(SIZE, c); - printDesc("cpu compact without scan, non-power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - expectedNPOT = count; - printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); - - zeroArray(SIZE, c); - printDesc("cpu compact with scan"); - count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient compact, power-of-two"); - count = StreamCompaction::Efficient::compact(SIZE, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient compact, non-power-of-two"); - count = StreamCompaction::Efficient::compact(NPOT, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); - - system("pause"); // stop Win32 console from closing on exit + + + printf("\n"); + printf("****************\n"); + printf("** SCAN TESTS **\n"); + printf("****************\n"); + + genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case + a[SIZE - 1] = 0; + printArray(SIZE, a, true); + + // initialize b using StreamCompaction::CPU::scan you implement + // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct. + // At first all cases passed because b && c are all zeroes. + zeroArray(SIZE, b); + printDesc("cpu scan, power-of-two"); + StreamCompaction::CPU::scan(SIZE, b, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + printArray(SIZE, b, true); + + zeroArray(SIZE, c); + printDesc("cpu scan, non-power-of-two"); + StreamCompaction::CPU::scan(NPOT, c, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + printArray(NPOT, b, true); + printCmpResult(NPOT, b, c); + + zeroArray(SIZE, c); + printDesc("naive scan, power-of-two"); + StreamCompaction::Naive::scan(SIZE, c, a); + printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + + zeroArray(SIZE, c); + printDesc("naive scan, non-power-of-two"); + StreamCompaction::Naive::scan(NPOT, c, a); + printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(SIZE, c, true); + printCmpResult(NPOT, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient scan, power-of-two"); + StreamCompaction::Efficient::scan(SIZE, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient scan, non-power-of-two"); + StreamCompaction::Efficient::scan(NPOT, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(NPOT, c, true); + printCmpResult(NPOT, b, c); + + zeroArray(SIZE, c); + printDesc("thrust scan, power-of-two"); + StreamCompaction::Thrust::scan(SIZE, c, a); + printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + + zeroArray(SIZE, c); + printDesc("thrust scan, non-power-of-two"); + StreamCompaction::Thrust::scan(NPOT, c, a); + printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(NPOT, c, true); + printCmpResult(NPOT, b, c); + + printf("\n"); + printf("*****************************\n"); + printf("** STREAM COMPACTION TESTS **\n"); + printf("*****************************\n"); + + // Compaction tests + + genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case + a[SIZE - 1] = 0; + printArray(SIZE, a, true); + + int count, expectedCount, expectedNPOT; + + // initialize b using StreamCompaction::CPU::compactWithoutScan you implement + // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct. + zeroArray(SIZE, b); + printDesc("cpu compact without scan, power-of-two"); + count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + expectedCount = count; + printArray(count, b, true); + printCmpLenResult(count, expectedCount, b, b); + + zeroArray(SIZE, c); + printDesc("cpu compact without scan, non-power-of-two"); + count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + expectedNPOT = count; + printArray(count, c, true); + printCmpLenResult(count, expectedNPOT, b, c); + + zeroArray(SIZE, c); + printDesc("cpu compact with scan"); + count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + printArray(count, c, true); + printCmpLenResult(count, expectedCount, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient compact, power-of-two"); + count = StreamCompaction::Efficient::compact(SIZE, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(count, c, true); + printCmpLenResult(count, expectedCount, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient compact, non-power-of-two"); + count = StreamCompaction::Efficient::compact(NPOT, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(count, c, true); + printCmpLenResult(count, expectedNPOT, b, c); + + system("pause"); // stop Win32 console from closing on exit delete[] a; delete[] b; delete[] c; diff --git a/src/testing_helpers.hpp b/src/testing_helpers.hpp index 46337ab..37a3072 100644 --- a/src/testing_helpers.hpp +++ b/src/testing_helpers.hpp @@ -18,7 +18,7 @@ int cmpArrays(int n, T *a, T *b) { } void printDesc(const char *desc) { - printf("==== %s ====\n", desc); + printf("%s\n", desc); } template @@ -72,5 +72,7 @@ void printArray(int n, int *a, bool abridged = false) { template void printElapsedTime(T time, std::string note = "") { - std::cout << " elapsed time: " << time << "ms " << note << std::endl; + + std::cout << time << std::endl; + //std::cout << " elapsed time: " << time << "ms " << note << std::endl; } \ No newline at end of file diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index cdbef77..4bb0dc2 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -13,5 +13,5 @@ set(SOURCE_FILES cuda_add_library(stream_compaction ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_61 ) diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 05ce667..62dca77 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -1,15 +1,15 @@ #include #include "cpu.h" -#include "common.h" +#include "common.h" namespace StreamCompaction { namespace CPU { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } /** @@ -18,9 +18,19 @@ namespace StreamCompaction { * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first. */ void scan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); - // TODO - timer().endCpuTimer(); + bool isTiming = true; + try { + timer().startCpuTimer(); + } catch (std::exception &) { + isTiming = false; + } + odata[0] = 0; + for (int i = 1; i < n; ++i) { + odata[i] = odata[i - 1] + idata[i - 1]; + } + if (isTiming) { + timer().endCpuTimer(); + } } /** @@ -30,9 +40,15 @@ namespace StreamCompaction { */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + int count = 0; + for (int i = 0; i < n; ++i) { + if (idata[i] != 0) { + odata[count] = idata[i]; + count++; + } + } timer().endCpuTimer(); - return -1; + return count; } /** @@ -42,9 +58,23 @@ namespace StreamCompaction { */ int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + int *tmpdata = new int[n]; + int *scandata = new int[n]; + for (int i = 0; i < n; ++i) { + tmpdata[i] = idata[i] == 0 ? 0 : 1; + } + scan(n, scandata, tmpdata); + int count = 0; + for (int i = 0; i < n; ++i) { + if (tmpdata[i] != 0) { + odata[scandata[i]] = idata[i]; + count++; + } + } + delete tmpdata; + delete scandata; timer().endCpuTimer(); - return -1; + return count; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 36c5ef2..c0581a8 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -5,22 +5,104 @@ namespace StreamCompaction { namespace Efficient { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } + + __global__ + void kernUpSweep(int n, int d, int *data, int offset_1, int offset_2) { + int k = threadIdx.x + (blockIdx.x * blockDim.x); + if (k % offset_1 != 0) { return; } + if (k > n) { return; } + data[k + offset_1 - 1] += data[k + offset_2 - 1]; + if (k == n - 1) { data[k] = 0; } + } + + __global__ + void kernDownSweep(int n, int d, int *data, int offset_1, int offset_2) { + int k = threadIdx.x + (blockIdx.x * blockDim.x); + if (k % offset_1 != 0) { return; } + if (k > n) { return; } + int t = data[k + offset_2 - 1]; + data[k + offset_2 - 1] = data[k + offset_1 - 1]; + data[k + offset_1 - 1] += t; + } + + __global__ + void kernZeroCorrect(int n, int *data) { + int k = threadIdx.x + (blockIdx.x * blockDim.x); + if (k > n) { return; } + data[k] -= data[0]; + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); + int paddedSize = 1 << ilog2ceil(n); + + int *idataPadded = new int[paddedSize]; + for (int i = 0; i < paddedSize; ++i) { + idataPadded[i] = i < n ? idata[i] : 0; + } + + int blockSize = 128; + dim3 blocksPerGrid((paddedSize + blockSize - 1) / blockSize); + dim3 threadsPerBlock(blockSize); + + int *dv_data; + cudaMalloc((void **) &dv_data, paddedSize * sizeof(int)); + checkCUDAError("cudaMalloc dv_data failed!"); + + cudaMemcpy(dv_data, idataPadded, paddedSize * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy to dv_data failed!"); + + bool end = true; + try { + timer().startGpuTimer(); + } catch (std::exception &) { + end = false; + } + + for (int i = 0; i < ilog2ceil(n); ++i) { + kernUpSweep << > > (paddedSize, i, dv_data, 1 << (i + 1), 1 << i); + } + + // set root to 0 + int z = 0; + cudaMemcpy(dv_data + n - 1, &z, sizeof(int), cudaMemcpyHostToDevice); + + for (int i = ilog2ceil(n) - 1; i >= 0; i--) { + kernDownSweep << > > (paddedSize, i, dv_data, 1 << (i + 1), 1 << i); + } + + if (end) { timer().endGpuTimer(); } + kernZeroCorrect << > > (paddedSize, dv_data); + cudaMemcpy(odata, dv_data, n * sizeof(int), cudaMemcpyDeviceToHost); + + delete idataPadded; + cudaFree(dv_data); } + __global__ + void kernMapToBoolean(int n, int *odata, int *idata) { + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + if (idx > n) { return; } + odata[idx] = idata[idx] == 0 ? 0 : 1; + } + + __global__ + void kernScatter(int n, int *odata, int *bdata, int *scandata, int *idata) { + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + if (idx > n) { return; } + if (bdata[idx] == 1) { + odata[scandata[idx]] = idata[idx]; + } + } + /** * Performs stream compaction on idata, storing the result into odata. * All zeroes are discarded. @@ -31,10 +113,60 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + + int blockSize = 128; + dim3 blocksPerGrid((n + blockSize - 1) / blockSize); + dim3 threadsPerBlock(blockSize); + + int *dv_bdata, *dv_scandata, *dv_idata, *dv_data; + cudaMalloc((void **) &dv_bdata, n * sizeof(int)); + checkCUDAError("cudaMalloc dv_bdata failed!"); + + cudaMalloc((void **) &dv_scandata, n * sizeof(int)); + checkCUDAError("cudaMalloc dv_scandata failed!"); + + cudaMalloc((void **) &dv_data, n * sizeof(int)); + checkCUDAError("cudaMalloc dv_data failed!"); + + cudaMalloc((void **) &dv_idata, n * sizeof(int)); + checkCUDAError("cudaMalloc dv_idata failed!"); + + cudaMemcpy(dv_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy to dv_idata failed!"); + + int *cpu_bdata, *cpu_scandata; + cpu_bdata = new int[n]; + cpu_scandata = new int[n]; + timer().startGpuTimer(); - // TODO + + kernMapToBoolean << > > (n, dv_bdata, dv_idata); + + cudaMemcpy(cpu_bdata, dv_bdata, n * sizeof(int), cudaMemcpyDeviceToHost); + + int count = 0; + for (int i = 0; i < n; ++i) { + if (cpu_bdata[i] == 1) { count++; } + } + + scan(n, cpu_scandata, cpu_bdata); + + cudaMemcpy(dv_scandata, cpu_scandata, n * sizeof(int), cudaMemcpyHostToDevice); + + kernScatter<<>>(n, dv_data, dv_bdata, dv_scandata, dv_idata); + timer().endGpuTimer(); - return -1; + + cudaMemcpy(odata, dv_data, count * sizeof(int), cudaMemcpyDeviceToHost); + + delete(cpu_bdata); + delete(cpu_scandata); + cudaFree(dv_bdata); + cudaFree(dv_scandata); + cudaFree(dv_idata); + cudaFree(dv_data); + + return count; } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 9218f8e..0a37415 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -4,22 +4,60 @@ #include "naive.h" namespace StreamCompaction { - namespace Naive { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; - } - // TODO: __global__ + namespace Naive { + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() { + static PerformanceTimer timer; + return timer; + } + + __global__ + void kernNaiveScanIteration(int n, int d, int *o, const int *i) { + int k = threadIdx.x + (blockIdx.x * blockDim.x); + if (k >= n) { return; } + int offset = 1 << (d - 1); + o[k] = k >= offset ? i[k - offset] + i[k] : i[k]; + } + + __global__ + void kernShiftRight(int n, int *o, int *i) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { return; } + o[index] = index == 0 ? 0 : i[index - 1]; + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); + int blockSize = 128; + dim3 blocksPerGrid((n + blockSize - 1) / blockSize); + dim3 threadsPerBlock(blockSize); + + int *dv_idata, *dv_odata; + cudaMalloc((void **) &dv_idata, n * sizeof(int)); + checkCUDAError("cudaMalloc dv_idata failed!"); + + cudaMalloc((void **) &dv_odata, n * sizeof(int)); + checkCUDAError("cudaMalloc dv_odata failed!"); + + cudaMemcpy(dv_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy to dv_idata failed!"); + + timer().startGpuTimer(); + + for (int d = 1; d <= ilog2ceil(n); ++d) { + kernNaiveScanIteration << > > (n, d, dv_odata, dv_idata); + std::swap(dv_idata, dv_odata); + } + kernShiftRight << > > (n, dv_odata, dv_idata); + + timer().endGpuTimer(); + + cudaMemcpy(odata, dv_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dv_idata); + cudaFree(dv_odata); } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 36b732d..536e7cb 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -8,21 +8,31 @@ namespace StreamCompaction { namespace Thrust { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); + thrust::device_vector dv_in(n); + thrust::copy(idata, idata + n, dv_in.begin()); + + thrust::device_vector dv_out(n); + + timer().startGpuTimer(); + + thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + + timer().endGpuTimer(); + + thrust::copy(dv_out.begin(), dv_out.end(), odata); // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); - timer().endGpuTimer(); } } }