diff --git a/README.md b/README.md index 0e38ddb..d74a6f7 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,121 @@ CUDA Stream Compaction ====================== -**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** +**University of Pennsylvania, CIS 565: GPU Programming and Architecture** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Jiarui Yan + * [LinkedIn](https://www.linkedin.com/in/jiarui-yan-a06bb5197?lipi=urn%3Ali%3Apage%3Ad_flagship3_profile_view_base_contact_details%3BvRlITiOMSt%2B9Mgg6SZFKDQ%3D%3D), [personal website](https://jiaruiyan.pb.online/), [twitter](https://twitter.com/JerryYan1997), etc. +* Tested on: Windows 10 Home, i7-9700K @ 3.60GHz 16GB DDR4 RAM, RTX 2070 SUPER 8GB Dedicated GPU memory (Personal desktop) -### (TODO: Your README) +## Features -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +* CPU Scan & Stream Compaction +* Naive GPU Scan Algorithm + +* Work-Efficient GPU Scan & Stream Compaction + +* Thrust scan + +* Compact threads (Part 5 Extra Credit -- +5) + +* Parallel Radix Sort (Part 6 Extra Credit -- +10) + +## Roughly optimize for each implementations + +In this part, in order to determine the suitable blocksize for each implementations, I run different scenarios under several discrete blocksizes: 16, 32, 64, 128, 256, 512. Then, I collect their average running time and plot them respectively. They are shown below: + +![Experiment 1](./img/Naive_GPU_scan_blocksize.PNG) + +![Experiment 2](./img/work_efficient_GPU_scan_blocksize.PNG) + +![Experiment 3](./img/efficient_compaction_blocksize.PNG) + +![Experiment 4](./img/parallel_radix_sort_blocksize.PNG) + +As you can see from graphs above, a blocksize that is 32 is suitable for all the scenarios. I think this maybe caused by the fact that the input array size is not long enough. I control the input size for power-of-two scenarios to be 256. Besides, 32 is also the number of threads in a warp. As a result, 32 is a preferable size for all of them. Subsequently, I will fix their blocksize to 32 and do analysis under this blocksize. + +## Performance comparsion + +![Experiment 5](./img/scan_time_for_different_array_size.PNG) + +As we can see from the graph above, we can figure out that GPU's advantages can be shown when the length of input array is long enough. + +As for Thrust implementation, I guess it implement shared memory and all sorts of tricks to improve the proformence and try their best to exclude operations like memory copy. + +## Brief explanation of phenomena + +From my observation of the NSight timeline, all of my code are bounded by memory I/O, because the gap between each computation is relatively bigger than computation parts. Besides, each implementation is different. For instance, Naive scan would hand in lots of CUDA kernels, while other implementation hands in less kernels. + +### Output of the test program + +Here is the output by inputing an 2^20 array. As for the 'parallel radix sort', please take a look at the thrid part of the ouput below. + +``` +**************** +** SCAN TESTS ** +**************** + [ 3 16 44 30 6 8 15 16 11 48 38 32 12 ... 46 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 3.6953ms (std::chrono Measured) + [ 0 3 19 63 93 99 107 122 138 149 197 235 267 ... 25674322 25674368 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 1.3371ms (std::chrono Measured) + [ 0 3 19 63 93 99 107 122 138 149 197 235 267 ... 25674263 25674285 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 0.999616ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.999424ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.227328ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.223232ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.245824ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.309088ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 1 2 0 0 2 0 3 0 3 0 0 2 0 ... 2 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 2.0501ms (std::chrono Measured) + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 2.0319ms (std::chrono Measured) + passed +==== cpu compact with scan ==== + elapsed time: 7.6485ms (std::chrono Measured) + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.7328ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.733216ms (CUDA Measured) + passed + +***************************** +** PARALLEL RADIX SORT TESTS ** +***************************** + [ 3 16 44 30 6 8 15 16 11 48 38 32 12 ... 46 0 ] +==== parallel radix sort, power-of-two correct result ==== + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 49 49 ] +==== parallel radix sort, power-of-two correct result ==== + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 49 49 ] +==== parallel radix sort, power-of-two ==== + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 49 49 ] + elapsed time: 0.733216ms (CUDA Measured) + passed +==== parallel radix sort, non-power-of-two ==== + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 49 49 ] + elapsed time: 0.733216ms (CUDA Measured) + passed +``` diff --git a/img/Naive_GPU_scan_blocksize.PNG b/img/Naive_GPU_scan_blocksize.PNG new file mode 100644 index 0000000..534559a Binary files /dev/null and b/img/Naive_GPU_scan_blocksize.PNG differ diff --git a/img/data.xlsx b/img/data.xlsx new file mode 100644 index 0000000..b1b8659 Binary files /dev/null and b/img/data.xlsx differ diff --git a/img/efficient_compaction_blocksize.PNG b/img/efficient_compaction_blocksize.PNG new file mode 100644 index 0000000..fe70fed Binary files /dev/null and b/img/efficient_compaction_blocksize.PNG differ diff --git a/img/example-1.png b/img/example-1.png deleted file mode 100644 index 28633a6..0000000 Binary files a/img/example-1.png and /dev/null differ diff --git a/img/example-2.jpg b/img/example-2.jpg deleted file mode 100644 index 984c2fd..0000000 Binary files a/img/example-2.jpg and /dev/null differ diff --git a/img/figure-39-2.jpg b/img/figure-39-2.jpg deleted file mode 100644 index bc9f9da..0000000 Binary files a/img/figure-39-2.jpg and /dev/null differ diff --git a/img/figure-39-4.jpg b/img/figure-39-4.jpg deleted file mode 100644 index 5888f20..0000000 Binary files a/img/figure-39-4.jpg and /dev/null differ diff --git a/img/parallel_radix_sort_blocksize.PNG b/img/parallel_radix_sort_blocksize.PNG new file mode 100644 index 0000000..da2e2e4 Binary files /dev/null and b/img/parallel_radix_sort_blocksize.PNG differ diff --git a/img/scan trend.xlsx b/img/scan trend.xlsx new file mode 100644 index 0000000..12aab12 Binary files /dev/null and b/img/scan trend.xlsx differ diff --git a/img/scan_time_for_different_array_size.PNG b/img/scan_time_for_different_array_size.PNG new file mode 100644 index 0000000..c214586 Binary files /dev/null and b/img/scan_time_for_different_array_size.PNG differ diff --git a/img/work_efficient_GPU_scan_blocksize.PNG b/img/work_efficient_GPU_scan_blocksize.PNG new file mode 100644 index 0000000..b8c9802 Binary files /dev/null and b/img/work_efficient_GPU_scan_blocksize.PNG differ diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..cba8347 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -11,13 +11,17 @@ #include #include #include +#include +#include +#include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 20; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; int *c = new int[SIZE]; +int *d = new int[SIZE]; int main(int argc, char* argv[]) { // Scan tests @@ -47,19 +51,19 @@ int main(int argc, char* argv[]) { printArray(NPOT, b, true); printCmpResult(NPOT, b, c); + /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan + onesArray(SIZE, c); + printDesc("1s array for finding bugs"); + StreamCompaction::Naive::scan(SIZE, c, c); + printArray(SIZE, c, true);*/ + zeroArray(SIZE, c); printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + // printArray(SIZE, c, true); printCmpResult(SIZE, b, c); - /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan - onesArray(SIZE, c); - printDesc("1s array for finding bugs"); - StreamCompaction::Naive::scan(SIZE, c, a); - printArray(SIZE, c, true); */ - zeroArray(SIZE, c); printDesc("naive scan, non-power-of-two"); StreamCompaction::Naive::scan(NPOT, c, a); @@ -71,14 +75,14 @@ int main(int argc, char* argv[]) { printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + // printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + // printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); @@ -94,6 +98,7 @@ int main(int argc, char* argv[]) { printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); //printArray(NPOT, c, true); printCmpResult(NPOT, b, c); + printf("\n"); printf("*****************************\n"); @@ -115,7 +120,7 @@ int main(int argc, char* argv[]) { count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); expectedCount = count; - printArray(count, b, true); + // printArray(count, b, true); printCmpLenResult(count, expectedCount, b, b); zeroArray(SIZE, c); @@ -123,14 +128,14 @@ int main(int argc, char* argv[]) { count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); expectedNPOT = count; - printArray(count, c, true); + // printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); zeroArray(SIZE, c); printDesc("cpu compact with scan"); count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(count, c, true); + // printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); @@ -147,6 +152,42 @@ int main(int argc, char* argv[]) { //printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); + + printf("\n"); + printf("*****************************\n"); + printf("** PARALLEL RADIX SORT TESTS **\n"); + printf("*****************************\n"); + + // Radix Sort Tests + genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case + a[SIZE - 1] = 0; + printArray(SIZE, a, true); + + zeroArray(SIZE, b); + printDesc("parallel radix sort, power-of-two correct result"); + memcpy(b, a, SIZE * sizeof(int)); + thrust::sort(thrust::host, b, b + SIZE); + printArray(SIZE, b, true); + + printDesc("parallel radix sort, power-of-two correct result"); + memcpy(d, a, NPOT * sizeof(int)); + thrust::sort(thrust::host, d, d + NPOT); + printArray(NPOT, d, true); + + zeroArray(SIZE, c); + printDesc("parallel radix sort, power-of-two"); + StreamCompaction::RadixSort::radix_sort(SIZE, c, a); + printArray(SIZE, c, true); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printCmpResult(SIZE, b, c); + + zeroArray(SIZE, c); + printDesc("parallel radix sort, non-power-of-two"); + StreamCompaction::RadixSort::radix_sort(NPOT, c, a); + printArray(NPOT, c, true); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printCmpResult(NPOT, d, c); + system("pause"); // stop Win32 console from closing on exit delete[] a; delete[] b; diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index 567795b..eb71175 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -4,6 +4,7 @@ set(headers "naive.h" "efficient.h" "thrust.h" + "radix.h" ) set(sources @@ -12,6 +13,7 @@ set(sources "naive.cu" "efficient.cu" "thrust.cu" + "radix.cu" ) list(SORT headers) diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..42d94f3 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -24,6 +24,16 @@ namespace StreamCompaction { */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { // TODO + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + if (idata[index] == 0) { + bools[index] = 0; + } + else { + bools[index] = 1; + } } /** @@ -33,6 +43,14 @@ namespace StreamCompaction { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { // TODO + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + if (bools[index] == 1) { + int final_index = indices[index]; + odata[final_index] = idata[index]; + } } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..dad5e64 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -20,6 +20,11 @@ namespace StreamCompaction { void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + odata[0] = 0; + for (int k = 1; k < n; ++k) + { + odata[k] = odata[k - 1] + idata[k - 1]; + } timer().endCpuTimer(); } @@ -31,8 +36,15 @@ namespace StreamCompaction { int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int o_idx = 0; + for (int i_idx = 0; i_idx < n; ++i_idx) { + if (idata[i_idx] != 0) { + odata[o_idx] = idata[i_idx]; + ++o_idx; + } + } timer().endCpuTimer(); - return -1; + return o_idx; } /** @@ -43,8 +55,35 @@ namespace StreamCompaction { int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int* temp_array = new int[n]; + int* scan_array = new int[n]; + // Compute temporary array: + for (int i_idx = 0; i_idx < n; ++i_idx) { + if (idata[i_idx] != 0) { + temp_array[i_idx] = 1; + } + else { + temp_array[i_idx] = 0; + } + } + // Exclusive scan: + scan_array[0] = 0; + for (int k = 1; k < n; ++k) + { + scan_array[k] = scan_array[k - 1] + temp_array[k - 1]; + } + // Scatter: + int o_counter = 0; + for (int i_idx = 0; i_idx < n; ++i_idx) + { + if (temp_array[i_idx] == 1) { + int o_idx = scan_array[i_idx]; + odata[o_idx] = idata[i_idx]; + ++o_counter; + } + } timer().endCpuTimer(); - return -1; + return o_counter; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..618d16a 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -3,6 +3,7 @@ #include "common.h" #include "efficient.h" + namespace StreamCompaction { namespace Efficient { using StreamCompaction::Common::PerformanceTimer; @@ -12,13 +13,117 @@ namespace StreamCompaction { return timer; } + __global__ void init_array(int* dev_array, const int* dev_temp_array, const int n, const int fit_size) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= fit_size) { + return; + } + if (index < n) { + dev_array[index] = dev_temp_array[index]; + } + else { + dev_array[index] = 0; + } + } + + __global__ void up_sweep(int* dev_array, const int fit_size, const int d) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + // int remap_index = index * pow(2.0, d + 1); + int coeff = 1 << (d + 1); + int remap_index = index * coeff; + if (remap_index >= fit_size) { + return; + } + // int two_pow_d_add_1 = pow(2.0, d + 1); + // int two_pow_d = pow(2.0, d); + int two_pow_d_add_1 = 1 << (d + 1); + int two_pow_d = 1 << d; + dev_array[remap_index + two_pow_d_add_1 - 1] += dev_array[remap_index + two_pow_d - 1]; + } + + __global__ void down_sweep(int* dev_array, const int fit_size, const int d) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + // int remap_index = index * pow(2.0, d + 1); + int coeff = 1 << (d + 1); + int remap_index = index * coeff; + if (remap_index >= fit_size) { + return; + } + // int two_pow_d_add_1 = pow(2.0, d + 1); + // int two_pow_d = pow(2.0, d); + int two_pow_d_add_1 = 1 << (d + 1); + int two_pow_d = 1 << d; + int t = dev_array[remap_index + two_pow_d - 1]; + dev_array[remap_index + two_pow_d - 1] = dev_array[remap_index + two_pow_d_add_1 - 1]; + dev_array[remap_index + two_pow_d_add_1 - 1] += t; + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int* dev_data_array; + int* dev_temp_data_array; + + int* dev_zero_one_temp_array; + int* dev_idata_array; + int* dev_final_array; + + // Init all the requirement data: + int n_ilog2 = ilog2ceil(n); + // int fit_size = pow(2, n_ilog2); + int fit_size = 1 << n_ilog2; + + int oriSizeInBytes = n * sizeof(int); + int fitSizeInBytes = fit_size * sizeof(int); + int blockSize = 32; + // dim3 fullBlocksPerGrid((fit_size + blockSize - 1) / blockSize); + dim3 fullBlocksPerGrid((fit_size / blockSize) + 1); + + cudaMalloc((void**)&dev_data_array, fitSizeInBytes); + checkCUDAError("cudaMalloc dev_data_array failed!"); + cudaMalloc((void**)&dev_temp_data_array, oriSizeInBytes); + checkCUDAError("cudaMalloc dev_temp_data_array failed!"); + + cudaMemcpy(dev_temp_data_array, idata, oriSizeInBytes, cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy dev_data_array failed!"); + + init_array <<>> (dev_data_array, dev_temp_data_array, n, fit_size); + timer().startGpuTimer(); + // TODO + int d_max = ilog2ceil(n) - 1; + // Up-Sweep: + for (int d = 0; d <= d_max; ++d) { + // int threads_num_needed = fit_size * pow(0.5, d + 1); + int deno = 1 << (d + 1); + int threads_num_needed = fit_size / deno; + dim3 up_sweep_blocks_per_grid((threads_num_needed / blockSize) + 1); + up_sweep <<>> (dev_data_array, fit_size, d); + } + + cudaMemset(dev_data_array + fit_size - 1, 0, sizeof(int)); + // Down-Sweep: + for (int d = d_max; d >= 0; --d) { + // int threads_num_needed = fit_size * pow(0.5, d + 1); + int deno = 1 << (d + 1); + int threads_num_needed = fit_size / deno; + dim3 down_sweep_blocks_per_grid((threads_num_needed / blockSize) + 1); + down_sweep <<>> (dev_data_array, fit_size, d); + } + timer().endGpuTimer(); + + // Copy to output data: + cudaMemcpy(odata, dev_data_array, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy odata failed!"); + + cudaFree(dev_data_array); + checkCUDAError("cudaFree(dev_data_array) failed!"); + cudaFree(dev_temp_data_array); + checkCUDAError("cudaFree(dev_temp_data_array) failed!"); + // free(a); } /** @@ -31,10 +136,92 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + int* dev_data_array; + int* dev_temp_data_array; + + int* dev_zero_one_temp_array; + int* dev_idata_array; + int* dev_final_array; + + int blockSize = 32; + // dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + dim3 fullBlocksPerGrid((n / blockSize) + 1); + int sizeInBytes = n * sizeof(int); + + int* zero_one_temp_array = new int[n]; + int* scan_result_array = new int[n]; + cudaMalloc((void**)&dev_zero_one_temp_array, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_zero_one_temp_array failed!"); + cudaMalloc((void**)&dev_idata_array, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_idata_array failed!"); + + cudaMemcpy(dev_zero_one_temp_array, idata, sizeInBytes, cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy dev_zero_one_temp_array failed!"); + cudaMemcpy(dev_idata_array, idata, sizeInBytes, cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy dev_idata_array failed!"); + + // Scan Init: + int n_ilog2 = ilog2ceil(n); + // int fit_size = pow(2, n_ilog2); + int fit_size = 1 << n_ilog2; + int fitSizeInBytes = fit_size * sizeof(int); + // dim3 scanFullBlocksPerGrid((fit_size + blockSize - 1) / blockSize); + dim3 scanFullBlocksPerGrid((fit_size / blockSize) + 1); + + cudaMalloc((void**)&dev_data_array, fitSizeInBytes); + checkCUDAError("cudaMalloc dev_data_array failed!"); + timer().startGpuTimer(); // TODO + // Transform the existing array into zero and one: + StreamCompaction::Common::kernMapToBoolean <<>> (n, dev_zero_one_temp_array, dev_idata_array); + + // Scan: + // Add zeros at the end of array. + init_array <<>> (dev_data_array, dev_zero_one_temp_array, n, fit_size); + + int d_max = ilog2ceil(n) - 1; + // Up-Sweep: + for (int d = 0; d <= d_max; ++d) { + // int threads_num_needed = fit_size * pow(0.5, d + 1); + int deno = 1 << (d + 1); + int threads_num_needed = fit_size / deno; + dim3 up_sweep_blocks_per_grid((threads_num_needed / blockSize) + 1); + up_sweep <<>> (dev_data_array, fit_size, d); + } + // x[n - 1] = 0 + // int* a = new int[1](); + // a[0] = 0; + // cudaMemcpy(dev_data_array + fit_size - 1, a, sizeof(int), cudaMemcpyHostToDevice); + cudaMemset(dev_data_array + fit_size - 1, 0, sizeof(int)); + // Down-Sweep: + for (int d = d_max; d >= 0; --d) { + // int threads_num_needed = fit_size * pow(0.5, d + 1); + int deno = 1 << (d + 1); + int threads_num_needed = fit_size / deno; + dim3 down_sweep_blocks_per_grid((threads_num_needed / blockSize) + 1); + down_sweep <<>> (dev_data_array, fit_size, d); + } + + // Get the total number of the final elements: + int final_count = 0; + cudaMemcpy(&final_count, dev_data_array + fit_size - 1, sizeof(int), cudaMemcpyDeviceToHost); + // Scatter: + cudaMalloc((void**)&dev_final_array, final_count * sizeof(int)); + checkCUDAError("cudaMalloc dev_final_array failed!"); + StreamCompaction::Common::kernScatter <<>> (fit_size, dev_final_array, dev_idata_array, dev_zero_one_temp_array, dev_data_array); timer().endGpuTimer(); - return -1; + + + cudaMemcpy(odata, dev_final_array, final_count * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy odata failed!"); + + + cudaFree(dev_data_array); + cudaFree(dev_zero_one_temp_array); + cudaFree(dev_idata_array); + cudaFree(dev_final_array); + return final_count; } } } diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 803cb4f..2890129 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -6,6 +6,12 @@ namespace StreamCompaction { namespace Efficient { StreamCompaction::Common::PerformanceTimer& timer(); + __global__ void init_array(int* dev_array, const int* dev_temp_array, const int n, const int fit_size); + + __global__ void up_sweep(int* dev_array, const int fit_size, const int d); + + __global__ void down_sweep(int* dev_array, const int fit_size, const int d); + void scan(int n, int *odata, const int *idata); int compact(int n, int *odata, const int *idata); diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..3532fd5 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -12,14 +12,74 @@ namespace StreamCompaction { return timer; } // TODO: __global__ + __global__ void addition_process(int* dev_array1, int* dev_array2, const int d, const int n) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + // int two_power_d_min_1 = powf(2.0, d - 1); + int two_power_d_min_1 = 1 << (d - 1); + + if (index >= two_power_d_min_1) { + dev_array2[index] = dev_array1[index - two_power_d_min_1] + dev_array1[index]; + } + else { + dev_array2[index] = dev_array1[index]; + } + } + __global__ void right_shift(int* dev_array1, int* dev_array2, const int n) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + if (index == 0) { + dev_array2[index] = 0; + } + else { + dev_array2[index] = dev_array1[index - 1]; + } + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int* dev_data_array1, * dev_data_array2; + // Init all the requirement data: + int sizeInBytes = n * sizeof(int); + int blockSize = 32; + dim3 fullBlocksPerGrid((n / blockSize) + 1); + + cudaMalloc((void**)&dev_data_array1, sizeInBytes); + checkCUDAError("cudaMalloc dev_data_array1 failed!"); + cudaMalloc((void**)&dev_data_array2, sizeInBytes); + checkCUDAError("cudaMalloc dev_data_array2 failed!"); + cudaMemcpy(dev_data_array1, idata, sizeInBytes, cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy dev_data_array1 failed!"); + cudaMemcpy(dev_data_array2, idata, sizeInBytes, cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy dev_data_array2 failed!"); + timer().startGpuTimer(); // TODO + // Inclusive scan: + int d_max = ilog2ceil(n); + for (int d = 1; d <= d_max; ++d) + { + addition_process <<< fullBlocksPerGrid, blockSize >>> (dev_data_array1, dev_data_array2, d, n); + checkCUDAError("Naive addition_process failed!"); + int* temp = dev_data_array1; + dev_data_array1 = dev_data_array2; + dev_data_array2 = temp; + } + // Right shift to get an exclusive scan: + right_shift <<< fullBlocksPerGrid, blockSize >>> (dev_data_array1, dev_data_array2, n); timer().endGpuTimer(); + cudaMemcpy(odata, dev_data_array2, sizeInBytes, cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy odata failed!"); + cudaFree(dev_data_array1); + checkCUDAError("cudaFree(dev_data_array1) failed!"); + cudaFree(dev_data_array2); + checkCUDAError("cudaFree(dev_data_array2) failed!"); } } } diff --git a/stream_compaction/naive.h b/stream_compaction/naive.h index 37dcb06..606d8e1 100644 --- a/stream_compaction/naive.h +++ b/stream_compaction/naive.h @@ -7,5 +7,6 @@ namespace StreamCompaction { StreamCompaction::Common::PerformanceTimer& timer(); void scan(int n, int *odata, const int *idata); + __global__ void addition_process(float* dev_data_array1, float* dev_data_array2, const int d, const int n); } } diff --git a/stream_compaction/radix.cu b/stream_compaction/radix.cu new file mode 100644 index 0000000..bd2048d --- /dev/null +++ b/stream_compaction/radix.cu @@ -0,0 +1,170 @@ +#include +#include +#include "efficient.h" +#include "common.h" + +int* dev_radix_data_array, * dev_b_array, * dev_e_array, * dev_f_array, * dev_t_array, * dev_d_array, * dev_output_array; + + +namespace StreamCompaction { + namespace RadixSort { + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; + } + + __global__ void compute_b_e(const int n, int* data_array, int* b_array, int* e_array, const int pass) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + // int curr_bit = pow(2.0, pass); + int curr_bit = 1 << pass; + int bit_res = curr_bit & data_array[index]; + if (curr_bit == bit_res) { + // Current bit is 1: + b_array[index] = 1; + e_array[index] = 0; + } + else { + // Current bit is 0: + b_array[index] = 0; + + e_array[index] = 1; + } + } + + __global__ void compute_t(const int n, const int total_false, int* f_array, int* t_array) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + t_array[index] = index - f_array[index] + total_false; + } + + __global__ void compute_d(const int n, int* f_array, int* t_array, int* b_array, int* d_array) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + if (b_array[index] > 0) { + d_array[index] = t_array[index]; + } + else { + d_array[index] = f_array[index]; + } + } + + __global__ void scatter(const int n, int* data_array, int* d_array, int* output_array) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + int rearranged_index = d_array[index]; + output_array[rearranged_index] = data_array[index]; + } + + void radix_sort(int n, int* odata, const int* idata) { + int blockSize = 32; + // dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + dim3 fullBlocksPerGrid((n / blockSize) + 1); + int sizeInBytes = n * sizeof(int); + + // Scan Init: + int n_ilog2 = ilog2ceil(n); + // int fit_size = pow(2, n_ilog2); + int fit_size = 1 << n_ilog2; + int fitSizeInBytes = fit_size * sizeof(int); + // dim3 scanFullBlocksPerGrid((fit_size + blockSize - 1) / blockSize); + dim3 scanFullBlocksPerGrid((n / blockSize) + 1); + + cudaMalloc((void**)&dev_radix_data_array, sizeInBytes); + checkCUDAError("cudaMalloc dev_data_array failed!"); + cudaMalloc((void**)&dev_b_array, sizeInBytes); + checkCUDAError("cudaMalloc dev_b_array failed!"); + cudaMalloc((void**)&dev_e_array, sizeInBytes); + checkCUDAError("cudaMalloc dev_e_array failed!"); + + cudaMalloc((void**)&dev_f_array, fitSizeInBytes); + checkCUDAError("cudaMalloc dev_f_array failed!"); + + cudaMalloc((void**)&dev_t_array, sizeInBytes); + checkCUDAError("cudaMalloc dev_t_array failed!"); + cudaMalloc((void**)&dev_d_array, sizeInBytes); + checkCUDAError("cudaMalloc dev_d_array failed!"); + cudaMalloc((void**)&dev_output_array, sizeInBytes); + checkCUDAError("cudaMalloc dev_output_array failed!"); + + cudaMemcpy(dev_radix_data_array, idata, sizeInBytes, cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy dev_data_array failed!"); + + timer().startGpuTimer(); + int bit_num = sizeof(int) * 8; + // bit_num = 1; + for (int pass = 0; pass < bit_num; ++pass) { + // Compute b: + // Compute e: + compute_b_e <<>> (n, dev_radix_data_array, dev_b_array, dev_e_array, pass); + // Compute f: + // Scan: + // Add zeros at the end of array. + StreamCompaction::Efficient::init_array <<>> (dev_f_array, dev_e_array, n, fit_size); + int d_max = ilog2ceil(n) - 1; + // Up sweep: + for (int d = 0; d <= d_max; ++d) { + int deno = 1 << (d + 1); + // int threads_num_needed = fit_size * pow(0.5, d + 1); + int threads_num_needed = fit_size / deno; + dim3 up_sweep_blocks_per_grid((threads_num_needed + blockSize - 1) / blockSize); + StreamCompaction::Efficient::up_sweep <<>> (dev_f_array, fit_size, d); + } + cudaMemset(dev_f_array + fit_size - 1, 0, sizeof(int)); + // Down sweep: + for (int d = d_max; d >= 0; --d) { + int deno = 1 << (d + 1); + // int threads_num_needed = fit_size * pow(0.5, d + 1); + int threads_num_needed = fit_size / deno; + dim3 down_sweep_blocks_per_grid((threads_num_needed + blockSize - 1) / blockSize); + StreamCompaction::Efficient::down_sweep <<>> (dev_f_array, fit_size, d); + } + int e_end, f_end; + cudaMemcpy(&e_end, dev_e_array + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy dev_e_array failed!"); + cudaMemcpy(&f_end, dev_f_array + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy dev_f_array failed!"); + int total_falses = e_end + f_end; + // Compute t: + compute_t << > > (n, total_falses, dev_f_array, dev_t_array); + checkCUDAError("compute_t failed!"); + // Compute d: + compute_d << > > (n, dev_f_array, dev_t_array, dev_b_array, dev_d_array); + checkCUDAError("compute_d failed!"); + // Scatter data for this pass: + scatter << > > (n, dev_radix_data_array, dev_d_array, dev_output_array); + checkCUDAError("scatter failed!"); + cudaMemcpy(dev_radix_data_array, dev_output_array, sizeInBytes, cudaMemcpyDeviceToDevice); + checkCUDAError("cudaMemcpy output to radix failed!"); + } + timer().endGpuTimer(); + + // Copy to output data: + cudaMemcpy(odata, dev_output_array, sizeInBytes, cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy odata failed!"); + + cudaFree(dev_radix_data_array); + checkCUDAError("cudaFree(dev_data_array) failed!"); + cudaFree(dev_output_array); + checkCUDAError("cudaFree(dev_output_array) failed!"); + cudaFree(dev_b_array); + checkCUDAError("cudaFree(dev_b_array) failed!"); + cudaFree(dev_e_array); + checkCUDAError("cudaFree(dev_e_array) failed!"); + cudaFree(dev_f_array); + checkCUDAError("cudaFree(dev_f_array) failed!"); + cudaFree(dev_d_array); + checkCUDAError("cudaFree(dev_d_array) failed!"); + } + } +} \ No newline at end of file diff --git a/stream_compaction/radix.h b/stream_compaction/radix.h new file mode 100644 index 0000000..c222f1c --- /dev/null +++ b/stream_compaction/radix.h @@ -0,0 +1,11 @@ +#pragma once + +#include "common.h" + +namespace StreamCompaction { + namespace RadixSort { + StreamCompaction::Common::PerformanceTimer& timer(); + + void radix_sort(int n, int* odata, const int* idata); + } +} diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..4726f6b 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -2,6 +2,7 @@ #include #include #include +#include #include #include "common.h" #include "thrust.h" @@ -18,11 +19,19 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + thrust::host_vector host_i(idata, idata + n); + thrust::host_vector host_o(n); + thrust::device_vector dev_i = host_i; + thrust::device_vector dev_o = host_o; + timer().startGpuTimer(); // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(dev_i.begin(), dev_i.end(), dev_o.begin()); timer().endGpuTimer(); + + thrust::copy(dev_o.begin(), dev_o.end(), odata); } } }