diff --git a/Project2 Performance Analysis.xlsx b/Project2 Performance Analysis.xlsx
new file mode 100644
index 0000000..fc00e9d
Binary files /dev/null and b/Project2 Performance Analysis.xlsx differ
diff --git a/README.md b/README.md
index 0e38ddb..6c5d81c 100644
--- a/README.md
+++ b/README.md
@@ -3,12 +3,276 @@ CUDA Stream Compaction
**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
-* (TODO) YOUR NAME HERE
- * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Angelina Risi
+ * [LinkedIn](www.linkedin.com/in/angelina-risi)
+* Tested on: Windows 10, i7-6700HQ @ 2.60GHz 8GB, GTX 960M 4096MB (Personal Laptop)
-### (TODO: Your README)
+## Project Description
+
+This project implements a variety of scan, compact and sort algorithms on the GPU with some comparison tests implemented on the CPU. The base requirements were to implement CPU Scan and Compact Functions, and to implement GPU Naive Scan and Compact and GPU Work-Efficient Scan and Compact. I also created a wrapper function for the Thrust scan implementation on the GPU.
+In addition to these base requirements, I implemented all the defined extra credit assignments. These were Radix sort, using shared GPU memory in the scan implementation, implementing memory bank conflict avoidance, and improving the work-efficient implementation's efficiency over the CPU implementation.
+
+### Features
+
+* CPU Scan
+* CPU Compact
+ - With and without scanning
+* GPU Naive Scan
+* GPU "Work-Efficient" Scan
+ - Including Shared Memory implementation (Extra Credit) w/ Bank Conflict resolution
+* GPU Stream Compaction
+ - Using Work-efficient scan, but can be replaced with other scan algorithms
+* GPU Radix Sort (Extra Credit)
+
+
+## Extra Credit
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+### Radix Sort Implementation
+
+Radix sort is a method of sorting data in an array from min to max using the values' binary data. This is done by sorting by the least-significant bit (LSB) first, iterating through bit sorts until the most-significant bit (MSB).
+Before we can sort, we actually need to find the dataset's maximum value. By taking the ceiling of log2(max), we can get the max number of bits representing the data, which bit is the MSB. This reduces the number of redundant iterations of sorting from the number of bits in the data type to only as many relevant ones there are in the data range. This is done on the GPU using a parallel reduction algorithm comparing pairs of values. The code is reproduced below with extra commentary.
+
+```cpp
+// each thread compares a pair of integers from the input buffer
+// and selects the greater of the two
+__global__ void kernFindMax(int n, int offset1, int offset2, int* buff) {
+ int index = (blockDim.x * blockIdx.x) + threadIdx.x;
+
+ // compute which index to compare
+ int access = index * offset2 - 1;
+ if (access >= n || n < 1 || access < 0) return;
+
+ // modify in place
+ if (buff[access] < buff[access - offset1]) {
+ buff[access] = buff[access - offset1];
+ }
+}
+```
+```cpp
+// The loop iterates deeper into the reduction until the final max value is sorted to the end
+// This essentially sweeps the max value up to the root of a balanced binary tree
+for (d = 1; d <= limit; d++) {
+ offset1 = pow(2, d - 1);
+ offset2 = pow(2, d);
+ fullBlocksPerGrid.x = ((size / offset2) + blockSize) / blockSize;
+ kernFindMax << > >(size, offset1, offset2, max_arr);
+ checkCUDAError("Radix find max fail!"); // error checking
+}
+```
+
+To perform the sort itself efficiently, we generate a a pair of boolean buffers indicating whether the currently tested bit at that index is 0 or 1. One buffer is the true buffer, called b_arr, and the other the false buffer, called f_arr. If the bit value is 1, b_arr[index] is set to 1 and f_arr to 0, and vice versa. We save the last value of f_arr for later to compute the number of "falses" for indexing.
+
+```cpp
+__global__ void kernBoolMaps(int n, int k, int* input, int* b_arr, int* f_arr) {
+ int index = (blockDim.x * blockIdx.x) + threadIdx.x;
+ if (index >= n) return;
+
+ // retrieve the kth bit from the input val
+ int bit = bitK(input[index], k);
+ // flip the bit
+ int fBit = flipBit(bit);
+
+ b_arr[index] = bit; // maps bit k into b_arr
+ f_arr[index] = fBit; // copy flipped value here for scan
+}
+```
+
+The f_arr is scanned using the work-efficient exclusive scan to generate the "false" indices, the locations to store the data values if b_arr[index] == 0 in the output array. The "true" indices, t_arr, are generated as "index - f_arr[index] + totFalse". The total false values is the last value in the scanned f_arr plus the value we stored earlier from f_arr before scanning. By using a GPU-implemented scatter function, we save the input values sorted into the output buffer. To remove the need for more intermediate buffers for each sort step, the input and output arrays are ping-ponged (switch their pointers) each sort step.
+
+```cpp
+__global__ void kernRadixScatter(int n, int *out, int *in, int *b_arr, int *f_arr, int *t_arr) {
+ int index = (blockDim.x * blockIdx.x) + threadIdx.x;
+ if (index >= n) return;
+
+ // We compute the index to access by checking the boolean in b_arr
+ // If true, we use the index in t_arr (true indexing array)
+ // Else, we choose the index in f_arr (false indexing array)
+ // The index "access" is where in the output array the input goes to.
+ int access = b_arr[index] ? t_arr[index] : f_arr[index];
+ out[access] = in[index];
+}
+```
+
+Once the input array has been sorted for each bit, the output is correctly sorted in order of ascending value. This implementation is intended to work on integer values, and currently operates on global device memory, bottlenecking performance. An example of a small array radix sort is depicted:
+
+
+
+### Shared Memory Work-Efficient Scan
+
+An alternative implementation of the work-efficient scan using shared memory to reduce latency is included. Each block stores an array shared among its threads to store the intermediate values before outputting. By reducing global memory accesses and instead using faster shared memory, we can potentially increase thoroughput. Additionally, rather than increasing the scan buffer size to a power of two we need only increase it to a multiple of the buffer size.
+Both the upsweep and downsweep are done in the same kernel as they need to both used the shared memory cache. This means we cannot dynamically change the block and threadcount as we traverse the tree as done in the global memory solution, and we must be careful to synchronize threads between write and read operations to prevent race conditions. Each block essentially performs a scan on a portion of the input data. The depth level (required level of looping) of this scan is log2(blockSize). While there is some overhead described below that makes the depth comparable to log2(n), the reduced memory latency in this portion of the scan should provide improvement over the global memory equivalent.
+To allow the merging of the blocks' solutions, while we calculate an exclusive scan through the downsweep, we save the root value of the tree in the index blockSize of the shared memory array.
+The blocks must add the root value of all previous blocks to their total to calculate the correct prefix sum values of the array. This requires a second scan over all the blocks sums. To simplify the problem of having to recursively scan and stitch together blocks of data for large input data sets, instead of doing a shared memory scan for the second scan, the Work-Efficient scan using global memory was used. This second data set should be much smaller than the first, keeping the extra compute overhead minimal, as log2(n / blockSize) = log2(n) - log2(blockSize) depth level in this scan.
+Once the root sums were scanned, the outputs of both scans we put through a kernel call to stitch together the block data correctly by adding the correct sum value to each of the original output's values. This requires only a simple parallel addition.
+
+```cpp
+__global__ void kernStitch(int n, int* in, int* sums) {
+ int bx = blockIdx.x;
+ int index = (blockDim.x * bx) + threadIdx.x;;
+
+ if (bx == 0) return;
+ if (index >= n) return;
+ in[index] += sums[bx];
+}
+```
+#### Bank Conflict Avoidance
+
+This shared memory scan algorithm is further improved by using offsets on the shared memory access iterators to reduce bank conflicts, events where multiple threads attempt to access a region of shared memory at the same time and must wait for the bus to become free. This is done by applying macros to calculate an offset on the shared memory index based on the assumed number of memory banks. These are taken from the example code in GPU Gems 3 Ch. 39 linked in the instructions. The offset macro is reproduced below, as is example code of its use from my scan function.
+
+```cpp
+// for reducing bank conflicts
+#define NUM_BANKS 32 // Number of memory banks assumed on SM
+#define LOG_NUM_BANKS 5 // log2(NUM_BANKS)
+#define CONFLICT_FREE_OFFSET(n) \
+ ((n) >> NUM_BANKS + (n) >> (2 * LOG_NUM_BANKS))
+ // Offset added to each shared memory index so that more threads accesses through diff bank
+ // so fewer must wait in line to use the same memory bus (thus less latency)
+```
+```cpp
+// Upsweep
+for (offset = 1; offset < blockSize; offset *=2) { // this offset is for calculating the original indices
+ access = (2 * offset * (tx + 1)) - 1; // index of shared memory to access
+ a2 = access - offset; // secondary access index
+
+ a2 += CONFLICT_FREE_OFFSET(a2); // add safe offset to access index
+ access += CONFLICT_FREE_OFFSET(access); // add safe offset to access index
+
+ if (access < blockSize) sBuf[access] += sBuf[a2]; // manipulate data at offset indices
+ __syncthreads(); // avoid mem issues
+}
+```
+
+## Test Output
+
+```
+****************
+** SCAN TESTS **
+****************
+ [ 15 38 45 12 38 26 6 23 30 2 34 33 7 ... 1 0 ]
+==== cpu scan, power-of-two ====
+ elapsed time: 0.077432ms (std::chrono Measured)
+ [ 0 15 53 98 110 148 174 180 203 233 235 269 302 ... 809545 809546 ]
+==== cpu scan, non-power-of-two ====
+ elapsed time: 0.058864ms (std::chrono Measured)
+ [ 0 15 53 98 110 148 174 180 203 233 235 269 302 ... 809458 809482 ]
+ passed
+==== naive scan, power-of-two ====
+ elapsed time: 0.099264ms (CUDA Measured)
+ [ 0 15 53 98 110 148 174 180 203 233 235 269 302 ... 809545 809546 ]
+ passed
+==== naive scan, non-power-of-two ====
+ elapsed time: 0.098816ms (CUDA Measured)
+ passed
+==== work-efficient scan, power-of-two ====
+ elapsed time: 0.164352ms (CUDA Measured)
+ [ 0 15 53 98 110 148 174 180 203 233 235 269 302 ... 809545 809546 ]
+ passed
+==== work-efficient scan, non-power-of-two ====
+ elapsed time: 0.156096ms (CUDA Measured)
+ [ 0 15 53 98 110 148 174 180 203 233 235 269 302 ... 809458 809482 ]
+ passed
+==== thrust scan, power-of-two ====
+ elapsed time: 0.57184ms (CUDA Measured)
+ passed
+==== thrust scan, non-power-of-two ====
+ elapsed time: 0.29728ms (CUDA Measured)
+ passed
+==== Find max, power-of-two ====
+ elapsed time: 0.059936ms (CUDA Measured)
+max = 49
+==== Find max, non-power-of-two ====
+ elapsed time: 0.05984ms (CUDA Measured)
+max = 49
+==== Radix sort, power-of-two ====
+ elapsed time: 2.60947ms (CUDA Measured)
+ [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 49 49 ]
+==== Radix sort, non-power-of-two ====
+ elapsed time: 2.29824ms (CUDA Measured)
+ [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 49 49 ]
+==== Radix example sort ====
+Test input array:
+ [ 4 7 2 6 3 5 1 0 ]
+ elapsed time: 0.464576ms (CUDA Measured)
+Sorted Output:
+ [ 0 1 2 3 4 5 6 7 ]
+==== Shared Memory Efficient Scan, power-of-two ====
+ elapsed time: 0.0904ms (CUDA Measured)
+ [ 0 15 53 98 110 148 174 180 203 233 235 269 302 ... 809545 809546 ]
+ passed
+==== Shared Memory Efficient Scan, non-power-of-two ====
+ elapsed time: 0.11568ms (CUDA Measured)
+ [ 0 15 53 98 110 148 174 180 203 233 235 269 302 ... 809458 809482 ]
+ passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+ [ 1 0 3 3 2 3 1 0 3 0 0 3 2 ... 2 0 ]
+==== cpu compact without scan, power-of-two ====
+ elapsed time: 0.116543ms (std::chrono Measured)
+ [ 1 3 3 2 3 1 3 3 2 1 3 3 2 ... 1 2 ]
+ passed
+==== cpu compact without scan, non-power-of-two ====
+ elapsed time: 0.132346ms (std::chrono Measured)
+ [ 1 3 3 2 3 1 3 3 2 1 3 3 2 ... 2 2 ]
+ passed
+==== cpu compact with scan ====
+ elapsed time: 0.390321ms (std::chrono Measured)
+ [ 1 3 3 2 3 1 3 3 2 1 3 3 2 ... 1 2 ]
+ passed
+==== work-efficient compact, power-of-two ====
+ elapsed time: 0.171744ms (CUDA Measured)
+ passed
+==== work-efficient compact, non-power-of-two ====
+ elapsed time: 0.192384ms (CUDA Measured)
+ passed
+==== Shared Memory work-efficient compact, power-of-two ====
+ elapsed time: 0.234592ms (CUDA Measured)
+ passed
+==== Shared Memory work-efficient compact, non-power-of-two ====
+ elapsed time: 0.236352ms (CUDA Measured)
+ passed
+```
+
+
+## Performance Analysis
+
+To get a rough estimate on the advantages of performance of each method, a loop was added to the main function to generate an array, run each scan and compact algorithm, and measure the time it took to run. This loop repeats 100 times and the average timing data is output after. By running this analysis for different block sizes and data set sizes, we can approximate performance in each case. The radix sort performance is also tested, but is not meant to be compared with the others due to performing a very different function.
+
+### Varying Block Size
+
+The performance was first tested on a set of data with 215 (32,768) integer values (minus 3 for the non-power-of-two test). The block size was varied at powers of 2 from 32 (the reasonable minimum, the warp size) to 1024 (the max threads per block of this GPU). This performance test was to decide on an optimal blocksize for testing these algorithms against the CPU scan and compact algorithms. We do not test Thrust in this manner as we only wrap its implementation.
+
+ 
+
+ 
+
+The average time to execute varies slightly between runs, but we can approximate the optimal blocksize from which size gives te shortest execution times on average. For some, the difference is minimal, making it difficult to choose, but the final selection is as follows:
+
+* Naive Scan: 256
+* Work-Efficient Scan: 64
+* Shared Memory Scan: 512
+
+The larger block size for the shared memory scan is likely more efficient due to both sharing memory across more threads and fewer total blocks to stitch together, reducing the latency from the secondary scan on block sums. In contrast, the work-efficient scan is most efficient for smaller block sizes. This may be due to the simplicity of the kernels and the decreasing number of active blocks per iteration allowing better thoroughput with smaller threadcounts per block.
+
+
+
+The Radix sort appears most efficient at 128 or 256 threads per block. Since the average between power-of-two and non-power-of-two array speeds is better for an array of size 128, this value is chosen.
+
+
+### Varying Data Set Sizes
+
+Once the algorithms' block sizes were optimized, they could be tested for varying data set sizes. The data size was swept through powers of two from 26 (64) to 222 (4,194,304) for completeness in examining small to large data sets.
+
+ 
+
+The scan algorithms were first compared. The plots demonstrate that the CPU implementation is actually significantly faster at first, but is overtaken by all the GPU implementations around an array size of 218. This is due to the CPU scan scaling directly with array size, while GPU implementations mitigate this with parallelism. Interesting to note is a sudden jump in thrust scan time at 215, but much slower time scaling otherwise. This is assumed to be due to how thrust optimizes scan processing or allocates the arrays. The work efficient scan is actually slower than the naive scan as well, but the shared memory work-efficient scan is faster than naive, and it eventually is faster than naive scan at larger array sizes.
+
+ 
+
+Compation was implemented on the CPU with and without scanning and on the GPU with the work efficient scan. The CPU compact without scanning requires less computation and is thus very fast. The GPU manages to become faster than the CPU at an array size of 219, when the array size is large enough that iterating over each index is slower than the parallel GPU map, scan, and scatter algorithm.
+
+These analyses show that unless properly optimized, the CPU can be faster than the GPU on small enough data sets with simple enough computations. Global memory access latency, idle threads, inefficient branching, etc. in the GPU implementations can cause great decreases in thoroughput, and with more time and effort could be further reduced. The comparison with the thrust implementation shows there are more optimizations to be applied. But, we still find use of even naive implementations on large enough data sets. For example, many modern laptop screens have 1920 x 1080 resolution, over 2 million pixels, making parallel GPU computations significantly more efficient.
+
+All performance data recorded can be found in the Performance Analysis excel file [here](https://github.com/risia/Project2-Stream-Compaction/blob/master/Project2%20Performance%20Analysis.xlsx).
diff --git a/img/compact_comp1.PNG b/img/compact_comp1.PNG
new file mode 100644
index 0000000..56462ae
Binary files /dev/null and b/img/compact_comp1.PNG differ
diff --git a/img/compact_comp2.PNG b/img/compact_comp2.PNG
new file mode 100644
index 0000000..4f623dd
Binary files /dev/null and b/img/compact_comp2.PNG differ
diff --git a/img/naive_blocksize.PNG b/img/naive_blocksize.PNG
new file mode 100644
index 0000000..aba994a
Binary files /dev/null and b/img/naive_blocksize.PNG differ
diff --git a/img/radix_blocksize.PNG b/img/radix_blocksize.PNG
new file mode 100644
index 0000000..f5d898f
Binary files /dev/null and b/img/radix_blocksize.PNG differ
diff --git a/img/radix_example.PNG b/img/radix_example.PNG
new file mode 100644
index 0000000..bd18f27
Binary files /dev/null and b/img/radix_example.PNG differ
diff --git a/img/scan_comp1.PNG b/img/scan_comp1.PNG
new file mode 100644
index 0000000..90fc408
Binary files /dev/null and b/img/scan_comp1.PNG differ
diff --git a/img/scan_comp2.PNG b/img/scan_comp2.PNG
new file mode 100644
index 0000000..22ae463
Binary files /dev/null and b/img/scan_comp2.PNG differ
diff --git a/img/sm_scan_blocksize.PNG b/img/sm_scan_blocksize.PNG
new file mode 100644
index 0000000..79b8394
Binary files /dev/null and b/img/sm_scan_blocksize.PNG differ
diff --git a/img/we_compact_blocksize.PNG b/img/we_compact_blocksize.PNG
new file mode 100644
index 0000000..5a1cc95
Binary files /dev/null and b/img/we_compact_blocksize.PNG differ
diff --git a/img/we_scan_blocksize.PNG b/img/we_scan_blocksize.PNG
new file mode 100644
index 0000000..5785979
Binary files /dev/null and b/img/we_scan_blocksize.PNG differ
diff --git a/src/main.cpp b/src/main.cpp
index 1850161..673330a 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -11,9 +11,11 @@
#include
#include
#include
+#include
+#include
#include "testing_helpers.hpp"
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 1 << 15; // feel free to change the size of array
const int NPOT = SIZE - 3; // Non-Power-Of-Two
int *a = new int[SIZE];
int *b = new int[SIZE];
@@ -51,7 +53,7 @@ int main(int argc, char* argv[]) {
printDesc("naive scan, power-of-two");
StreamCompaction::Naive::scan(SIZE, c, a);
printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
- //printArray(SIZE, c, true);
+ printArray(SIZE, c, true);
printCmpResult(SIZE, b, c);
/* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
@@ -71,14 +73,14 @@ int main(int argc, char* argv[]) {
printDesc("work-efficient scan, power-of-two");
StreamCompaction::Efficient::scan(SIZE, c, a);
printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
- //printArray(SIZE, c, true);
+ printArray(SIZE, c, true);
printCmpResult(SIZE, b, c);
zeroArray(SIZE, c);
printDesc("work-efficient scan, non-power-of-two");
StreamCompaction::Efficient::scan(NPOT, c, a);
printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
- //printArray(NPOT, c, true);
+ printArray(NPOT, c, true);
printCmpResult(NPOT, b, c);
zeroArray(SIZE, c);
@@ -95,6 +97,67 @@ int main(int argc, char* argv[]) {
//printArray(NPOT, c, true);
printCmpResult(NPOT, b, c);
+ zeroArray(SIZE, c);
+ printDesc("Find max, power-of-two");
+ int max = StreamCompaction::Radix::max(SIZE, a);
+ printElapsedTime(StreamCompaction::Radix::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+ printf("max = %i\n", max);
+
+ zeroArray(SIZE, c);
+ printDesc("Find max, non-power-of-two");
+ max = StreamCompaction::Radix::max(NPOT, a);
+ printElapsedTime(StreamCompaction::Radix::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+ printf("max = %i\n", max);
+
+ zeroArray(SIZE, c);
+ //int radix_tst[8] = { 4, 7, 2, 6, 3, 5, 1, 0 };
+ printDesc("Radix sort, power-of-two");
+ StreamCompaction::Radix::sort(SIZE, c, a);
+ printElapsedTime(StreamCompaction::Radix::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+ printArray(SIZE, c, true);
+
+ zeroArray(SIZE, c);
+ printDesc("Radix sort, non-power-of-two");
+ StreamCompaction::Radix::sort(NPOT, c, a);
+ printElapsedTime(StreamCompaction::Radix::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+ printArray(NPOT, c, true);
+
+ zeroArray(SIZE, c);
+ int radix_tst[8] = { 4, 7, 2, 6, 3, 5, 1, 0 };
+ printDesc("Radix example sort");
+ printf("Test input array:\n");
+ printArray(8, radix_tst, true);
+ StreamCompaction::Radix::sort(8, c, radix_tst);
+ printElapsedTime(StreamCompaction::Radix::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+ printf("Sorted Output:\n");
+ printArray(8, c, true);
+
+ zeroArray(SIZE, c);
+ printDesc("Shared Memory Efficient Scan, power-of-two");
+ StreamCompaction::SharedMem::scan(SIZE, c, a);
+ printElapsedTime(StreamCompaction::SharedMem::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+ printArray(SIZE, c, true);
+ printCmpResult(SIZE, b, c);
+
+ zeroArray(SIZE, c);
+ printDesc("Shared Memory Efficient Scan, non-power-of-two");
+ StreamCompaction::SharedMem::scan(NPOT, c, a);
+ printElapsedTime(StreamCompaction::SharedMem::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+ printArray(NPOT, c, true);
+ printCmpResult(NPOT, b, c);
+
+ //zeroArray(SIZE, c);
+ //printDesc("Shared Memory Efficient Scan, power-of-two");
+ //StreamCompaction::SharedMem::scan(8, c, radix_tst);
+ //printElapsedTime(StreamCompaction::SharedMem::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+ //printArray(8, c, true);
+
+ //zeroArray(SIZE, c);
+ //printDesc("Shared Memory Efficient Scan, non-power-of-two");
+ //StreamCompaction::SharedMem::scan(7, c, radix_tst);
+ //printElapsedTime(StreamCompaction::SharedMem::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+ //printArray(7, c, true);
+
printf("\n");
printf("*****************************\n");
printf("** STREAM COMPACTION TESTS **\n");
@@ -147,6 +210,161 @@ int main(int argc, char* argv[]) {
//printArray(count, c, true);
printCmpLenResult(count, expectedNPOT, b, c);
+ zeroArray(SIZE, c);
+ printDesc("Shared Memory work-efficient compact, power-of-two");
+ count = StreamCompaction::SharedMem::compact(SIZE, c, a);
+ printElapsedTime(StreamCompaction::SharedMem::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+ //printArray(count, c, true);
+ printCmpLenResult(count, expectedCount, b, c);
+
+ zeroArray(SIZE, c);
+ printDesc("Shared Memory work-efficient compact, non-power-of-two");
+ count = StreamCompaction::SharedMem::compact(NPOT, c, a);
+ printElapsedTime(StreamCompaction::SharedMem::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+ //printArray(count, c, true);
+ printCmpLenResult(count, expectedNPOT, b, c);
+
+
+ // loop 100 tests to get avgs
+ // make time variables
+ float time_N_S_POT = 0.0f; // naive pow 2 scan
+ float time_N_S_NPOT = 0.0f; // naive not pow 2 scan
+ float time_WE_S_POT = 0.0f; //
+ float time_WE_S_NPOT = 0.0f; //
+ float time_WE_C_POT = 0.0f; //
+ float time_WE_C_NPOT = 0.0f; //
+ float time_SM_S_POT = 0.0f; //
+ float time_SM_S_NPOT = 0.0f; //
+ float time_T_S_POT = 0.0f; //
+ float time_T_S_NPOT = 0.0f; //
+ float time_R_S_POT = 0.0f; //
+ float time_R_S_NPOT = 0.0f; //
+ float time_CPU_S_POT = 0.0f;
+ float time_CPU_S_NPOT = 0.0f;
+ float time_CPU_C_S = 0.0f;
+ float time_CPU_C_NS = 0.0f;
+ float time_CPU_C_S_NPOT = 0.0f;
+ float time_CPU_C_NS_NPOT = 0.0f;
+
+ for (int i = 0; i < 100; i++) {
+ // gen array
+ genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case
+ a[SIZE - 1] = 0;
+
+ // cpu scan POT
+ zeroArray(SIZE, b);
+ StreamCompaction::CPU::scan(SIZE, b, a);
+ time_CPU_S_POT += StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation();
+
+ // cpu scan POT
+ zeroArray(SIZE, b);
+ StreamCompaction::CPU::scan(NPOT, b, a);
+ time_CPU_S_NPOT += StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation();
+
+ // cpu compact w/o scan
+ zeroArray(SIZE, b);
+ StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
+ time_CPU_C_NS += StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation();
+
+ // cpu compact w/o scan
+ zeroArray(SIZE, b);
+ StreamCompaction::CPU::compactWithoutScan(NPOT, b, a);
+ time_CPU_C_NS_NPOT += StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation();
+
+ // cpu compact w/ scan
+ zeroArray(SIZE, b);
+ StreamCompaction::CPU::compactWithScan(SIZE, b, a);
+ time_CPU_C_S += StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation();
+
+ // cpu compact w/ scan
+ zeroArray(SIZE, b);
+ StreamCompaction::CPU::compactWithScan(NPOT, b, a);
+ time_CPU_C_S_NPOT += StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation();
+
+ // Naive scan POT
+ zeroArray(SIZE, b);
+ StreamCompaction::Naive::scan(SIZE, b, a);
+ time_N_S_POT += StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation();
+
+ // Naive scan N_POT
+ zeroArray(SIZE, b);
+ StreamCompaction::Naive::scan(NPOT, b, a);
+ time_N_S_NPOT += StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation();
+
+ // WE scan POT
+ zeroArray(SIZE, b);
+ StreamCompaction::Efficient::scan(SIZE, b, a);
+ time_WE_S_POT += StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation();
+
+ // WE scan N_POT
+ zeroArray(SIZE, b);
+ StreamCompaction::Efficient::scan(NPOT, b, a);
+ time_WE_S_NPOT += StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation();
+
+ // WE compact POT
+ zeroArray(SIZE, b);
+ StreamCompaction::Efficient::compact(SIZE, b, a);
+ time_WE_C_POT += StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation();
+
+ // WE compact N_POT
+ zeroArray(SIZE, b);
+ StreamCompaction::Efficient::compact(NPOT, b, a);
+ time_WE_C_NPOT += StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation();
+
+ // SM scan POT
+ zeroArray(SIZE, b);
+ StreamCompaction::SharedMem::scan(SIZE, b, a);
+ time_SM_S_POT += StreamCompaction::SharedMem::timer().getGpuElapsedTimeForPreviousOperation();
+
+ // SM scan N_POT
+ zeroArray(SIZE, b);
+ StreamCompaction::SharedMem::scan(NPOT, b, a);
+ time_SM_S_NPOT += StreamCompaction::SharedMem::timer().getGpuElapsedTimeForPreviousOperation();
+
+ // Thrust scan POT
+ zeroArray(SIZE, b);
+ StreamCompaction::Thrust::scan(SIZE, b, a);
+ time_T_S_POT += StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation();
+
+ // Thrust scan N_POT
+ zeroArray(SIZE, b);
+ StreamCompaction::Thrust::scan(NPOT, b, a);
+ time_T_S_NPOT += StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation();
+
+ // Radix sort POT
+ zeroArray(SIZE, b);
+ StreamCompaction::Radix::sort(SIZE, b, a);
+ time_R_S_POT += StreamCompaction::Radix::timer().getGpuElapsedTimeForPreviousOperation();
+
+ // Radic sort N_POT
+ zeroArray(SIZE, b);
+ StreamCompaction::Radix::sort(NPOT, b, a);
+ time_R_S_NPOT += StreamCompaction::Radix::timer().getGpuElapsedTimeForPreviousOperation();
+
+ }
+
+ // print avg times
+ printf("CPU Scan POT: %f\n", time_CPU_S_POT / 100.0f);
+ printf("CPU Scan NPOT: %f\n", time_CPU_S_NPOT / 100.0f);
+ printf("CPU Compact POT: %f\n", time_CPU_C_NS / 100.0f);
+ printf("CPU Scan Compact NPOT: %f\n", time_CPU_C_S_NPOT / 100.0f);
+ printf("CPU Compact NPOT: %f\n", time_CPU_C_NS_NPOT / 100.0f);
+ printf("CPU Scan Compact POT: %f\n", time_CPU_C_S / 100.0f);
+ printf("Naive POT: %f\n", time_N_S_POT / 100.0f);
+ printf("Naive NPOT: %f\n", time_N_S_NPOT / 100.0f);
+ printf("WE Scan POT: %f\n", time_WE_S_POT / 100.0f);
+ printf("WE Scan NPOT: %f\n", time_WE_S_NPOT / 100.0f);
+ printf("WE Comp POT: %f\n", time_WE_C_POT / 100.0f);
+ printf("WE Comp NPOT: %f\n", time_WE_C_NPOT / 100.0f);
+ printf("SM Scan POT: %f\n", time_SM_S_POT / 100.0f);
+ printf("SM Scan NPOT: %f\n", time_SM_S_NPOT / 100.0f);
+ printf("Thrust Scan POT: %f\n", time_T_S_POT / 100.0f);
+ printf("Thrust Scan NPOT: %f\n", time_T_S_NPOT / 100.0f);
+ printf("Radix POT: %f\n", time_R_S_POT / 100.0f);
+ printf("Radix NPOT: %f\n", time_R_S_NPOT / 100.0f);
+
+
+
system("pause"); // stop Win32 console from closing on exit
delete[] a;
delete[] b;
diff --git a/src/testing_helpers.hpp b/src/testing_helpers.hpp
index 46337ab..d6cc4e3 100644
--- a/src/testing_helpers.hpp
+++ b/src/testing_helpers.hpp
@@ -1,8 +1,8 @@
#pragma once
-#include
-#include
-#include
+#include
+#include
+#include
#include
#include
@@ -69,8 +69,8 @@ void printArray(int n, int *a, bool abridged = false) {
printf("]\n");
}
-template
-void printElapsedTime(T time, std::string note = "")
-{
- std::cout << " elapsed time: " << time << "ms " << note << std::endl;
+template
+void printElapsedTime(T time, std::string note = "")
+{
+ std::cout << " elapsed time: " << time << "ms " << note << std::endl;
}
\ No newline at end of file
diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt
index cdbef77..76a0f9d 100644
--- a/stream_compaction/CMakeLists.txt
+++ b/stream_compaction/CMakeLists.txt
@@ -9,9 +9,13 @@ set(SOURCE_FILES
"efficient.cu"
"thrust.h"
"thrust.cu"
+ "radix.h"
+ "radix.cu"
+ "shared_mem.h"
+ "shared_mem.cu"
)
cuda_add_library(stream_compaction
${SOURCE_FILES}
- OPTIONS -arch=sm_20
+ OPTIONS -arch=sm_50
)
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index 8fc0211..c99513b 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -23,7 +23,10 @@ namespace StreamCompaction {
* which map to 0 will be removed, and elements which map to 1 will be kept.
*/
__global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
- // TODO
+ int index = (blockDim.x * blockIdx.x) + threadIdx.x;
+ if (index >= n) return;
+ if (idata[index] != 0) bools[index] = 1;
+ else bools[index] = 0;
}
/**
@@ -32,7 +35,9 @@ namespace StreamCompaction {
*/
__global__ void kernScatter(int n, int *odata,
const int *idata, const int *bools, const int *indices) {
- // TODO
+ int index = (blockDim.x * blockIdx.x) + threadIdx.x;
+ if (index >= n) return;
+ if (bools[index] == 1) odata[indices[index]] = idata[index];
}
}
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 05ce667..dbf4b59 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -1,15 +1,15 @@
#include
#include "cpu.h"
-#include "common.h"
+#include "common.h"
namespace StreamCompaction {
namespace CPU {
- using StreamCompaction::Common::PerformanceTimer;
- PerformanceTimer& timer()
- {
- static PerformanceTimer timer;
- return timer;
+ using StreamCompaction::Common::PerformanceTimer;
+ PerformanceTimer& timer()
+ {
+ static PerformanceTimer timer;
+ return timer;
}
/**
@@ -19,7 +19,13 @@ namespace StreamCompaction {
*/
void scan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
- // TODO
+
+ if (n < 1) return; // no data
+ odata[0] = 0; // Exclusive scan
+ for (int i = 1; i < n; i++) {
+ odata[i] = idata[i-1] + odata[i-1];
+ }
+
timer().endCpuTimer();
}
@@ -30,9 +36,19 @@ namespace StreamCompaction {
*/
int compactWithoutScan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
- // TODO
+
+ if (n < 1) return -1; // no data
+ int nElem = 0; // remaining elements after compact
+
+ for (int i = 0; i < n; i++) {
+ if (idata[i] != 0) {
+ odata[nElem] = idata[i];
+ nElem++;
+ }
+ }
+
timer().endCpuTimer();
- return -1;
+ return nElem;
}
/**
@@ -41,10 +57,44 @@ namespace StreamCompaction {
* @returns the number of elements remaining after compaction.
*/
int compactWithScan(int n, int *odata, const int *idata) {
+
+ int *map_data = (int*)(malloc(n * sizeof(int)));
+ int *scan_data = (int*)(malloc(n * sizeof(int)));
+
timer().startCpuTimer();
- // TODO
+
+ //map
+ int i;
+ for (i = 0; i < n; i++) {
+ if (idata[i] != 0) {
+ map_data[i] = 1;
+ }
+ else map_data[i] = 0;
+ }
+
+ // scan
+ scan_data[0] = 0; // Exclusive scan
+ for (int i = 1; i < n; i++) {
+ scan_data[i] = map_data[i - 1] + scan_data[i - 1];
+ }
+
+ int r_val;
+
+ // scatter
+ for (i = 0; i < n; i++) {
+ if (map_data[i] == 1) {
+ r_val = scan_data[i];
+ odata[r_val] = idata[i];
+ }
+ }
+ r_val++;
+
timer().endCpuTimer();
- return -1;
+
+ free(map_data);
+ free(scan_data);
+
+ return r_val;
}
}
}
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 36c5ef2..297bb94 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -3,22 +3,101 @@
#include "common.h"
#include "efficient.h"
+#define blockSize 64
+
namespace StreamCompaction {
namespace Efficient {
- using StreamCompaction::Common::PerformanceTimer;
- PerformanceTimer& timer()
- {
- static PerformanceTimer timer;
- return timer;
+ using StreamCompaction::Common::PerformanceTimer;
+ PerformanceTimer& timer()
+ {
+ static PerformanceTimer timer;
+ return timer;
}
+ __global__ void kernScanDataUpSweep(int n, int offset1, int offset2, int* buff) {
+ int index = (blockDim.x * blockIdx.x) + threadIdx.x;
+
+ int access = index * offset2 - 1;
+ if (access >= n || n < 1 || access < 0) return;
+
+ buff[access] += buff[access - offset1];
+ }
+
+
+ __global__ void kernScanDataDownSweep(int n, int offset1, int offset2, int* buff) {
+ int index = (blockDim.x * blockIdx.x) + threadIdx.x;
+
+ int access = index * offset2 - 1;
+ if (access >= n || n < 1 || access < 0) return;
+
+ int temp = buff[access - offset1];
+ buff[access - offset1] = buff[access];
+ buff[access] += temp;
+ }
+
/**
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
void scan(int n, int *odata, const int *idata) {
- timer().startGpuTimer();
- // TODO
- timer().endGpuTimer();
+
+ int limit = ilog2ceil(n);
+ int size = pow(2, limit);
+
+
+ dim3 fullBlocksPerGrid((size + blockSize - 1) / blockSize);
+
+ // allocate memory
+ int* dev_buf;
+ cudaMalloc((void**)&dev_buf, size * sizeof(int));
+ checkCUDAError("w-e scan malloc fail!");
+
+ // copy input data to device
+ cudaMemset(dev_buf + n, 0, (size - n) * sizeof(int));
+ cudaMemcpy(dev_buf, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+ checkCUDAError("initializing w-e scan data buff fail!");
+
+ timer().startGpuTimer();
+
+ int d;
+ int offset1;
+ int offset2;
+
+ int threads;
+
+ // UpSweep
+ for (d = 1; d <= limit; d++) {
+ offset1 = pow(2, d - 1);
+ offset2 = pow(2, d);
+
+ threads = (size / offset2);
+ fullBlocksPerGrid.x = (threads / blockSize) + 1;
+
+ kernScanDataUpSweep << > >(size, offset1, offset2, dev_buf);
+ checkCUDAError("upsweep fail!");
+ }
+
+ // DownSweep
+ cudaMemset(dev_buf + n - 1, 0, (size - n + 1)* sizeof(int));
+ for (d = limit; d >= 1; d--) {
+ offset1 = pow(2, d - 1);
+ offset2 = pow(2, d);
+
+ threads = (size / offset2);
+ fullBlocksPerGrid.x = (threads / blockSize) + 1;
+
+ kernScanDataDownSweep << > >(size, offset1, offset2, dev_buf);
+ checkCUDAError("downsweep fail!");
+ }
+
+
+ timer().endGpuTimer();
+
+ // copy output data to host
+ cudaMemcpy(odata, dev_buf, n * sizeof(int), cudaMemcpyDeviceToHost);
+ checkCUDAError("copying output data fail!");
+
+ // cleanup
+ cudaFree(dev_buf);
}
/**
@@ -31,10 +110,86 @@ namespace StreamCompaction {
* @returns The number of elements remaining after compaction.
*/
int compact(int n, int *odata, const int *idata) {
+
+ int* dev_map; // bool mapping
+ int* dev_scan; // scanned data
+ int* dev_out; // compacted data to output
+ int* dev_in; // input data
+
+ int limit = ilog2ceil(n);
+ int size = pow(2, limit);
+
+ // allocate memory
+ cudaMalloc((void**)&dev_in, n * sizeof(int));
+ cudaMalloc((void**)&dev_map, n * sizeof(int));
+ cudaMalloc((void**)&dev_out, n * sizeof(int));
+ cudaMalloc((void**)&dev_scan, size * sizeof(int));
+ checkCUDAError("w-e compact malloc fail!");
+
+ cudaMemset(dev_scan + n, 0, (size - n) * sizeof(int)); // zero extra mem
+ cudaMemcpy(dev_in, idata, n * sizeof(int), cudaMemcpyHostToDevice); // copy input data
+ checkCUDAError("initializing w-e compact data buffs fail!");
+
+ dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize);
+
timer().startGpuTimer();
- // TODO
+ // map
+ StreamCompaction::Common::kernMapToBoolean << > >(n, dev_map, dev_in);
+ cudaMemcpy(dev_scan, dev_map, n * sizeof(int), cudaMemcpyDeviceToDevice); // copy bool data to scan
+ checkCUDAError("w-e compact bool mapping fail!");
+
+ // scan
+
+ int d;
+ int offset1;
+ int offset2;
+
+ // UpSweep
+ for (d = 1; d <= limit; d++) {
+ offset1 = pow(2, d - 1);
+ offset2 = pow(2, d);
+ fullBlocksPerGrid.x = ((size / offset2) + blockSize) / blockSize;
+ kernScanDataUpSweep << > >(size, offset1, offset2, dev_scan);
+ checkCUDAError("w-e compact upsweep fail!");
+ }
+
+ // DownSweep
+ cudaMemset(dev_scan + n - 1, 0, (size - n + 1) * sizeof(int));
+ for (d = limit; d >= 1; d--) {
+ offset1 = pow(2, d - 1);
+ offset2 = pow(2, d);
+ fullBlocksPerGrid.x = ((size / offset2) + blockSize) / blockSize;
+ kernScanDataDownSweep << > >(size, offset1, offset2, dev_scan);
+ checkCUDAError("w-e compact downsweep fail!");
+ }
+
+ // scatter
+ fullBlocksPerGrid.x = ((n + blockSize - 1) / blockSize);
+ StreamCompaction::Common::kernScatter << > >(n, dev_out, dev_in, dev_map, dev_scan);
+ checkCUDAError("w-e compact scatter fail!");
+
timer().endGpuTimer();
- return -1;
+
+ // copy output to host
+ cudaMemcpy(odata, dev_out, n * sizeof(int), cudaMemcpyDeviceToHost);
+ checkCUDAError("w-e compact output copy fail!");
+
+ // calc # of elements for return
+ int map_val;
+ int r_val;
+ cudaMemcpy(&r_val, dev_scan + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+ cudaMemcpy(&map_val, dev_map + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+ checkCUDAError("w-e compact calc # elem fail!");
+
+ r_val += map_val;
+
+ // cleanup
+ cudaFree(dev_in);
+ cudaFree(dev_map);
+ cudaFree(dev_out);
+ cudaFree(dev_scan);
+
+ return r_val;
}
}
}
diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h
index 803cb4f..3578061 100644
--- a/stream_compaction/efficient.h
+++ b/stream_compaction/efficient.h
@@ -6,6 +6,10 @@ namespace StreamCompaction {
namespace Efficient {
StreamCompaction::Common::PerformanceTimer& timer();
+ __global__ void kernScanDataUpSweep(int n, int offset1, int offset2, int* buff);
+
+ __global__ void kernScanDataDownSweep(int n, int offset1, int offset2, int* buff);
+
void scan(int n, int *odata, const int *idata);
int compact(int n, int *odata, const int *idata);
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 9218f8e..09cb5ff 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -3,23 +3,75 @@
#include "common.h"
#include "naive.h"
+#define blockSize 256
+
namespace StreamCompaction {
namespace Naive {
- using StreamCompaction::Common::PerformanceTimer;
- PerformanceTimer& timer()
- {
- static PerformanceTimer timer;
- return timer;
+ using StreamCompaction::Common::PerformanceTimer;
+ PerformanceTimer& timer()
+ {
+ static PerformanceTimer timer;
+ return timer;
}
// TODO: __global__
+ __global__ void kernScanDataNaive(int n, int offset, int* out, const int *in) {
+
+ int index = (blockDim.x * blockIdx.x) + threadIdx.x;
+ if (index > n || n < 1) return;
+
+ if (index >= offset) {
+ out[index] = in[index] + in[index - offset];
+ }
+ else {
+ out[index] = in[index];
+ }
+ }
+
/**
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
void scan(int n, int *odata, const int *idata) {
+ dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize);
+
+ // allocate memory
+ int* dev_out;
+ int* dev_in;
+ int* swap;
+
+ cudaMalloc((void**)&dev_out, n * sizeof(int));
+ cudaMalloc((void**)&dev_in, n * sizeof(int));
+ checkCUDAError("naive scan malloc fail!");
+
+ // copy input data to device
+ cudaMemset(dev_in, 0, sizeof(int));
+ cudaMemcpy(dev_in + 1, idata, (n - 1) * sizeof(int), cudaMemcpyHostToDevice);
+ checkCUDAError("naive input copy fail!");
+
timer().startGpuTimer();
- // TODO
+
+ int d;
+ int offset;
+ for (d = 1; d <= ilog2ceil(n); d++) {
+ offset = pow(2, d - 1);
+ kernScanDataNaive<<>>(n, offset, dev_out, dev_in);
+ checkCUDAError("naive scan iteration fail!");
+ // swap buffers
+ swap = dev_in;
+ dev_in = dev_out;
+ dev_out = swap;
+
+ }
+
timer().endGpuTimer();
+
+ // copy output data to host
+ cudaMemcpy(odata, dev_in, n * sizeof(int), cudaMemcpyDeviceToHost);
+ checkCUDAError("naive copy output fail!");
+
+ // cleanup
+ cudaFree(dev_in);
+ cudaFree(dev_out);
}
}
}
diff --git a/stream_compaction/radix.cu b/stream_compaction/radix.cu
new file mode 100644
index 0000000..c0efccf
--- /dev/null
+++ b/stream_compaction/radix.cu
@@ -0,0 +1,223 @@
+#include "radix.h"
+#include
+#include
+
+#define blockSize 128
+
+// macros for bit checks and toggles
+// define macro to get nth bit of int
+#define bitK(num, k) ((num >> k) & 1)
+// flip bit
+#define flipBit(bit) (bit ^ 1)
+
+namespace StreamCompaction {
+ namespace Radix {
+ using StreamCompaction::Common::PerformanceTimer;
+ PerformanceTimer& timer()
+ {
+ static PerformanceTimer timer;
+ return timer;
+ }
+
+ __global__ void kernFindMax(int n, int offset1, int offset2, int* buff) {
+ int index = (blockDim.x * blockIdx.x) + threadIdx.x;
+
+ int access = index * offset2 - 1;
+ if (access >= n || n < 1 || access < 0) return;
+
+ // modify in place
+ if (buff[access] < buff[access - offset1]) {
+ buff[access] = buff[access - offset1];
+ }
+
+ }
+
+ __global__ void kernBoolMaps(int n, int k, int* input, int* b_arr, int* f_arr) {
+ int index = (blockDim.x * blockIdx.x) + threadIdx.x;
+ if (index >= n) return;
+
+ int bit = bitK(input[index], k);
+ int fBit = flipBit(bit);
+
+ b_arr[index] = bit; // maps bit k in b_arr
+ f_arr[index] = fBit; // copy flipped value here for scan
+ }
+
+ __global__ void kernComputeT(int n, int totFalse, int *t_arr, int *f_arr) {
+ int index = (blockDim.x * blockIdx.x) + threadIdx.x;
+ if (index >= n) return;
+
+ t_arr[index] = index - f_arr[index] + totFalse;
+ }
+
+ __global__ void kernRadixScatter(int n, int *out, int *in, int *b_arr, int *f_arr, int *t_arr) {
+ int index = (blockDim.x * blockIdx.x) + threadIdx.x;
+ if (index >= n) return;
+
+ int access = b_arr[index] ? t_arr[index] : f_arr[index];
+ out[access] = in[index];
+ }
+
+ /*
+ Performs Radix Sort on input data using Work-Efficient Scan
+ */
+
+ int max(int n, int *idata) {
+ int limit = ilog2ceil(n);
+ int size = pow(2, limit);
+
+ dim3 fullBlocksPerGrid((size + blockSize - 1) / blockSize);
+
+ int d;
+ int offset1;
+ int offset2;
+
+ int max;
+ int *max_arr;
+ cudaMalloc((void**)&max_arr, size * sizeof(int));
+ cudaMemcpy(max_arr, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+ cudaMemset(max_arr + n, 0, (size - n) * sizeof(int));
+
+ timer().startGpuTimer();
+
+ // find max of data
+ for (d = 1; d <= limit; d++) {
+ offset1 = pow(2, d - 1);
+ offset2 = pow(2, d);
+ fullBlocksPerGrid.x = ((size / offset2) + blockSize) / blockSize;
+ kernFindMax << > >(size, offset1, offset2, max_arr);
+ checkCUDAError("Radix find max fail!");
+ }
+
+ timer().endGpuTimer();
+
+ cudaMemcpy(&max, max_arr + size - 1, sizeof(int), cudaMemcpyDeviceToHost);
+
+ cudaFree(max_arr);
+ return max;
+
+
+ }
+
+ void sort(int n, int *odata, const int *idata) {
+
+ int limit = ilog2ceil(n);
+ int size = pow(2, limit);
+
+ dim3 fullBlocksPerGrid((size + blockSize - 1) / blockSize);
+
+ int d;
+ int offset1;
+ int offset2;
+
+ int max;
+ int totFalse;
+ int temp;
+
+ // alloc. memory
+ int *b_arr;
+ //int *e_arr; // e_arr sorted in f_arr, do not need
+ int *f_arr;
+ int *t_arr;
+
+ int *dev_in;
+ int *dev_out;
+
+ int *swap;
+
+ cudaMalloc((void**)&b_arr, n * sizeof(int));
+ //cudaMalloc((void**)&e_arr, n * sizeof(int));
+ cudaMalloc((void**)&f_arr, size * sizeof(int)); // sized to power of 2 for scan
+ cudaMalloc((void**)&t_arr, n * sizeof(int));
+
+ cudaMalloc((void**)&dev_in, n * sizeof(int));
+ cudaMalloc((void**)&dev_out, n * sizeof(int));
+
+ // copy input
+ cudaMemcpy(dev_in, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+ cudaMemcpy(f_arr, dev_in, n * sizeof(int), cudaMemcpyDeviceToDevice);
+ cudaMemset(f_arr + n, 0, (size - n) * sizeof(int));
+ checkCUDAError("Radix mem init fail!");
+
+ timer().startGpuTimer();
+
+ // find max of data
+ for (d = 1; d <= limit; d++) {
+ offset1 = pow(2, d - 1);
+ offset2 = pow(2, d);
+ fullBlocksPerGrid.x = ((size / offset2) + blockSize) / blockSize;
+ kernFindMax << > >(size, offset1, offset2, f_arr);
+ checkCUDAError("Radix find max fail!");
+ }
+
+ // copy max value
+ cudaMemcpy(&max, f_arr + size - 1, sizeof(int), cudaMemcpyDeviceToHost);
+ // zero extra mem
+ cudaMemset(f_arr + n, 0, (size - n) * sizeof(int));
+
+ for (int k = 0; k < ilog2ceil(max); k++) {
+
+ // map arrays b & e
+ fullBlocksPerGrid.x = ((n + blockSize - 1) / blockSize);
+ kernBoolMaps << > > (n, k, dev_in, b_arr, f_arr);
+ checkCUDAError("Radix bool maps fail!");
+
+ // get whether last number's bit is false
+ cudaMemcpy(&totFalse, f_arr + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+
+ // exclusive scan f_arr
+
+ // UpSweep
+ for (d = 1; d <= limit; d++) {
+ offset1 = pow(2, d - 1);
+ offset2 = pow(2, d);
+ fullBlocksPerGrid.x = ((size / offset2) + blockSize) / blockSize;
+ StreamCompaction::Efficient::kernScanDataUpSweep << > >(size, offset1, offset2, f_arr);
+ checkCUDAError("Radix upsweep fail!");
+ }
+
+ // DownSweep
+ cudaMemset(f_arr + n - 1, 0, (size - n + 1) * sizeof(int));
+ for (d = limit; d >= 1; d--) {
+ offset1 = pow(2, d - 1);
+ offset2 = pow(2, d);
+ fullBlocksPerGrid.x = ((size / offset2) + blockSize) / blockSize;
+ StreamCompaction::Efficient::kernScanDataDownSweep << > >(size, offset1, offset2, f_arr);
+ checkCUDAError("Radix downsweep fail!");
+ }
+
+ // total Falses
+ cudaMemcpy(&temp, f_arr + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+ totFalse += temp;
+
+ // Compute t_arr
+ fullBlocksPerGrid.x = ((n + blockSize - 1) / blockSize);
+ kernComputeT << > >(n, totFalse, t_arr, f_arr);
+ checkCUDAError("Radix t_arr compute fail!");
+
+ // scatter
+ kernRadixScatter << > >(n, dev_out, dev_in, b_arr, f_arr, t_arr);
+ checkCUDAError("Radix scatter fail!");
+
+ swap = dev_out;
+ dev_out = dev_in;
+ dev_in = swap;
+ }
+
+ timer().endGpuTimer();
+
+ // copy output data to host
+ cudaMemcpy(odata, dev_in, n * sizeof(int), cudaMemcpyDeviceToHost);
+ checkCUDAError("Radix output copy fail!");
+
+ // cleanup
+ cudaFree(dev_in);
+ cudaFree(dev_out);
+ cudaFree(b_arr);
+ cudaFree(f_arr);
+ cudaFree(t_arr);
+ }
+
+ }
+
+}
\ No newline at end of file
diff --git a/stream_compaction/radix.h b/stream_compaction/radix.h
new file mode 100644
index 0000000..19b46c6
--- /dev/null
+++ b/stream_compaction/radix.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include "common.h"
+#include "efficient.h"
+
+namespace StreamCompaction {
+ namespace Radix {
+ StreamCompaction::Common::PerformanceTimer& timer();
+
+ void sort(int n, int *odata, const int *idata);
+
+ int max(int n, int* idata);
+ }
+}
diff --git a/stream_compaction/shared_mem.cu b/stream_compaction/shared_mem.cu
new file mode 100644
index 0000000..cb7bbd7
--- /dev/null
+++ b/stream_compaction/shared_mem.cu
@@ -0,0 +1,272 @@
+#include
+#include
+#include "common.h"
+#include "efficient.h"
+#include "shared_mem.h"
+
+#define blockSize 512
+
+// for reducing bank conflicts
+#define NUM_BANKS 32
+#define LOG_NUM_BANKS 5
+#define CONFLICT_FREE_OFFSET(n) \
+ ((n) >> NUM_BANKS + (n) >> (2 * LOG_NUM_BANKS))
+
+namespace StreamCompaction {
+ namespace SharedMem {
+ using StreamCompaction::Common::PerformanceTimer;
+ PerformanceTimer& timer()
+ {
+ static PerformanceTimer timer;
+ return timer;
+ }
+ __global__ void kernScanDataShared(int n, int* in, int* out, int* sums) {
+ // init shared mem for block, could improve latency
+ __shared__ int sBuf[blockSize];
+
+ int tx = threadIdx.x;
+ int index = (blockDim.x * blockIdx.x) + tx;
+
+ int off_tx = tx + CONFLICT_FREE_OFFSET(tx);
+
+ // copy used vals to shared mem
+ sBuf[off_tx] = (index < n) ? in[index] : 0;
+
+ __syncthreads(); // avoid mem issues
+
+ int offset; // step size
+ int access; // shared buffer access index
+ int a2;
+
+ // Upsweep
+ for (offset = 1; offset < blockSize; offset *=2) {
+ access = (2 * offset * (tx + 1)) - 1;
+ a2 = access - offset;
+ a2 += CONFLICT_FREE_OFFSET(a2);
+ access += CONFLICT_FREE_OFFSET(access);
+ if (access < blockSize) sBuf[access] += sBuf[a2];
+ __syncthreads(); // avoid mem issues
+ }
+
+ // prepare array for downsweep
+ if (tx == blockSize - 1 + CONFLICT_FREE_OFFSET(blockSize - 1)) {
+ sums[blockIdx.x] = sBuf[off_tx];
+ sBuf[off_tx] = 0;
+ }
+ __syncthreads();
+ if (index >= n - 1) sBuf[off_tx] = 0;
+ __syncthreads(); // avoid mem issues
+
+ // Downsweep (inclusive)
+ // do exclusive downsweep
+ int temp;
+
+ for (offset = blockSize; offset >= 1; offset /= 2) {
+ access = (2 * offset * (tx + 1)) - 1;
+ a2 = access - offset;
+ a2 += CONFLICT_FREE_OFFSET(a2);
+ access += CONFLICT_FREE_OFFSET(access);
+ if (access < blockSize) {
+ temp = sBuf[a2]; // store left child
+ sBuf[a2] = sBuf[access]; // swap
+ sBuf[access] += temp; // add
+ }
+ __syncthreads(); // avoid mem issues
+ }
+
+ // write to dev memory
+ if (index < n) {
+ out[index] = sBuf[off_tx];
+ }
+ }
+
+ __global__ void kernStitch(int n, int* in, int* sums) {
+ int bx = blockIdx.x;
+ int index = (blockDim.x * bx) + threadIdx.x;;
+
+ if (bx == 0) return;
+ if (index >= n) return;
+ in[index] += sums[bx];
+ }
+
+ /**
+ * Performs prefix-sum (aka scan) on idata, storing the result into odata.
+ */
+ void scan(int n, int *odata, const int *idata) {
+
+ int num_blocks = 1 + (n - 1)/ blockSize;
+ int limit = ilog2ceil(num_blocks);
+ int sum_size = pow(2, limit);
+
+ dim3 fullBlocksPerGrid(num_blocks);
+
+ int* dev_out; // data to output
+ int* dev_in; // input data
+ int* dev_sums; // sums, from first blockwise scan
+
+ int x;
+
+ cudaMalloc((void**)&dev_in, n * sizeof(int));
+ cudaMalloc((void**)&dev_out, n * sizeof(int));
+ cudaMalloc((void**)&dev_sums, sum_size * sizeof(int));
+
+ // copy input data to device
+ cudaMemcpy(dev_in, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+ cudaMemset(dev_out, 0, n * sizeof(int));
+ checkCUDAError("initializing shared mem scan data buff fail!");
+
+ timer().startGpuTimer();
+
+ // scan blocks of data
+ kernScanDataShared<<>>(n, dev_in, dev_out, dev_sums);
+ checkCUDAError("shared mem scan fail!");
+
+ // scan block sums
+ int d;
+ int offset1;
+ int offset2;
+
+ // UpSweep
+ for (d = 1; d <= limit; d++) {
+ offset1 = pow(2, d - 1);
+ offset2 = pow(2, d);
+ fullBlocksPerGrid.x = ((sum_size / offset2) + blockSize) / blockSize;
+ StreamCompaction::Efficient::kernScanDataUpSweep << > >(sum_size, offset1, offset2, dev_sums);
+ checkCUDAError("w-e compact upsweep fail!");
+ }
+
+ // DownSweep
+ cudaMemset(dev_sums + num_blocks - 1, 0, (sum_size - num_blocks + 1) * sizeof(int));
+ for (d = limit; d >= 1; d--) {
+ offset1 = pow(2, d - 1);
+ offset2 = pow(2, d);
+ fullBlocksPerGrid.x = ((sum_size / offset2) + blockSize) / blockSize;
+ StreamCompaction::Efficient::kernScanDataDownSweep << > >(sum_size, offset1, offset2, dev_sums);
+ checkCUDAError("w-e compact downsweep fail!");
+ }
+
+ fullBlocksPerGrid.x = num_blocks;
+ kernStitch << > >(n, dev_out, dev_sums);
+ checkCUDAError("shared mem scan stitch fail!");
+
+
+ timer().endGpuTimer();
+
+ // copy out data
+ cudaMemcpy(odata, dev_out, n * sizeof(int), cudaMemcpyDeviceToHost);
+ checkCUDAError("shared mem scan output copy fail!");
+
+ cudaFree(dev_out);
+ cudaFree(dev_in);
+ cudaFree(dev_sums);
+ }
+
+ /**
+ * Performs stream compaction on idata, storing the result into odata.
+ * All zeroes are discarded.
+ *
+ * @param n The number of elements in idata.
+ * @param odata The array into which to store elements.
+ * @param idata The array of elements to compact.
+ * @returns The number of elements remaining after compaction.
+ */
+ int compact(int n, int *odata, const int *idata) {
+
+ int* dev_map; // bool mapping
+ int* dev_scan; // scanned data
+ int* dev_out; // compacted data to output
+ int* dev_in; // input data
+
+ int* dev_sums;
+
+ int num_blocks = 1 + (n - 1) / blockSize;
+ int limit = ilog2ceil(num_blocks);
+ int sum_size = pow(2, limit);
+
+ dim3 fullBlocksPerGrid(num_blocks);
+
+ // allocate memory
+ cudaMalloc((void**)&dev_in, n * sizeof(int));
+ cudaMalloc((void**)&dev_map, n * sizeof(int));
+ cudaMalloc((void**)&dev_out, n * sizeof(int));
+ cudaMalloc((void**)&dev_scan, n * sizeof(int));
+
+ cudaMalloc((void**)&dev_sums, sum_size * sizeof(int));
+ checkCUDAError("shared mem compact malloc fail!");
+
+ cudaMemset(dev_scan, 0, n * sizeof(int));
+ checkCUDAError("initializing shared mem scan data buff fail!");
+
+ cudaMemcpy(dev_in, idata, n * sizeof(int), cudaMemcpyHostToDevice); // copy input data
+ checkCUDAError("initializing w-e compact data buffs fail!");
+
+ timer().startGpuTimer();
+ // map
+ fullBlocksPerGrid.x = ((n + blockSize - 1) / blockSize);
+ StreamCompaction::Common::kernMapToBoolean << > >(n, dev_map, dev_in);
+ checkCUDAError("shared mem compact bool mapping fail!");
+
+ // scan the map
+ fullBlocksPerGrid.x = num_blocks;
+ kernScanDataShared << > >(n, dev_map, dev_scan, dev_sums);
+ checkCUDAError("shared mem scan fail!");
+
+ int r_val;
+ cudaMemcpy(&r_val, dev_sums + num_blocks - 1, sizeof(int), cudaMemcpyDeviceToHost);
+
+ // scan sums from blocks
+ int d;
+ int offset1;
+ int offset2;
+
+ // UpSweep
+ for (d = 1; d <= limit; d++) {
+ offset1 = pow(2, d - 1);
+ offset2 = pow(2, d);
+ fullBlocksPerGrid.x = ((sum_size / offset2) + blockSize) / blockSize;
+ StreamCompaction::Efficient::kernScanDataUpSweep << > >(sum_size, offset1, offset2, dev_sums);
+ checkCUDAError("w-e compact upsweep fail!");
+ }
+
+ // DownSweep
+ cudaMemset(dev_sums + num_blocks - 1, 0, (sum_size - num_blocks + 1) * sizeof(int));
+ for (d = limit; d >= 1; d--) {
+ offset1 = pow(2, d - 1);
+ offset2 = pow(2, d);
+ fullBlocksPerGrid.x = ((sum_size / offset2) + blockSize) / blockSize;
+ StreamCompaction::Efficient::kernScanDataDownSweep << > >(sum_size, offset1, offset2, dev_sums);
+ checkCUDAError("w-e compact downsweep fail!");
+ }
+
+ fullBlocksPerGrid.x = num_blocks;
+ kernStitch << > >(n, dev_scan, dev_sums);
+ checkCUDAError("shared mem scan stitch fail!");
+
+ // scatter
+ fullBlocksPerGrid.x = ((n + blockSize - 1) / blockSize);
+ StreamCompaction::Common::kernScatter << > >(n, dev_out, dev_in, dev_map, dev_scan);
+ checkCUDAError("shared mem compact scatter fail!");
+
+ timer().endGpuTimer();
+
+ // copy output to host
+ cudaMemcpy(odata, dev_out, n * sizeof(int), cudaMemcpyDeviceToHost);
+ checkCUDAError("shared mem compact output copy fail!");
+
+ // calc # of elements for return
+ int r_val2;
+ cudaMemcpy(&r_val2, dev_sums + num_blocks -1, sizeof(int), cudaMemcpyDeviceToHost);
+ checkCUDAError("shared mem compact calc # elem fail!");
+
+ // cleanup
+ cudaFree(dev_in);
+ cudaFree(dev_map);
+ cudaFree(dev_out);
+ cudaFree(dev_scan);
+ cudaFree(dev_sums);
+
+ return r_val + r_val2;
+ }
+ }
+}
diff --git a/stream_compaction/shared_mem.h b/stream_compaction/shared_mem.h
new file mode 100644
index 0000000..9fee8a2
--- /dev/null
+++ b/stream_compaction/shared_mem.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include "common.h"
+
+namespace StreamCompaction {
+ namespace SharedMem {
+ StreamCompaction::Common::PerformanceTimer& timer();
+
+ __global__ void kernScanDataShared(int n, int* in, int* out, int* sums);
+
+ void scan(int n, int *odata, const int *idata);
+
+ int compact(int n, int *odata, const int *idata);
+ }
+}
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 36b732d..998efcf 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -8,21 +8,30 @@
namespace StreamCompaction {
namespace Thrust {
- using StreamCompaction::Common::PerformanceTimer;
- PerformanceTimer& timer()
- {
- static PerformanceTimer timer;
- return timer;
+ using StreamCompaction::Common::PerformanceTimer;
+ PerformanceTimer& timer()
+ {
+ static PerformanceTimer timer;
+ return timer;
}
/**
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
void scan(int n, int *odata, const int *idata) {
+ thrust::host_vector host_thrust_in(idata, idata + n);
+
+ thrust::device_vector dev_thrust_in(n);
+ thrust::device_vector dev_thrust_out(n);
+
+ thrust::copy(host_thrust_in.begin(), host_thrust_in.end(), dev_thrust_in.begin());
+
timer().startGpuTimer();
- // TODO use `thrust::exclusive_scan`
- // example: for device_vectors dv_in and dv_out:
- // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+
+ thrust::exclusive_scan(dev_thrust_in.begin(), dev_thrust_in.end(), dev_thrust_out.begin());
+
timer().endGpuTimer();
+
+ thrust::copy(dev_thrust_out.begin(), dev_thrust_out.end(), odata);
}
}
}