diff --git a/README.md b/README.md index 0e38ddb..9e533fc 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,211 @@ CUDA Stream Compaction -====================== +==================== -**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** +**University of Pennsylvania, CIS 565: GPU Programming and Architecture** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +**Anantha Srinivas** +[LinkedIn](https://www.linkedin.com/in/anantha-srinivas-00198958/), [Twitter](https://twitter.com/an2tha) -### (TODO: Your README) +**Tested on:** +* Windows 10, i7-8700 @ 3.20GHz 16GB, GTX 1080 8097MB (Personal) +* Built for Visual Studio 2017 using the v140 toolkit -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +--- +Introduction +--- + +Stream Compaction is a technique used to remove specific unwanted elements from a continous array of elements. + +![Figure 1: Stream Compaction](img/compaction.jpg) + +Given an input of elements and a list of removable items as shown in the above figure [1], we need to generate a new array of output that contains only the desired elements. + +Stream compaction has many uses included optimizing ray collections in Path Tracers or compressing sparse matrices. Running Stream compaction on a CPU for a large number of elements ( > 2^8) might not be efficient. We will try to parallelize this algorithm to achieve a better peformance. + +Implementation +--- +Stream Compaction on the GPU can roughly be broken down into three steps. + +1. `Initialization` +2. `Scan` + 1. `UpSweep` + 2. `DownSweep` +3. `Scatter` + +An example of the steps that occur in this algorithm. +![Figure 2: Stream compaction example](img/example_sc.jpg) + +Performance Analysis +--- + +Test Conditions: +* Windows 10, i7-8700 @ 3.20GHz 16GB, GTX 1080 8097MB (Personal) +* Running in Release mode. +* NVIDIA Vertical sync is turned off. + +Overall I found that the performance of the various techniques and hardware to be in this order: + +`Thrust < Work-efficient GPU scan < Naive GPU scan < CPU scan` + +**However** for some test cases I found that my implementation of Work efficient GPU scan was performing way better than Thrust's implementation. The following is a table describing the timings run with the mentioned test conditions (and block size of 128). + +| | 2^8 | 2^12 | 2^16 | 2^20 | 2^24 | +| --- | --- | --- | --- | --- | --- | +|CPU Power of 2 | 0.000642ms| 0.005453ms| 0.090786ms| 3.98564ms| 62.7343ms| +|CPU Non-Power of 2 | 0.00032ms| 0.005775ms| 0.088862ms| 1.41794ms| 22.8898ms| +|Naive GPU Power of 2 | 0.086016ms| 0.074752ms| 0.118784ms| 0.867328ms| 16.2294ms| +|Naive GPU Non-Power of 2 | 0.09216ms| 0.0768ms| 0.124928ms| 0.89088ms| 15.8597ms| +|WE Scan Power of 2 | 0.057344ms | 0.08704ms| 0.18944ms| 1.60768ms| 25.089ms| +|WE Scan Non-Power of 2 | 0.0512ms| 0.082944ms| 0.139264m| 0.842752ms| 12.2982ms| +|Thrust Power of 2 | 0.057344ms| 0.164864ms | 0.169984ms| 0.234496ms| 0.763904ms| +|Thrust Non-Power of 2 | 0.060416ms| 0.305152ms| 0.15872ms| 0.224256ms | 0.792576m| + + + + +![Figure 3: Performance graph (block 128)](img/scan_b128.png) +![Figure 4: Performance graph (block 128)](img/stream_b128.png) + +Further timing data can be found under: `./results` + +Further questions +--- + +I found that the thrust implementation was slower than mine for a block size of `128` and an array size of `2^8`. I think that this may be due to the fact that Thrust performs some sort of caching and optimizing for large array size. For a small array size this optimization may actually be an overhead. + + + +Here is some results from the test cases. +Further data can be found under `./results` + + +For Block Size of 128 +``` +**************** +** SCAN TESTS ** +**************** +ARRAY SIZE : 256 + [ 35 36 45 46 43 2 36 26 41 5 1 22 33 ... 47 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.000642ms (std::chrono Measured) +==== cpu scan, non-power-of-two ==== + elapsed time: 0.000321ms (std::chrono Measured) + passed +==== naive scan, power-of-two ==== + elapsed time: 0.06656ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.06656ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.058368ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.052224ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.058368ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.08704ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.055296ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 0 1 1 1 2 2 3 2 3 1 0 2 2 ... 3 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.000642ms (std::chrono Measured) + [ 1 1 1 2 2 3 2 3 1 2 2 3 2 ... 3 3 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.000963ms (std::chrono Measured) + [ 1 1 1 2 2 3 2 3 1 2 2 3 2 ... 2 3 ] + passed +==== cpu compact with scan ==== + elapsed time: 0.001283ms (std::chrono Measured) + [ 1 1 1 2 2 3 2 3 1 2 2 3 2 ... 3 3 ] + passed +==== work-efficient compact, power-of-two ==== + 0 1 1 1 2 2 3 2 3 1 0 2 2 ... 3 0 ] + elapsed time: 0.191488ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + 0 1 1 1 2 2 3 2 3 1 0 2 2 ... 2 3 ] + elapsed time: 0.14848ms (CUDA Measured) + passed +``` + + +For Block Size of 512 +``` +**************** +** SCAN TESTS ** +**************** +ARRAY SIZE : 16777216 + [ 37 9 40 17 14 33 23 4 25 41 26 43 37 ... 14 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 59.1057ms (std::chrono Measured) +==== cpu scan, non-power-of-two ==== + elapsed time: 22.8725ms (std::chrono Measured) + passed +==== naive scan, power-of-two ==== + elapsed time: 16.3615ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 15.8372ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 28.3566ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 12.8952ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.88576ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.794624ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.828416ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 2 3 1 3 3 2 1 3 1 0 3 2 ... 1 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 34.9035ms (std::chrono Measured) + [ 2 2 3 1 3 3 2 1 3 1 3 2 1 ... 2 1 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 34.8583ms (std::chrono Measured) + [ 2 2 3 1 3 3 2 1 3 1 3 2 1 ... 2 2 ] + passed +==== cpu compact with scan ==== + elapsed time: 134.979ms (std::chrono Measured) + [ 2 2 3 1 3 3 2 1 3 1 3 2 1 ... 2 1 ] + passed +==== work-efficient compact, power-of-two ==== + 2 2 3 1 3 3 2 1 3 1 0 3 2 ... 1 0 ] + elapsed time: 59.0572ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + 2 2 3 1 3 3 2 1 3 1 0 3 2 ... 0 2 ] + elapsed time: 45.6591ms (CUDA Measured) + passed +``` + + + + + +References +--- + +[1] - [GPU Gems Chapter 39](https://developer.nvidia.com/gpugems/GPUGems3/gpugems3_ch39.html) diff --git a/img/New Microsoft PowerPoint Presentation.pptx b/img/New Microsoft PowerPoint Presentation.pptx new file mode 100644 index 0000000..c716660 Binary files /dev/null and b/img/New Microsoft PowerPoint Presentation.pptx differ diff --git a/img/compaction.jpg b/img/compaction.jpg new file mode 100644 index 0000000..726a7df Binary files /dev/null and b/img/compaction.jpg differ diff --git a/img/example.png b/img/example.png new file mode 100644 index 0000000..19a8007 Binary files /dev/null and b/img/example.png differ diff --git a/img/example_sc.jpg b/img/example_sc.jpg new file mode 100644 index 0000000..0225e47 Binary files /dev/null and b/img/example_sc.jpg differ diff --git a/img/scan_b128.png b/img/scan_b128.png new file mode 100644 index 0000000..cb6be37 Binary files /dev/null and b/img/scan_b128.png differ diff --git a/img/stream_b128.png b/img/stream_b128.png new file mode 100644 index 0000000..88cdbec Binary files /dev/null and b/img/stream_b128.png differ diff --git a/img/wtf.PNG b/img/wtf.PNG new file mode 100644 index 0000000..8fb9c2a Binary files /dev/null and b/img/wtf.PNG differ diff --git a/results/GraphMaker.xlsx b/results/GraphMaker.xlsx new file mode 100644 index 0000000..0842954 Binary files /dev/null and b/results/GraphMaker.xlsx differ diff --git a/results/timings_block_128_array_2_12.txt b/results/timings_block_128_array_2_12.txt new file mode 100644 index 0000000..2d4288b --- /dev/null +++ b/results/timings_block_128_array_2_12.txt @@ -0,0 +1,56 @@ +**************** +** SCAN TESTS ** +**************** +ARRAY SIZE : 4096 + [ 11 49 31 25 36 47 33 23 5 22 11 46 13 ... 37 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.005453ms (std::chrono Measured) +==== cpu scan, non-power-of-two ==== + elapsed time: 0.005775ms (std::chrono Measured) + passed +==== naive scan, power-of-two ==== + elapsed time: 0.074752ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.0768ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.08704ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.082944ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.226304ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.164864ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.305152ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 1 3 3 3 0 1 3 3 1 2 3 0 1 ... 3 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.008662ms (std::chrono Measured) + [ 1 3 3 3 1 3 3 1 2 3 1 3 2 ... 3 3 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.009624ms (std::chrono Measured) + [ 1 3 3 3 1 3 3 1 2 3 1 3 2 ... 1 2 ] + passed +==== cpu compact with scan ==== + elapsed time: 0.020532ms (std::chrono Measured) + [ 1 3 3 3 1 3 3 1 2 3 1 3 2 ... 3 3 ] + passed +==== work-efficient compact, power-of-two ==== + 1 3 3 3 0 1 3 3 1 2 3 0 1 ... 3 0 ] + elapsed time: 0.186368ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + 1 3 3 3 0 1 3 3 1 2 3 0 1 ... 0 2 ] + elapsed time: 0.1792ms (CUDA Measured) + passed \ No newline at end of file diff --git a/results/timings_block_128_array_2_16.txt b/results/timings_block_128_array_2_16.txt new file mode 100644 index 0000000..5e063c3 --- /dev/null +++ b/results/timings_block_128_array_2_16.txt @@ -0,0 +1,56 @@ +**************** +** SCAN TESTS ** +**************** +ARRAY SIZE : 65536 + [ 30 47 10 32 37 18 7 42 19 0 28 34 44 ... 28 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.090786ms (std::chrono Measured) +==== cpu scan, non-power-of-two ==== + elapsed time: 0.088862ms (std::chrono Measured) + passed +==== naive scan, power-of-two ==== + elapsed time: 0.118784ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.124928ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.18944ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.139264ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.167936ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.169984ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.15872ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 3 0 2 1 0 1 2 1 0 0 2 2 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.143078ms (std::chrono Measured) + [ 2 3 2 1 1 2 1 2 2 1 2 2 1 ... 1 1 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.179649ms (std::chrono Measured) + [ 2 3 2 1 1 2 1 2 2 1 2 2 1 ... 2 1 ] + passed +==== cpu compact with scan ==== + elapsed time: 0.38432ms (std::chrono Measured) + [ 2 3 2 1 1 2 1 2 2 1 2 2 1 ... 1 1 ] + passed +==== work-efficient compact, power-of-two ==== + 2 3 0 2 1 0 1 2 1 0 0 2 2 ... 0 0 ] + elapsed time: 0.4352ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + 2 3 0 2 1 0 1 2 1 0 0 2 2 ... 2 1 ] + elapsed time: 0.393216ms (CUDA Measured) + passed \ No newline at end of file diff --git a/results/timings_block_128_array_2_20.txt b/results/timings_block_128_array_2_20.txt new file mode 100644 index 0000000..e96d185 --- /dev/null +++ b/results/timings_block_128_array_2_20.txt @@ -0,0 +1,56 @@ +**************** +** SCAN TESTS ** +**************** +ARRAY SIZE : 1048576 + [ 19 13 6 30 41 42 22 0 1 11 49 39 47 ... 6 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 3.98564ms (std::chrono Measured) +==== cpu scan, non-power-of-two ==== + elapsed time: 1.41794ms (std::chrono Measured) + passed +==== naive scan, power-of-two ==== + elapsed time: 0.867328ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.89088ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 1.60768ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.842752ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.200704ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.234496ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.224256ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 3 3 2 2 3 2 0 0 1 1 1 1 1 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 2.41114ms (std::chrono Measured) + [ 3 3 2 2 3 2 1 1 1 1 1 3 3 ... 3 2 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 2.57571ms (std::chrono Measured) + [ 3 3 2 2 3 2 1 1 1 1 1 3 3 ... 3 2 ] + passed +==== cpu compact with scan ==== + elapsed time: 8.84481ms (std::chrono Measured) + [ 3 3 2 2 3 2 1 1 1 1 1 3 3 ... 3 2 ] + passed +==== work-efficient compact, power-of-two ==== + 3 3 2 2 3 2 0 0 1 1 1 1 1 ... 0 0 ] + elapsed time: 4.02432ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + 3 3 2 2 3 2 0 0 1 1 1 1 1 ... 3 2 ] + elapsed time: 3.2215ms (CUDA Measured) + passed \ No newline at end of file diff --git a/results/timings_block_128_array_2_24.txt b/results/timings_block_128_array_2_24.txt new file mode 100644 index 0000000..debf8e0 --- /dev/null +++ b/results/timings_block_128_array_2_24.txt @@ -0,0 +1,56 @@ +**************** +** SCAN TESTS ** +**************** +ARRAY SIZE : 16777216 + [ 4 27 22 35 26 33 9 45 19 1 3 34 34 ... 43 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 62.7343ms (std::chrono Measured) +==== cpu scan, non-power-of-two ==== + elapsed time: 22.8898ms (std::chrono Measured) + passed +==== naive scan, power-of-two ==== + elapsed time: 16.2294ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 15.8597ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 25.089ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 12.2982ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.811008ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.763904ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.792576ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 0 2 2 0 0 0 0 0 2 3 3 2 3 ... 2 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 34.8778ms (std::chrono Measured) + [ 2 2 2 3 3 2 3 3 2 2 2 1 3 ... 2 2 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 34.6655ms (std::chrono Measured) + [ 2 2 2 3 3 2 3 3 2 2 2 1 3 ... 3 2 ] + passed +==== cpu compact with scan ==== + elapsed time: 131.577ms (std::chrono Measured) + [ 2 2 2 3 3 2 3 3 2 2 2 1 3 ... 2 2 ] + passed +==== work-efficient compact, power-of-two ==== + 0 2 2 0 0 0 0 0 2 3 3 2 3 ... 2 0 ] + elapsed time: 58.5677ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + 0 2 2 0 0 0 0 0 2 3 3 2 3 ... 3 2 ] + elapsed time: 45.0959ms (CUDA Measured) + passed \ No newline at end of file diff --git a/results/timings_block_128_array_2_8.txt b/results/timings_block_128_array_2_8.txt new file mode 100644 index 0000000..c1faa77 --- /dev/null +++ b/results/timings_block_128_array_2_8.txt @@ -0,0 +1,56 @@ +**************** +** SCAN TESTS ** +**************** +ARRAY SIZE : 256 + [ 0 2 18 19 23 34 30 35 48 13 48 36 42 ... 26 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.000642ms (std::chrono Measured) +==== cpu scan, non-power-of-two ==== + elapsed time: 0.00032ms (std::chrono Measured) + passed +==== naive scan, power-of-two ==== + elapsed time: 0.086016ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.09216ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.057344ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.0512ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.058368ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.057344ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.060416ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 0 2 2 3 1 0 0 1 2 3 2 2 0 ... 2 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.000641ms (std::chrono Measured) + [ 2 2 3 1 1 2 3 2 2 3 2 3 3 ... 3 2 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.000962ms (std::chrono Measured) + [ 2 2 3 1 1 2 3 2 2 3 2 3 3 ... 2 1 ] + passed +==== cpu compact with scan ==== + elapsed time: 0.001283ms (std::chrono Measured) + [ 2 2 3 1 1 2 3 2 2 3 2 3 3 ... 3 2 ] + passed +==== work-efficient compact, power-of-two ==== + 0 2 2 3 1 0 0 1 2 3 2 2 0 ... 2 0 ] + elapsed time: 0.205824ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + 0 2 2 3 1 0 0 1 2 3 2 2 0 ... 2 1 ] + elapsed time: 0.16384ms (CUDA Measured) + passed \ No newline at end of file diff --git a/results/timings_block_256_array_2_12.txt b/results/timings_block_256_array_2_12.txt new file mode 100644 index 0000000..5aaef2b --- /dev/null +++ b/results/timings_block_256_array_2_12.txt @@ -0,0 +1,56 @@ +**************** +** SCAN TESTS ** +**************** +ARRAY SIZE : 4096 + [ 20 33 11 0 28 43 6 20 9 45 6 17 20 ... 12 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.005774ms (std::chrono Measured) +==== cpu scan, non-power-of-two ==== + elapsed time: 0.006095ms (std::chrono Measured) + passed +==== naive scan, power-of-two ==== + elapsed time: 0.075776ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.0768ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.088064ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.074752ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.159744ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.151552ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.197632ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 1 3 1 3 1 1 2 1 0 3 2 3 1 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.008982ms (std::chrono Measured) + [ 1 3 1 3 1 1 2 1 3 2 3 1 2 ... 2 1 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.009303ms (std::chrono Measured) + [ 1 3 1 3 1 1 2 1 3 2 3 1 2 ... 2 1 ] + passed +==== cpu compact with scan ==== + elapsed time: 0.021815ms (std::chrono Measured) + [ 1 3 1 3 1 1 2 1 3 2 3 1 2 ... 2 1 ] + passed +==== work-efficient compact, power-of-two ==== + 1 3 1 3 1 1 2 1 0 3 2 3 1 ... 0 0 ] + elapsed time: 0.242688ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + 1 3 1 3 1 1 2 1 0 3 2 3 1 ... 2 1 ] + elapsed time: 0.186368ms (CUDA Measured) + passed \ No newline at end of file diff --git a/results/timings_block_256_array_2_16.txt b/results/timings_block_256_array_2_16.txt new file mode 100644 index 0000000..7abc546 --- /dev/null +++ b/results/timings_block_256_array_2_16.txt @@ -0,0 +1,56 @@ +**************** +** SCAN TESTS ** +**************** +ARRAY SIZE : 65536 + [ 29 46 36 37 32 20 12 47 5 34 44 10 4 ... 31 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.0879ms (std::chrono Measured) +==== cpu scan, non-power-of-two ==== + elapsed time: 0.0879ms (std::chrono Measured) + passed +==== naive scan, power-of-two ==== + elapsed time: 0.105472ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.103424ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.187392ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.13824ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.19968ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.13824ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.159744ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 1 2 0 3 0 0 2 1 1 2 0 0 2 ... 3 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.137944ms (std::chrono Measured) + [ 1 2 3 2 1 1 2 2 3 1 2 3 2 ... 2 3 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.135378ms (std::chrono Measured) + [ 1 2 3 2 1 1 2 2 3 1 2 3 2 ... 1 2 ] + passed +==== cpu compact with scan ==== + elapsed time: 0.364109ms (std::chrono Measured) + [ 1 2 3 2 1 1 2 2 3 1 2 3 2 ... 2 3 ] + passed +==== work-efficient compact, power-of-two ==== + 1 2 0 3 0 0 2 1 1 2 0 0 2 ... 3 0 ] + elapsed time: 0.503808ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + 1 2 0 3 0 0 2 1 1 2 0 0 2 ... 0 2 ] + elapsed time: 0.311296ms (CUDA Measured) + passed \ No newline at end of file diff --git a/results/timings_block_256_array_2_20.txt b/results/timings_block_256_array_2_20.txt new file mode 100644 index 0000000..db4b4ef --- /dev/null +++ b/results/timings_block_256_array_2_20.txt @@ -0,0 +1,56 @@ +**************** +** SCAN TESTS ** +**************** +ARRAY SIZE : 1048576 + [ 41 8 26 6 1 42 12 15 46 43 33 47 8 ... 47 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 3.92821ms (std::chrono Measured) +==== cpu scan, non-power-of-two ==== + elapsed time: 1.44296ms (std::chrono Measured) + passed +==== naive scan, power-of-two ==== + elapsed time: 0.896ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.96256ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 1.60051ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.841728ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.24576ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.252928ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.287744ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 2 0 2 0 2 3 0 1 1 1 1 3 ... 2 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 2.20968ms (std::chrono Measured) + [ 2 2 2 2 3 1 1 1 1 3 2 3 3 ... 2 2 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 2.17054ms (std::chrono Measured) + [ 2 2 2 2 3 1 1 1 1 3 2 3 3 ... 2 3 ] + passed +==== cpu compact with scan ==== + elapsed time: 8.86182ms (std::chrono Measured) + [ 2 2 2 2 3 1 1 1 1 3 2 3 3 ... 2 2 ] + passed +==== work-efficient compact, power-of-two ==== + 2 2 0 2 0 2 3 0 1 1 1 1 3 ... 2 0 ] + elapsed time: 3.87482ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + 2 2 0 2 0 2 3 0 1 1 1 1 3 ... 2 3 ] + elapsed time: 3.11501ms (CUDA Measured) + passed \ No newline at end of file diff --git a/results/timings_block_256_array_2_24.txt b/results/timings_block_256_array_2_24.txt new file mode 100644 index 0000000..8ffce76 --- /dev/null +++ b/results/timings_block_256_array_2_24.txt @@ -0,0 +1,56 @@ +**************** +** SCAN TESTS ** +**************** +ARRAY SIZE : 16777216 + [ 40 44 44 8 30 7 13 24 43 26 15 8 11 ... 32 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 62.6503ms (std::chrono Measured) +==== cpu scan, non-power-of-two ==== + elapsed time: 23.3402ms (std::chrono Measured) + passed +==== naive scan, power-of-two ==== + elapsed time: 15.9805ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 15.9724ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 26.947ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 12.3228ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.791552ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.825344ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.846848ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 3 0 2 3 2 3 2 3 3 0 3 2 2 ... 2 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 35.0713ms (std::chrono Measured) + [ 3 2 3 2 3 2 3 3 3 2 2 1 2 ... 1 2 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 35.1117ms (std::chrono Measured) + [ 3 2 3 2 3 2 3 3 3 2 2 1 2 ... 2 3 ] + passed +==== cpu compact with scan ==== + elapsed time: 128.475ms (std::chrono Measured) + [ 3 2 3 2 3 2 3 3 3 2 2 1 2 ... 1 2 ] + passed +==== work-efficient compact, power-of-two ==== + 3 0 2 3 2 3 2 3 3 0 3 2 2 ... 2 0 ] + elapsed time: 59.2128ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + 3 0 2 3 2 3 2 3 3 0 3 2 2 ... 2 3 ] + elapsed time: 45.1881ms (CUDA Measured) + passed \ No newline at end of file diff --git a/results/timings_block_256_array_2_8.txt b/results/timings_block_256_array_2_8.txt new file mode 100644 index 0000000..74f454b --- /dev/null +++ b/results/timings_block_256_array_2_8.txt @@ -0,0 +1,56 @@ +**************** +** SCAN TESTS ** +**************** +ARRAY SIZE : 256 + [ 35 36 45 46 43 2 36 26 41 5 1 22 33 ... 47 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.000642ms (std::chrono Measured) +==== cpu scan, non-power-of-two ==== + elapsed time: 0.000321ms (std::chrono Measured) + passed +==== naive scan, power-of-two ==== + elapsed time: 0.06656ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.06656ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.058368ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.052224ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.058368ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.08704ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.055296ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 0 1 1 1 2 2 3 2 3 1 0 2 2 ... 3 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.000642ms (std::chrono Measured) + [ 1 1 1 2 2 3 2 3 1 2 2 3 2 ... 3 3 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.000963ms (std::chrono Measured) + [ 1 1 1 2 2 3 2 3 1 2 2 3 2 ... 2 3 ] + passed +==== cpu compact with scan ==== + elapsed time: 0.001283ms (std::chrono Measured) + [ 1 1 1 2 2 3 2 3 1 2 2 3 2 ... 3 3 ] + passed +==== work-efficient compact, power-of-two ==== + 0 1 1 1 2 2 3 2 3 1 0 2 2 ... 3 0 ] + elapsed time: 0.191488ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + 0 1 1 1 2 2 3 2 3 1 0 2 2 ... 2 3 ] + elapsed time: 0.14848ms (CUDA Measured) + passed \ No newline at end of file diff --git a/results/timings_block_512_array_2_12.txt b/results/timings_block_512_array_2_12.txt new file mode 100644 index 0000000..b4df031 --- /dev/null +++ b/results/timings_block_512_array_2_12.txt @@ -0,0 +1,56 @@ +**************** +** SCAN TESTS ** +**************** +ARRAY SIZE : 4096 + [ 0 41 29 16 40 14 37 24 18 33 2 31 47 ... 47 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.005454ms (std::chrono Measured) +==== cpu scan, non-power-of-two ==== + elapsed time: 0.005454ms (std::chrono Measured) + passed +==== naive scan, power-of-two ==== + elapsed time: 0.083968ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.082944ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.090112ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.082944ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.197632ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.169984ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.164864ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 1 1 2 2 2 1 2 2 3 2 1 1 ... 1 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.008982ms (std::chrono Measured) + [ 2 1 1 2 2 2 1 2 2 3 2 1 1 ... 1 1 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.008983ms (std::chrono Measured) + [ 2 1 1 2 2 2 1 2 2 3 2 1 1 ... 1 2 ] + passed +==== cpu compact with scan ==== + elapsed time: 0.020531ms (std::chrono Measured) + [ 2 1 1 2 2 2 1 2 2 3 2 1 1 ... 1 1 ] + passed +==== work-efficient compact, power-of-two ==== + 2 1 1 2 2 2 1 2 2 3 2 1 1 ... 1 0 ] + elapsed time: 0.186368ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + 2 1 1 2 2 2 1 2 2 3 2 1 1 ... 1 2 ] + elapsed time: 0.182272ms (CUDA Measured) + passed \ No newline at end of file diff --git a/results/timings_block_512_array_2_16.txt b/results/timings_block_512_array_2_16.txt new file mode 100644 index 0000000..f812152 --- /dev/null +++ b/results/timings_block_512_array_2_16.txt @@ -0,0 +1,56 @@ +**************** +** SCAN TESTS ** +**************** +ARRAY SIZE : 65536 + [ 3 8 21 10 36 29 48 44 23 1 12 23 7 ... 35 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.088862ms (std::chrono Measured) +==== cpu scan, non-power-of-two ==== + elapsed time: 0.089183ms (std::chrono Measured) + passed +==== naive scan, power-of-two ==== + elapsed time: 0.265216ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.130048ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.188416ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.13824ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.16896ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.1792ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.171008ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 0 0 1 0 3 0 0 1 3 1 2 1 2 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.14404ms (std::chrono Measured) + [ 1 3 1 3 1 2 1 2 2 3 2 1 1 ... 2 2 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.143398ms (std::chrono Measured) + [ 1 3 1 3 1 2 1 2 2 3 2 1 1 ... 2 2 ] + passed +==== cpu compact with scan ==== + elapsed time: 0.445593ms (std::chrono Measured) + [ 1 3 1 3 1 2 1 2 2 3 2 1 1 ... 2 2 ] + passed +==== work-efficient compact, power-of-two ==== + 0 0 1 0 3 0 0 1 3 1 2 1 2 ... 0 0 ] + elapsed time: 0.37888ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + 0 0 1 0 3 0 0 1 3 1 2 1 2 ... 2 2 ] + elapsed time: 0.331776ms (CUDA Measured) + passed \ No newline at end of file diff --git a/results/timings_block_512_array_2_20.txt b/results/timings_block_512_array_2_20.txt new file mode 100644 index 0000000..46e855d --- /dev/null +++ b/results/timings_block_512_array_2_20.txt @@ -0,0 +1,56 @@ +**************** +** SCAN TESTS ** +**************** +ARRAY SIZE : 1048576 + [ 5 42 36 7 29 21 29 44 15 36 45 28 31 ... 49 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 4.12807ms (std::chrono Measured) +==== cpu scan, non-power-of-two ==== + elapsed time: 1.42468ms (std::chrono Measured) + passed +==== naive scan, power-of-two ==== + elapsed time: 0.919552ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.899072ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 1.6937ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.88064ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.251904ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.374784ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.226304ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 1 2 0 1 1 1 3 0 1 0 3 2 1 ... 3 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 2.19941ms (std::chrono Measured) + [ 1 2 1 1 1 3 1 3 2 1 1 2 2 ... 1 3 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 2.30111ms (std::chrono Measured) + [ 1 2 1 1 1 3 1 3 2 1 1 2 2 ... 3 1 ] + passed +==== cpu compact with scan ==== + elapsed time: 8.74184ms (std::chrono Measured) + [ 1 2 1 1 1 3 1 3 2 1 1 2 2 ... 1 3 ] + passed +==== work-efficient compact, power-of-two ==== + 1 2 0 1 1 1 3 0 1 0 3 2 1 ... 3 0 ] + elapsed time: 3.97312ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + 1 2 0 1 1 1 3 0 1 0 3 2 1 ... 1 0 ] + elapsed time: 3.46726ms (CUDA Measured) + passed \ No newline at end of file diff --git a/results/timings_block_512_array_2_24.txt b/results/timings_block_512_array_2_24.txt new file mode 100644 index 0000000..a003373 --- /dev/null +++ b/results/timings_block_512_array_2_24.txt @@ -0,0 +1,56 @@ +**************** +** SCAN TESTS ** +**************** +ARRAY SIZE : 16777216 + [ 37 9 40 17 14 33 23 4 25 41 26 43 37 ... 14 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 59.1057ms (std::chrono Measured) +==== cpu scan, non-power-of-two ==== + elapsed time: 22.8725ms (std::chrono Measured) + passed +==== naive scan, power-of-two ==== + elapsed time: 16.3615ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 15.8372ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 28.3566ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 12.8952ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.88576ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.794624ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.828416ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 2 3 1 3 3 2 1 3 1 0 3 2 ... 1 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 34.9035ms (std::chrono Measured) + [ 2 2 3 1 3 3 2 1 3 1 3 2 1 ... 2 1 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 34.8583ms (std::chrono Measured) + [ 2 2 3 1 3 3 2 1 3 1 3 2 1 ... 2 2 ] + passed +==== cpu compact with scan ==== + elapsed time: 134.979ms (std::chrono Measured) + [ 2 2 3 1 3 3 2 1 3 1 3 2 1 ... 2 1 ] + passed +==== work-efficient compact, power-of-two ==== + 2 2 3 1 3 3 2 1 3 1 0 3 2 ... 1 0 ] + elapsed time: 59.0572ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + 2 2 3 1 3 3 2 1 3 1 0 3 2 ... 0 2 ] + elapsed time: 45.6591ms (CUDA Measured) + passed \ No newline at end of file diff --git a/results/timings_block_512_array_2_8.txt b/results/timings_block_512_array_2_8.txt new file mode 100644 index 0000000..cb13962 --- /dev/null +++ b/results/timings_block_512_array_2_8.txt @@ -0,0 +1,57 @@ +**************** +** SCAN TESTS ** +**************** +ARRAY SIZE : 256 + [ 47 43 42 18 28 11 22 40 38 14 26 24 12 ... 34 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.000321ms (std::chrono Measured) +==== cpu scan, non-power-of-two ==== + elapsed time: 0.000642ms (std::chrono Measured) + passed +==== naive scan, power-of-two ==== + elapsed time: 0.0768ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.0768ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.062464ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.05632ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.090112ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.086016ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.06144ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 1 1 2 0 0 3 0 0 2 2 0 0 2 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.000641ms (std::chrono Measured) + [ 1 1 2 3 2 2 2 1 1 2 3 1 2 ... 1 2 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.000642ms (std::chrono Measured) + [ 1 1 2 3 2 2 2 1 1 2 3 1 2 ... 3 1 ] + passed +==== cpu compact with scan ==== + elapsed time: 0.001283ms (std::chrono Measured) + [ 1 1 2 3 2 2 2 1 1 2 3 1 2 ... 1 2 ] + passed +==== work-efficient compact, power-of-two ==== + 1 1 2 0 0 3 0 0 2 2 0 0 2 ... 0 0 ] + elapsed time: 0.349184ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + 1 1 2 0 0 3 0 0 2 2 0 0 2 ... 0 1 ] + elapsed time: 0.156672ms (CUDA Measured) + a[0] = 1, b[0] = 0 + FAIL VALUE \ No newline at end of file diff --git a/src/main.cpp b/src/main.cpp index 1850161..2787185 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -12,21 +12,36 @@ #include #include #include "testing_helpers.hpp" +#include -const int SIZE = 1 << 8; // feel free to change the size of array -const int NPOT = SIZE - 3; // Non-Power-Of-Two -int *a = new int[SIZE]; -int *b = new int[SIZE]; -int *c = new int[SIZE]; - -int main(int argc, char* argv[]) { +int main(int argc, char* argv[]) +{ // Scan tests + int numArguments = argc; + + const unsigned long SIZE = [&] + { + if(numArguments > 1) + { + std::string arg = argv[1]; std::size_t pos; + return (1 << std::stoi(arg, &pos)); + } + return (1 << 8); + }(); + + //const int SIZE = 1 << 8; // feel free to change the size of array + const unsigned long NPOT = SIZE - 3; // Non-Power-Of-Two + int *a = new int[SIZE]; + int *b = new int[SIZE]; + int *c = new int[SIZE]; printf("\n"); printf("****************\n"); printf("** SCAN TESTS **\n"); printf("****************\n"); + printf("ARRAY SIZE : %lu\n", SIZE); + genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case a[SIZE - 1] = 0; printArray(SIZE, a, true); @@ -38,13 +53,13 @@ int main(int argc, char* argv[]) { printDesc("cpu scan, power-of-two"); StreamCompaction::CPU::scan(SIZE, b, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(SIZE, b, true); + //printArray(SIZE, b, true); zeroArray(SIZE, c); printDesc("cpu scan, non-power-of-two"); StreamCompaction::CPU::scan(NPOT, c, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(NPOT, b, true); + //printArray(NPOT, b, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); @@ -88,6 +103,13 @@ int main(int argc, char* argv[]) { //printArray(SIZE, c, true); printCmpResult(SIZE, b, c); + zeroArray(SIZE, c); + printDesc("thrust scan, power-of-two"); + StreamCompaction::Thrust::scan(SIZE, c, a); + printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + zeroArray(SIZE, c); printDesc("thrust scan, non-power-of-two"); StreamCompaction::Thrust::scan(NPOT, c, a); diff --git a/src/testing_helpers.hpp b/src/testing_helpers.hpp index 46337ab..4431984 100644 --- a/src/testing_helpers.hpp +++ b/src/testing_helpers.hpp @@ -51,8 +51,8 @@ void onesArray(int n, int *a) { void genArray(int n, int *a, int maxval) { srand(time(nullptr)); - - for (int i = 0; i < n; i++) { + + for (int i = 0; i < n; i++) { a[i] = rand() % maxval; } } diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index cdbef77..48e2f35 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -13,5 +13,5 @@ set(SOURCE_FILES cuda_add_library(stream_compaction ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_60 ) diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 8fc0211..fa95ca7 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -22,8 +22,10 @@ namespace StreamCompaction { * Maps an array to an array of 0s and 1s for stream compaction. Elements * which map to 0 will be removed, and elements which map to 1 will be kept. */ - __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + __global__ void kernMapToBoolean(int n, int *bools, const int *idata) + { + const int index = (blockIdx.x * blockDim.x) + threadIdx.x; + bools[index] = idata[index] == 0 ? 0 : 1; } /** @@ -31,8 +33,15 @@ namespace StreamCompaction { * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]]. */ __global__ void kernScatter(int n, int *odata, - const int *idata, const int *bools, const int *indices) { - // TODO + const int *idata, const int *bools, const int *indices) + { + const int index = (blockIdx.x * blockDim.x) + threadIdx.x; + int outIndex; + if (bools[index] != 0) + { + outIndex = indices[index]; + odata[outIndex] = idata[index]; + } } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 05ce667..92b4665 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -1,17 +1,30 @@ #include #include "cpu.h" -#include "common.h" +#include "common.h" namespace StreamCompaction { namespace CPU { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } + void printArray(int n, int *a, bool abridged = false) { + printf(" [ "); + for (int i = 0; i < n; i++) { + if (abridged && i + 2 == 15 && n > 16) { + i = n - 2; + printf("... "); + } + printf("%3d ", a[i]); + } + printf("]\n"); + } + /** * CPU scan (prefix sum). * For performance analysis, this is supposed to be a simple for loop. @@ -19,7 +32,14 @@ namespace StreamCompaction { */ void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + + odata[0] = 0; + + for(int i = 1; i < n; ++i) + { + odata[i] = odata[i - 1] + idata[i - 1]; + } + timer().endCpuTimer(); } @@ -30,9 +50,18 @@ namespace StreamCompaction { */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + + int outIndex = 0; + for (int i = 0; i < n; ++i) + { + if (idata[i] != 0) + { + odata[outIndex++] = idata[i]; + } + } + timer().endCpuTimer(); - return -1; + return (outIndex); } /** @@ -41,10 +70,44 @@ namespace StreamCompaction { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { + + // Keeping allocations outside of timer + auto *tempValidator = new int[n]; + auto *scanArray = new int[n]; + timer().startCpuTimer(); - // TODO + + // 1. Compute temporary array + for (int i = 0; i < n; ++i) + { + tempValidator[i] = (idata[i] != 0 ? 1 : 0); + } + + // 2. Perform exclusive scan + scanArray[0] = 0; + for (int i = 1; i < n; ++i) + { + scanArray[i] = scanArray[i - 1] + tempValidator[i - 1]; + } + + + // 3. Scatter + int outIndex = 0; + for(int i = 0; i < n; ++i) + { + if(tempValidator[i] != 0) + { + outIndex = scanArray[i]; + odata[outIndex] = idata[i]; + } + } + timer().endCpuTimer(); - return -1; + + delete[] tempValidator; + delete[] scanArray; + + return (outIndex + 1); } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 36c5ef2..c268ae9 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -5,20 +5,163 @@ namespace StreamCompaction { namespace Efficient { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } + // Global variables + int* device_iData; + int* device_oData; + int* device_bools; + int* device_sortedbools; + +#define blockSize 512 + + void printArray(int n, const int *a, bool abridged = false) { + + if(" [ "); + for (int i = 0; i < n; i++) { + if (abridged && i + 2 == 15 && n > 16) { + i = n - 2; + printf("... "); + } + printf("%3d ", a[i]); + } + printf("]\n"); + } + + + void PrintCUDAArray(int n, const int* cudaArrayPtr) + { + cudaDeviceSynchronize(); + + int* temp = new int[n]; + cudaMemcpy(temp, cudaArrayPtr, n * sizeof(n), cudaMemcpyDeviceToHost); + + printArray(n, temp, true); + + delete[] temp; + } + + + + /** + * Kernel to perform a Work efficient scan on a integer array + */ + __global__ void kernUpSweep(int n, int two_d, int* outputData) + { + const int index = (blockIdx.x * blockDim.x) + threadIdx.x; + + const int two_d_1 = two_d * 2; + + if(index % two_d_1 != 0) + { + return; + } + + const int oldIndex = index + two_d - 1; + const int newIndex = index + two_d_1 - 1; + + const int currData = outputData[newIndex]; + + outputData[newIndex] = newIndex != (n - 1) ? currData + outputData[oldIndex] : 0; + } + + /** + * Kernel to perform a Work efficient scan on a integer array + */ + __global__ void kernDownSweep(int n, int two_d, int* outputData) + { + const int index = (blockIdx.x * blockDim.x) + threadIdx.x; + + const int two_d_1 = two_d * 2; + + if (index % two_d_1 != 0) + { + return; + } + + const int oldIndex = index + two_d - 1; + const int newIndex = index + two_d_1 - 1; + + const int dataAtNewIndex = outputData[newIndex]; + + const int t = outputData[oldIndex]; + outputData[oldIndex] = dataAtNewIndex; + outputData[newIndex] = t + outputData[newIndex]; + } + + /** + * Shifts the whole array to the right by one in parallel + */ + __global__ void kernMakeExclusive(int n, int* outputData, int* inputData) + { + const int index = (blockIdx.x * blockDim.x) + threadIdx.x; + + outputData[index] = index != 0 ? inputData[index - 1] : 0; + } + + + inline int RoundToPower2(int n) + { + int start = 2; + + while(start <= n) + { + start *= 2; + } + return start; + } + + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); + void scan(int n, int *odata, const int *idata) + { + const int numElements = RoundToPower2(n); + const int numTotalBytes = numElements * sizeof(int); + const int numActualBytes = n * sizeof(int); + + // 1. Allocate the memory in device + cudaMalloc(reinterpret_cast(&device_oData), numTotalBytes); + cudaMemcpy(device_oData, idata, numActualBytes, cudaMemcpyHostToDevice); + + cudaDeviceSynchronize(); + + timer().startGpuTimer(); + + // 2. Compute Block count + dim3 num_blocks((numElements + blockSize - 1) / blockSize); + + // 3. Call the kernel + const int log_n = ilog2ceil(numElements); + // 3a. UpSweep + int power_2 = 1; + for(int d = 0; d < log_n; ++d) + { + power_2 = (1 << d); + kernUpSweep << < num_blocks, blockSize >> > (numElements, power_2, device_oData); + } + + // 3b. DownSweep + for (int d = log_n - 1; d >= 0; --d) + { + power_2 = (1 << d); + kernDownSweep << < num_blocks, blockSize >> > (numElements, power_2, device_oData); + } + + timer().endGpuTimer(); + + cudaDeviceSynchronize(); + cudaMemcpy(odata, device_oData, numActualBytes, cudaMemcpyDeviceToHost); + + // 4. Free up any gpu memory + cudaFree(device_iData); + cudaFree(device_oData); } /** @@ -30,11 +173,82 @@ namespace StreamCompaction { * @param idata The array of elements to compact. * @returns The number of elements remaining after compaction. */ - int compact(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); - return -1; + int compact(int n, int *odata, const int *idata) + { + const int numElements = RoundToPower2(n); + const int numTotalBytes = numElements * sizeof(int); + const int numActualBytes = n * sizeof(int); + + int* tempHolder = new int[n]; + + printArray(n, idata, true); + + // 1. Allocate the memory in device + cudaMalloc(reinterpret_cast(&device_iData), numTotalBytes); + cudaMalloc(reinterpret_cast(&device_oData), numTotalBytes); + cudaMalloc(reinterpret_cast(&device_bools), numTotalBytes); + cudaMalloc(reinterpret_cast(&device_sortedbools), numTotalBytes); + + cudaMemcpy(device_iData, idata, numActualBytes, cudaMemcpyHostToDevice); + + cudaDeviceSynchronize(); + + timer().startGpuTimer(); + + // 2. Compute Block count + dim3 num_blocks((numElements + blockSize - 1) / blockSize); + + // 3. Call the kernel + + // 3a. Map to bools + StreamCompaction::Common::kernMapToBoolean << > > (numElements, device_bools, device_iData); + cudaMemcpy(device_sortedbools, device_bools, numActualBytes, cudaMemcpyHostToHost); + + // 3b. UpSweep + const int log_n = ilog2ceil(numElements); + int power_2 = 1; + for (int d = 0; d < log_n; ++d) + { + power_2 = (1 << d); + kernUpSweep << < num_blocks, blockSize >> > (numElements, power_2, device_sortedbools); + } + + // 3c. DownSweep + for (int d = log_n - 1; d >= 0; --d) + { + power_2 = (1 << d); + kernDownSweep << < num_blocks, blockSize >> > (numElements, power_2, device_sortedbools); + } + + // 3d. Compact + StreamCompaction::Common::kernScatter << > > (numElements, device_oData, device_iData, device_bools, device_sortedbools); + + // 4. Manually copy from the GPU the bools and check the number of valida values to return + cudaDeviceSynchronize(); + cudaMemcpy(tempHolder, device_bools, numActualBytes, cudaMemcpyDeviceToHost); + int count = 0; + for(int i = 0; i < n; ++i) + { + if(tempHolder[i] != 0) + { + count++; + } + } + + timer().endGpuTimer(); + + // Copy over the results + cudaDeviceSynchronize(); + cudaMemcpy(odata, device_oData, numActualBytes, cudaMemcpyDeviceToHost); + + // 5. Free up any gpu memory + cudaFree(device_iData); + cudaFree(device_oData); + cudaFree(device_bools); + cudaFree(device_sortedbools); + delete[] tempHolder; + + return (count); } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 9218f8e..57a023b 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -5,21 +5,101 @@ namespace StreamCompaction { namespace Naive { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } - // TODO: __global__ + + // Global variables + int *device_iData; + int *device_oData; + +#define blockSize 512 + + void printArray(int n, int *a, bool abridged = false) { + printf(" [ "); + for (int i = 0; i < n; i++) { + if (abridged && i + 2 == 15 && n > 16) { + i = n - 2; + printf("... "); + } + printf("%3d ", a[i]); + } + printf("]\n"); + } + + /** + * Kernel to perform a Naive scan on a integer array + */ + __global__ void kernScan(int n, int power, int* outputData, int* inputData) + { + const int index = (blockIdx.x * blockDim.x) + threadIdx.x; + + // Fetch it once + const auto curr_data = inputData[index]; + + if(index >= power) + { + outputData[index] = inputData[index - power] + curr_data; + } + else + { + outputData[index] = curr_data; + } + } + + /** + * Shifts the whole array to the right by one in parallel + */ + __global__ void kernMakeExclusive(int n, int* outputData, int* inputData) + { + const int index = (blockIdx.x * blockDim.x) + threadIdx.x; + + outputData[index] = index != 0 ? inputData[index - 1] : 0; + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); + void scan(int n, int *odata, const int *idata) + { + // 1. Allocate the memory in device + cudaMalloc(reinterpret_cast(&device_iData), n * (sizeof(int))); + cudaMalloc(reinterpret_cast(&device_oData), n * (sizeof(int))); + cudaMemcpy(device_iData, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + cudaDeviceSynchronize(); + + timer().startGpuTimer(); + + // 2. Compute Block count + dim3 num_blocks((n + blockSize - 1) / blockSize); + + // 3. Call the kernel + const auto logn = ilog2ceil(n); + for (auto i = 1; i <= logn; ++i) + { + const auto power = 1 << (i - 1); + kernScan << < num_blocks, blockSize >> > (n, power, device_oData, device_iData); + + // Swap + const auto temp = device_iData; + device_iData = device_oData; + device_oData = temp; + } + + // Make it exclusive as we need that for stream compaction later on + kernMakeExclusive <<< num_blocks, blockSize >> > (n, device_oData, device_iData); + + cudaDeviceSynchronize(); + timer().endGpuTimer(); + + cudaMemcpy(odata, device_oData, sizeof(int) * n, cudaMemcpyDeviceToHost); + + // 4. Free up any gpu memory + cudaFree(device_iData); + cudaFree(device_oData); } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 36b732d..1c4a2a6 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -8,21 +8,47 @@ namespace StreamCompaction { namespace Thrust { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } + + thrust::device_ptr dev_thrustInputData; + thrust::device_ptr dev_thrustOutputData; + int* device_iData; + int* device_oData; + +#define blockSize 512 + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO use `thrust::exclusive_scan` - // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); - timer().endGpuTimer(); + void scan(int n, int *odata, const int *idata) + { + const int numTotalBytes = n * sizeof(int); + + cudaMalloc(reinterpret_cast(&device_iData), numTotalBytes); + cudaMalloc(reinterpret_cast(&device_oData), numTotalBytes); + + cudaMemcpy(device_iData, idata, numTotalBytes, cudaMemcpyHostToDevice); + cudaDeviceSynchronize(); + + dev_thrustInputData = thrust::device_ptr(device_iData); + dev_thrustOutputData = thrust::device_ptr(device_oData); + + timer().startGpuTimer(); + + thrust::exclusive_scan(dev_thrustInputData, n + dev_thrustInputData, dev_thrustOutputData); + + timer().endGpuTimer(); + + cudaDeviceSynchronize(); + cudaMemcpy(odata, device_oData, numTotalBytes, cudaMemcpyDeviceToHost); + + cudaFree(device_iData); + cudaFree(device_oData); } } }