Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
60eeba1
CPU, Naive GPU, and Efficient GPU scan/compact
risia Sep 11, 2018
00e94b3
Added Thrust implementation and improved Work-Efficient Performance
risia Sep 12, 2018
2930bec
Backing up Radix Sort, not yet tested but compiles
risia Sep 13, 2018
e300543
Radix debugged and working on small data set, added cuda error checks…
risia Sep 13, 2018
b2cbce1
Radix tested on large dataset, appears functional
risia Sep 13, 2018
6b81341
Added timing info for finding max val
risia Sep 13, 2018
4eead15
Removed debug code
risia Sep 13, 2018
2f1b74b
Radix sort implementation description in readme
risia Sep 13, 2018
3d4f2f2
Added findMax code
risia Sep 13, 2018
3292d5b
Small Radix sort example
risia Sep 13, 2018
829a49e
More code and images to illustrate radix sort
risia Sep 13, 2018
0bb3468
Update README.md
risia Sep 13, 2018
cafbb78
Shared Memory Work-Efficient Scan added
risia Sep 13, 2018
e7b4274
Merge branch 'master' of https://github.com/risia/Project2-Stream-Com…
risia Sep 13, 2018
016817c
Shared Memory Compact added + tests
risia Sep 13, 2018
54e3f55
removed debug printf
risia Sep 13, 2018
2ad8ef6
Short description of Shared Memory implementation
risia Sep 13, 2018
38d401a
Fixed shared memory scan
risia Sep 13, 2018
2e0a00d
Merge branch 'master' of https://github.com/risia/Project2-Stream-Com…
risia Sep 13, 2018
3605d5c
Update README.md
risia Sep 13, 2018
88e6511
Bank Conflict reduction added to shared mem implementation
risia Sep 13, 2018
5adafa6
Merge branch 'master' of https://github.com/risia/Project2-Stream-Com…
risia Sep 13, 2018
b643ac3
Update README.md
risia Sep 13, 2018
776b074
project description
risia Sep 13, 2018
67029ae
Optimized stitching of shared mem scan blocks
risia Sep 13, 2018
3882da2
Fixes so everything works for large arrays
risia Sep 13, 2018
9831715
Removed debug printf
risia Sep 13, 2018
df3dd11
Added test for average time to run each scan/compact for perf. analys…
risia Sep 14, 2018
3d26700
More E.C. documentation
risia Sep 14, 2018
f2b4e85
changed some code spacing
risia Sep 14, 2018
f9a8cf1
began performance analysis writeup
risia Sep 14, 2018
cdd7ac8
Speed vs blocksize images
risia Sep 19, 2018
dd3eb86
block size analysis
risia Sep 19, 2018
f2271e6
more performance analysis plots
risia Sep 19, 2018
212a68b
performance analysis
risia Sep 19, 2018
45c7ea8
Update README.md
risia Sep 19, 2018
1d8ca97
Update README.md
risia Sep 19, 2018
b38fb67
Update README.md
risia Sep 19, 2018
98fd233
Update README.md
risia Sep 19, 2018
cd68a7d
Update README.md
risia Sep 19, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added Project2 Performance Analysis.xlsx
Binary file not shown.
276 changes: 270 additions & 6 deletions README.md

Large diffs are not rendered by default.

Binary file added img/compact_comp1.PNG
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added img/compact_comp2.PNG
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added img/naive_blocksize.PNG
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added img/radix_blocksize.PNG
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added img/radix_example.PNG
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added img/scan_comp1.PNG
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added img/scan_comp2.PNG
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added img/sm_scan_blocksize.PNG
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added img/we_compact_blocksize.PNG
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added img/we_scan_blocksize.PNG
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
226 changes: 222 additions & 4 deletions src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,11 @@
#include <stream_compaction/naive.h>
#include <stream_compaction/efficient.h>
#include <stream_compaction/thrust.h>
#include <stream_compaction/radix.h>
#include <stream_compaction/shared_mem.h>
#include "testing_helpers.hpp"

const int SIZE = 1 << 8; // feel free to change the size of array
const int SIZE = 1 << 15; // feel free to change the size of array
const int NPOT = SIZE - 3; // Non-Power-Of-Two
int *a = new int[SIZE];
int *b = new int[SIZE];
Expand Down Expand Up @@ -51,7 +53,7 @@ int main(int argc, char* argv[]) {
printDesc("naive scan, power-of-two");
StreamCompaction::Naive::scan(SIZE, c, a);
printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
//printArray(SIZE, c, true);
printArray(SIZE, c, true);
printCmpResult(SIZE, b, c);

/* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
Expand All @@ -71,14 +73,14 @@ int main(int argc, char* argv[]) {
printDesc("work-efficient scan, power-of-two");
StreamCompaction::Efficient::scan(SIZE, c, a);
printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
//printArray(SIZE, c, true);
printArray(SIZE, c, true);
printCmpResult(SIZE, b, c);

zeroArray(SIZE, c);
printDesc("work-efficient scan, non-power-of-two");
StreamCompaction::Efficient::scan(NPOT, c, a);
printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
//printArray(NPOT, c, true);
printArray(NPOT, c, true);
printCmpResult(NPOT, b, c);

zeroArray(SIZE, c);
Expand All @@ -95,6 +97,67 @@ int main(int argc, char* argv[]) {
//printArray(NPOT, c, true);
printCmpResult(NPOT, b, c);

zeroArray(SIZE, c);
printDesc("Find max, power-of-two");
int max = StreamCompaction::Radix::max(SIZE, a);
printElapsedTime(StreamCompaction::Radix::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
printf("max = %i\n", max);

zeroArray(SIZE, c);
printDesc("Find max, non-power-of-two");
max = StreamCompaction::Radix::max(NPOT, a);
printElapsedTime(StreamCompaction::Radix::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
printf("max = %i\n", max);

zeroArray(SIZE, c);
//int radix_tst[8] = { 4, 7, 2, 6, 3, 5, 1, 0 };
printDesc("Radix sort, power-of-two");
StreamCompaction::Radix::sort(SIZE, c, a);
printElapsedTime(StreamCompaction::Radix::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
printArray(SIZE, c, true);

zeroArray(SIZE, c);
printDesc("Radix sort, non-power-of-two");
StreamCompaction::Radix::sort(NPOT, c, a);
printElapsedTime(StreamCompaction::Radix::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
printArray(NPOT, c, true);

zeroArray(SIZE, c);
int radix_tst[8] = { 4, 7, 2, 6, 3, 5, 1, 0 };
printDesc("Radix example sort");
printf("Test input array:\n");
printArray(8, radix_tst, true);
StreamCompaction::Radix::sort(8, c, radix_tst);
printElapsedTime(StreamCompaction::Radix::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
printf("Sorted Output:\n");
printArray(8, c, true);

zeroArray(SIZE, c);
printDesc("Shared Memory Efficient Scan, power-of-two");
StreamCompaction::SharedMem::scan(SIZE, c, a);
printElapsedTime(StreamCompaction::SharedMem::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
printArray(SIZE, c, true);
printCmpResult(SIZE, b, c);

zeroArray(SIZE, c);
printDesc("Shared Memory Efficient Scan, non-power-of-two");
StreamCompaction::SharedMem::scan(NPOT, c, a);
printElapsedTime(StreamCompaction::SharedMem::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
printArray(NPOT, c, true);
printCmpResult(NPOT, b, c);

//zeroArray(SIZE, c);
//printDesc("Shared Memory Efficient Scan, power-of-two");
//StreamCompaction::SharedMem::scan(8, c, radix_tst);
//printElapsedTime(StreamCompaction::SharedMem::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
//printArray(8, c, true);

//zeroArray(SIZE, c);
//printDesc("Shared Memory Efficient Scan, non-power-of-two");
//StreamCompaction::SharedMem::scan(7, c, radix_tst);
//printElapsedTime(StreamCompaction::SharedMem::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
//printArray(7, c, true);

printf("\n");
printf("*****************************\n");
printf("** STREAM COMPACTION TESTS **\n");
Expand Down Expand Up @@ -147,6 +210,161 @@ int main(int argc, char* argv[]) {
//printArray(count, c, true);
printCmpLenResult(count, expectedNPOT, b, c);

zeroArray(SIZE, c);
printDesc("Shared Memory work-efficient compact, power-of-two");
count = StreamCompaction::SharedMem::compact(SIZE, c, a);
printElapsedTime(StreamCompaction::SharedMem::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
//printArray(count, c, true);
printCmpLenResult(count, expectedCount, b, c);

zeroArray(SIZE, c);
printDesc("Shared Memory work-efficient compact, non-power-of-two");
count = StreamCompaction::SharedMem::compact(NPOT, c, a);
printElapsedTime(StreamCompaction::SharedMem::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
//printArray(count, c, true);
printCmpLenResult(count, expectedNPOT, b, c);


// loop 100 tests to get avgs
// make time variables
float time_N_S_POT = 0.0f; // naive pow 2 scan
float time_N_S_NPOT = 0.0f; // naive not pow 2 scan
float time_WE_S_POT = 0.0f; //
float time_WE_S_NPOT = 0.0f; //
float time_WE_C_POT = 0.0f; //
float time_WE_C_NPOT = 0.0f; //
float time_SM_S_POT = 0.0f; //
float time_SM_S_NPOT = 0.0f; //
float time_T_S_POT = 0.0f; //
float time_T_S_NPOT = 0.0f; //
float time_R_S_POT = 0.0f; //
float time_R_S_NPOT = 0.0f; //
float time_CPU_S_POT = 0.0f;
float time_CPU_S_NPOT = 0.0f;
float time_CPU_C_S = 0.0f;
float time_CPU_C_NS = 0.0f;
float time_CPU_C_S_NPOT = 0.0f;
float time_CPU_C_NS_NPOT = 0.0f;

for (int i = 0; i < 100; i++) {
// gen array
genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case
a[SIZE - 1] = 0;

// cpu scan POT
zeroArray(SIZE, b);
StreamCompaction::CPU::scan(SIZE, b, a);
time_CPU_S_POT += StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation();

// cpu scan POT
zeroArray(SIZE, b);
StreamCompaction::CPU::scan(NPOT, b, a);
time_CPU_S_NPOT += StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation();

// cpu compact w/o scan
zeroArray(SIZE, b);
StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
time_CPU_C_NS += StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation();

// cpu compact w/o scan
zeroArray(SIZE, b);
StreamCompaction::CPU::compactWithoutScan(NPOT, b, a);
time_CPU_C_NS_NPOT += StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation();

// cpu compact w/ scan
zeroArray(SIZE, b);
StreamCompaction::CPU::compactWithScan(SIZE, b, a);
time_CPU_C_S += StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation();

// cpu compact w/ scan
zeroArray(SIZE, b);
StreamCompaction::CPU::compactWithScan(NPOT, b, a);
time_CPU_C_S_NPOT += StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation();

// Naive scan POT
zeroArray(SIZE, b);
StreamCompaction::Naive::scan(SIZE, b, a);
time_N_S_POT += StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation();

// Naive scan N_POT
zeroArray(SIZE, b);
StreamCompaction::Naive::scan(NPOT, b, a);
time_N_S_NPOT += StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation();

// WE scan POT
zeroArray(SIZE, b);
StreamCompaction::Efficient::scan(SIZE, b, a);
time_WE_S_POT += StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation();

// WE scan N_POT
zeroArray(SIZE, b);
StreamCompaction::Efficient::scan(NPOT, b, a);
time_WE_S_NPOT += StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation();

// WE compact POT
zeroArray(SIZE, b);
StreamCompaction::Efficient::compact(SIZE, b, a);
time_WE_C_POT += StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation();

// WE compact N_POT
zeroArray(SIZE, b);
StreamCompaction::Efficient::compact(NPOT, b, a);
time_WE_C_NPOT += StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation();

// SM scan POT
zeroArray(SIZE, b);
StreamCompaction::SharedMem::scan(SIZE, b, a);
time_SM_S_POT += StreamCompaction::SharedMem::timer().getGpuElapsedTimeForPreviousOperation();

// SM scan N_POT
zeroArray(SIZE, b);
StreamCompaction::SharedMem::scan(NPOT, b, a);
time_SM_S_NPOT += StreamCompaction::SharedMem::timer().getGpuElapsedTimeForPreviousOperation();

// Thrust scan POT
zeroArray(SIZE, b);
StreamCompaction::Thrust::scan(SIZE, b, a);
time_T_S_POT += StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation();

// Thrust scan N_POT
zeroArray(SIZE, b);
StreamCompaction::Thrust::scan(NPOT, b, a);
time_T_S_NPOT += StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation();

// Radix sort POT
zeroArray(SIZE, b);
StreamCompaction::Radix::sort(SIZE, b, a);
time_R_S_POT += StreamCompaction::Radix::timer().getGpuElapsedTimeForPreviousOperation();

// Radic sort N_POT
zeroArray(SIZE, b);
StreamCompaction::Radix::sort(NPOT, b, a);
time_R_S_NPOT += StreamCompaction::Radix::timer().getGpuElapsedTimeForPreviousOperation();

}

// print avg times
printf("CPU Scan POT: %f\n", time_CPU_S_POT / 100.0f);
printf("CPU Scan NPOT: %f\n", time_CPU_S_NPOT / 100.0f);
printf("CPU Compact POT: %f\n", time_CPU_C_NS / 100.0f);
printf("CPU Scan Compact NPOT: %f\n", time_CPU_C_S_NPOT / 100.0f);
printf("CPU Compact NPOT: %f\n", time_CPU_C_NS_NPOT / 100.0f);
printf("CPU Scan Compact POT: %f\n", time_CPU_C_S / 100.0f);
printf("Naive POT: %f\n", time_N_S_POT / 100.0f);
printf("Naive NPOT: %f\n", time_N_S_NPOT / 100.0f);
printf("WE Scan POT: %f\n", time_WE_S_POT / 100.0f);
printf("WE Scan NPOT: %f\n", time_WE_S_NPOT / 100.0f);
printf("WE Comp POT: %f\n", time_WE_C_POT / 100.0f);
printf("WE Comp NPOT: %f\n", time_WE_C_NPOT / 100.0f);
printf("SM Scan POT: %f\n", time_SM_S_POT / 100.0f);
printf("SM Scan NPOT: %f\n", time_SM_S_NPOT / 100.0f);
printf("Thrust Scan POT: %f\n", time_T_S_POT / 100.0f);
printf("Thrust Scan NPOT: %f\n", time_T_S_NPOT / 100.0f);
printf("Radix POT: %f\n", time_R_S_POT / 100.0f);
printf("Radix NPOT: %f\n", time_R_S_NPOT / 100.0f);



system("pause"); // stop Win32 console from closing on exit
delete[] a;
delete[] b;
Expand Down
14 changes: 7 additions & 7 deletions src/testing_helpers.hpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#pragma once

#include <cstdlib>
#include <cstdio>
#include <iostream>
#include <cstdlib>
#include <cstdio>
#include <iostream>
#include <string>
#include <ctime>

Expand Down Expand Up @@ -69,8 +69,8 @@ void printArray(int n, int *a, bool abridged = false) {
printf("]\n");
}

template<typename T>
void printElapsedTime(T time, std::string note = "")
{
std::cout << " elapsed time: " << time << "ms " << note << std::endl;
template<typename T>
void printElapsedTime(T time, std::string note = "")
{
std::cout << " elapsed time: " << time << "ms " << note << std::endl;
}
6 changes: 5 additions & 1 deletion stream_compaction/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,13 @@ set(SOURCE_FILES
"efficient.cu"
"thrust.h"
"thrust.cu"
"radix.h"
"radix.cu"
"shared_mem.h"
"shared_mem.cu"
)

cuda_add_library(stream_compaction
${SOURCE_FILES}
OPTIONS -arch=sm_20
OPTIONS -arch=sm_50
)
9 changes: 7 additions & 2 deletions stream_compaction/common.cu
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,10 @@ namespace StreamCompaction {
* which map to 0 will be removed, and elements which map to 1 will be kept.
*/
__global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
// TODO
int index = (blockDim.x * blockIdx.x) + threadIdx.x;
if (index >= n) return;
if (idata[index] != 0) bools[index] = 1;
else bools[index] = 0;
}

/**
Expand All @@ -32,7 +35,9 @@ namespace StreamCompaction {
*/
__global__ void kernScatter(int n, int *odata,
const int *idata, const int *bools, const int *indices) {
// TODO
int index = (blockDim.x * blockIdx.x) + threadIdx.x;
if (index >= n) return;
if (bools[index] == 1) odata[indices[index]] = idata[index];
}

}
Expand Down
Loading