diff --git a/README.md b/README.md
index 0e38ddb..ade0b8e 100644
--- a/README.md
+++ b/README.md
@@ -3,12 +3,122 @@ CUDA Stream Compaction
**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
-* (TODO) YOUR NAME HERE
- * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Shenyue Chen
+ * [LinkedIn](https://www.linkedin.com/in/shenyue-chen-5b2728119/), [personal website](http://github.com/EvsChen)
+* Tested on: Windows 10, Intel Xeon Platinum 8259CL @ 2.50GHz 16GB, Tesla T4 (AWS g4dn-xlarge)
+
+### Features
+* Implementation of cpu scan, cpu compact, naive scan, work-efficient scan and work-efficient compact
+* Optimization of the work-efficient scan algorithm by launching only necessary number of threads in `upSweep` and `downSweep`
+ * Map thread index to the actual index by `interval * idx + interval - 1`, where interval is `1 << iteration`
+ * For example, for N = 8, 4 threads is launched in the first iteration, 2 launched in the secondm etc.
+
+### Performance analysis
+Block size: 128
+
+Lengh of array: from (2^10 - 2^21)
+
+**All the time measured in this section is the average of 100 tests, to avoid caching of functions**
+
+As N increases, the time for cpu algorithms increases in a exponential manner while the GPU algorithms increases much slower.
+
+
+
+
+
+
+
+
+
+For the GPU algorithms only, the naive algorithm performs the best when N is small. But thrust scan turns out to be the best when N becomes larger.
+
+
+
+
+In my experiments, there are no obvious difference for the NPOT version of the work efficient scan algorithm.
+
+
+
+
+Similar things happen for the thrust scan.
+
+
+
+
+
+
+
+### Sample output
+I tested each of the algorithm for 100 times and include some additional information.
+```
+****************
+** SCAN TESTS **
+****************
+ [ 41 23 25 1 5 46 28 37 30 42 42 25 35 ... 38 0 ]
+==== cpu scan, power-of-two ====
+ Time record is [11.006, 11.431, 11.774, 13.613, 14.736, 11.656, 11.874, 11.659, 11.823, 12.829, 11.737, 11.989, 11.872, ... 25.619]
+ elapsed time: 13.311ms (std::chrono Measured)
+ [ 0 41 64 89 90 95 141 169 206 236 278 320 345 ... 51331714 51331752 ]
+==== cpu scan, non-power-of-two ====
+ Time record is [6.4447, 6.2375, 20.608, 8.2194, 4.5156, 5.2029, 4.3476, 3.7569, 3.7672, 3.9463, 3.8041, 6.7647, 8.9613, ... 3.7673]
+ elapsed time: 4.7256ms (std::chrono Measured)
+ [ 0 41 64 89 90 95 141 169 206 236 278 320 345 ... 51331652 51331692 ]
+ passed
+==== naive scan, power-of-two ====
+ Time record is [1.6997, 1.6947, 1.6957, 1.6964, 1.6937, 1.6972, 1.6957, 1.6955, 1.6977, 1.6956, 1.6957, 1.697, 1.6957, ... 1.5053]
+ elapsed time: 1.5804ms (CUDA Measured)
+ passed
+==== naive scan, non-power-of-two ====
+ Time record is [1.505, 1.5154, 1.5114, 1.5073, 1.5134, 1.5131, 1.5095, 1.5173, 1.5183, 1.5173, 1.5181, 1.5193, 1.5177, ... 1.5286]
+ elapsed time: 1.5617ms (CUDA Measured)
+ passed
+==== work-efficient scan, power-of-two ====
+ Time record is [1.0465, 0.9728, 0.92182, 0.9257, 0.93184, 2.3247, 0.92374, 0.9345, 0.92896, 0.92176, 0.92269, 0.92266, 0.93555, ... 0.9264]
+ elapsed time: 0.97037ms (CUDA Measured)
+ passed
+==== work-efficient scan, non-power-of-two ====
+ Time record is [0.93405, 0.92058, 0.92365, 0.91706, 0.93229, 0.92541, 0.9175, 0.93056, 0.9216, 0.9176, 0.91802, 0.93424, 0.91955, ... 0.91955]
+ elapsed time: 0.92851ms (CUDA Measured)
+ passed
+==== thrust scan, power-of-two ====
+ Time record is [0.27674, 0.29424, 0.37693, 0.28387, 0.26618, 0.26726, 0.29562, 0.27222, 0.27443, 0.29901, 0.31325, 0.30925, 0.27082, ... 0.26301]
+ elapsed time: 0.30972ms (CUDA Measured)
+ passed
+==== thrust scan, non-power-of-two ====
+ Time record is [0.2639, 0.36099, 0.28035, 0.26765, 0.39936, 0.30883, 0.29104, 0.27306, 0.26934, 0.2631, 0.29914, 0.26224, 0.2863, ... 0.29773]
+ elapsed time: 0.31379ms (CUDA Measured)
+ passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+ [ 0 1 1 3 0 1 0 2 3 3 1 1 1 ... 3 0 ]
+==== cpu compact without scan, power-of-two ====
+ Time record is [7.5451, 5.9954, 5.9785, 5.7061, 6.2471, 6.2241, 5.9236, 5.7947, 6.4508, 5.8406, 5.8629, 5.7438, 5.7671, ... 5.7358]
+ elapsed time: 6.2666ms (std::chrono Measured)
+ [ 1 1 3 1 2 3 3 1 1 1 3 1 2 ... 1 3 ]
+ passed
+==== cpu compact without scan, non-power-of-two ====
+ Time record is [7.3508, 8.2913, 5.9938, 5.9339, 5.8238, 5.7023, 5.9023, 5.8208, 6.926, 6.5035, 5.7348, 6.8928, 7.7841, ... 5.8204]
+ elapsed time: 6.2747ms (std::chrono Measured)
+ [ 1 1 3 1 2 3 3 1 1 1 3 1 2 ... 2 3 ]
+ passed
+==== cpu compact with scan ====
+ Time record is [28.751, 24.819, 23.735, 23.725, 23.829, 23.911, 23.968, 26.001, 24.981, 24.231, 23.606, 24.335, 23.51, ... 26.916]
+ elapsed time: 24.973ms (std::chrono Measured)
+ [ 1 1 3 1 2 3 3 1 1 1 3 1 2 ... 1 3 ]
+ passed
+==== work-efficient compact, power-of-two ====
+ Time record is [1.9505, 1.6153, 1.6267, 1.6086, 1.6013, 1.6535, 1.6727, 1.7992, 1.7735, 2.1627, 1.7832, 1.7375, 1.7575, ... 1.682]
+ elapsed time: 1.8556ms (CUDA Measured)
+ passed
+==== work-efficient compact, non-power-of-two ====
+ Time record is [1.6855, 1.9108, 1.7044, 1.6872, 1.7303, 1.9761, 1.7651, 1.9041, 1.6835, 1.6977, 1.7791, 1.6812, 1.7178, ... 1.6495]
+ elapsed time: 1.7805ms (CUDA Measured)
+ passed
+Result for n = 2097152 is :
+ Time record is [13.311, 4.7256, 1.5804, 1.5617, 0.97037, 0.92851, 0.30972, 0.31379, 6.2666, 6.2747, 24.973, 1.8556, 1.7805, ... 0]
+```
-### (TODO: Your README)
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
diff --git a/doc/analysis.py b/doc/analysis.py
new file mode 100644
index 0000000..ce7ab35
--- /dev/null
+++ b/doc/analysis.py
@@ -0,0 +1,75 @@
+import csv
+import matplotlib.pyplot as plt
+
+data = []
+with open('lab2.csv', 'r') as csvfile:
+ spamreader = csv.reader(csvfile, delimiter=',')
+ for row in spamreader:
+ drow = []
+ for data_str in row:
+ drow.append(float(data_str))
+ data.append(drow)
+
+start = 10
+end = 21
+
+cpu_scan = []
+naive_scan = []
+efficient_scan = []
+efficient_scan_npot = []
+thrust_scan = []
+thrust_scan_npot = []
+cpu_compact = []
+efficient_compact = []
+
+x = []
+
+# import pdb; pdb.set_trace()
+
+for i in range(start, end + 1):
+ x.append(i)
+ row = data[i - 10]
+ cpu_scan.append(row[0])
+ naive_scan.append(row[2])
+ efficient_scan.append(row[4])
+ efficient_scan_npot.append(row[6])
+ thrust_scan.append(row[6])
+ thrust_scan_npot.append(row[7])
+ cpu_compact.append(row[8])
+ efficient_compact.append(row[11])
+
+fig, ax = plt.subplots()
+
+### Scan
+# plt.plot(x, cpu_scan, label="cpu_scan")
+# plt.plot(x, naive_scan, label="naive_scan")
+# plt.plot(x, efficient_scan, label="work_efficient")
+# plt.plot(x, thrust_scan, label="thrust_scane")
+# plt.title("Scan time")
+
+### NPOT
+# plt.plot(x, efficient_scan, label="work-efficient scan")
+# plt.plot(x, efficient_scan_npot, label="work-efficient scan NPOT")
+# plt.title("Scan time")
+
+
+### Thrust NPOT
+# plt.plot(x, thrust_scan, label="thrust scan")
+# plt.plot(x, thrust_scan_npot, label="thrust scan NPOT")
+# plt.title("Scan time")
+
+### Compact
+# plt.plot(x, cpu_compact, label="cpu compact")
+# plt.plot(x, efficient_compact, label="work efficient compact")
+# plt.title("Compact time")
+
+### GPU
+plt.plot(x, naive_scan, label="naive_scan")
+plt.plot(x, efficient_scan, label="work_efficient")
+plt.plot(x, thrust_scan, label="thrust_scan")
+plt.title("GPU Scan time")
+
+plt.xlabel("Number (in base 2)")
+plt.ylabel("Time (ms)")
+plt.legend()
+plt.show()
\ No newline at end of file
diff --git a/doc/compact.png b/doc/compact.png
new file mode 100644
index 0000000..36a6aae
Binary files /dev/null and b/doc/compact.png differ
diff --git a/doc/gpu_scan.png b/doc/gpu_scan.png
new file mode 100644
index 0000000..9a8c82a
Binary files /dev/null and b/doc/gpu_scan.png differ
diff --git a/doc/lab2.csv b/doc/lab2.csv
new file mode 100644
index 0000000..bf95092
--- /dev/null
+++ b/doc/lab2.csv
@@ -0,0 +1,12 @@
+0.001885, 0.001899, 0.036519, 0.03666, 0.10034, 0.12692, 0.19843, 0.18431, 0.002746, 0.004293, 0.007446, 0.2877, 0.32142
+0.005164, 0.003933, 0.05982, 0.064216, 0.16541, 0.13125, 0.15975, 0.1403, 0.003983, 0.005303, 0.010544, 0.30586, 0.37939
+0.009546, 0.007338, 0.046396, 0.046294, 0.13717, 0.1225, 0.14251, 0.12847, 0.020095, 0.008572, 0.045547, 0.3119, 0.32465
+0.009546, 0.007338, 0.046396, 0.046294, 0.13717, 0.1225, 0.14251, 0.12847, 0.020095, 0.008572, 0.045547, 0.3119, 0.32465
+0.032347, 0.03035, 0.057651, 0.057629, 0.14566, 0.1432, 0.17856, 0.17249, 0.062693, 0.068325, 0.141, 0.39171, 0.40263
+0.072447, 0.066555, 0.065433, 0.065277, 0.16558, 0.17479, 0.16487, 0.20362, 0.12599, 0.11522, 0.27981, 0.34057, 0.50167
+0.13301, 0.1447, 0.087247, 0.087203, 0.16367, 0.19217, 0.13575, 0.18643, 0.28578, 0.25115, 0.55336, 0.44775, 0.34398
+0.28354, 0.33327, 0.12242, 0.11939, 0.19577, 0.18088, 0.1958, 0.15035, 0.70451, 0.6561, 1.6623, 0.51467, 0.52659
+1.4489, 0.57192, 0.20105, 0.18198, 0.21742, 0.25338, 0.64729, 0.28838, 0.79568, 0.85801, 3.0992, 0.55925, 0.56684
+3.1769, 1.1265, 0.35451, 0.3327, 0.31406, 0.34869, 0.29619, 0.31233, 1.602, 1.6526, 6.3687, 0.68551, 0.66141
+6.3527, 2.0997, 0.9041, 0.8269, 0.34781, 0.3672, 0.28274, 0.28195, 3.196, 3.239, 12.361, 0.89818, 0.90519
+13.311, 4.7256, 1.5804, 1.5617, 0.97037, 0.92851, 0.30972, 0.31379, 6.2666, 6.2747, 24.973, 1.8556, 1.7805
\ No newline at end of file
diff --git a/doc/scan_time.png b/doc/scan_time.png
new file mode 100644
index 0000000..1ea81e8
Binary files /dev/null and b/doc/scan_time.png differ
diff --git a/doc/scan_time_npot.png b/doc/scan_time_npot.png
new file mode 100644
index 0000000..5401296
Binary files /dev/null and b/doc/scan_time_npot.png differ
diff --git a/doc/thrust_scan_time_npot.png b/doc/thrust_scan_time_npot.png
new file mode 100644
index 0000000..c3bcdf4
Binary files /dev/null and b/doc/thrust_scan_time_npot.png differ
diff --git a/src/main.cpp b/src/main.cpp
index 896ac2b..989423f 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,12 +13,40 @@
#include
#include "testing_helpers.hpp"
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 1 << 10; // feel free to change the size of array
const int NPOT = SIZE - 3; // Non-Power-Of-Two
int *a = new int[SIZE];
int *b = new int[SIZE];
int *c = new int[SIZE];
+const int repeatTime = 100;
+float *record = new float[repeatTime];
+
+float getTimeAvg(float *src) {
+ float t = 0.f;
+ for (int i = 0; i < repeatTime; i++) {
+ t += src[i];
+ }
+ return t / repeatTime;
+}
+
+void printTime(float *src) {
+ std::cout << " Time record is [";
+ std::cout.precision(5);
+ for (int i = 0; i < repeatTime; i++) {
+ if (repeatTime > 16 && i == 13) {
+ std::cout << "... ";
+ i = repeatTime - 2;
+ continue;
+ }
+ std::cout << src[i];
+ if (i != repeatTime - 1) {
+ std::cout << ", ";
+ }
+ }
+ std::cout << "]" << std::endl;
+}
+
int main(int argc, char* argv[]) {
// Scan tests
@@ -30,28 +58,45 @@ int main(int argc, char* argv[]) {
genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case
a[SIZE - 1] = 0;
printArray(SIZE, a, true);
+ float res[13];
+
// initialize b using StreamCompaction::CPU::scan you implement
// We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct.
// At first all cases passed because b && c are all zeroes.
- zeroArray(SIZE, b);
printDesc("cpu scan, power-of-two");
- StreamCompaction::CPU::scan(SIZE, b, a);
- printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+ for (int i = 0; i < repeatTime; i++) {
+ zeroArray(SIZE, b);
+ StreamCompaction::CPU::scan(SIZE, b, a);
+ record[i] = StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation();
+ }
+ printTime(record);
+ res[0] = getTimeAvg(record);
+ printElapsedTime(res[0], "(std::chrono Measured)");
printArray(SIZE, b, true);
- zeroArray(SIZE, c);
+
printDesc("cpu scan, non-power-of-two");
- StreamCompaction::CPU::scan(NPOT, c, a);
- printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+ for (int i = 0; i < repeatTime; i++) {
+ zeroArray(SIZE, c);
+ StreamCompaction::CPU::scan(NPOT, c, a);
+ record[i] = StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation();
+ }
+ printTime(record);
+ res[1] = getTimeAvg(record);
+ printElapsedTime(getTimeAvg(record), "(std::chrono Measured)");
printArray(NPOT, b, true);
printCmpResult(NPOT, b, c);
- zeroArray(SIZE, c);
printDesc("naive scan, power-of-two");
- StreamCompaction::Naive::scan(SIZE, c, a);
- printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
- //printArray(SIZE, c, true);
+ for (int i = 0; i < repeatTime; i++) {
+ zeroArray(SIZE, c);
+ StreamCompaction::Naive::scan(SIZE, c, a);
+ record[i] = StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation();
+ }
+ printTime(record);
+ res[2] = getTimeAvg(record);
+ printElapsedTime(getTimeAvg(record), "(CUDA Measured)");
printCmpResult(SIZE, b, c);
/* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
@@ -59,39 +104,62 @@ int main(int argc, char* argv[]) {
printDesc("1s array for finding bugs");
StreamCompaction::Naive::scan(SIZE, c, a);
printArray(SIZE, c, true); */
-
- zeroArray(SIZE, c);
+
printDesc("naive scan, non-power-of-two");
- StreamCompaction::Naive::scan(NPOT, c, a);
- printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
- //printArray(SIZE, c, true);
+ for (int i = 0; i < repeatTime; i++) {
+ zeroArray(SIZE, c);
+ StreamCompaction::Naive::scan(NPOT, c, a);
+ record[i] = StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation();
+ }
+ printTime(record);
+ res[3] = getTimeAvg(record);
+ printElapsedTime(getTimeAvg(record), "(CUDA Measured)");
printCmpResult(NPOT, b, c);
- zeroArray(SIZE, c);
printDesc("work-efficient scan, power-of-two");
- StreamCompaction::Efficient::scan(SIZE, c, a);
- printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+ for (int i = 0; i < repeatTime; i++) {
+ zeroArray(SIZE, c);
+ StreamCompaction::Efficient::scan(SIZE, c, a);
+ record[i] = StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation();
+ }
+ printTime(record);
+ res[4] = getTimeAvg(record);
+ printElapsedTime(getTimeAvg(record), "(CUDA Measured)");
//printArray(SIZE, c, true);
printCmpResult(SIZE, b, c);
- zeroArray(SIZE, c);
printDesc("work-efficient scan, non-power-of-two");
- StreamCompaction::Efficient::scan(NPOT, c, a);
- printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
- //printArray(NPOT, c, true);
+ for (int i = 0; i < repeatTime; i++) {
+ zeroArray(SIZE, c);
+ StreamCompaction::Efficient::scan(NPOT, c, a);
+ record[i] = StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation();
+ }
+ printTime(record);
+ res[5] = getTimeAvg(record);
+ printElapsedTime(getTimeAvg(record), "(CUDA Measured)");
printCmpResult(NPOT, b, c);
- zeroArray(SIZE, c);
printDesc("thrust scan, power-of-two");
- StreamCompaction::Thrust::scan(SIZE, c, a);
- printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+ for (int i = 0; i < repeatTime; i++) {
+ zeroArray(SIZE, c);
+ StreamCompaction::Thrust::scan(SIZE, c, a);
+ record[i] = StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation();
+ }
+ printTime(record);
+ res[6] = getTimeAvg(record);
+ printElapsedTime(getTimeAvg(record), "(CUDA Measured)");
//printArray(SIZE, c, true);
printCmpResult(SIZE, b, c);
- zeroArray(SIZE, c);
printDesc("thrust scan, non-power-of-two");
- StreamCompaction::Thrust::scan(NPOT, c, a);
- printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+ for (int i = 0; i < repeatTime; i++) {
+ zeroArray(SIZE, c);
+ StreamCompaction::Thrust::scan(NPOT, c, a);
+ record[i] = StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation();
+ }
+ printTime(record);
+ res[7] = getTimeAvg(record);
+ printElapsedTime(getTimeAvg(record), "(CUDA Measured)");
//printArray(NPOT, c, true);
printCmpResult(NPOT, b, c);
@@ -110,43 +178,71 @@ int main(int argc, char* argv[]) {
// initialize b using StreamCompaction::CPU::compactWithoutScan you implement
// We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct.
- zeroArray(SIZE, b);
printDesc("cpu compact without scan, power-of-two");
- count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
- printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+ for (int i = 0; i < repeatTime; i++) {
+ zeroArray(SIZE, b);
+ count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
+ record[i] = StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation();
+ }
+ printTime(record);
+ res[8] = getTimeAvg(record);
+ printElapsedTime(getTimeAvg(record), "(std::chrono Measured)");
expectedCount = count;
printArray(count, b, true);
printCmpLenResult(count, expectedCount, b, b);
- zeroArray(SIZE, c);
printDesc("cpu compact without scan, non-power-of-two");
- count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a);
- printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+ for (int i = 0; i < repeatTime; i++) {
+ zeroArray(SIZE, c);
+ count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a);
+ record[i] = StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation();
+ }
+ printTime(record);
+ res[9] = getTimeAvg(record);
+ printElapsedTime(getTimeAvg(record), "(std::chrono Measured)");
expectedNPOT = count;
printArray(count, c, true);
printCmpLenResult(count, expectedNPOT, b, c);
- zeroArray(SIZE, c);
printDesc("cpu compact with scan");
- count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
- printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+ for (int i = 0; i < repeatTime; i++) {
+ zeroArray(SIZE, c);
+ count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
+ record[i] = StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation();
+ }
+ printTime(record);
+ res[10] = getTimeAvg(record);
+ printElapsedTime(getTimeAvg(record), "(std::chrono Measured)");
printArray(count, c, true);
printCmpLenResult(count, expectedCount, b, c);
- zeroArray(SIZE, c);
printDesc("work-efficient compact, power-of-two");
- count = StreamCompaction::Efficient::compact(SIZE, c, a);
- printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+ for (int i = 0; i < repeatTime; i++) {
+ zeroArray(SIZE, c);
+ count = StreamCompaction::Efficient::compact(SIZE, c, a);
+ record[i] = StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation();
+ }
+ printTime(record);
+ res[11] = getTimeAvg(record);
+ printElapsedTime(getTimeAvg(record), "(CUDA Measured)");
//printArray(count, c, true);
printCmpLenResult(count, expectedCount, b, c);
- zeroArray(SIZE, c);
printDesc("work-efficient compact, non-power-of-two");
- count = StreamCompaction::Efficient::compact(NPOT, c, a);
- printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+ for (int i = 0; i < repeatTime; i++) {
+ zeroArray(SIZE, c);
+ count = StreamCompaction::Efficient::compact(NPOT, c, a);
+ record[i] = StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation();
+ }
+ printTime(record);
+ res[12] = getTimeAvg(record);
+ printElapsedTime(getTimeAvg(record), "(CUDA Measured)");
//printArray(count, c, true);
printCmpLenResult(count, expectedNPOT, b, c);
+ std::cout << "Result for n = " << SIZE << " is :" << std::endl;
+ printTime(res);
+
system("pause"); // stop Win32 console from closing on exit
delete[] a;
delete[] b;
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index 2ed6d63..7a373c6 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -23,7 +23,9 @@ namespace StreamCompaction {
* which map to 0 will be removed, and elements which map to 1 will be kept.
*/
__global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
- // TODO
+ int idx = threadIdx.x + (blockDim.x * blockIdx.x);
+ if (idx >= n) return;
+ bools[idx] = idata[idx] == 0 ? 0 : 1;
}
/**
@@ -32,7 +34,11 @@ namespace StreamCompaction {
*/
__global__ void kernScatter(int n, int *odata,
const int *idata, const int *bools, const int *indices) {
- // TODO
+ int idx = threadIdx.x + (blockDim.x * blockIdx.x);
+ if (idx >= n) return;
+ if (bools[idx] == 1) {
+ odata[indices[idx]] = idata[idx];
+ }
}
}
diff --git a/stream_compaction/common.h b/stream_compaction/common.h
index d2c1fed..aa7d58c 100644
--- a/stream_compaction/common.h
+++ b/stream_compaction/common.h
@@ -11,7 +11,14 @@
#include
#define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
+#ifdef _DEBUG
#define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
+#else
+#define checkCUDAError(msg)
+#endif // _DEBUG
+
+
+#define blockSize 128
/**
* Check for CUDA errors; print and exit if there was a problem.
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 719fa11..2172a8a 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -19,7 +19,10 @@ namespace StreamCompaction {
*/
void scan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
- // TODO
+ odata[0] = 0;
+ for (int i = 1; i < n; i++) {
+ odata[i] = odata[i - 1] + idata[i - 1];
+ }
timer().endCpuTimer();
}
@@ -30,9 +33,14 @@ namespace StreamCompaction {
*/
int compactWithoutScan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
- // TODO
+ int j = 0;
+ for (int i = 0; i < n; i++) {
+ if (idata[i] != 0) {
+ odata[j++] = idata[i];
+ }
+ }
timer().endCpuTimer();
- return -1;
+ return j;
}
/**
@@ -41,10 +49,27 @@ namespace StreamCompaction {
* @returns the number of elements remaining after compaction.
*/
int compactWithScan(int n, int *odata, const int *idata) {
+ int *temp = new int[n],
+ *tempSum = new int[n];
timer().startCpuTimer();
- // TODO
+ for (int i = 0; i < n; i++) {
+ temp[i] = idata[i] == 0 ? 0 : 1;
+ }
+ tempSum[0] = 0;
+ for (int i = 1; i < n; i++) {
+ tempSum[i] = tempSum[i - 1] + temp[i - 1];
+ }
+ int cnt = 0;
+ for (int i = 0; i < n; i++) {
+ if (temp[i] == 1) {
+ odata[tempSum[i]] = idata[i];
+ cnt++;
+ }
+ }
timer().endCpuTimer();
- return -1;
+ delete[] temp;
+ delete[] tempSum;
+ return cnt;
}
}
}
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 2db346e..4878221 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -12,13 +12,73 @@ namespace StreamCompaction {
return timer;
}
+ __global__ void upSweep(int numThreads, int *data, int d) {
+ int idx = threadIdx.x + (blockIdx.x * blockDim.x);
+ if (idx >= numThreads) return;
+ int interval = 1 << d;
+ int mapped = interval * idx + interval - 1;
+ data[mapped] += data[mapped - (interval >> 1)];
+ }
+
+ __global__ void downSweep(int numThreads, int *data, int d) {
+ int idx = threadIdx.x + (blockIdx.x * blockDim.x);
+ if (idx >= numThreads) return;
+ int interval = 1 << d;
+ int node = interval * idx + interval - 1;
+ int left = node - (interval >> 1);
+ int temp = data[left];
+ data[left] = data[node];
+ data[node] += temp;
+ }
+
/**
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
- void scan(int n, int *odata, const int *idata) {
- timer().startGpuTimer();
- // TODO
- timer().endGpuTimer();
+ void scan(int n, int *dev_odata, const int *dev_idata, bool callFromMain) {
+ int iterations = ilog2ceil(n);
+ int nextN = 1 << iterations;
+ int *dev_idata_temp;
+ cudaMalloc((void **) &dev_idata_temp, nextN * sizeof(int));
+ checkCUDAError("SCAN: cudaMalloc dev_idata_temp failed");
+ cudaMemset(dev_idata_temp, 0, nextN *sizeof(int));
+ checkCUDAError("SCAN: cudaMemset dev_idata_temp failed");
+ if (callFromMain) {
+ cudaMemcpy(dev_idata_temp, dev_idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+ timer().startGpuTimer();
+ }
+ else {
+ cudaMemcpy(dev_idata_temp, dev_idata, sizeof(int) * n, cudaMemcpyDeviceToDevice);
+ }
+ checkCUDAError("SCAN: cudaMemcpy dev_idata_temp failed");
+
+ // Up-sweep
+ for (int d = 1; d <= iterations; d++) {
+ int numThreads = 1 << (iterations - d);
+ dim3 blocks((numThreads + blockSize - 1) / blockSize);
+ upSweep<<>>(numThreads, dev_idata_temp, d);
+ checkCUDAError("SCAN: upSweep failed");
+ }
+
+ // Down-sweep
+ // Set the "root" to 0
+ cudaMemset(&dev_idata_temp[nextN - 1], 0, sizeof(int));
+ for (int d = iterations; d >= 1; d--) {
+ int numThreads = 1 << (iterations - d);
+ dim3 blocks((numThreads + blockSize - 1) / blockSize);
+ downSweep<<>>(numThreads, dev_idata_temp, d);
+ checkCUDAError("SCAN: downSweep failed");
+ }
+
+ if (callFromMain) {
+ timer().endGpuTimer();
+ cudaMemcpy(dev_odata, dev_idata_temp, sizeof(int) * n, cudaMemcpyDeviceToHost);
+ }
+ else {
+ cudaMemcpy(dev_odata, dev_idata_temp, sizeof(int) * n, cudaMemcpyDeviceToDevice);
+ }
+ checkCUDAError("SCAN: cudaMemcpy dev_odata failed");
+
+ cudaFree(dev_idata_temp);
}
/**
@@ -31,10 +91,42 @@ namespace StreamCompaction {
* @returns The number of elements remaining after compaction.
*/
int compact(int n, int *odata, const int *idata) {
- timer().startGpuTimer();
- // TODO
- timer().endGpuTimer();
- return -1;
+ int *bools, *indices, *dev_idata, *dev_odata;
+ cudaMalloc((void**) &bools, sizeof(int) * n);
+ checkCUDAError("COMPACT: cudaMalloc bools failed");
+ cudaMalloc((void**) &indices, sizeof(int) * n);
+ checkCUDAError("COMPACT: cudaMalloc indices failed");
+ cudaMalloc((void**) &dev_idata, sizeof(int) * n);
+ checkCUDAError("COMPACT: cudaMalloc dev_idata failed");
+ cudaMalloc((void**) &dev_odata, sizeof(int) * n);
+ checkCUDAError("COMPACT: cudaMalloc dev_odata failed");
+ cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+ checkCUDAError("COMPACT: cudaMalloc idata->dev_idata failed");
+
+ timer().startGpuTimer();
+
+ dim3 blocks((n + blockSize - 1) / blockSize);
+ Common::kernMapToBoolean<<>>(n, bools, dev_idata);
+ checkCUDAError("COMPACT: kernMapToBoolean failed");
+ scan(n, indices, bools, false);
+ Common::kernScatter<<>>(n, dev_odata, dev_idata, bools, indices);
+ checkCUDAError("COMPACT: kernScatter failed");
+
+ timer().endGpuTimer();
+
+ int cnt, lastBool;
+ cudaMemcpy(odata, dev_odata, sizeof(int) * n, cudaMemcpyDeviceToHost);
+ checkCUDAError("COMPACT: cudaMemcpy dev_odata->odata failed");
+ // Copy the count back
+ cudaMemcpy(&cnt, &indices[n - 1], sizeof(int), cudaMemcpyDeviceToHost);
+ checkCUDAError("COMPACT: cudaMemcpy indices->cnt failed");
+ cudaMemcpy(&lastBool, &bools[n - 1], sizeof(int), cudaMemcpyDeviceToHost);
+ checkCUDAError("COMPACT: cudaMemcpy bools->lastBool failed");
+ cudaFree(bools);
+ cudaFree(indices);
+ cudaFree(dev_idata);
+ cudaFree(dev_odata);
+ return lastBool ? cnt + 1 : cnt;
}
}
}
diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h
index 803cb4f..5ba93b0 100644
--- a/stream_compaction/efficient.h
+++ b/stream_compaction/efficient.h
@@ -6,7 +6,7 @@ namespace StreamCompaction {
namespace Efficient {
StreamCompaction::Common::PerformanceTimer& timer();
- void scan(int n, int *odata, const int *idata);
+ void scan(int n, int *odata, const int *idata, bool useTimer = true);
int compact(int n, int *odata, const int *idata);
}
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 4308876..9fd83e9 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -11,15 +11,45 @@ namespace StreamCompaction {
static PerformanceTimer timer;
return timer;
}
- // TODO: __global__
+ __global__ void addPrev(int n, int *idata, int *odata, int d) {
+ int idx = threadIdx.x + (blockIdx.x * blockDim.x);
+ if (idx >= n) return;
+ int base = 1 << (d - 1);
+ odata[idx] = idx >= base ? idata[idx - base] + idata[idx] : idata[idx];
+ }
/**
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
void scan(int n, int *odata, const int *idata) {
+ int *dev_idata, *dev_odata;
+ cudaMalloc((void **) &dev_idata, n * sizeof(int));
+ checkCUDAError("cudaMalloc dev_idata failed");
+ cudaMalloc((void **) &dev_odata, n * sizeof(int));
+ checkCUDAError("cudaMalloc dev_odata failed");
+ cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+ checkCUDAError("cudaMemcpy dev_idata failed");
+
timer().startGpuTimer();
- // TODO
+ int iterations = ilog2ceil(n);
+
+ dim3 blocks((n + blockSize - 1) / blockSize);
+ for (int d = 1; d <= iterations; d++) {
+ if (d % 2 == 1) {
+ addPrev << > > (n, dev_idata, dev_odata, d);
+ }
+ else {
+ addPrev << > > (n, dev_odata, dev_idata, d);
+ }
+ checkCUDAError("addPrev failed");
+ }
+
timer().endGpuTimer();
+ odata[0] = 0;
+ cudaMemcpy(odata + 1, (iterations % 2 == 1) ? dev_odata : dev_idata, sizeof(int) * (n - 1), cudaMemcpyDeviceToHost);
+ checkCUDAError("cudaMemcpy odata failed");
+ cudaFree(dev_idata);
+ cudaFree(dev_odata);
}
}
}
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 1def45e..cb3bf9d 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -18,11 +18,13 @@ namespace StreamCompaction {
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
void scan(int n, int *odata, const int *idata) {
+ thrust::host_vector host_idata(idata, idata + n);
+ thrust::device_vector dev_idata = host_idata;
+ thrust::device_vector dev_odata(n);
timer().startGpuTimer();
- // TODO use `thrust::exclusive_scan`
- // example: for device_vectors dv_in and dv_out:
- // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+ thrust::exclusive_scan(dev_idata.begin(), dev_idata.end(), dev_odata.begin());
timer().endGpuTimer();
+ thrust::copy(dev_odata.begin(), dev_odata.end(), odata);
}
}
}