diff --git a/README.md b/README.md index 0e38ddb..ade0b8e 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,122 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Shenyue Chen + * [LinkedIn](https://www.linkedin.com/in/shenyue-chen-5b2728119/), [personal website](http://github.com/EvsChen) +* Tested on: Windows 10, Intel Xeon Platinum 8259CL @ 2.50GHz 16GB, Tesla T4 (AWS g4dn-xlarge) + +### Features +* Implementation of cpu scan, cpu compact, naive scan, work-efficient scan and work-efficient compact +* Optimization of the work-efficient scan algorithm by launching only necessary number of threads in `upSweep` and `downSweep` + * Map thread index to the actual index by `interval * idx + interval - 1`, where interval is `1 << iteration` + * For example, for N = 8, 4 threads is launched in the first iteration, 2 launched in the secondm etc. + +### Performance analysis +Block size: 128 + +Lengh of array: from (2^10 - 2^21) + +**All the time measured in this section is the average of 100 tests, to avoid caching of functions** + +As N increases, the time for cpu algorithms increases in a exponential manner while the GPU algorithms increases much slower. + +

+ +

+ +

+ +

+ +For the GPU algorithms only, the naive algorithm performs the best when N is small. But thrust scan turns out to be the best when N becomes larger. +

+ +

+ +In my experiments, there are no obvious difference for the NPOT version of the work efficient scan algorithm. +

+ +

+ +Similar things happen for the thrust scan. +

+ +

+ + + + +### Sample output +I tested each of the algorithm for 100 times and include some additional information. +``` +**************** +** SCAN TESTS ** +**************** + [ 41 23 25 1 5 46 28 37 30 42 42 25 35 ... 38 0 ] +==== cpu scan, power-of-two ==== + Time record is [11.006, 11.431, 11.774, 13.613, 14.736, 11.656, 11.874, 11.659, 11.823, 12.829, 11.737, 11.989, 11.872, ... 25.619] + elapsed time: 13.311ms (std::chrono Measured) + [ 0 41 64 89 90 95 141 169 206 236 278 320 345 ... 51331714 51331752 ] +==== cpu scan, non-power-of-two ==== + Time record is [6.4447, 6.2375, 20.608, 8.2194, 4.5156, 5.2029, 4.3476, 3.7569, 3.7672, 3.9463, 3.8041, 6.7647, 8.9613, ... 3.7673] + elapsed time: 4.7256ms (std::chrono Measured) + [ 0 41 64 89 90 95 141 169 206 236 278 320 345 ... 51331652 51331692 ] + passed +==== naive scan, power-of-two ==== + Time record is [1.6997, 1.6947, 1.6957, 1.6964, 1.6937, 1.6972, 1.6957, 1.6955, 1.6977, 1.6956, 1.6957, 1.697, 1.6957, ... 1.5053] + elapsed time: 1.5804ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + Time record is [1.505, 1.5154, 1.5114, 1.5073, 1.5134, 1.5131, 1.5095, 1.5173, 1.5183, 1.5173, 1.5181, 1.5193, 1.5177, ... 1.5286] + elapsed time: 1.5617ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + Time record is [1.0465, 0.9728, 0.92182, 0.9257, 0.93184, 2.3247, 0.92374, 0.9345, 0.92896, 0.92176, 0.92269, 0.92266, 0.93555, ... 0.9264] + elapsed time: 0.97037ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + Time record is [0.93405, 0.92058, 0.92365, 0.91706, 0.93229, 0.92541, 0.9175, 0.93056, 0.9216, 0.9176, 0.91802, 0.93424, 0.91955, ... 0.91955] + elapsed time: 0.92851ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + Time record is [0.27674, 0.29424, 0.37693, 0.28387, 0.26618, 0.26726, 0.29562, 0.27222, 0.27443, 0.29901, 0.31325, 0.30925, 0.27082, ... 0.26301] + elapsed time: 0.30972ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + Time record is [0.2639, 0.36099, 0.28035, 0.26765, 0.39936, 0.30883, 0.29104, 0.27306, 0.26934, 0.2631, 0.29914, 0.26224, 0.2863, ... 0.29773] + elapsed time: 0.31379ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 0 1 1 3 0 1 0 2 3 3 1 1 1 ... 3 0 ] +==== cpu compact without scan, power-of-two ==== + Time record is [7.5451, 5.9954, 5.9785, 5.7061, 6.2471, 6.2241, 5.9236, 5.7947, 6.4508, 5.8406, 5.8629, 5.7438, 5.7671, ... 5.7358] + elapsed time: 6.2666ms (std::chrono Measured) + [ 1 1 3 1 2 3 3 1 1 1 3 1 2 ... 1 3 ] + passed +==== cpu compact without scan, non-power-of-two ==== + Time record is [7.3508, 8.2913, 5.9938, 5.9339, 5.8238, 5.7023, 5.9023, 5.8208, 6.926, 6.5035, 5.7348, 6.8928, 7.7841, ... 5.8204] + elapsed time: 6.2747ms (std::chrono Measured) + [ 1 1 3 1 2 3 3 1 1 1 3 1 2 ... 2 3 ] + passed +==== cpu compact with scan ==== + Time record is [28.751, 24.819, 23.735, 23.725, 23.829, 23.911, 23.968, 26.001, 24.981, 24.231, 23.606, 24.335, 23.51, ... 26.916] + elapsed time: 24.973ms (std::chrono Measured) + [ 1 1 3 1 2 3 3 1 1 1 3 1 2 ... 1 3 ] + passed +==== work-efficient compact, power-of-two ==== + Time record is [1.9505, 1.6153, 1.6267, 1.6086, 1.6013, 1.6535, 1.6727, 1.7992, 1.7735, 2.1627, 1.7832, 1.7375, 1.7575, ... 1.682] + elapsed time: 1.8556ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + Time record is [1.6855, 1.9108, 1.7044, 1.6872, 1.7303, 1.9761, 1.7651, 1.9041, 1.6835, 1.6977, 1.7791, 1.6812, 1.7178, ... 1.6495] + elapsed time: 1.7805ms (CUDA Measured) + passed +Result for n = 2097152 is : + Time record is [13.311, 4.7256, 1.5804, 1.5617, 0.97037, 0.92851, 0.30972, 0.31379, 6.2666, 6.2747, 24.973, 1.8556, 1.7805, ... 0] +``` -### (TODO: Your README) -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) diff --git a/doc/analysis.py b/doc/analysis.py new file mode 100644 index 0000000..ce7ab35 --- /dev/null +++ b/doc/analysis.py @@ -0,0 +1,75 @@ +import csv +import matplotlib.pyplot as plt + +data = [] +with open('lab2.csv', 'r') as csvfile: + spamreader = csv.reader(csvfile, delimiter=',') + for row in spamreader: + drow = [] + for data_str in row: + drow.append(float(data_str)) + data.append(drow) + +start = 10 +end = 21 + +cpu_scan = [] +naive_scan = [] +efficient_scan = [] +efficient_scan_npot = [] +thrust_scan = [] +thrust_scan_npot = [] +cpu_compact = [] +efficient_compact = [] + +x = [] + +# import pdb; pdb.set_trace() + +for i in range(start, end + 1): + x.append(i) + row = data[i - 10] + cpu_scan.append(row[0]) + naive_scan.append(row[2]) + efficient_scan.append(row[4]) + efficient_scan_npot.append(row[6]) + thrust_scan.append(row[6]) + thrust_scan_npot.append(row[7]) + cpu_compact.append(row[8]) + efficient_compact.append(row[11]) + +fig, ax = plt.subplots() + +### Scan +# plt.plot(x, cpu_scan, label="cpu_scan") +# plt.plot(x, naive_scan, label="naive_scan") +# plt.plot(x, efficient_scan, label="work_efficient") +# plt.plot(x, thrust_scan, label="thrust_scane") +# plt.title("Scan time") + +### NPOT +# plt.plot(x, efficient_scan, label="work-efficient scan") +# plt.plot(x, efficient_scan_npot, label="work-efficient scan NPOT") +# plt.title("Scan time") + + +### Thrust NPOT +# plt.plot(x, thrust_scan, label="thrust scan") +# plt.plot(x, thrust_scan_npot, label="thrust scan NPOT") +# plt.title("Scan time") + +### Compact +# plt.plot(x, cpu_compact, label="cpu compact") +# plt.plot(x, efficient_compact, label="work efficient compact") +# plt.title("Compact time") + +### GPU +plt.plot(x, naive_scan, label="naive_scan") +plt.plot(x, efficient_scan, label="work_efficient") +plt.plot(x, thrust_scan, label="thrust_scan") +plt.title("GPU Scan time") + +plt.xlabel("Number (in base 2)") +plt.ylabel("Time (ms)") +plt.legend() +plt.show() \ No newline at end of file diff --git a/doc/compact.png b/doc/compact.png new file mode 100644 index 0000000..36a6aae Binary files /dev/null and b/doc/compact.png differ diff --git a/doc/gpu_scan.png b/doc/gpu_scan.png new file mode 100644 index 0000000..9a8c82a Binary files /dev/null and b/doc/gpu_scan.png differ diff --git a/doc/lab2.csv b/doc/lab2.csv new file mode 100644 index 0000000..bf95092 --- /dev/null +++ b/doc/lab2.csv @@ -0,0 +1,12 @@ +0.001885, 0.001899, 0.036519, 0.03666, 0.10034, 0.12692, 0.19843, 0.18431, 0.002746, 0.004293, 0.007446, 0.2877, 0.32142 +0.005164, 0.003933, 0.05982, 0.064216, 0.16541, 0.13125, 0.15975, 0.1403, 0.003983, 0.005303, 0.010544, 0.30586, 0.37939 +0.009546, 0.007338, 0.046396, 0.046294, 0.13717, 0.1225, 0.14251, 0.12847, 0.020095, 0.008572, 0.045547, 0.3119, 0.32465 +0.009546, 0.007338, 0.046396, 0.046294, 0.13717, 0.1225, 0.14251, 0.12847, 0.020095, 0.008572, 0.045547, 0.3119, 0.32465 +0.032347, 0.03035, 0.057651, 0.057629, 0.14566, 0.1432, 0.17856, 0.17249, 0.062693, 0.068325, 0.141, 0.39171, 0.40263 +0.072447, 0.066555, 0.065433, 0.065277, 0.16558, 0.17479, 0.16487, 0.20362, 0.12599, 0.11522, 0.27981, 0.34057, 0.50167 +0.13301, 0.1447, 0.087247, 0.087203, 0.16367, 0.19217, 0.13575, 0.18643, 0.28578, 0.25115, 0.55336, 0.44775, 0.34398 +0.28354, 0.33327, 0.12242, 0.11939, 0.19577, 0.18088, 0.1958, 0.15035, 0.70451, 0.6561, 1.6623, 0.51467, 0.52659 +1.4489, 0.57192, 0.20105, 0.18198, 0.21742, 0.25338, 0.64729, 0.28838, 0.79568, 0.85801, 3.0992, 0.55925, 0.56684 +3.1769, 1.1265, 0.35451, 0.3327, 0.31406, 0.34869, 0.29619, 0.31233, 1.602, 1.6526, 6.3687, 0.68551, 0.66141 +6.3527, 2.0997, 0.9041, 0.8269, 0.34781, 0.3672, 0.28274, 0.28195, 3.196, 3.239, 12.361, 0.89818, 0.90519 +13.311, 4.7256, 1.5804, 1.5617, 0.97037, 0.92851, 0.30972, 0.31379, 6.2666, 6.2747, 24.973, 1.8556, 1.7805 \ No newline at end of file diff --git a/doc/scan_time.png b/doc/scan_time.png new file mode 100644 index 0000000..1ea81e8 Binary files /dev/null and b/doc/scan_time.png differ diff --git a/doc/scan_time_npot.png b/doc/scan_time_npot.png new file mode 100644 index 0000000..5401296 Binary files /dev/null and b/doc/scan_time_npot.png differ diff --git a/doc/thrust_scan_time_npot.png b/doc/thrust_scan_time_npot.png new file mode 100644 index 0000000..c3bcdf4 Binary files /dev/null and b/doc/thrust_scan_time_npot.png differ diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..989423f 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,12 +13,40 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 10; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; int *c = new int[SIZE]; +const int repeatTime = 100; +float *record = new float[repeatTime]; + +float getTimeAvg(float *src) { + float t = 0.f; + for (int i = 0; i < repeatTime; i++) { + t += src[i]; + } + return t / repeatTime; +} + +void printTime(float *src) { + std::cout << " Time record is ["; + std::cout.precision(5); + for (int i = 0; i < repeatTime; i++) { + if (repeatTime > 16 && i == 13) { + std::cout << "... "; + i = repeatTime - 2; + continue; + } + std::cout << src[i]; + if (i != repeatTime - 1) { + std::cout << ", "; + } + } + std::cout << "]" << std::endl; +} + int main(int argc, char* argv[]) { // Scan tests @@ -30,28 +58,45 @@ int main(int argc, char* argv[]) { genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case a[SIZE - 1] = 0; printArray(SIZE, a, true); + float res[13]; + // initialize b using StreamCompaction::CPU::scan you implement // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct. // At first all cases passed because b && c are all zeroes. - zeroArray(SIZE, b); printDesc("cpu scan, power-of-two"); - StreamCompaction::CPU::scan(SIZE, b, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + for (int i = 0; i < repeatTime; i++) { + zeroArray(SIZE, b); + StreamCompaction::CPU::scan(SIZE, b, a); + record[i] = StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(); + } + printTime(record); + res[0] = getTimeAvg(record); + printElapsedTime(res[0], "(std::chrono Measured)"); printArray(SIZE, b, true); - zeroArray(SIZE, c); + printDesc("cpu scan, non-power-of-two"); - StreamCompaction::CPU::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + for (int i = 0; i < repeatTime; i++) { + zeroArray(SIZE, c); + StreamCompaction::CPU::scan(NPOT, c, a); + record[i] = StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(); + } + printTime(record); + res[1] = getTimeAvg(record); + printElapsedTime(getTimeAvg(record), "(std::chrono Measured)"); printArray(NPOT, b, true); printCmpResult(NPOT, b, c); - zeroArray(SIZE, c); printDesc("naive scan, power-of-two"); - StreamCompaction::Naive::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + for (int i = 0; i < repeatTime; i++) { + zeroArray(SIZE, c); + StreamCompaction::Naive::scan(SIZE, c, a); + record[i] = StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(); + } + printTime(record); + res[2] = getTimeAvg(record); + printElapsedTime(getTimeAvg(record), "(CUDA Measured)"); printCmpResult(SIZE, b, c); /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan @@ -59,39 +104,62 @@ int main(int argc, char* argv[]) { printDesc("1s array for finding bugs"); StreamCompaction::Naive::scan(SIZE, c, a); printArray(SIZE, c, true); */ - - zeroArray(SIZE, c); + printDesc("naive scan, non-power-of-two"); - StreamCompaction::Naive::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + for (int i = 0; i < repeatTime; i++) { + zeroArray(SIZE, c); + StreamCompaction::Naive::scan(NPOT, c, a); + record[i] = StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(); + } + printTime(record); + res[3] = getTimeAvg(record); + printElapsedTime(getTimeAvg(record), "(CUDA Measured)"); printCmpResult(NPOT, b, c); - zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); - StreamCompaction::Efficient::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + for (int i = 0; i < repeatTime; i++) { + zeroArray(SIZE, c); + StreamCompaction::Efficient::scan(SIZE, c, a); + record[i] = StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(); + } + printTime(record); + res[4] = getTimeAvg(record); + printElapsedTime(getTimeAvg(record), "(CUDA Measured)"); //printArray(SIZE, c, true); printCmpResult(SIZE, b, c); - zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); - StreamCompaction::Efficient::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + for (int i = 0; i < repeatTime; i++) { + zeroArray(SIZE, c); + StreamCompaction::Efficient::scan(NPOT, c, a); + record[i] = StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(); + } + printTime(record); + res[5] = getTimeAvg(record); + printElapsedTime(getTimeAvg(record), "(CUDA Measured)"); printCmpResult(NPOT, b, c); - zeroArray(SIZE, c); printDesc("thrust scan, power-of-two"); - StreamCompaction::Thrust::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + for (int i = 0; i < repeatTime; i++) { + zeroArray(SIZE, c); + StreamCompaction::Thrust::scan(SIZE, c, a); + record[i] = StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(); + } + printTime(record); + res[6] = getTimeAvg(record); + printElapsedTime(getTimeAvg(record), "(CUDA Measured)"); //printArray(SIZE, c, true); printCmpResult(SIZE, b, c); - zeroArray(SIZE, c); printDesc("thrust scan, non-power-of-two"); - StreamCompaction::Thrust::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + for (int i = 0; i < repeatTime; i++) { + zeroArray(SIZE, c); + StreamCompaction::Thrust::scan(NPOT, c, a); + record[i] = StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(); + } + printTime(record); + res[7] = getTimeAvg(record); + printElapsedTime(getTimeAvg(record), "(CUDA Measured)"); //printArray(NPOT, c, true); printCmpResult(NPOT, b, c); @@ -110,43 +178,71 @@ int main(int argc, char* argv[]) { // initialize b using StreamCompaction::CPU::compactWithoutScan you implement // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct. - zeroArray(SIZE, b); printDesc("cpu compact without scan, power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + for (int i = 0; i < repeatTime; i++) { + zeroArray(SIZE, b); + count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); + record[i] = StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(); + } + printTime(record); + res[8] = getTimeAvg(record); + printElapsedTime(getTimeAvg(record), "(std::chrono Measured)"); expectedCount = count; printArray(count, b, true); printCmpLenResult(count, expectedCount, b, b); - zeroArray(SIZE, c); printDesc("cpu compact without scan, non-power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + for (int i = 0; i < repeatTime; i++) { + zeroArray(SIZE, c); + count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); + record[i] = StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(); + } + printTime(record); + res[9] = getTimeAvg(record); + printElapsedTime(getTimeAvg(record), "(std::chrono Measured)"); expectedNPOT = count; printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); - zeroArray(SIZE, c); printDesc("cpu compact with scan"); - count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + for (int i = 0; i < repeatTime; i++) { + zeroArray(SIZE, c); + count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); + record[i] = StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(); + } + printTime(record); + res[10] = getTimeAvg(record); + printElapsedTime(getTimeAvg(record), "(std::chrono Measured)"); printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); - zeroArray(SIZE, c); printDesc("work-efficient compact, power-of-two"); - count = StreamCompaction::Efficient::compact(SIZE, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + for (int i = 0; i < repeatTime; i++) { + zeroArray(SIZE, c); + count = StreamCompaction::Efficient::compact(SIZE, c, a); + record[i] = StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(); + } + printTime(record); + res[11] = getTimeAvg(record); + printElapsedTime(getTimeAvg(record), "(CUDA Measured)"); //printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); - zeroArray(SIZE, c); printDesc("work-efficient compact, non-power-of-two"); - count = StreamCompaction::Efficient::compact(NPOT, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + for (int i = 0; i < repeatTime; i++) { + zeroArray(SIZE, c); + count = StreamCompaction::Efficient::compact(NPOT, c, a); + record[i] = StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(); + } + printTime(record); + res[12] = getTimeAvg(record); + printElapsedTime(getTimeAvg(record), "(CUDA Measured)"); //printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); + std::cout << "Result for n = " << SIZE << " is :" << std::endl; + printTime(res); + system("pause"); // stop Win32 console from closing on exit delete[] a; delete[] b; diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..7a373c6 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -23,7 +23,9 @@ namespace StreamCompaction { * which map to 0 will be removed, and elements which map to 1 will be kept. */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + int idx = threadIdx.x + (blockDim.x * blockIdx.x); + if (idx >= n) return; + bools[idx] = idata[idx] == 0 ? 0 : 1; } /** @@ -32,7 +34,11 @@ namespace StreamCompaction { */ __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { - // TODO + int idx = threadIdx.x + (blockDim.x * blockIdx.x); + if (idx >= n) return; + if (bools[idx] == 1) { + odata[indices[idx]] = idata[idx]; + } } } diff --git a/stream_compaction/common.h b/stream_compaction/common.h index d2c1fed..aa7d58c 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -11,7 +11,14 @@ #include #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) +#ifdef _DEBUG #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) +#else +#define checkCUDAError(msg) +#endif // _DEBUG + + +#define blockSize 128 /** * Check for CUDA errors; print and exit if there was a problem. diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..2172a8a 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -19,7 +19,10 @@ namespace StreamCompaction { */ void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + odata[0] = 0; + for (int i = 1; i < n; i++) { + odata[i] = odata[i - 1] + idata[i - 1]; + } timer().endCpuTimer(); } @@ -30,9 +33,14 @@ namespace StreamCompaction { */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + int j = 0; + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + odata[j++] = idata[i]; + } + } timer().endCpuTimer(); - return -1; + return j; } /** @@ -41,10 +49,27 @@ namespace StreamCompaction { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { + int *temp = new int[n], + *tempSum = new int[n]; timer().startCpuTimer(); - // TODO + for (int i = 0; i < n; i++) { + temp[i] = idata[i] == 0 ? 0 : 1; + } + tempSum[0] = 0; + for (int i = 1; i < n; i++) { + tempSum[i] = tempSum[i - 1] + temp[i - 1]; + } + int cnt = 0; + for (int i = 0; i < n; i++) { + if (temp[i] == 1) { + odata[tempSum[i]] = idata[i]; + cnt++; + } + } timer().endCpuTimer(); - return -1; + delete[] temp; + delete[] tempSum; + return cnt; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..4878221 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -12,13 +12,73 @@ namespace StreamCompaction { return timer; } + __global__ void upSweep(int numThreads, int *data, int d) { + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + if (idx >= numThreads) return; + int interval = 1 << d; + int mapped = interval * idx + interval - 1; + data[mapped] += data[mapped - (interval >> 1)]; + } + + __global__ void downSweep(int numThreads, int *data, int d) { + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + if (idx >= numThreads) return; + int interval = 1 << d; + int node = interval * idx + interval - 1; + int left = node - (interval >> 1); + int temp = data[left]; + data[left] = data[node]; + data[node] += temp; + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); + void scan(int n, int *dev_odata, const int *dev_idata, bool callFromMain) { + int iterations = ilog2ceil(n); + int nextN = 1 << iterations; + int *dev_idata_temp; + cudaMalloc((void **) &dev_idata_temp, nextN * sizeof(int)); + checkCUDAError("SCAN: cudaMalloc dev_idata_temp failed"); + cudaMemset(dev_idata_temp, 0, nextN *sizeof(int)); + checkCUDAError("SCAN: cudaMemset dev_idata_temp failed"); + if (callFromMain) { + cudaMemcpy(dev_idata_temp, dev_idata, sizeof(int) * n, cudaMemcpyHostToDevice); + timer().startGpuTimer(); + } + else { + cudaMemcpy(dev_idata_temp, dev_idata, sizeof(int) * n, cudaMemcpyDeviceToDevice); + } + checkCUDAError("SCAN: cudaMemcpy dev_idata_temp failed"); + + // Up-sweep + for (int d = 1; d <= iterations; d++) { + int numThreads = 1 << (iterations - d); + dim3 blocks((numThreads + blockSize - 1) / blockSize); + upSweep<<>>(numThreads, dev_idata_temp, d); + checkCUDAError("SCAN: upSweep failed"); + } + + // Down-sweep + // Set the "root" to 0 + cudaMemset(&dev_idata_temp[nextN - 1], 0, sizeof(int)); + for (int d = iterations; d >= 1; d--) { + int numThreads = 1 << (iterations - d); + dim3 blocks((numThreads + blockSize - 1) / blockSize); + downSweep<<>>(numThreads, dev_idata_temp, d); + checkCUDAError("SCAN: downSweep failed"); + } + + if (callFromMain) { + timer().endGpuTimer(); + cudaMemcpy(dev_odata, dev_idata_temp, sizeof(int) * n, cudaMemcpyDeviceToHost); + } + else { + cudaMemcpy(dev_odata, dev_idata_temp, sizeof(int) * n, cudaMemcpyDeviceToDevice); + } + checkCUDAError("SCAN: cudaMemcpy dev_odata failed"); + + cudaFree(dev_idata_temp); } /** @@ -31,10 +91,42 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); - return -1; + int *bools, *indices, *dev_idata, *dev_odata; + cudaMalloc((void**) &bools, sizeof(int) * n); + checkCUDAError("COMPACT: cudaMalloc bools failed"); + cudaMalloc((void**) &indices, sizeof(int) * n); + checkCUDAError("COMPACT: cudaMalloc indices failed"); + cudaMalloc((void**) &dev_idata, sizeof(int) * n); + checkCUDAError("COMPACT: cudaMalloc dev_idata failed"); + cudaMalloc((void**) &dev_odata, sizeof(int) * n); + checkCUDAError("COMPACT: cudaMalloc dev_odata failed"); + cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + checkCUDAError("COMPACT: cudaMalloc idata->dev_idata failed"); + + timer().startGpuTimer(); + + dim3 blocks((n + blockSize - 1) / blockSize); + Common::kernMapToBoolean<<>>(n, bools, dev_idata); + checkCUDAError("COMPACT: kernMapToBoolean failed"); + scan(n, indices, bools, false); + Common::kernScatter<<>>(n, dev_odata, dev_idata, bools, indices); + checkCUDAError("COMPACT: kernScatter failed"); + + timer().endGpuTimer(); + + int cnt, lastBool; + cudaMemcpy(odata, dev_odata, sizeof(int) * n, cudaMemcpyDeviceToHost); + checkCUDAError("COMPACT: cudaMemcpy dev_odata->odata failed"); + // Copy the count back + cudaMemcpy(&cnt, &indices[n - 1], sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("COMPACT: cudaMemcpy indices->cnt failed"); + cudaMemcpy(&lastBool, &bools[n - 1], sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("COMPACT: cudaMemcpy bools->lastBool failed"); + cudaFree(bools); + cudaFree(indices); + cudaFree(dev_idata); + cudaFree(dev_odata); + return lastBool ? cnt + 1 : cnt; } } } diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 803cb4f..5ba93b0 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -6,7 +6,7 @@ namespace StreamCompaction { namespace Efficient { StreamCompaction::Common::PerformanceTimer& timer(); - void scan(int n, int *odata, const int *idata); + void scan(int n, int *odata, const int *idata, bool useTimer = true); int compact(int n, int *odata, const int *idata); } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..9fd83e9 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -11,15 +11,45 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } - // TODO: __global__ + __global__ void addPrev(int n, int *idata, int *odata, int d) { + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + if (idx >= n) return; + int base = 1 << (d - 1); + odata[idx] = idx >= base ? idata[idx - base] + idata[idx] : idata[idx]; + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int *dev_idata, *dev_odata; + cudaMalloc((void **) &dev_idata, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_idata failed"); + cudaMalloc((void **) &dev_odata, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_odata failed"); + cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy dev_idata failed"); + timer().startGpuTimer(); - // TODO + int iterations = ilog2ceil(n); + + dim3 blocks((n + blockSize - 1) / blockSize); + for (int d = 1; d <= iterations; d++) { + if (d % 2 == 1) { + addPrev << > > (n, dev_idata, dev_odata, d); + } + else { + addPrev << > > (n, dev_odata, dev_idata, d); + } + checkCUDAError("addPrev failed"); + } + timer().endGpuTimer(); + odata[0] = 0; + cudaMemcpy(odata + 1, (iterations % 2 == 1) ? dev_odata : dev_idata, sizeof(int) * (n - 1), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy odata failed"); + cudaFree(dev_idata); + cudaFree(dev_odata); } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..cb3bf9d 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,13 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + thrust::host_vector host_idata(idata, idata + n); + thrust::device_vector dev_idata = host_idata; + thrust::device_vector dev_odata(n); timer().startGpuTimer(); - // TODO use `thrust::exclusive_scan` - // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(dev_idata.begin(), dev_idata.end(), dev_odata.begin()); timer().endGpuTimer(); + thrust::copy(dev_odata.begin(), dev_odata.end(), odata); } } }