diff --git a/Project2-Character-Recognition/README.md b/Project2-Character-Recognition/README.md index 4503fac..cb1b35b 100644 --- a/Project2-Character-Recognition/README.md +++ b/Project2-Character-Recognition/README.md @@ -3,12 +3,17 @@ CUDA Character Recognition **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Joshua Nadel + * https://www.linkedin.com/in/joshua-nadel-379382136/, http://www.joshnadel.com/ +* Tested on: Windows 10, i7-6700HQ @ 2.60GHz 16GB, GTX 970M (Personal laptop) -### (TODO: Your README) +### Character Recognition -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +![](img/output.png) + +The program outputs the total error before training and total error after training. As you can see, total error decreases after optimizing the network's weights on training data. + +My implementation is hard-coded at 3 layers. I cannot test framerates at other network complexities. + +I am limited to a data set of 101 by 101 images. I have no other training or testing data to compare performance on image size with. diff --git a/Project2-Character-Recognition/character_recognition/CMakeLists.txt b/Project2-Character-Recognition/character_recognition/CMakeLists.txt index 7446175..556196a 100644 --- a/Project2-Character-Recognition/character_recognition/CMakeLists.txt +++ b/Project2-Character-Recognition/character_recognition/CMakeLists.txt @@ -7,5 +7,5 @@ set(SOURCE_FILES cuda_add_library(character_recognition ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_52 ) diff --git a/Project2-Character-Recognition/character_recognition/mlp.cu b/Project2-Character-Recognition/character_recognition/mlp.cu index 5a3ed7f..198e5a1 100644 --- a/Project2-Character-Recognition/character_recognition/mlp.cu +++ b/Project2-Character-Recognition/character_recognition/mlp.cu @@ -2,6 +2,14 @@ #include #include "common.h" #include "mlp.h" +#include + +#define DIM 101 +#define LABELS 52 + +#define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__) + +#define blockSize 128 namespace CharacterRecognition { using Common::PerformanceTimer; @@ -10,18 +18,199 @@ namespace CharacterRecognition { static PerformanceTimer timer; return timer; } + + float *dev_input; + float *dev_hidden; + float *dev_output; + float *dev_w_kj; + float *dev_w_ki; + + int inputDims = DIM * DIM; + int hiddenDims = inputDims; + int outputDims = LABELS; + + __global__ void backprop(float *inputLr, float *hiddenLr, float *outputLr, int n_input, int n_hidden, int n_output, float *weightsIH, float *weightsHO, float *d_weightsIH, float *d_weightsHO, float label) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n_input) { + return; + } + + for (int i = 0; i < n_hidden; i++) { + float totalError = 0.0f; + float gradientProd = 1.0f; + float weightProd = 1.0f; + for (int j = 0; j < n_output; j++) { + float expected = label == j ? 1 : 0; + float error = expected - outputLr[j]; + + int weight2Index = i + j * n_output; + d_weightsHO[weight2Index] += hiddenLr[i] * outputLr[j] * (1 - outputLr[j]) * -error; + totalError += error; + gradientProd *= outputLr[j] * (1 - outputLr[j]); + weightProd *= weightsHO[weight2Index]; + } + int weight1Index = index + i * n_hidden; + d_weightsIH[weight1Index] += inputLr[index] * hiddenLr[i] * (1 - hiddenLr[i]) * -totalError * gradientProd * weightProd; + } + } + + __global__ void zeroBuffer(float *buffer, int n) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + + buffer[index] = 0; + } + + __global__ void addTwoBuffers(float *addTo, float *addFrom, float lambda, int n) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + + addTo[index] += addFrom[index] * lambda; + } - // TODO: __global__ - - /** - * Example of use case (follow how you did it in stream compaction) - */ - /*void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); - } - */ + void train(float lambda) { + dim3 fullBlocksPerGrid((inputDims + blockSize - 1) / blockSize); + dim3 fullBlocksPerGridInToHid((inputDims * hiddenDims + blockSize - 1) / blockSize); + dim3 fullBlocksPerGridHidToOut((hiddenDims * outputDims + blockSize - 1) / blockSize); + + float totalError = 0; + + float *dev_d_w_kj; + float *dev_d_w_ki; + cudaMalloc((void**)&dev_d_w_kj, inputDims * hiddenDims * sizeof(float)); + checkCUDAErrorWithLine("cudaMalloc dev_d_w_kj failed!"); + cudaMalloc((void**)&dev_d_w_ki, hiddenDims * outputDims * sizeof(float)); + checkCUDAErrorWithLine("cudaMalloc dev_d_w_ki failed!"); + + for (int i = 0; i < LABELS; i++) { + + std::string filename = "../data-set/"; + std::string number = std::to_string(i + 1); + if (number.length() == 1) { + number = std::string("0").append(number); + } + filename.append(number); + filename.append("info.txt"); + FILE * image = std::fopen(filename.c_str(), "r"); + int label; + int dimensions; + fscanf(image, "%d", &label); + fscanf(image, "%d", &dimensions); + float *colors = new float[dimensions]; + for (int j = 0; j < dimensions; j++) { + int color; + fscanf(image, "%d", &color); + colors[j] = color; + } + + float *output = new float[outputDims]; + evaluate(colors, output); + backprop << > > (dev_input, dev_hidden, dev_output, inputDims, hiddenDims, outputDims, dev_w_kj, dev_w_ki, dev_d_w_kj, dev_d_w_ki, i); + for (int j = 0; j < outputDims; j++) { + float expected = i == j ? 1 : 0; + float error = expected - output[j]; + totalError += error * error; + } + delete[] colors; + delete[] output; + } + totalError /= 2.f; + + addTwoBuffers << > > (dev_w_kj, dev_d_w_kj, -lambda * totalError, inputDims * hiddenDims); + addTwoBuffers << > > (dev_w_ki, dev_d_w_ki, -lambda * totalError, hiddenDims * outputDims); + + cudaFree(dev_d_w_kj); + cudaFree(dev_d_w_ki); + + printf("Total error is %f\n", totalError); + } + + __global__ void kernComputeLayer(float *inputLr, float *outputLr, int n_input, int n_output, float *weights) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n_output) { + return; + } + + // Weighted sum of previous layer inputs + for (int i = 0; i < n_input; i++) { + int weightIndex = i + index * n_input; + outputLr[index] += inputLr[i] * weights[weightIndex]; + } + + // Activation function + outputLr[index] = 1 / (1 + expf(-outputLr[index])); + } + + void printArray(const float *array, int n) { + printf("["); + for (int i = 0; i < n; i++) { + printf("%f, ", array[i]); + } + printf("]\n"); + } + + void evaluate(float *input, float *output) { + dim3 fullBlocksPerGridInToHid((inputDims * hiddenDims + blockSize - 1) / blockSize); + dim3 fullBlocksPerGridHidToOut((hiddenDims * outputDims + blockSize - 1) / blockSize); + dim3 fullBlocksPerGrid((inputDims + blockSize - 1) / blockSize); + + zeroBuffer << > > (dev_input, inputDims); + zeroBuffer << > > (dev_hidden, hiddenDims); + zeroBuffer << > > (dev_output, outputDims); + + cudaMemcpy(dev_input, input, sizeof(float) * inputDims, cudaMemcpyHostToDevice); + + kernComputeLayer << > > (dev_input, dev_hidden, inputDims, hiddenDims, dev_w_kj); + kernComputeLayer << > > (dev_hidden, dev_output, hiddenDims, outputDims, dev_w_ki); + + cudaMemcpy(output, dev_output, sizeof(float) * outputDims, cudaMemcpyDeviceToHost); + } + + void init() { + dim3 fullBlocksPerGridInToHid((inputDims * hiddenDims + blockSize - 1) / blockSize); + dim3 fullBlocksPerGridHidToOut((hiddenDims * outputDims + blockSize - 1) / blockSize); + + cudaMalloc((void**)&dev_input, inputDims * sizeof(float)); + checkCUDAErrorWithLine("cudaMalloc dev_input failed!"); + + cudaMalloc((void**)&dev_hidden, hiddenDims * sizeof(float)); + checkCUDAErrorWithLine("cudaMalloc dev_hidden failed!"); + + cudaMalloc((void**)&dev_output, outputDims * sizeof(float)); + checkCUDAErrorWithLine("cudaMalloc dev_output failed!"); + + cudaMalloc((void**)&dev_w_kj, inputDims * hiddenDims * sizeof(float)); + checkCUDAErrorWithLine("cudaMalloc dev_w_kj failed!"); + + cudaMalloc((void**)&dev_w_ki, hiddenDims * outputDims * sizeof(float)); + checkCUDAErrorWithLine("cudaMalloc dev_w_ki failed!"); + + float *weights1 = new float[inputDims * hiddenDims]; + for (int i = 0; i < inputDims * hiddenDims; i++) { + float r = ((double)rand() / (RAND_MAX)); + weights1[i] = r * 2.0f - 1.0f; + } + float *weights2 = new float[hiddenDims * outputDims]; + for (int i = 0; i < hiddenDims * outputDims; i++) { + float r = ((double)rand() / (RAND_MAX)); + weights2[i] = r * 2.0f - 1.0f; + } + cudaMemcpy(dev_w_kj, weights1, sizeof(float) * inputDims * hiddenDims, cudaMemcpyHostToDevice); + cudaMemcpy(dev_w_ki, weights2, sizeof(float) * hiddenDims * outputDims, cudaMemcpyHostToDevice); + + delete[] weights1; + delete[] weights2; + } - // TODO: implement required elements for MLP sections 1 and 2 here + void end() { + cudaFree(dev_input); + cudaFree(dev_hidden); + cudaFree(dev_output); + cudaFree(dev_w_kj); + cudaFree(dev_w_ki); + } } diff --git a/Project2-Character-Recognition/character_recognition/mlp.h b/Project2-Character-Recognition/character_recognition/mlp.h index 2096228..2a79d18 100644 --- a/Project2-Character-Recognition/character_recognition/mlp.h +++ b/Project2-Character-Recognition/character_recognition/mlp.h @@ -5,5 +5,8 @@ namespace CharacterRecognition { Common::PerformanceTimer& timer(); - // TODO: implement required elements for MLP sections 1 and 2 here + void init(); + void train(float lambda); + void evaluate(float *input, float *output); + void end(); } diff --git a/Project2-Character-Recognition/img/output.png b/Project2-Character-Recognition/img/output.png new file mode 100644 index 0000000..033d17c Binary files /dev/null and b/Project2-Character-Recognition/img/output.png differ diff --git a/Project2-Character-Recognition/src/main.cpp b/Project2-Character-Recognition/src/main.cpp index 11dd534..34780f7 100644 --- a/Project2-Character-Recognition/src/main.cpp +++ b/Project2-Character-Recognition/src/main.cpp @@ -11,142 +11,57 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array -const int NPOT = SIZE - 3; // Non-Power-Of-Two -int *a = new int[SIZE]; -int *b = new int[SIZE]; -int *c = new int[SIZE]; - -int main(int argc, char* argv[]) { - // Scan tests - - printf("\n"); - printf("****************\n"); - printf("** SCAN TESTS **\n"); - printf("****************\n"); - - genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); - - // initialize b using StreamCompaction::CPU::scan you implement - // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct. - // At first all cases passed because b && c are all zeroes. - zeroArray(SIZE, b); - printDesc("cpu scan, power-of-two"); - StreamCompaction::CPU::scan(SIZE, b, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(SIZE, b, true); - - zeroArray(SIZE, c); - printDesc("cpu scan, non-power-of-two"); - StreamCompaction::CPU::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(NPOT, b, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("naive scan, power-of-two"); - StreamCompaction::Naive::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan - onesArray(SIZE, c); - printDesc("1s array for finding bugs"); - StreamCompaction::Naive::scan(SIZE, c, a); - printArray(SIZE, c, true); */ - - zeroArray(SIZE, c); - printDesc("naive scan, non-power-of-two"); - StreamCompaction::Naive::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient scan, power-of-two"); - StreamCompaction::Efficient::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient scan, non-power-of-two"); - StreamCompaction::Efficient::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("thrust scan, power-of-two"); - StreamCompaction::Thrust::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - zeroArray(SIZE, c); - printDesc("thrust scan, non-power-of-two"); - StreamCompaction::Thrust::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); +void printArray(const float *array, int n) { + printf("["); + for (int i = 0; i < n; i++) { + printf("%f, ", array[i]); + } + printf("]\n"); +} - printf("\n"); - printf("*****************************\n"); - printf("** STREAM COMPACTION TESTS **\n"); - printf("*****************************\n"); +void testImage(int i, float *output) { + std::string filename = "../data-set/"; + std::string number = std::to_string(i + 1); + if (number.length() == 1) { + number = std::string("0").append(number); + } + filename.append(number); + filename.append("info.txt"); + FILE * image = std::fopen(filename.c_str(), "r"); + int label; + int dimensions; + fscanf(image, "%d", &label); + fscanf(image, "%d", &dimensions); + float *colors = new float[dimensions]; + for (int j = 0; j < dimensions; j++) { + int color; + fscanf(image, "%d", &color); + colors[j] = color; + } + + CharacterRecognition::evaluate(colors, output); + fclose(image); + delete[] colors; +} - // Compaction tests +int main(int argc, char* argv[]) { + float *output = new float[52]; - genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); + CharacterRecognition::init(); - int count, expectedCount, expectedNPOT; + testImage(0, output); + //printArray(output, 52); - // initialize b using StreamCompaction::CPU::compactWithoutScan you implement - // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct. - zeroArray(SIZE, b); - printDesc("cpu compact without scan, power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - expectedCount = count; - printArray(count, b, true); - printCmpLenResult(count, expectedCount, b, b); + CharacterRecognition::train(0.2f); - zeroArray(SIZE, c); - printDesc("cpu compact without scan, non-power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - expectedNPOT = count; - printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); + testImage(0, output); + //printArray(output, 52); - zeroArray(SIZE, c); - printDesc("cpu compact with scan"); - count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); + CharacterRecognition::train(0.2f); - zeroArray(SIZE, c); - printDesc("work-efficient compact, power-of-two"); - count = StreamCompaction::Efficient::compact(SIZE, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); + //printArray(output, 52); - zeroArray(SIZE, c); - printDesc("work-efficient compact, non-power-of-two"); - count = StreamCompaction::Efficient::compact(NPOT, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); + delete[] output; - system("pause"); // stop Win32 console from closing on exit - delete[] a; - delete[] b; - delete[] c; + CharacterRecognition::end(); } diff --git a/Project2-Stream-Compaction/README.md b/Project2-Stream-Compaction/README.md index 0e38ddb..0d61941 100644 --- a/Project2-Stream-Compaction/README.md +++ b/Project2-Stream-Compaction/README.md @@ -3,12 +3,85 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Joshua Nadel + * https://www.linkedin.com/in/joshua-nadel-379382136/, http://www.joshnadel.com/ +* Tested on: Windows 10, i7-6700HQ @ 2.60GHz 16GB, GTX 970M (Personal laptop) -### (TODO: Your README) +### Scan / Stream Compaction -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +This project demonstrates the ability of the GPU to quickly complete algorithms that are, in serial, quite slow. It contains both serial and parallel implementations of scan and stream compaction algorithms, and uses timers to compare their performances on large quantities of data. +The list of features includes: +* Serial scan implementation on the CPU +* Serial compact implementation on the CPU +* Naive scan implementation on the GPU +* Work-efficient scan implementation on the GPU +* Work-efficient compact implementation on the GPU +* Wrapper for thrust's scan implementation to compare performance +Tested at block size of 128. + +![](img/timeOverLength.png) + +Less runtime means a more optimal implementation + +I do not know how the thrust implementation manages to be so efficient at such large array sizes. In fact, it seems to become increasingly efficient with array length. + +It is interesting to note that the efficient implementation is consistently slower than the naive implementation. This is likely due to memory access order. + +Predictably, the serial CPU version increases in runtime proportionally to the increase in array size. + +The program output reads: +``` +**************** +** SCAN TESTS ** +**************** + [ 25 18 37 41 29 8 10 48 21 22 19 18 6 ... 39 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.0588ms (std::chrono Measured) + [ 0 25 43 80 121 150 158 168 216 237 259 278 296 ... 25536 25575 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 0.0569ms (std::chrono Measured) + [ 0 25 43 80 121 150 158 168 216 237 259 278 296 ... 25478 25497 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 0.10656ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.105216ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.116672ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.13984ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.119264ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.071648ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 2 1 0 2 1 1 0 1 0 1 0 1 ... 1 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.0038ms (std::chrono Measured) + [ 2 2 1 2 1 1 1 1 1 3 2 1 2 ... 2 1 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.0032ms (std::chrono Measured) + [ 2 2 1 2 1 1 1 1 1 3 2 1 2 ... 1 1 ] + passed +==== cpu compact with scan ==== + elapsed time: 0.069ms (std::chrono Measured) + [ 2 2 1 2 1 1 1 1 1 3 2 1 2 ... 2 1 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.199904ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.1856ms (CUDA Measured) + passed +Press any key to continue . . .``` \ No newline at end of file diff --git a/Project2-Stream-Compaction/img/timeOverLength.png b/Project2-Stream-Compaction/img/timeOverLength.png new file mode 100644 index 0000000..85b8946 Binary files /dev/null and b/Project2-Stream-Compaction/img/timeOverLength.png differ diff --git a/Project2-Stream-Compaction/src/main.cpp b/Project2-Stream-Compaction/src/main.cpp index d016553..3569b2e 100644 --- a/Project2-Stream-Compaction/src/main.cpp +++ b/Project2-Stream-Compaction/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 10; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; diff --git a/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt b/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt index cdbef77..6ad13f6 100644 --- a/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt +++ b/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt @@ -13,5 +13,5 @@ set(SOURCE_FILES cuda_add_library(stream_compaction ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_52 ) diff --git a/Project2-Stream-Compaction/stream_compaction/common.cu b/Project2-Stream-Compaction/stream_compaction/common.cu index 2ed6d63..8f9d82f 100644 --- a/Project2-Stream-Compaction/stream_compaction/common.cu +++ b/Project2-Stream-Compaction/stream_compaction/common.cu @@ -23,7 +23,17 @@ namespace StreamCompaction { * which map to 0 will be removed, and elements which map to 1 will be kept. */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index > n) { + return; + } + + if (idata[index] == 0) { + bools[index] = 0; + } + else { + bools[index] = 1; + } } /** @@ -32,7 +42,14 @@ namespace StreamCompaction { */ __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { - // TODO + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index > n) { + return; + } + + if (bools[index] == 1) { + odata[indices[index]] = idata[index]; + } } } diff --git a/Project2-Stream-Compaction/stream_compaction/cpu.cu b/Project2-Stream-Compaction/stream_compaction/cpu.cu index a2d3e6c..cb00e33 100644 --- a/Project2-Stream-Compaction/stream_compaction/cpu.cu +++ b/Project2-Stream-Compaction/stream_compaction/cpu.cu @@ -19,7 +19,13 @@ namespace StreamCompaction { */ void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + for (int i = 0; i < n; i++) { + int sum = 0; + for (int j = 0; j < i; j++) { + sum += idata[j]; + } + odata[i] = sum; + } timer().endCpuTimer(); } @@ -30,9 +36,16 @@ namespace StreamCompaction { */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + int j = 0; + for (int i = 0; i < n; i++) { + int toPut = idata[i]; + if (toPut != 0) { + odata[j] = toPut; + j++; + } + } timer().endCpuTimer(); - return -1; + return j; } /** @@ -42,9 +55,42 @@ namespace StreamCompaction { */ int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + int *mappedArray = new int[n]; + for (int i = 0; i < n; i++) { + mappedArray[i] = 0; + } + for (int i = 0; i < n; i++) { + if (idata[i] == 0) mappedArray[i] = 0; + else mappedArray[i] = 1; + } + int *scannedArray = new int[n]; + for (int i = 0; i < n; i++) { + scannedArray[i] = 0; + } + scanNoTimer(n, scannedArray, mappedArray); + int count = 0; + for (int i = 0; i < n; i++) { + int toPut = idata[i]; + int indexToPut = scannedArray[i]; + if (toPut != 0) { + odata[indexToPut] = toPut; + count++; + } + } timer().endCpuTimer(); - return -1; + delete[] mappedArray; + delete[] scannedArray; + return count; } + + void scanNoTimer(int n, int *odata, const int *idata) { + for (int i = 0; i < n; i++) { + int sum = 0; + for (int j = 0; j < i; j++) { + sum += idata[j]; + } + odata[i] = sum; + } + } } } diff --git a/Project2-Stream-Compaction/stream_compaction/cpu.h b/Project2-Stream-Compaction/stream_compaction/cpu.h index 236ce11..79aeaba 100644 --- a/Project2-Stream-Compaction/stream_compaction/cpu.h +++ b/Project2-Stream-Compaction/stream_compaction/cpu.h @@ -11,5 +11,7 @@ namespace StreamCompaction { int compactWithoutScan(int n, int *odata, const int *idata); int compactWithScan(int n, int *odata, const int *idata); + + void scanNoTimer(int n, int *odata, const int *idata); } } diff --git a/Project2-Stream-Compaction/stream_compaction/efficient.cu b/Project2-Stream-Compaction/stream_compaction/efficient.cu index 2db346e..57fe766 100644 --- a/Project2-Stream-Compaction/stream_compaction/efficient.cu +++ b/Project2-Stream-Compaction/stream_compaction/efficient.cu @@ -3,6 +3,10 @@ #include "common.h" #include "efficient.h" +#define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__) + +#define blockSize 128 + namespace StreamCompaction { namespace Efficient { using StreamCompaction::Common::PerformanceTimer; @@ -12,15 +16,104 @@ namespace StreamCompaction { return timer; } + int *dev_data; + + __global__ void upSweep(int *data, int n, int d) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + int stride = powf(2, d + 1); + if (index >= n || index % stride != 0) { + return; + } + + int index2 = index + powf(2, d) - 1; + data[index + stride - 1] += data[index2]; + } + + __global__ void downSweep(int *data, int n, int d) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + int stride = powf(2, d + 1); + if (index >= n || index % stride != 0) { + return; + } + + int index2 = index + powf(2, d) - 1; + int index3 = index + powf(2, d + 1) - 1; + int t = data[index2]; + data[index2] = data[index3]; + data[index3] += t; + } + + __global__ void copyBuffer(const int *source, int *dest, int n) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + + dest[index] = source[index]; + } + + __global__ void kern0LastElement(int *data, int n) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index != n - 1) { + return; + } + + data[index] = 0; + } + + __global__ void kernReduction(int *data, int n, int d) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + int stride = powf(2, d + 1); + if (index >= n || index % stride != 0) { + return; + } + + int index2 = index + powf(2, d) - 1; + data[index + stride - 1] += data[index2]; + } + + void printArray(const int *array, int n) { + printf("["); + for (int i = 0; i < n; i++) { + printf("%d, ", array[i]); + } + printf("]\n"); + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); + int n2 = pow(2, ceil(log2(n))); + + dim3 fullBlocksPerGrid((n2 + blockSize - 1) / blockSize); + + cudaMalloc((void**)&dev_data, n2 * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_data failed!"); + cudaMemcpy(dev_data, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + + timer().startGpuTimer(); + + for (int d = 0; d < log2(n2); d++) { + upSweep << > > (dev_data, n2, d); + checkCUDAErrorWithLine("Up sweep failed!"); + } + kern0LastElement << > > (dev_data, n2); + for (int d = log2(n2) - 1; d >= 0; d--) { + downSweep << > > (dev_data, n2, d); + checkCUDAErrorWithLine("Down sweep failed!"); + } + + timer().endGpuTimer(); + cudaMemcpy(odata, dev_data, sizeof(int) * n, cudaMemcpyDeviceToHost); + cudaFree(dev_data); } + int *dev_bools; + int *dev_idata; + int *dev_odata; + int *dev_scanned; + /** * Performs stream compaction on idata, storing the result into odata. * All zeroes are discarded. @@ -31,10 +124,51 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + int n2 = pow(2, ceil(log2(n))); + + dim3 fullBlocksPerGrid((n2 + blockSize - 1) / blockSize); + + cudaMalloc((void**)&dev_bools, n2 * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_bools failed!"); + cudaMalloc((void**)&dev_idata, n2 * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_idata failed!"); + cudaMalloc((void**)&dev_odata, n2 * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_odata failed!"); + cudaMalloc((void**)&dev_scanned, n2 * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_scanned failed!"); + cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + timer().startGpuTimer(); - // TODO + Common::kernMapToBoolean << > > (n2, dev_bools, dev_idata); + cudaMemcpy(dev_scanned, dev_bools, sizeof(int) * n2, cudaMemcpyDeviceToDevice); + for (int d = 0; d < log2(n2); d++) { + upSweep << > > (dev_scanned, n2, d); + checkCUDAErrorWithLine("Up sweep failed!"); + } + kern0LastElement << > > (dev_scanned, n2); + for (int d = log2(n2) - 1; d >= 0; d--) { + downSweep << > > (dev_scanned, n2, d); + checkCUDAErrorWithLine("Down sweep failed!"); + } + Common::kernScatter << > > (n2, dev_odata, dev_idata, dev_bools, dev_scanned); + for (int d = 0; d < log2(n2); d++) { + kernReduction << > > (dev_bools, n2, d); + checkCUDAErrorWithLine("Reduction failed!"); + } timer().endGpuTimer(); - return -1; + + int *summedBools = new int[n2]; + cudaMemcpy(odata, dev_odata, sizeof(int) * n, cudaMemcpyDeviceToHost); + cudaMemcpy(summedBools, dev_bools, sizeof(int) * n2, cudaMemcpyDeviceToHost); + int toReturn = summedBools[n2 - 1]; + + cudaFree(dev_bools); + cudaFree(dev_idata); + cudaFree(dev_odata); + cudaFree(dev_scanned); + delete[] summedBools; + + return toReturn; } } } diff --git a/Project2-Stream-Compaction/stream_compaction/naive.cu b/Project2-Stream-Compaction/stream_compaction/naive.cu index 4308876..2c7795c 100644 --- a/Project2-Stream-Compaction/stream_compaction/naive.cu +++ b/Project2-Stream-Compaction/stream_compaction/naive.cu @@ -3,6 +3,10 @@ #include "common.h" #include "naive.h" +#define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__) + +#define blockSize 128 + namespace StreamCompaction { namespace Naive { using StreamCompaction::Common::PerformanceTimer; @@ -11,15 +15,84 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } - // TODO: __global__ + + int *dev_odata; + int *dev_idata; + + __global__ void scanHelper(int *odata, int *idata, int n, int d) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + + if (index >= powf(2, d - 1)) { + int dataIndex = index - powf(2, d - 1); + odata[index] = idata[dataIndex] + idata[index]; + } + else { + odata[index] = idata[index]; + } + } + + __global__ void copyBuffer(const int *source, int *dest, int n) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + + dest[index] = source[index]; + } + + __global__ void shiftBuffer(const int *source, int *dest, int n) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index > n) { + return; + } + + if (index == 0) { + dest[index] = 0; + } + else { + dest[index] = source[index - 1]; + } + } + + void printArray(const int *array, int n) { + printf("["); + for (int i = 0; i < n; i++) { + printf("%d, ", array[i]); + } + printf("]\n"); + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int n2 = pow(2, ceil(log2(n))); + + dim3 fullBlocksPerGrid((n2 + blockSize - 1) / blockSize); + + cudaMalloc((void**)&dev_idata, n2 * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_idata failed!"); + cudaMalloc((void**)&dev_odata, n2 * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_odata failed!"); + cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + cudaMemcpy(dev_odata, odata, sizeof(int) * n, cudaMemcpyHostToDevice); + timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); + + for (int d = 1; d <= log2(n2); d++) { + scanHelper << > > (dev_odata, dev_idata, n2, d); + checkCUDAErrorWithLine("Scan helper failed!"); + copyBuffer << > > (dev_odata, dev_idata, n2); + checkCUDAErrorWithLine("Copy buffer failed!"); + } + shiftBuffer << > > (dev_idata, dev_odata, n2); + timer().endGpuTimer(); + cudaMemcpy(odata, dev_odata, sizeof(int) * n, cudaMemcpyDeviceToHost); + cudaFree(dev_idata); + cudaFree(dev_odata); } } } diff --git a/Project2-Stream-Compaction/stream_compaction/thrust.cu b/Project2-Stream-Compaction/stream_compaction/thrust.cu index 1def45e..cb858b5 100644 --- a/Project2-Stream-Compaction/stream_compaction/thrust.cu +++ b/Project2-Stream-Compaction/stream_compaction/thrust.cu @@ -18,11 +18,17 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + thrust::host_vector h_in(idata, idata + n); + thrust::host_vector h_out(odata, odata + n); + thrust::device_vector dv_in = h_in; + thrust::device_vector dv_out = h_out; timer().startGpuTimer(); // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); timer().endGpuTimer(); + thrust::copy(dv_out.begin(), dv_out.end(), odata); } } } diff --git a/README.md b/README.md index 3a0b2fe..f260cf6 100644 --- a/README.md +++ b/README.md @@ -3,13 +3,15 @@ CUDA Number Algorithms **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Joshua Nadel + * https://www.linkedin.com/in/joshua-nadel-379382136/, http://www.joshnadel.com/ +* Tested on: Windows 10, i7-6700HQ @ 2.60GHz 16GB, GTX 970M (Personal laptop) -### (TODO: Your README) +### Number Algorithms -Link to the readmes of the other two subprojects. +[Stream Compaction](Project2-Stream-Compaction/README.md) + +[Character Recognition](Project2-Character-Recognition/README.md) Add anything else you think is relevant up to this point. (Remember, this is public, so don't put anything here that you don't want to share with the world.)