diff --git a/Project2-Character-Recognition/README.md b/Project2-Character-Recognition/README.md index 4503fac..83f9979 100644 --- a/Project2-Character-Recognition/README.md +++ b/Project2-Character-Recognition/README.md @@ -3,12 +3,28 @@ CUDA Character Recognition **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Tabatha Hickman + * LinkedIn:https://www.linkedin.com/in/tabatha-hickman-335987140/ +* Tested on: Windows 10 Pro, i7-5600U CPU @ 2.60GHz 16GB, GeForce 840M (personal computer) -### (TODO: Your README) +## Neural Network Implementation -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +This project's purpose was to create a neural network which does its computations on the GPU. I created a multi-layer perceptron with one hidden layer, so in total there are 3 layers (input, hidden, output). We evaluate the network by feeding information forward to the next layer. To process each new layer, I performed a summation for each output node on all the input nodes multiplied by the corresponding weight between those two nodes, then ran that sum through an activation function. In this case our function was ```f(x) = 1/(1+e^-x)```. + +We want to find the best set of weights so that the outputs of the network are as accurate as possible. We do this by entering a training phase. First we start with random values for the weights. Then, provided with inputs and corresponding target outputs, we run the inputs through the network and compare the outputs with their targets and find the error associated. Then through backward propagation, we can go through each weight and update it based on the results so that next time the output is more accurate. + +Once the network has been trained adequately, we can run new inputs on it and see if we get some good results. + +Using provided weights for a working XOR neural network, I was able to verify my code correctly feeds forward and builds the network. I've also been able to produce my own fairly accurate weights for XOR: (This had a target error of 0.01) + +``` +Ran 13101 iterations of training + (0, 0) expected: 0.000000, result 0.071486 + (0, 1) expected: 1.000000, result 0.930205 + (1, 0) expected: 1.000000, result 0.923021 + (1, 1) expected: 0.000000, result 0.063928 +``` + +Unfortunately, I was having a lot of trouble extending this to character recognition. Training does not seem to be working - the error is huge and doesn't improve at all with further iterations. I attempted to debug this and started getting "CUDA grid launch failed" errors. Upon looking this up I found out this has to do with the TDR of my Debugger, but I can't find the place to change that setting. + diff --git a/Project2-Character-Recognition/character_recognition/CMakeLists.txt b/Project2-Character-Recognition/character_recognition/CMakeLists.txt index 7446175..01edd01 100644 --- a/Project2-Character-Recognition/character_recognition/CMakeLists.txt +++ b/Project2-Character-Recognition/character_recognition/CMakeLists.txt @@ -7,5 +7,5 @@ set(SOURCE_FILES cuda_add_library(character_recognition ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_30 ) diff --git a/Project2-Character-Recognition/character_recognition/mlp.cu b/Project2-Character-Recognition/character_recognition/mlp.cu index 5a3ed7f..c5ae05a 100644 --- a/Project2-Character-Recognition/character_recognition/mlp.cu +++ b/Project2-Character-Recognition/character_recognition/mlp.cu @@ -1,5 +1,6 @@ #include #include +#include #include "common.h" #include "mlp.h" @@ -23,5 +24,254 @@ namespace CharacterRecognition { } */ + #define blockSize 128 + + __host__ __device__ unsigned int hash(unsigned int a) { + a = (a + 0x7ed55d16) + (a << 12); + a = (a ^ 0xc761c23c) ^ (a >> 19); + a = (a + 0x165667b1) + (a << 5); + a = (a + 0xd3a2646c) ^ (a << 9); + a = (a + 0xfd7046c5) + (a << 3); + a = (a ^ 0xb55a4f09) ^ (a >> 16); + return a; + } + + __host__ __device__ float genRandom(float time, int index) { + thrust::default_random_engine rng(hash((int)(index * time))); + thrust::uniform_real_distribution unitDistrib(-1, 1); + + return (float)unitDistrib(rng); + } + + __global__ void kernInitRandomWeights(int N, float* wtMat, float scale) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index < N) { + float rand = genRandom(N, index); + wtMat[index] = scale * rand; + } + } + + __global__ void kernInitZero(int N, float* data) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index < N) + { + data[index] = 0; + } + } + + __global__ void kernSumWeights(int iDim, int oDim, float* wtMat, float* idata, float* odata) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= oDim) { return; } + + for (int idx = 0; idx < iDim; idx++) + { + int wtIdx = idx * oDim + index; + odata[index] += wtMat[wtIdx] * idata[idx]; + } + } + + __global__ void kernActivationFxn(int N, float* idata, float* odata) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) { return; } + + float x = idata[index]; + float e = exp(-x); + odata[index] = 1.0f / (1.0f + e); + } + + __global__ void kernCalcErrors(int N, float* target, float* output, float* odata) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) { return; } + + odata[index] = target[index] - output[index]; + } + + __global__ void kernEditWeightsji(int N, int iDim, float lambda, float* hidden, float* errors, float* outputSums, + float* partialErr, float* wtMat) + { + // for hidden to output weights: + // delta = lambda * value of hidden node * (target - output) * derivative of f(x) (where x is the sum before it went in f(x) or is just the output??) + // derivative of f = f * (1-f) + + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) { return; } + + int i = index % iDim; + int j = index / iDim; + + float x = outputSums[i]; + float fx = 1.0f / (1.0f + exp(-x)); + partialErr[i] = errors[i] * fx * (1 - fx); + float deltaW = lambda * hidden[j] * partialErr[i]; + + wtMat[index] += deltaW; + } + + __global__ void kernEditWeightskj(int N, int jDim, int iDim, float lambda, float* input, float* hiddenSums, + float* partialErr, float* wji, + float* wtMat) + { + // for hidden to output weights: + // delta = lambda * value of input node * derivative of f(x) * + // derivative of f = f * (1-f) + + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) { return; } + + int j = index % jDim; + int k = index / jDim; + + float sumPropErrs = 0; + for (int i = 0; i < iDim; i++) + { + sumPropErrs += partialErr[i] * wji[j + i * jDim]; + } + + float x = hiddenSums[j]; + float fx = 1.0f / (1.0f + exp(-x)); + float deltaW = lambda * input[k] * sumPropErrs * fx * (1 - fx); + + wtMat[index] += deltaW; + } + + void makeWeightMat(int n, float* data) + { + float* dev_data; + cudaMalloc((void**)&dev_data, n * sizeof(float)); + + kernInitRandomWeights << > > (n, dev_data, 30); + + cudaMemcpy(data, dev_data, n * sizeof(float), cudaMemcpyDeviceToHost); + cudaFree(dev_data); + } + // TODO: implement required elements for MLP sections 1 and 2 here + float mlpTrain(int i, int j, int k, float* odata, float* idata, float* wkj, float* wji, float* target) + { + float *dev_input, *dev_hidden, *dev_output; + float *dev_hiddenSums, *dev_outputSums; + float *dev_wkj, *dev_wji; + float *dev_target, *dev_errors, *dev_partialErr, *dev_tempwji; + + cudaMalloc((void**)&dev_input, k * sizeof(float)); + cudaMalloc((void**)&dev_hidden, j * sizeof(float)); + cudaMalloc((void**)&dev_output, i * sizeof(float)); + cudaMemcpy(dev_input, idata, k * sizeof(float), cudaMemcpyHostToDevice); + + cudaMalloc((void**)&dev_hiddenSums, j * sizeof(float)); + cudaMalloc((void**)&dev_outputSums, i * sizeof(float)); + + cudaMalloc((void**)&dev_wkj, k * j * sizeof(float)); + cudaMalloc((void**)&dev_wji, j * i * sizeof(float)); + cudaMemcpy(dev_wkj, wkj, k * j * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(dev_wji, wji, j * i * sizeof(float), cudaMemcpyHostToDevice); + + cudaMalloc((void**)&dev_target, i * sizeof(float)); + cudaMalloc((void**)&dev_errors, i * sizeof(float)); + cudaMalloc((void**)&dev_partialErr, i * sizeof(float)); + cudaMalloc((void**)&dev_tempwji, i * j * sizeof(float)); + cudaMemcpy(dev_target, target, i * sizeof(float), cudaMemcpyHostToDevice); + + // initialize non input buffers to zeros + kernInitZero << > > (j, dev_hidden); + kernInitZero << > > (i, dev_output); + + // input -> hidden + kernSumWeights << > > (k, j, dev_wkj, dev_input, dev_hiddenSums); + kernActivationFxn << > > (j, dev_hiddenSums, dev_hidden); + + // hidden -> output + kernSumWeights << > > (j, i, dev_wji, dev_hidden, dev_outputSums); + kernActivationFxn << > > (i, dev_outputSums, dev_output); + + // calculate error, lambda + kernCalcErrors << > > (i, dev_target, dev_output, dev_errors); + + float* errs = new float[i]; + cudaMemcpy(errs, dev_errors, i * sizeof(float), cudaMemcpyDeviceToHost); + float sumErr = 0; + for (int e = 0; e < i; e++) + { + sumErr += (errs[e]*errs[e]); + } + sumErr /= 2.0f; + float lambda = sumErr; + + // update weights + cudaMemcpy(dev_tempwji, dev_wji, j * i * sizeof(float), cudaMemcpyDeviceToDevice); + kernEditWeightsji << > > (j*i, i, lambda, dev_hidden, dev_errors, dev_output, + dev_partialErr, dev_wji); + kernEditWeightskj << > > (k*j, j, i, lambda, dev_input, dev_hidden, dev_partialErr, + dev_tempwji, dev_wkj); + + cudaMemcpy(odata, dev_output, i * sizeof(float), cudaMemcpyDeviceToHost); + cudaMemcpy(wkj, dev_wkj, k * j * sizeof(float), cudaMemcpyDeviceToHost); + cudaMemcpy(wji, dev_wji, j * i * sizeof(float), cudaMemcpyDeviceToHost); + + cudaFree(dev_input); + cudaFree(dev_hidden); + cudaFree(dev_output); + + cudaFree(dev_hiddenSums); + cudaFree(dev_outputSums); + + cudaFree(dev_wkj); + cudaFree(dev_wji); + + cudaFree(dev_target); + cudaFree(dev_errors); + cudaFree(dev_partialErr); + cudaFree(dev_tempwji); + + return sumErr; + } + + void mlpRun(int i, int j, int k, float* odata, float* idata, float* wkj, float* wji) + { + float *dev_input, *dev_hidden, *dev_output; + float *dev_hiddenSums, *dev_outputSums; + float *dev_wkj, *dev_wji; + + cudaMalloc((void**)&dev_input, k * sizeof(float)); + cudaMalloc((void**)&dev_hidden, j * sizeof(float)); + cudaMalloc((void**)&dev_output, i * sizeof(float)); + cudaMemcpy(dev_input, idata, k * sizeof(float), cudaMemcpyHostToDevice); + + cudaMalloc((void**)&dev_hiddenSums, j * sizeof(float)); + cudaMalloc((void**)&dev_outputSums, i * sizeof(float)); + + cudaMalloc((void**)&dev_wkj, k * j * sizeof(float)); + cudaMalloc((void**)&dev_wji, j * i * sizeof(float)); + cudaMemcpy(dev_wkj, wkj, k * j * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(dev_wji, wji, j * i * sizeof(float), cudaMemcpyHostToDevice); + + // initialize non input buffers to zeros + kernInitZero << > > (j, dev_hidden); + kernInitZero << > > (i, dev_output); + + // input -> hidden + kernSumWeights << > > (k, j, dev_wkj, dev_input, dev_hiddenSums); + kernActivationFxn << > > (j, dev_hiddenSums, dev_hidden); + + // hidden -> output + kernSumWeights << > > (j, i, dev_wji, dev_hidden, dev_outputSums); + kernActivationFxn << > > (i, dev_outputSums, dev_output); + + cudaMemcpy(odata, dev_output, i * sizeof(float), cudaMemcpyDeviceToHost); + + cudaFree(dev_input); + cudaFree(dev_hidden); + cudaFree(dev_output); + + cudaFree(dev_hiddenSums); + cudaFree(dev_outputSums); + + cudaFree(dev_wkj); + cudaFree(dev_wji); + } } diff --git a/Project2-Character-Recognition/character_recognition/mlp.h b/Project2-Character-Recognition/character_recognition/mlp.h index 2096228..52b75ac 100644 --- a/Project2-Character-Recognition/character_recognition/mlp.h +++ b/Project2-Character-Recognition/character_recognition/mlp.h @@ -5,5 +5,10 @@ namespace CharacterRecognition { Common::PerformanceTimer& timer(); + void makeWeightMat(int n, float* data); + // TODO: implement required elements for MLP sections 1 and 2 here + float mlpTrain(int i, int j, int k, float* odata, float* idata, float* wkj, float* wji, float* target); + + void mlpRun(int i, int j, int k, float* odata, float* idata, float* wkj, float* wji); } diff --git a/Project2-Character-Recognition/src/main.cpp b/Project2-Character-Recognition/src/main.cpp index 11dd534..75c0cc3 100644 --- a/Project2-Character-Recognition/src/main.cpp +++ b/Project2-Character-Recognition/src/main.cpp @@ -7,146 +7,167 @@ */ #include +#include +#include #include #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array -const int NPOT = SIZE - 3; // Non-Power-Of-Two -int *a = new int[SIZE]; -int *b = new int[SIZE]; -int *c = new int[SIZE]; +void readFromFile(int idx, float* inputArr) +{ + std::string fileName = std::to_string(idx) + "info.txt"; + if (idx < 10) { fileName = std::to_string(0) + fileName; } + fileName = "../data-set/" + fileName; + + std::ifstream infile(fileName); + + int n1, n2, count; + float x; + count = 0; + + if (!(infile >> n1 >> n2)) { printf("Error reading first two lines of file %s\n", fileName); } + + while (infile >> x && count < 10201) + { + inputArr[count] = x; + count++; + } +} int main(int argc, char* argv[]) { - // Scan tests - - printf("\n"); - printf("****************\n"); - printf("** SCAN TESTS **\n"); - printf("****************\n"); - - genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); - - // initialize b using StreamCompaction::CPU::scan you implement - // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct. - // At first all cases passed because b && c are all zeroes. - zeroArray(SIZE, b); - printDesc("cpu scan, power-of-two"); - StreamCompaction::CPU::scan(SIZE, b, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(SIZE, b, true); - - zeroArray(SIZE, c); - printDesc("cpu scan, non-power-of-two"); - StreamCompaction::CPU::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(NPOT, b, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("naive scan, power-of-two"); - StreamCompaction::Naive::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan - onesArray(SIZE, c); - printDesc("1s array for finding bugs"); - StreamCompaction::Naive::scan(SIZE, c, a); - printArray(SIZE, c, true); */ - - zeroArray(SIZE, c); - printDesc("naive scan, non-power-of-two"); - StreamCompaction::Naive::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient scan, power-of-two"); - StreamCompaction::Efficient::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient scan, non-power-of-two"); - StreamCompaction::Efficient::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("thrust scan, power-of-two"); - StreamCompaction::Thrust::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - zeroArray(SIZE, c); - printDesc("thrust scan, non-power-of-two"); - StreamCompaction::Thrust::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); - - printf("\n"); - printf("*****************************\n"); - printf("** STREAM COMPACTION TESTS **\n"); - printf("*****************************\n"); - - // Compaction tests - - genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); - - int count, expectedCount, expectedNPOT; - - // initialize b using StreamCompaction::CPU::compactWithoutScan you implement - // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct. - zeroArray(SIZE, b); - printDesc("cpu compact without scan, power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - expectedCount = count; - printArray(count, b, true); - printCmpLenResult(count, expectedCount, b, b); - - zeroArray(SIZE, c); - printDesc("cpu compact without scan, non-power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - expectedNPOT = count; - printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); - - zeroArray(SIZE, c); - printDesc("cpu compact with scan"); - count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient compact, power-of-two"); - count = StreamCompaction::Efficient::compact(SIZE, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient compact, non-power-of-two"); - count = StreamCompaction::Efficient::compact(NPOT, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); + // XOR TESTING + printf("XOR TESTING\n"); + + float *xorInput1 = new float[3]; + xorInput1[0] = 0; + xorInput1[1] = 0; + xorInput1[2] = 1; //bias + + float *xorTarget1 = new float[1]; + xorTarget1[0] = 0; + + float *xorInput2 = new float[3]; + xorInput2[0] = 0; + xorInput2[1] = 1; + xorInput2[2] = 1; + + float *xorTarget2 = new float[1]; + xorTarget2[0] = 1; + + float *xorInput3 = new float[3]; + xorInput3[0] = 1; + xorInput3[1] = 0; + xorInput3[2] = 1; + + float *xorTarget3 = new float[1]; + xorTarget3[0] = 1; + + float *xorInput4 = new float[3]; + xorInput4[0] = 1; + xorInput4[1] = 1; + xorInput4[2] = 1; + + float *xorTarget4 = new float[1]; + xorTarget4[0] = 0; + + float *wkj = new float[9]; + float *wji = new float[3]; + CharacterRecognition::makeWeightMat(9, wkj); + CharacterRecognition::makeWeightMat(3, wji); + // testing values from spreadsheet, + // make sure to change j and k to 2 to get rid of bias + /*wkj[0] = 10.1; + wkj[1] = 0.9; + wkj[2] = 20; + wkj[3] = 0.87; + wji[0] = 41; + wji[1] = -54;*/ + + + float *xorOutput = new float[1]; + + int i = 1; + int j = 3; + int k = 3; + + //training + float tgtError = 0.01f; + float currError = 100000.0f; + int count = 0; + while (currError > tgtError && count < 15000) + { + currError = 0; + currError += CharacterRecognition::mlpTrain(i, j, k, xorOutput, xorInput1, wkj, wji, xorTarget1); + currError += CharacterRecognition::mlpTrain(i, j, k, xorOutput, xorInput2, wkj, wji, xorTarget2); + currError += CharacterRecognition::mlpTrain(i, j, k, xorOutput, xorInput3, wkj, wji, xorTarget3); + currError += CharacterRecognition::mlpTrain(i, j, k, xorOutput, xorInput4, wkj, wji, xorTarget4); + count++; + } + + //test + printf("Ran %d iterations of training\n", count); + CharacterRecognition::mlpRun(i, j, k, xorOutput, xorInput1, wkj, wji); + printf(" (0, 0) expected: %f, result %f\n", xorTarget1[0], xorOutput[0]); + CharacterRecognition::mlpRun(i, j, k, xorOutput, xorInput2, wkj, wji); + printf(" (0, 1) expected: %f, result %f\n", xorTarget2[0], xorOutput[0]); + CharacterRecognition::mlpRun(i, j, k, xorOutput, xorInput3, wkj, wji); + printf(" (1, 0) expected: %f, result %f\n", xorTarget3[0], xorOutput[0]); + CharacterRecognition::mlpRun(i, j, k, xorOutput, xorInput4, wkj, wji); + printf(" (1, 1) expected: %f, result %f\n", xorTarget4[0], xorOutput[0]); + + + // CHAR RECOG TESTING + printf("CHAR RECOG TESTING\n"); + + i = 1; + j = 10202; + k = 10202; // +1 for bias + + float *CRwkj = new float[k*j]; + float *CRwji = new float[j*i]; + CharacterRecognition::makeWeightMat(k*j, wkj); + CharacterRecognition::makeWeightMat(j*i, wji); + + float *CRoutput = new float[i]; + + tgtError = 0.01f; + currError = 100000.0f; + count = 0; + while (currError > tgtError && count < 10) + { + currError = 0; + for (int f = 0; f < 52; f++) + { + float* tgt = new float[i]; + tgt[0] = f + 1; + + float* input = new float[k]; + readFromFile(f + 1, input); + input[k-1] = 1; + + currError += CharacterRecognition::mlpTrain(i, j, k, CRoutput, input, CRwkj, CRwji, tgt); + + delete[] input; + delete[] tgt; + } + printf("After %d iterations, error = %f\n", count, currError); + count++; + } system("pause"); // stop Win32 console from closing on exit - delete[] a; - delete[] b; - delete[] c; + delete[] xorInput1; + delete[] xorTarget1; + delete[] xorInput2; + delete[] xorTarget2; + delete[] xorInput3; + delete[] xorTarget3; + delete[] xorInput4; + delete[] xorTarget4; + delete[] xorOutput; + delete[] wkj; + delete[] wji; + + delete[] CRoutput; + delete[] CRwkj; + delete[] CRwji; } diff --git a/Project2-Character-Recognition/src/testing_helpers.hpp b/Project2-Character-Recognition/src/testing_helpers.hpp index b28a8d2..dab93be 100644 --- a/Project2-Character-Recognition/src/testing_helpers.hpp +++ b/Project2-Character-Recognition/src/testing_helpers.hpp @@ -37,7 +37,7 @@ void printCmpLenResult(int n, int expN, T *a, T *b) { cmpArrays(n, a, b) ? "FAIL VALUE" : "passed"); } -void zeroArray(int n, int *a) { +void zeroArray(int n, float *a) { for (int i = 0; i < n; i++) { a[i] = 0; } diff --git a/Project2-Stream-Compaction/README.md b/Project2-Stream-Compaction/README.md index 0e38ddb..ddf8108 100644 --- a/Project2-Stream-Compaction/README.md +++ b/Project2-Stream-Compaction/README.md @@ -3,12 +3,99 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Tabatha Hickman + * LinkedIn:https://www.linkedin.com/in/tabatha-hickman-335987140/ +* Tested on: Windows 10 Pro, i7-5600U CPU @ 2.60GHz 16GB, GeForce 840M (personal computer) -### (TODO: Your README) +## Stream Compaction -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +### Approaches +Stream compaction is the process of removing null or otherwise useless values from an array. Naively, you could just traverse each element one by one, removing it if necessary, but there are other methods that can be used to reduce computation. One of them involves using a prefix-sum scan. If we create a mirror array to the original which has boolean values (0 or 1) representing if the value is null or not, then perform a scan on that, we will find that for each value which should be in the output of stream compaction, the mirrored value in the scan output is its index in the output array. + +So now we should think about how best we can perform the prefix-sum scan. There are several approaches I implemented that accomplish this: + +* CPU Naive approach: A simple for loop which keeps track of a sum, goes through each element in the array and adds its value to the sum. +* GPU Naive approach: We reduce the number of computations to n times log base 2 of n by looping through each power of 2 from 1 to the first that is greater than or equal to the size of the array and summing pairs of numbers with that power of 2 distance from one another. As this adds up we get finally get the final sum adding the sum at the halfway index to the sum at the last index. +* GPU Work-Efficient approach: This approach further optimizes the scan by imagining the array as a binary tree and summing pairs of two iteratively until we get to the top of the tree with a final sum. This is considered the upsweep phase. It works, but it only sets the final index with the correct sum and neglects the others. So there is next a downsweep phase where from the top of the tree to the leaves, we take two nodes and swap their values with the lower index one adding its value to the current sum of the right one. By the time we get to the bottom of the tree, where we have each index in the original array, they all have the correct output for the prefix sum. +* GPU Thrust approach: Simply invokes the scan function of the Thrust library. + +### Performance Analysis + +Optimal block sizes for each scan approach: I tested different block sizes to find the optimal one for each GPu implementation and got these results: + +![](img/blockSizeChart.JPG) + +* Naive: 128 +* Work-Efficient: 128 + +Comparison of all scan approaches: + +Here is a chart of the results for all approaches with differing array sizes. + +![](img/arraySizeChartFull.JPG) + +And here is a zoomed in version of the chart so you can see the comparisons at a more detailed level. + +![](img/arraySizeChartZoom.JPG) + +The CPU and Thrust implmentation appears to be pretty stable until we reach a size of 2^16 or 2^17. Though Thrust starts out as one of the worst times at the lowest array size, which leads me to believe a lot of the computation for Thrust involves the overhead of starting it up and using the library, and is unrelated to array size until we reach really large numbers. Since the CPU implementation should have O(n) time, I was surprised to see the slope was so minimal as the array size increased by powers of 2. However, the CPU implementation is really only doing one calculation per element so maybe the simplicity of just using a for loop has so little overhead is what makes this implementation so much faster. + +As for the Naive and Work-efficient GPU implementations, both seem to follow the same exponential curve, with the work-efficient consitently about double the time of naive. Both are far worse than the Thrust or CPU implementations, which only grows more clear as array size increases. I think that these two implementations are worse only because the task at hand is so simple. The overhead of invoking multiple kernels, performing multiple loops, and in some cases needing to swap buffers seems to be more trouble than it's worth. As for the difference between the naive and work-efficient, I would guess that the extra kernel is what causes the doubling issue, since almost the same calculations need to be done in the two implementations, but in the work-efficient we split it into upsweep and downsweep which need to be completed in sequence so it all takes twice the time. + +Output of test program: (2^8 array size) + +``` +**************** +** SCAN TESTS ** +**************** + [ 11 10 31 32 43 9 48 31 19 2 32 39 22 ... 12 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.000789ms (std::chrono Measured) + [ 0 11 21 52 84 127 136 184 215 234 236 268 307 ... 5946 5958 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 0.000395ms (std::chrono Measured) + [ 0 11 21 52 84 127 136 184 215 234 236 268 307 ... 5870 5891 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 0.052672ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.05264ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.090496ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.091072ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.095808ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.103232ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 0 1 1 1 2 2 2 3 0 3 3 3 ... 2 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.001579ms (std::chrono Measured) + [ 2 1 1 1 2 2 2 3 3 3 3 1 2 ... 1 2 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.001579ms (std::chrono Measured) + [ 2 1 1 1 2 2 2 3 3 3 3 1 2 ... 2 2 ] + passed +==== cpu compact with scan ==== + elapsed time: 5.07698ms (std::chrono Measured) + [ 2 1 1 1 2 2 2 3 3 3 3 1 2 ... 1 2 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.08976ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.089792ms (CUDA Measured) + passed +``` diff --git a/Project2-Stream-Compaction/img/arraySizeChartFull.JPG b/Project2-Stream-Compaction/img/arraySizeChartFull.JPG new file mode 100644 index 0000000..34d1a89 Binary files /dev/null and b/Project2-Stream-Compaction/img/arraySizeChartFull.JPG differ diff --git a/Project2-Stream-Compaction/img/arraySizeChartZoom.JPG b/Project2-Stream-Compaction/img/arraySizeChartZoom.JPG new file mode 100644 index 0000000..1cccadb Binary files /dev/null and b/Project2-Stream-Compaction/img/arraySizeChartZoom.JPG differ diff --git a/Project2-Stream-Compaction/img/blockSizeChart.JPG b/Project2-Stream-Compaction/img/blockSizeChart.JPG new file mode 100644 index 0000000..9aab80c Binary files /dev/null and b/Project2-Stream-Compaction/img/blockSizeChart.JPG differ diff --git a/Project2-Stream-Compaction/src/main.cpp b/Project2-Stream-Compaction/src/main.cpp index d016553..1058dc6 100644 --- a/Project2-Stream-Compaction/src/main.cpp +++ b/Project2-Stream-Compaction/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 14; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; diff --git a/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt b/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt index cdbef77..6444fc7 100644 --- a/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt +++ b/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt @@ -13,5 +13,5 @@ set(SOURCE_FILES cuda_add_library(stream_compaction ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_37 ) diff --git a/Project2-Stream-Compaction/stream_compaction/cpu.cu b/Project2-Stream-Compaction/stream_compaction/cpu.cu index a2d3e6c..dac6329 100644 --- a/Project2-Stream-Compaction/stream_compaction/cpu.cu +++ b/Project2-Stream-Compaction/stream_compaction/cpu.cu @@ -18,9 +18,18 @@ namespace StreamCompaction { * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first. */ void scan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); - // TODO - timer().endCpuTimer(); + bool standalone = true; + try { timer().startCpuTimer(); } + catch (std::exception) { standalone = false; } + + int sum = 0; + for (int i = 0; i < n; i++) + { + odata[i] = sum; + sum += idata[i]; + } + + if(standalone){ timer().endCpuTimer(); } } /** @@ -30,9 +39,19 @@ namespace StreamCompaction { */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + + int idxInOut = 0; + for (int i = 0; i < n; i++) + { + if (idata[i] != 0) + { + odata[idxInOut] = idata[i]; + idxInOut++; + } + } + timer().endCpuTimer(); - return -1; + return idxInOut; } /** @@ -42,9 +61,31 @@ namespace StreamCompaction { */ int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + + int* temp = new int[n]; + int* tempScan = new int[n]; + for (int i = 0; i < n; i++) + { + if (idata[i] != 0) + temp[i] = 1; + else + temp[i] = 0; + } + + scan(n, tempScan, temp); + + int num = 0; + for (int i = 0; i < n; i++) + { + if (temp[i] == 1) + { + odata[tempScan[i]] = idata[i]; + num++; + } + } + timer().endCpuTimer(); - return -1; + return num; } } } diff --git a/Project2-Stream-Compaction/stream_compaction/efficient.cu b/Project2-Stream-Compaction/stream_compaction/efficient.cu index 2db346e..8e090a8 100644 --- a/Project2-Stream-Compaction/stream_compaction/efficient.cu +++ b/Project2-Stream-Compaction/stream_compaction/efficient.cu @@ -12,13 +12,110 @@ namespace StreamCompaction { return timer; } + #define blockSize 128 + + __global__ void kernMapToBoolean(int N, int* arr, int* boolArr) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) return; + + boolArr[index] = arr[index]; + if (boolArr[index] != 0) + { + boolArr[index] = 1; + } + } + + __global__ void kernUpSweep(int N, int d, int* arr) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) return; + + int powDPlus = 1 << (d+1); + int powD = 1 << d; + + if (index % powDPlus == 0) + { + arr[index + powDPlus - 1] += arr[index + powD - 1]; + } + if (index == N - 1) + { + arr[index] = 0; + } + } + + __global__ void kernDownSweep(int N, int d, int* arr) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) return; + + int powDPlus = 1 << (d + 1); + int powD = 1 << d; + + if (index % powDPlus == 0) + { + int temp = arr[index + powD - 1]; + arr[index + powD - 1] = arr[index + powDPlus - 1]; + arr[index + powDPlus - 1] += temp; + } + } + + __global__ void kernInclusiveToExclusive(int N, int* arr) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) return; + + arr[index] -= arr[0]; + } + + __global__ void kernScatter(int N, int* idata, int* boolArr, int* scanArr, int* odata) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) return; + + if (boolArr[index] == 1) + { + int idx = scanArr[index]; + odata[idx] = idata[index]; + } + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int pow2Length = 1 << ilog2ceil(n); + int* idataPow2 = new int[pow2Length]; + memcpy(idataPow2, idata, n * sizeof(int)); + for (int i = n; i < pow2Length; i++) + { + idataPow2[i] = 0; + } + + int* dev_arr; + + cudaMalloc((void**)&dev_arr, pow2Length * sizeof(int)); + cudaMemcpy(dev_arr, idataPow2, sizeof(int) * n, cudaMemcpyHostToDevice); + timer().startGpuTimer(); - // TODO + + for (int i = 0; i < ilog2ceil(n); i++) + { + kernUpSweep << > > (pow2Length, i, dev_arr); + } + + for (int j = ilog2ceil(n)-1; j >= 0; j--) + { + kernDownSweep << > > (pow2Length, j, dev_arr); + } + + kernInclusiveToExclusive << > > (pow2Length, dev_arr); + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_arr, sizeof(int) * n, cudaMemcpyDeviceToHost); + + cudaFree(dev_arr); } /** @@ -31,10 +128,56 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); - return -1; + int* dev_boolArr; + int* dev_scanArr; + int* dev_idata; + int* dev_odata; + + int* host_boolArr = new int[n]; + int* host_scanArr = new int[n]; + + cudaMalloc((void**)&dev_boolArr, n * sizeof(int)); + cudaMalloc((void**)&dev_scanArr, n * sizeof(int)); + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + + cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + + bool standalone = true; + try { timer().startCpuTimer(); } + catch (std::exception) { standalone = false; } + + kernMapToBoolean << > > (n, dev_idata, dev_boolArr); + cudaMemcpy(host_boolArr, dev_boolArr, sizeof(int) * n, cudaMemcpyDeviceToHost); + + scan(n, host_scanArr, host_boolArr); + cudaMemcpy(dev_scanArr, host_scanArr, sizeof(int) * n, cudaMemcpyHostToDevice); + + kernScatter << > > (n, dev_idata, dev_boolArr, dev_scanArr, dev_odata); + + if (standalone) { timer().endCpuTimer(); } + + cudaMemcpy(odata, dev_odata, sizeof(int) * n, cudaMemcpyDeviceToHost); + + int num = 0; + for (int i = n-1; i >= 0; i--) + { + if (host_boolArr[i] != 0) + { + num = host_scanArr[i] + 1; + break; + } + } + + cudaFree(dev_boolArr); + cudaFree(dev_scanArr); + cudaFree(dev_idata); + cudaFree(dev_odata); + + free(host_boolArr); + free(host_scanArr); + + return num; } } } diff --git a/Project2-Stream-Compaction/stream_compaction/naive.cu b/Project2-Stream-Compaction/stream_compaction/naive.cu index 4308876..f42ecc3 100644 --- a/Project2-Stream-Compaction/stream_compaction/naive.cu +++ b/Project2-Stream-Compaction/stream_compaction/naive.cu @@ -11,15 +11,67 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } - // TODO: __global__ + + #define blockSize 128 + + __global__ void kernNaiveScan(int N, int d, int* read, int* write) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) return; + + int start = pow(float(2), float(d - 1)); + if (index >= start) + { + write[index] = read[index - start] + read[index]; + } + else + { + write[index] = read[index]; + } + } + + __global__ void kernInclusiveToExclusive(int N, int* read, int* write) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) return; + + if (index == 0) + { + write[index] = 0; + } + else + { + write[index] = read[index-1]; + } + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int* dev_read; + int* dev_write; + + cudaMalloc((void**)&dev_read, n * sizeof(int)); + cudaMalloc((void**)&dev_write, n * sizeof(int)); + + cudaMemcpy(dev_read, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + timer().startGpuTimer(); - // TODO + + for (int i = 1; i <= ilog2ceil(n); i++) + { + kernNaiveScan << > > (n, i, dev_read, dev_write); + std::swap(dev_read, dev_write); + } + kernInclusiveToExclusive << > > (n, dev_read, dev_write); + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_write, sizeof(int) * n, cudaMemcpyDeviceToHost); + + cudaFree(dev_read); + cudaFree(dev_write); } } } diff --git a/Project2-Stream-Compaction/stream_compaction/thrust.cu b/Project2-Stream-Compaction/stream_compaction/thrust.cu index 1def45e..6183414 100644 --- a/Project2-Stream-Compaction/stream_compaction/thrust.cu +++ b/Project2-Stream-Compaction/stream_compaction/thrust.cu @@ -18,11 +18,18 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + thrust::host_vector host_idata(n); + thrust::copy(idata, idata + n, host_idata.begin()); + + thrust::device_vector dv_in = host_idata; + + thrust::device_vector dv_out(n); + timer().startGpuTimer(); - // TODO use `thrust::exclusive_scan` - // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); timer().endGpuTimer(); + + thrust::copy(dv_out.begin(), dv_out.end(), odata); } } } diff --git a/README.md b/README.md index 3a0b2fe..265ca67 100644 --- a/README.md +++ b/README.md @@ -3,14 +3,14 @@ CUDA Number Algorithms **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Tabatha Hickman + * LinkedIn:https://www.linkedin.com/in/tabatha-hickman-335987140/ +* Tested on: Windows 10 Pro, i7-5600U CPU @ 2.60GHz 16GB, GeForce 840M (personal computer) -### (TODO: Your README) +See these readmes for more information on the two subprojects: -Link to the readmes of the other two subprojects. +* [Stream Compaction](./Project2-Stream-Compaction/README.md) +* [Character Recognition](./Project2-Character-Recognition/README.md) -Add anything else you think is relevant up to this point. -(Remember, this is public, so don't put anything here that you don't want to share with the world.) +Note: changed the sm value in CMakeLists of both stream_compaction and character_recognition to higher value so the project would build.