diff --git a/Project2-Character-Recognition/README.md b/Project2-Character-Recognition/README.md index 4503fac..d4eebdb 100644 --- a/Project2-Character-Recognition/README.md +++ b/Project2-Character-Recognition/README.md @@ -1,14 +1,43 @@ CUDA Character Recognition ====================== - **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +Dhruv Karthik: [LinkedIn](https://www.linkedin.com/in/dhruv_karthik/) + +Tested on: Windows 10 Home, Intel(R) Core(TM) i7-8700 CPU @ 3.20GHz, 16GM, GTX 2070 - Compute Capability 7.5 +____________________________________________________________________________________ +![Developer](https://img.shields.io/badge/Developer-Dhruv-0f97ff.svg?style=flat) ![CUDA 10.1](https://img.shields.io/badge/CUDA-10.1-yellow.svg) ![Built](https://img.shields.io/appveyor/ci/gruntjs/grunt.svg) ![Issues](https://img.shields.io/badge/issues-none-green.svg) +____________________________________________________________________________________ +## Outcome +### XOR Convergence +![](img/chareg.PNG) + +## Analysis + +**Background** : As illustrated in the image above, I could train an XOR MLP via backpropagation. You can visually see backpropagation work by setting the learning rate to 1 and watching the softmax probabilities shift wildly on each training iteration. I trained the network with Binary Cross Entropy Loss, the network strure is illustrated as part of the Addtional Implementation Features section below. + +**Loss**: The losses would vary on each example due to random initialization, but my best loss on the XOR problem was **0.005005** + +## Additional Implementation Features +### Variable MLP Builder & Batched Updates + +Define any MLP very easily as follows: +```C++ +//Network Structure +int numSamples = 1; +int inputDim = 2; +int numLayers = 1; +int hiddenDim[1] = {5}; +int outputDim = 2; +``` +Notice ```numSamples```. This allows you to set the batchSize of the Neural Network to perform Batched Gradient Descent, as opposed to stochastic gradient descent which is the base implementation. This required that I implement an ```AffineLayer``` class and construct matrices out of these, and handle backpropagation for variables batches. +### Variable Input Sizes and biases + +This is a consequence of the previous feature, as I can accept arbitrarily sized inputs and outputs via the ```inputDim``` and ```outputDim``` variables. I also had the option of including biases. -### (TODO: Your README) +## Tragic Historical Significance of the XOR Problem +Neural Networks are not new. In 1958, [Frank Rosenblatt](https://en.wikipedia.org/wiki/Frank_Rosenblatt) proposed a hypothetical model of a brain's nervous system and coined it the *perceptron*. Essentially, this model fit a line to a dataset. However, as seen below, you can't fit a line to an XOR function. -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +![](img/goodperceptron.png) +The perceptron got a ton of hype in the 60's, but two authors published a [book](https://mitpress.mit.edu/books/perceptrons) on emphasizing why perceptron's are terrible, because they can't fit the XOR function. This book single handedly resulted in the first of three AI Winters. If it weren't that book, the students of CIS 565 in 2010 would also be implementing MLP's in CUDA! diff --git a/Project2-Character-Recognition/character_recognition/CMakeLists.txt b/Project2-Character-Recognition/character_recognition/CMakeLists.txt index 7446175..c5e28b0 100644 --- a/Project2-Character-Recognition/character_recognition/CMakeLists.txt +++ b/Project2-Character-Recognition/character_recognition/CMakeLists.txt @@ -7,5 +7,5 @@ set(SOURCE_FILES cuda_add_library(character_recognition ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_75 ) diff --git a/Project2-Character-Recognition/character_recognition/mlp.cu b/Project2-Character-Recognition/character_recognition/mlp.cu index 5a3ed7f..663cc5d 100644 --- a/Project2-Character-Recognition/character_recognition/mlp.cu +++ b/Project2-Character-Recognition/character_recognition/mlp.cu @@ -2,6 +2,12 @@ #include #include "common.h" #include "mlp.h" +#include + +#define blockSize 512 +#define NUM_ITERS 50 +#define LEARNING_RATE 0.1 +#define FULLBATCH 0 namespace CharacterRecognition { using Common::PerformanceTimer; @@ -10,18 +16,397 @@ namespace CharacterRecognition { static PerformanceTimer timer; return timer; } - - // TODO: __global__ - - /** - * Example of use case (follow how you did it in stream compaction) - */ - /*void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); - } - */ - // TODO: implement required elements for MLP sections 1 and 2 here + void printFloatArray(float *x, int n) { + printf(" [ "); + for (int i = 0; i < n; i++) { + printf("%f ", x[i]); + } + printf("]\n"); + } + __host__ __device__ unsigned int hash(unsigned int a) { + a = (a + 0x7ed55d16) + (a << 12); + a = (a ^ 0xc761c23c) ^ (a >> 19); + a = (a + 0x165667b1) + (a << 5); + a = (a + 0xd3a2646c) ^ (a << 9); + a = (a + 0xfd7046c5) + (a << 3); + a = (a ^ 0xb55a4f09) ^ (a >> 16); + return a; + } + __global__ void kernInitWeightsBias(float *W, float *b, int inputDim, int outputDim){ + //Random Weight Initialization & Zero Bias Initialization + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= inputDim * outputDim) { + return; + } + thrust::default_random_engine rng(hash((int)(index * inputDim * outputDim + W[0]))); + thrust::uniform_real_distribution dist(0.0, 1.0); + W[index] = dist(rng); + //W[index] = 0.1 * index; + int y = index / outputDim; + b[y] = 0; + } + + __global__ void kernAffineForward(float *W, float *b, float *in, float *out, int inputDim, int outputDim, int numSamples, bool sigmoid) { + /* + W: Shape inputDim x outputDim + b: Shape outputDim + in: Shape numSamples x inputDim + out: Shape numSamples x outputDim + */ + int index = blockIdx.x * blockDim.x + threadIdx.x; + int row = index / outputDim; + int col = index % outputDim; + float val = 0; + if (row < numSamples && col < outputDim) { + for (int i = 0; i < inputDim; i++) { + val += in[row * inputDim + i] * W[i * outputDim + col]; + } + val += b[col]; + out[row * outputDim + col] = sigmoid ? 1.0/(1+expf(-1.0*val)) : val; + } + } + + __device__ float applySigmoid(float x) { + return 1 / (1 + expf(-x)); + } + + __device__ float dSigmoid(float x) { + return x * (1 - x); + } + + __global__ void kern_dSigmoid(float *dout, float *doutLinear, int numSamples, int outputDim) { + //Apply softmax across entire dout matrix (dout is outputDim x + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= numSamples * outputDim) { + return; + } + float doutidx = dout[index]; + doutLinear[index] = doutidx * (1 - doutidx); + } + + __global__ void kern_dIn(float *doutLinear, float *W, float *din, int inputDim, int outputDim, int numSamples) { + /* Effectively calculates matmul(doutLinear, W.T) + doutLinear: outputDim x numSamples - each element is dL/dY where Y = XW + b + W: inputDim x outputDim + din: inputDim x numSamples - each element is dL/din_(i,j) + */ + int index = blockIdx.x * blockDim.x + threadIdx.x; + int row = index / inputDim; + int col = index % inputDim; + float val = 0; + if (row < numSamples && col < inputDim) { + for (int i = 0; i < outputDim; i++) { + val += doutLinear[row * outputDim + i] * W[col * outputDim + i]; + } + din[row * inputDim + col] = val; + } + } + + __global__ void kern_dW(float *W, float *b, float *doutLinear, float *in, int inputDim, int outputDim, int numSamples, float lr) { + /* Effectively calculates matmul(input.T, doutLinear) and applies an update + W: inputDim x outputDim (We do gradient descent here) + b: outputDim (we do gradient decent here too) + doutLinear: outputDim x numSamples - each element is dL/dY where Y = XW + b + in: inputDim x numSamples + lr: learning rate + */ + int index = blockIdx.x * blockDim.x + threadIdx.x; + int row = index / outputDim; + int col = index % outputDim; + float val = 0; + float dbval = 0; + float currW = W[row * outputDim + col]; + float currb = b[col]; + float doutLinearIdx = 0; + if (row < inputDim && col < outputDim) { + for (int i = 0; i < numSamples; i++) { + doutLinearIdx = doutLinear[i * outputDim + col]; + val += in[i * inputDim + row] * doutLinearIdx; + dbval += doutLinearIdx; + } + W[row * outputDim + col] = currW - lr * (val); + b[col] = currb - lr * (dbval); + } + } + + __global__ void kernStableSoftmax(float *pred, float *pred2, float *target, int *sums, int numSamples, int outputDim) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int row = index / outputDim; + float rowval = 0.0; + if (index < numSamples * outputDim) { + for (int i = 0; i < outputDim; i++) { + rowval += pred2[row * outputDim + i]; + } + sums[row] = rowval; + pred[index] = expf(pred2[index]); + pred[index] = pred2[index] / rowval; + } + } + + __global__ void kernSums(float *pred, int *sums, int numSamples, int outputDim) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int row = index / outputDim; + float rowval = 0.0; + if (index < numSamples * outputDim) { + for (int i = 0; i < outputDim; i++) { + rowval += pred[row * outputDim + i]; + } + sums[row] = rowval; + } + } + + //AffineLayer + AffineLayer::AffineLayer(int idim, int odim, int ns): numSamples(ns), inputDim(idim), outputDim(odim), sigmoid(true), eval(false), doneFwd(false){ + //Malloc Weights, Biases, in and out + cudaMalloc((void**)&W, idim * odim * sizeof(float)); + checkCUDAError("cuda Malloc W failed"); + cudaMalloc((void**)&b, odim * sizeof(float)); + checkCUDAError("cuda Malloc b failed"); + cudaMalloc((void**)&dev_in, inputDim * numSamples * sizeof(float)); + checkCUDAError("cuda Malloc dev_in in failed"); + + //Call Initializer Kernels + dim3 fullBlocksPerGrid((inputDim * outputDim + blockSize - 1) / blockSize); + kernInitWeightsBias<<>>(W, b, inputDim, outputDim); + } + + void AffineLayer::setSigmoid(bool state) { + sigmoid = state; + } + void AffineLayer::setEval(bool state) { + eval = state; + } + + float* AffineLayer::forward(float *in, int ns) { + /*Uses W & b to perform forward pass on an Affine Layer + Assumes dev_input is set (on GPU), numSamples is set and eval is set + */ + float *dev_out; + cudaMalloc((void**)&dev_out, outputDim * numSamples * sizeof(float)); + checkCUDAError("cuda Malloc dev_out in failed"); + + //Memcpy the *in information into dev_in + cudaMemcpy(dev_in, in, inputDim * numSamples * sizeof(float), cudaMemcpyHostToDevice); + + //Call Affine Forward Kernel + int numBlocks = (numSamples * outputDim + blockSize - 1) / blockSize; + kernAffineForward<<>>(W, b, dev_in, dev_out, inputDim, outputDim, numSamples, sigmoid); + + //Memcpy out the *out and *in information from dev_out + float *out = new float[outputDim * numSamples]; + cudaMemcpy(out, dev_out, outputDim * numSamples * sizeof(float), cudaMemcpyDeviceToHost); + + printf("SCORES\n"); + printFloatArray(out, outputDim*numSamples); + + //free (dont free dev_in because you'll need it for backprop) + cudaFree(dev_out); + return out; + } + + float* AffineLayer::backward(float *dout, float lr){ + /* Does backprop and one gradient update for W & b & returns din + dout: upstream gradient coming in + lr: learning rate + Returns + */ + //Malloc the input matrix and an output matrix + float *dev_dout, *dev_din, *dev_doutLinear; + cudaMalloc((void**)&dev_dout, outputDim * numSamples * sizeof(float)); + checkCUDAError("cuda Malloc dev_dout in failed"); + cudaMalloc((void**)&dev_din, inputDim * numSamples * sizeof(float)); + checkCUDAError("cuda Malloc dev_din in failed"); + + //Memcpy the *dout information into dev_dout + cudaMemcpy(dev_dout, dout, outputDim * numSamples * sizeof(float), cudaMemcpyHostToDevice); + checkCUDAError("cuda Memcpy dout in failed"); + + //Make 3 diff grid layouts + dim3 weightBiasGrid((inputDim * outputDim + blockSize - 1) / blockSize); + dim3 outputGrid = (numSamples * outputDim + blockSize - 1) / blockSize; + dim3 inputGrid = ((numSamples * inputDim + blockSize - 1) / blockSize); + + if (sigmoid) { + cudaMalloc((void**)&dev_doutLinear, outputDim * numSamples * sizeof(float)); + checkCUDAError("cuda Malloc dev_din in failed"); + //Get derivative of softmax, and update + kern_dSigmoid<<>>(dev_dout, dev_doutLinear, inputDim, outputDim); + } + else { + dev_doutLinear = dev_dout; + } + + //Use matrix to compute dIn + kern_dIn<<>>(dev_doutLinear, W, dev_din, inputDim, outputDim, numSamples); + + + //Update dw and db + kern_dW<<>>(W, b, dev_doutLinear, dev_in, inputDim, outputDim, numSamples, lr); + + //DEBUG STUFF + float *myW= new float[inputDim * outputDim]; + cudaMemcpy(myW, W, inputDim * outputDim * sizeof(float), cudaMemcpyDeviceToHost); + printf("BACKPROP:WSTARTS\n"); + printFloatArray(myW, inputDim * outputDim); + printf("BACKPROP:MY WENDS\n"); + + float *myb= new float[outputDim]; + cudaMemcpy(myb, b, outputDim * sizeof(float), cudaMemcpyDeviceToHost); + printf("BACKPROP:MY bSTARTS\n"); + printFloatArray(myb, outputDim); + printf("BACKPROP:MY bENDS\n"); + + //Memcpy back the din info + float *din = new float[inputDim * numSamples]; + cudaMemcpy(din, dev_din, inputDim * numSamples * sizeof(float), cudaMemcpyDeviceToHost); + checkCUDAError("cuda Memcpy din in failed"); + + + //Free Mems + cudaFree(dev_doutLinear); + cudaFree(dev_din); + return din; + } + + void cpu_softmax(float *pred, int numSamples, int outputDim) { + float rowSum = 0; + for (int i = 0; i < numSamples; ++i) { + rowSum = 0; + for (int j = 0; j < outputDim; ++j) { + pred[i * outputDim + j] = exp(pred[i * outputDim + j]); + rowSum += pred[i * outputDim + j]; + } + for (int k = 0; k < outputDim; ++k){ + pred[i * outputDim + k] /= rowSum; + } + } + printf("PRED-PROBABILITIES\n"); + printFloatArray(pred, outputDim); + } + + float cpu_crossEntropy(float *pred, float *target, int numSamples, int outputDim, float* dout){ + float* log_likelihood = new float[numSamples]; + float llsum = 0; + for (int i = 0; i < numSamples; ++i) { + for (int c = 0; c < outputDim; ++c) { + float ting = pred[i * outputDim + c]; + dout[i * outputDim + c] = ting; + } + } + printf("DOUT\n"); + printFloatArray(dout, outputDim); + + for (int i = 0; i < numSamples; ++i) { + int offset = target[i]; + float ting = pred[i * outputDim + offset]; + log_likelihood[i] = -log(ting); + llsum += -log(ting); + dout[i * outputDim + offset] -= 1; + for (int c = 0; c < outputDim; ++c) { + dout[i * outputDim + c] /= numSamples; + } + } + return llsum / numSamples; + } + + float softmax_loss(float *pred, float *target, float *dout, int numSamples, int outputDim) { + /* Returns a float representing the loss, and updates dout + pred: Shape numSamples x outputDim + target: Shape numSamples + dout: Each element + */ + + //Apply Softmax to pred + cpu_softmax(pred, numSamples, outputDim); + + float loss = cpu_crossEntropy(pred, target, numSamples, outputDim, dout); + return loss; + } + + void getXORSample(int idx, float *x, float *target) { + if (FULLBATCH) { + x[0] = 0; + x[1] = 0; + target[0] = 0; + x[2] = 0; + x[3] = 1; + target[1] = 1; + x[4] = 1; + x[5] = 0; + target[2] = 1; + x[6] = 1; + x[7] = 1; + target[3] = 0; + } + if (idx % 4 == 0) { + x[0] = 1; + x[1] = 1; + target[0] = 0; + } + else if (idx % 3 == 0) { + x[0] = 0; + x[1] = 0; + target[0] = 0; + } + else if (idx % 2 == 0) { + x[0] = 1; + x[1] = 0; + target[0] = 1; + } + else { + x[0] = 0; + x[1] = 1; + target[0] = 1; + } + } + + void XORTest() { + //Network Structure + int numSamples = 1; + int inputDim = 2; + int numLayers = 1; + int hiddenDim[1] = {5}; + int outputDim = 2; + + //XOR Input Array and Target Array + float *x = new float[numSamples * inputDim]; + float *target = new float[numSamples * outputDim]; + //Build Layers + std::vector layers; + layers.push_back(new AffineLayer(inputDim, hiddenDim[0], numSamples)); + for (int l = 1; l < numLayers; ++l) { + AffineLayer* currLayer = new AffineLayer(hiddenDim[l - 1], hiddenDim[l], numSamples); + currLayer->setSigmoid(true); + layers.push_back(currLayer); + } + layers.push_back(new AffineLayer(hiddenDim[numLayers-1], outputDim, numSamples)); + layers[layers.size() - 1]->setSigmoid(false); + + for (int k = 0; k < NUM_ITERS; ++k) { + getXORSample(k, x, target); + printf("INPUT\n"); + printFloatArray(x, inputDim * numSamples); + printf("TARGET\n"); + printFloatArray(target, 1 * numSamples); + //FORWARD PROP + float *out; + out = x; + for (int c = 0; c < layers.size(); ++c) { + out = layers[c]->forward(out, numSamples); + } + + //CALCULATE LOSS + float *dout = new float[outputDim * numSamples]; + float loss = softmax_loss(out, target, dout, numSamples, outputDim); + printf("LOSS BACKPROP:%f\n", loss); + printFloatArray(dout, outputDim * numSamples); + + //BACKWARD PROP + for (int v = layers.size() - 1; v >= 0; v--) { + dout = layers[v]->backward(dout, LEARNING_RATE); + } + printf("======================================\n", loss); + } + } } diff --git a/Project2-Character-Recognition/character_recognition/mlp.h b/Project2-Character-Recognition/character_recognition/mlp.h index 2096228..7f75d54 100644 --- a/Project2-Character-Recognition/character_recognition/mlp.h +++ b/Project2-Character-Recognition/character_recognition/mlp.h @@ -1,9 +1,37 @@ #pragma once +#include #include "common.h" namespace CharacterRecognition { Common::PerformanceTimer& timer(); + class AffineLayer{ + float *dev_in; + float *W; + float *b; + int numSamples; + int inputDim, outputDim; + bool sigmoid; + bool eval; + bool doneFwd; + public: + AffineLayer(int idim, int odim, int ns); + float* forward(float *in, int num_samples); + float* backward(float *dout, float lr); + void setEval(bool state); + void setSigmoid(bool state); + float softmax_loss(float *pred, float *target, float *dout, int numSamples, int outputDim); + void cpu_softmax(float *pred, int numSamples, int outputDim); + float cpu_crossEntropy(float *pred, float *target, int numSamples, int outputDim, float* dout); + char* getType(); + }; + class FCN { + std::vector layers; + public: + FCN(int inputDim, int outputDim, int numHiddenLayers, int *hiddenDims); + void forward(float *input, float *ouput, bool eval); + }; // TODO: implement required elements for MLP sections 1 and 2 here + void XORTest(); } diff --git a/Project2-Character-Recognition/img/chareg.PNG b/Project2-Character-Recognition/img/chareg.PNG new file mode 100644 index 0000000..2da0aae Binary files /dev/null and b/Project2-Character-Recognition/img/chareg.PNG differ diff --git a/Project2-Character-Recognition/img/goodperceptron.png b/Project2-Character-Recognition/img/goodperceptron.png new file mode 100644 index 0000000..1f7cec6 Binary files /dev/null and b/Project2-Character-Recognition/img/goodperceptron.png differ diff --git a/Project2-Character-Recognition/src/main.cpp b/Project2-Character-Recognition/src/main.cpp index 11dd534..5e2536f 100644 --- a/Project2-Character-Recognition/src/main.cpp +++ b/Project2-Character-Recognition/src/main.cpp @@ -11,139 +11,30 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 3; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; int *c = new int[SIZE]; -int main(int argc, char* argv[]) { - // Scan tests +void printFloatArray(float *x, int n) { + printf(" [ "); + for (int i = 0; i < n; i++) { + printf("%f ", x[i]); + } + printf("]\n"); +} +int main(int argc, char* argv[]) { + /* + CHARACTER RECOGNITION TESTS + */ printf("\n"); printf("****************\n"); - printf("** SCAN TESTS **\n"); + printf("** CHARACTER RECOGNITION TESTS **\n"); printf("****************\n"); - genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); - - // initialize b using StreamCompaction::CPU::scan you implement - // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct. - // At first all cases passed because b && c are all zeroes. - zeroArray(SIZE, b); - printDesc("cpu scan, power-of-two"); - StreamCompaction::CPU::scan(SIZE, b, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(SIZE, b, true); - - zeroArray(SIZE, c); - printDesc("cpu scan, non-power-of-two"); - StreamCompaction::CPU::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(NPOT, b, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("naive scan, power-of-two"); - StreamCompaction::Naive::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan - onesArray(SIZE, c); - printDesc("1s array for finding bugs"); - StreamCompaction::Naive::scan(SIZE, c, a); - printArray(SIZE, c, true); */ - - zeroArray(SIZE, c); - printDesc("naive scan, non-power-of-two"); - StreamCompaction::Naive::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient scan, power-of-two"); - StreamCompaction::Efficient::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient scan, non-power-of-two"); - StreamCompaction::Efficient::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("thrust scan, power-of-two"); - StreamCompaction::Thrust::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - zeroArray(SIZE, c); - printDesc("thrust scan, non-power-of-two"); - StreamCompaction::Thrust::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); - - printf("\n"); - printf("*****************************\n"); - printf("** STREAM COMPACTION TESTS **\n"); - printf("*****************************\n"); - - // Compaction tests - - genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); - - int count, expectedCount, expectedNPOT; - - // initialize b using StreamCompaction::CPU::compactWithoutScan you implement - // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct. - zeroArray(SIZE, b); - printDesc("cpu compact without scan, power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - expectedCount = count; - printArray(count, b, true); - printCmpLenResult(count, expectedCount, b, b); - - zeroArray(SIZE, c); - printDesc("cpu compact without scan, non-power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - expectedNPOT = count; - printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); - - zeroArray(SIZE, c); - printDesc("cpu compact with scan"); - count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient compact, power-of-two"); - count = StreamCompaction::Efficient::compact(SIZE, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient compact, non-power-of-two"); - count = StreamCompaction::Efficient::compact(NPOT, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); + CharacterRecognition::XORTest(); system("pause"); // stop Win32 console from closing on exit delete[] a; diff --git a/Project2-Stream-Compaction/README.md b/Project2-Stream-Compaction/README.md index 0e38ddb..a713137 100644 --- a/Project2-Stream-Compaction/README.md +++ b/Project2-Stream-Compaction/README.md @@ -1,14 +1,101 @@ -CUDA Stream Compaction -====================== +Project 2 - STREAM COMPACTION +==================== +**University of Pennsylvania, CIS 565: GPU Programming and Architecture** +Dhruv Karthik: [LinkedIn](https://www.linkedin.com/in/dhruv_karthik/) -**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** +Tested on: Windows 10 Home, Intel(R) Core(TM) i7-8700 CPU @ 3.20GHz, 16GM, GTX 2070 - Compute Capability 7.5 +____________________________________________________________________________________ +![Developer](https://img.shields.io/badge/Developer-Dhruv-0f97ff.svg?style=flat) ![CUDA 10.1](https://img.shields.io/badge/CUDA-10.1-yellow.svg) ![Built](https://img.shields.io/appveyor/ci/gruntjs/grunt.svg) ![Issues](https://img.shields.io/badge/issues-none-green.svg) +____________________________________________________________________________________ +## Intro +At a high level, stream compaction involves removing zeros from an array of ones. We outline results for three different ways of performing this task as follows. +## Performance Analysis +### Runtime vs Blocksize (ARRAY SIZE = 4096, 1<<12) + ![](img/runtimevsblocksize.png) +**Chosen Blocksizes** +* Naive : 128 +* Work Efficient: 128 -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +### Runtime vs Array Size (BLOCKSIZE = 128) + ![](img/graph2.png) -### (TODO: Your README) +## Questions +**Can you find the performance bottlenecks? Is it memory I/O? Computation? Is it different for each implementation?** -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +Main Learning: I actually removed 2 calls to ```cudaMemcpy``` and saw the execution time get halved across all the GPU Implementations. +Copying back and forth (Device to Device & Device to Host) can seriously increase the execution time of the program. +* The CPU Implementation takes wins likely because of no need for copying memory. It can just pass the pointer arround This takes *O(n) +* The Naive Implementation does *O(nlogn)* computations (many of them needless in the event of an non-power-of-2 array) and runs slow than the CPU +The Work Efficient scan is the faster of the GPU scans, but still requires that we copy. While is it *O(n)*, many of the threads in a warp don't exit early when they should. Even worse is that unlike CPU, this still has 2 cudaMemcpy's. + +## Extra Credit Features +**Why is my GPU implementation slower than my CPU implementation?** + +Apart from the calls to ```cudaMemcpy```, some threads to unneccesarry work because they don't get terminated early as they will not be required for the next level of computation for the work-efficient scan. I optimized this by sending an offset value into the call to the kernel, so any threads that weren't neccesarry in the future could get terminated early. This was also useful because it saves one call to cudaMemcpy. Despite this, my CPU implementation was still much faster. + +This picture illustrates an early termination strategy on the upsweep. + + ![](img/earlyexits.PNG) + +## Output +```bash +**************** +** SCAN TESTS ** +**************** + [ 40 42 40 23 36 45 21 8 44 38 1 3 25 ... 15 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.000642ms (std::chrono Measured) + [ 0 40 82 122 145 181 226 247 255 299 337 338 341 ... 6248 6263 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 0.000321ms (std::chrono Measured) + [ 0 40 82 122 145 181 226 247 255 299 337 338 341 ... 6141 6177 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 0.315392ms (CUDA Measured) + [ 0 40 82 122 145 181 226 247 255 299 337 338 341 ... 6248 6263 ] + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.2752ms (CUDA Measured) + [ 0 40 82 122 145 181 226 247 255 299 337 338 341 ... 6141 6177 ] + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.08576ms (CUDA Measured) + [ 0 40 82 122 145 181 226 247 255 299 337 338 341 ... 6248 6263 ] + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.08608ms (CUDA Measured) + [ 0 40 82 122 145 181 226 247 255 299 337 338 341 ... 6141 6177 ] + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.105824ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.107616ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 1 0 0 3 0 0 1 3 2 3 3 0 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.000641ms (std::chrono Measured) + [ 2 1 3 1 3 2 3 3 1 1 2 3 2 ... 2 2 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.000963ms (std::chrono Measured) + [ 2 1 3 1 3 2 3 3 1 1 2 3 2 ... 1 2 ] + passed +==== cpu compact with scan ==== + elapsed time: 0.002246ms (std::chrono Measured) + [ 2 1 3 1 3 2 3 3 1 1 2 3 2 ... 2 2 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.19456ms (CUDA Measured) + [ 2 1 3 1 3 2 3 3 1 1 2 3 2 ... 2 2 ] + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.196608ms (CUDA Measured) + [ 2 1 3 1 3 2 3 3 1 1 2 3 2 ... 1 2 ] + passed +``` diff --git a/Project2-Stream-Compaction/img/earlyexits.PNG b/Project2-Stream-Compaction/img/earlyexits.PNG new file mode 100644 index 0000000..bd189ba Binary files /dev/null and b/Project2-Stream-Compaction/img/earlyexits.PNG differ diff --git a/Project2-Stream-Compaction/img/goodperceptron.png b/Project2-Stream-Compaction/img/goodperceptron.png new file mode 100644 index 0000000..1f7cec6 Binary files /dev/null and b/Project2-Stream-Compaction/img/goodperceptron.png differ diff --git a/Project2-Stream-Compaction/img/graph2.png b/Project2-Stream-Compaction/img/graph2.png new file mode 100644 index 0000000..52f2c5a Binary files /dev/null and b/Project2-Stream-Compaction/img/graph2.png differ diff --git a/Project2-Stream-Compaction/img/output.PNG b/Project2-Stream-Compaction/img/output.PNG new file mode 100644 index 0000000..abeb9d5 Binary files /dev/null and b/Project2-Stream-Compaction/img/output.PNG differ diff --git a/Project2-Stream-Compaction/img/perceptron.png b/Project2-Stream-Compaction/img/perceptron.png new file mode 100644 index 0000000..d6ff089 Binary files /dev/null and b/Project2-Stream-Compaction/img/perceptron.png differ diff --git a/Project2-Stream-Compaction/img/runtimevsblocksize.png b/Project2-Stream-Compaction/img/runtimevsblocksize.png new file mode 100644 index 0000000..1c280b5 Binary files /dev/null and b/Project2-Stream-Compaction/img/runtimevsblocksize.png differ diff --git a/Project2-Stream-Compaction/src/main.cpp b/Project2-Stream-Compaction/src/main.cpp index d016553..e8edc72 100644 --- a/Project2-Stream-Compaction/src/main.cpp +++ b/Project2-Stream-Compaction/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1<<12; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -27,8 +27,9 @@ int main(int argc, char* argv[]) { printf("** SCAN TESTS **\n"); printf("****************\n"); - genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; + genArray(SIZE, a, 50); // Leave a 0 at the end to test that edge case + a[SIZE - 1] = 0; + printArray(SIZE, a, true); // initialize b using StreamCompaction::CPU::scan you implement @@ -50,8 +51,8 @@ int main(int argc, char* argv[]) { zeroArray(SIZE, c); printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan @@ -59,26 +60,25 @@ int main(int argc, char* argv[]) { printDesc("1s array for finding bugs"); StreamCompaction::Naive::scan(SIZE, c, a); printArray(SIZE, c, true); */ - zeroArray(SIZE, c); printDesc("naive scan, non-power-of-two"); StreamCompaction::Naive::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); @@ -99,13 +99,11 @@ int main(int argc, char* argv[]) { printf("*****************************\n"); printf("** STREAM COMPACTION TESTS **\n"); printf("*****************************\n"); - // Compaction tests genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; + a[SIZE - 1] = 0; printArray(SIZE, a, true); - int count, expectedCount, expectedNPOT; // initialize b using StreamCompaction::CPU::compactWithoutScan you implement @@ -125,7 +123,7 @@ int main(int argc, char* argv[]) { expectedNPOT = count; printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); - + zeroArray(SIZE, c); printDesc("cpu compact with scan"); count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); @@ -137,16 +135,16 @@ int main(int argc, char* argv[]) { printDesc("work-efficient compact, power-of-two"); count = StreamCompaction::Efficient::compact(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); + zeroArray(SIZE, c); printDesc("work-efficient compact, non-power-of-two"); count = StreamCompaction::Efficient::compact(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); - system("pause"); // stop Win32 console from closing on exit delete[] a; delete[] b; diff --git a/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt b/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt index cdbef77..185a604 100644 --- a/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt +++ b/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt @@ -13,5 +13,5 @@ set(SOURCE_FILES cuda_add_library(stream_compaction ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_75 ) diff --git a/Project2-Stream-Compaction/stream_compaction/cpu.cu b/Project2-Stream-Compaction/stream_compaction/cpu.cu index a2d3e6c..ac9089a 100644 --- a/Project2-Stream-Compaction/stream_compaction/cpu.cu +++ b/Project2-Stream-Compaction/stream_compaction/cpu.cu @@ -18,33 +18,92 @@ namespace StreamCompaction { * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first. */ void scan(int n, int *odata, const int *idata) { + // idata: orig int array, odata: output int array, n is len(int array) timer().startCpuTimer(); - // TODO + odata[0] = 0; + for (int i = 1; i < n; i++) { + odata[i] = odata[i - 1] + idata[i-1]; //n-1 adds + } timer().endCpuTimer(); } + void scan_notimer(int n, int *odata, const int *idata) { + // idata: orig int array, odata: output int array, n is len(int array) + odata[0] = 0; + for (int i = 1; i < n; i++) { + odata[i] = odata[i - 1] + idata[i-1]; //n-1 adds + } + } + + /** * CPU stream compaction without using the scan function. * * @returns the number of elements remaining after compaction. */ int compactWithoutScan(int n, int *odata, const int *idata) { + // idata: orig int array, odata: output int array, n is len(int array) timer().startCpuTimer(); - // TODO + int num_nonzeros = 0; + for (int i = 0; i < n; i++) { + int elt_i = idata[i]; + if (elt_i != 0) { + odata[num_nonzeros] = elt_i; + ++num_nonzeros; + } + } timer().endCpuTimer(); - return -1; + return num_nonzeros; } + void computeTemporaryArray(int n, int *tempArray, const int *idata) { + //Temporary array copies zeros & sets nonzeros to 1 + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + tempArray[i] = 1; + } + else { + tempArray[i] = 0; + } + } + } + + int scatter(int n, int *odata, const int *idata, const int *tempArray) { + //odata now contains the scan result + int elt_i, shouldInclude, newIdx; + int count = 0; + for (int i = 0; i < n; i++) { + shouldInclude = tempArray[i]; + elt_i = idata[i]; + if (shouldInclude) { + newIdx = odata[i]; + odata[newIdx] = elt_i; + ++count; + } + } + return count; + } + /** * CPU stream compaction using scan and scatter, like the parallel version. * * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { + // idata: orig int array, odata: output int array, n is len(int array) timer().startCpuTimer(); - // TODO + + //1: Malloc & Compute Temporary Array + int *tempArray = new int[n]; + computeTemporaryArray(n, tempArray, idata); + + //2: Exclusive Scan on tempArray + scan_notimer(n, odata, tempArray); + + //3: Scatter + int newlen = scatter(n, odata, idata, tempArray); timer().endCpuTimer(); - return -1; + return newlen; } } -} +} \ No newline at end of file diff --git a/Project2-Stream-Compaction/stream_compaction/efficient.cu b/Project2-Stream-Compaction/stream_compaction/efficient.cu index 2db346e..de2bb61 100644 --- a/Project2-Stream-Compaction/stream_compaction/efficient.cu +++ b/Project2-Stream-Compaction/stream_compaction/efficient.cu @@ -3,24 +3,150 @@ #include "common.h" #include "efficient.h" +/*! Block size used for CUDA kernel launch*/ +#define blockSize 1024 namespace StreamCompaction { - namespace Efficient { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; - } + namespace Efficient { + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; + } + + int nextPowerOf2(int n) { + int p = 1; + if (n && !(n & (n - 1))) { + return n; + } + while (p < n) { + p <<= 1; + } + return p; + } + + __global__ void kernUpsweep(int n, int d, int *odata, int incr, int twod) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + //also return if index is not a multiple of the incr + if (index >= n || (index) % incr != 0) { + return; + } + //if we reached here, index+1 must be a multiple of incr (2^(d+1)) + odata[index + incr - 1] += odata[index + twod - 1]; + odata[n - 1] = 0; + } + __global__ void kernDownsweep(int n, int d, int *odata, int incr, int twod) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + //also return if index is not a multiple of the incr + if (index >= n || (index) % incr != 0) { + return; + } + //if we reached here, index+1 must be a multiple of incr (2^(d+1)) + int t = odata[index + twod - 1]; + odata[index + twod - 1] = odata[index + incr - 1]; + odata[index + incr - 1] += t; + } + + __global__ void kernMapToBoolean(int n, int *mask, int *idata) { + //dev_odata contains idata + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) { + return; + } + if (idata[index] != 0) { + mask[index] = 1; + } + else { + mask[index] = 0; + } + } + + __global__ void kernScatter(int n, int *mask, int *odata, int *odata2, int *idata) { + //odata now contains scan result + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n){ + return; + } + int shouldInclude = mask[index]; + if (shouldInclude) { + int newIdx = odata2[index]; + odata[newIdx] = idata[index]; + } + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int malloc_size = nextPowerOf2(n); + //CUDA Malloc buffers + int *dev_odata; + cudaMalloc((void**)&dev_odata, malloc_size * sizeof(int)); + checkCUDAError("cudaMalloc dev_odata failed!"); + + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + int max_level = ilog2ceil(n); + int incr = 0; + int twod = 0; + //Copy idata into dev_odata + cudaMemcpy(dev_odata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy dev_odata failed!"); + timer().startGpuTimer(); - // TODO + //Upsweep + for (int d = 0; d < max_level; d++) { + incr = pow(2, d + 1); + twod = pow(2, d); + kernUpsweep<<>>(malloc_size, d, dev_odata, incr, twod); + } + + //Downsweep + for (int d = max_level-1; d >= 0; d--) { + incr = pow(2, d + 1); + twod = pow(2, d); + kernDownsweep<<>>(malloc_size, d, dev_odata, incr, twod); + } timer().endGpuTimer(); + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + //Free Memory + cudaFree(dev_odata); + } + + void scan_notimer(int n, int malloc_size, int *dev_odata) { + //Odata contains mask info + dim3 fullBlocksPerGrid((malloc_size + blockSize - 1) / blockSize); + int max_level = ilog2ceil(n); + int incr = 0; + int twod = 0; + + //Upsweep + for (int d = 0; d < max_level; d++) { + incr = pow(2, d + 1); + twod = pow(2, d); + kernUpsweep<<>>(malloc_size, d, dev_odata, incr, twod); + } + + //Downsweep + for (int d = max_level-1; d >= 0; d--) { + incr = pow(2, d + 1); + twod = pow(2, d); + kernDownsweep<<>>(malloc_size, d, dev_odata, incr, twod); + } } + + void printArray(int n, int *a, bool abridged = false) { + printf(" [ "); + for (int i = 0; i < n; i++) { + if (abridged && i + 2 == 15 && n > 16) { + i = n - 2; + printf("... "); + } + printf("%3d ", a[i]); + } + printf("]\n"); +} + /** * Performs stream compaction on idata, storing the result into odata. * All zeroes are discarded. @@ -31,10 +157,53 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + int malloc_size = nextPowerOf2(n); + //CUDA Malloc buffers + int *dev_odata; + int *dev_odata2; + int *dev_idata; + int *dev_mask; + cudaMalloc((void**)&dev_odata, (malloc_size+1) * sizeof(int)); + checkCUDAError("cudaMalloc dev_odata failed!"); + cudaMalloc((void**)&dev_odata2, (malloc_size+1) * sizeof(int)); + checkCUDAError("cudaMalloc dev_odata failed!"); + cudaMalloc((void**)&dev_idata, malloc_size * sizeof(int)); + checkCUDAError("cudaMalloc dev_in failed!"); + cudaMalloc((void**)&dev_mask, malloc_size * sizeof(int)); + checkCUDAError("cudaMalloc dev_temp failed!"); + + //Memcpy idata into dev_odata for starters + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy dev_idata failed!"); + cudaMemcpy(dev_odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToDevice); + checkCUDAError("cudaMemcpy dev_odata failed!"); + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); timer().startGpuTimer(); - // TODO + //1: Compute mask (Temporary Array) + kernMapToBoolean<<>>(n, dev_odata, dev_idata); + + //2: Exclusive Scan on TempArray + cudaMemcpy(dev_mask, dev_odata, n * sizeof(int), cudaMemcpyDeviceToDevice); + checkCUDAError("cudaMemcpy dev_odata failed!"); + scan_notimer(n, malloc_size, dev_odata); + + //2.5: Get Count from dev_mask + int tempcount[1]; + cudaMemcpy(&tempcount, dev_odata + n - 1, 1 * sizeof(int), cudaMemcpyDeviceToHost); + int count = idata[n - 1] == 0 ? tempcount[0] : tempcount[0] + 1; + + //3: Scatter (dev_odata now contains scan info) + cudaMemcpy(dev_odata2, dev_odata, n * sizeof(int), cudaMemcpyDeviceToDevice); + checkCUDAError("cudaMemcpy dev_odata failed!"); + kernScatter<<>>(n, dev_mask, dev_odata, dev_odata2, dev_idata); timer().endGpuTimer(); - return -1; + cudaMemcpy(odata, dev_odata, (count) * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_mask); + cudaFree(dev_odata); + cudaFree(dev_odata2); + cudaFree(dev_idata); + return count; } } } diff --git a/Project2-Stream-Compaction/stream_compaction/naive.cu b/Project2-Stream-Compaction/stream_compaction/naive.cu index 4308876..03afff3 100644 --- a/Project2-Stream-Compaction/stream_compaction/naive.cu +++ b/Project2-Stream-Compaction/stream_compaction/naive.cu @@ -3,6 +3,11 @@ #include "common.h" #include "naive.h" +/*! Block size used for CUDA kernel launch*/ +#define blockSize 1024 +int *dev_A; +int *dev_B; + namespace StreamCompaction { namespace Naive { using StreamCompaction::Common::PerformanceTimer; @@ -11,15 +16,68 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } - // TODO: __global__ + + __global__ void kernNaiveScan(int n, int curr_level, int* devA, int* devB) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) { + return; + } + int offset = (int)powf(2, curr_level - 1); + if (index >= offset) { + devB[index] = devA[index - offset] + devA[index]; + } + else { + devB[index] = devA[index]; + } + } + + int nextPowerOf2(int n) { + int p = 1; + if (n && !(n & (n - 1))) { + return n; + } + while (p < n) { + p <<= 1; + } + return p; + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int malloc_size = nextPowerOf2(n); + //CUDA Malloc buffers + cudaMalloc((void**)&dev_A, malloc_size * sizeof(int)); + checkCUDAError("cudaMalloc dev_A failed!"); + cudaMalloc((void**)&dev_B, malloc_size * sizeof(int)); + checkCUDAError("cudaMalloc dev_A failed!"); + + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + int max_level = ilog2ceil(n); + timer().startGpuTimer(); - // TODO + //Copy idata into dev_A + cudaMemcpy(dev_A, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + //loop over each level + for (int curr_level = 1; curr_level <= max_level; curr_level++) { + //Launch Kernel (thereby updating dev_B) + kernNaiveScan<<>>(n, curr_level, dev_A, dev_B); + + //Copy dev_B's updated data into dev_A + cudaMemcpy(dev_A, dev_B, n * sizeof(int), cudaMemcpyDeviceToDevice); + checkCUDAError("cudaMemcpy dev_A to dev_B failed!"); + } + //Exclusive Scan so shift right when copying back + cudaMemcpy(odata+1, dev_A, (n-1) * sizeof(int), cudaMemcpyDeviceToHost); + odata[0] = 0; + checkCUDAError("cudaMemcpy dev_A to out failed!"); timer().endGpuTimer(); + + //Free Memory + cudaFree(dev_A); + cudaFree(dev_B); } } } diff --git a/Project2-Stream-Compaction/stream_compaction/thrust.cu b/Project2-Stream-Compaction/stream_compaction/thrust.cu index 1def45e..0c607be 100644 --- a/Project2-Stream-Compaction/stream_compaction/thrust.cu +++ b/Project2-Stream-Compaction/stream_compaction/thrust.cu @@ -18,11 +18,24 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO use `thrust::exclusive_scan` - // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); - timer().endGpuTimer(); + int * dev_idata; + int * dev_odata; + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_odata failed!"); + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_idata failed!"); + + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy dev_idata failed!"); + thrust::device_ptr input(dev_idata); + thrust::device_ptr output(dev_odata); + + timer().startGpuTimer(); + thrust::exclusive_scan(input, input + n, output); + timer().endGpuTimer(); + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(dev_idata); + cudaFree(dev_odata); } } } diff --git a/README.md b/README.md index 3a0b2fe..87eacf8 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,12 @@ CUDA Number Algorithms ====================== -**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** +**University of Pennsylvania, CIS 565: GPU Programming and Architecture** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +Dhruv Karthik: [LinkedIn](https://www.linkedin.com/in/dhruv_karthik/) -### (TODO: Your README) +Tested on: Windows 10 Home, Intel(R) Core(TM) i7-8700 CPU @ 3.20GHz, 16GM, GTX 2070 - Compute Capability 7.5 -Link to the readmes of the other two subprojects. - -Add anything else you think is relevant up to this point. -(Remember, this is public, so don't put anything here that you don't want to share with the world.) +* [Stream Compaction](https://github.com/botforge/Project2-Number-Algorithms/tree/master/Project2-Stream-Compaction) +* [Character Recognition](https://github.com/botforge/Project2-Number-Algorithms/tree/master/Project2-Character-Recognition)