diff --git a/Project2-Character-Recognition/2x2_XOR_excel_example.xlsx b/Project2-Character-Recognition/2x2_XOR_excel_example.xlsx index d063777..fb29acd 100644 Binary files a/Project2-Character-Recognition/2x2_XOR_excel_example.xlsx and b/Project2-Character-Recognition/2x2_XOR_excel_example.xlsx differ diff --git a/Project2-Character-Recognition/CMakeLists.txt b/Project2-Character-Recognition/CMakeLists.txt index 09e9198..3301b19 100644 --- a/Project2-Character-Recognition/CMakeLists.txt +++ b/Project2-Character-Recognition/CMakeLists.txt @@ -22,6 +22,7 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin") endif() include_directories(.) +link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib/x64) add_subdirectory(character_recognition) cuda_add_executable(${CMAKE_PROJECT_NAME} @@ -32,4 +33,6 @@ cuda_add_executable(${CMAKE_PROJECT_NAME} target_link_libraries(${CMAKE_PROJECT_NAME} character_recognition ${CORELIBS} + cublas ) + diff --git a/Project2-Character-Recognition/README.md b/Project2-Character-Recognition/README.md index 4503fac..8c79eb2 100644 --- a/Project2-Character-Recognition/README.md +++ b/Project2-Character-Recognition/README.md @@ -3,12 +3,43 @@ CUDA Character Recognition **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Klayton Wittler + * [LinkedIn](https://www.linkedin.com/in/klayton-wittler/) +* Tested on: Windows 10 Pro, i7-7700K @ 4.20GHz 16.0GB, GTX 1070 8.192GB (my PC) -### (TODO: Your README) +## Sections -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +* [Introduction](#introduction) +* [Impelmentation](#implementation) +* [Additions](#additions) + +# Introduction +This project is an attempt at developing a multi-layer perceptron from scratch that utilizes some of the parallelization offered by the GPU. + +![MLP](img/MLP.png) + +The architecture for the network is: + +* Input Layer: Number of nodes in this layer is equal to the number of features. Two for XOR and 10201 for the images in the dataset + +* Hidden Layer: This layer is a linear transformation with some weighted matrix activated by some non-linear function (sigmoid) + +* Output Layer: The number of nodes in this layer is equat to the number of labels. One for XOR and 52 for the images in the dataset + +The loss function for the network is currently the sum of squared error, but other losses for the image classification would be better (for example cross entropy). + +# Implementation +My implemenation contains a flag ```XOR``` which switches from the XOR test to the image dataset. I currently have the ability to load the images, store them into an arary and give it to training or testing functions, however the backprogation for training does not work. Below is an example of forward propagation wiht loss calculation on the XOR dataset. + +![XOR](img/XORoutput.png) + +Inside ```mlp.cu``` the main functions are ```train``` and ```test```. Each of these call ```forward``` to run the network utilizing cublas matrix multiplication in ```matrixMultiply``` for the weights and calls ```kernSigmoid``` to run the activations in parallel. This is repeated twice for the two layers in which the prediction is then returned. The loss is then calculated in ```MSE``` which is also set up to be done in parallel but for the XOR example is trivial. The loss could then be fed into backpropagation to adjust the weights in order perform gradient descent on the loss function. In ```main.cpp``` have given an option to specify the number of desired iterations and learning rate to the user as well as the number of hidden nodes. + +# Additions +## Matrix multiplication +```CMakeLists.txt``` has been edited to include ```link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib/x64)``` as well as ```cublas``` in ```target_link_libraries```. + +Each layer of the network is handle with a matrix multiplication through cublas in the ```matrixMultiply``` function then feed through the sigmoid activation in parallel with ```kernSigmoid```. Below is the outputs of each layer in the XOR example. There is a flag for testing matrix multiply ```MULT``` which will trigger a console prompt of what size matrix to test, fill in each element with its index and return the result for verification. + +![XOR-matrix](img/XORforwardProp.png) diff --git a/Project2-Character-Recognition/character_recognition/CMakeLists.txt b/Project2-Character-Recognition/character_recognition/CMakeLists.txt index 7446175..cbe0e75 100644 --- a/Project2-Character-Recognition/character_recognition/CMakeLists.txt +++ b/Project2-Character-Recognition/character_recognition/CMakeLists.txt @@ -7,5 +7,8 @@ set(SOURCE_FILES cuda_add_library(character_recognition ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_61 ) + +target_link_libraries(character_recognition ${CUDA_LIBRARIES}) +target_link_libraries(character_recognition ${CUDA_CUBLAS_LIBRARIES}) diff --git a/Project2-Character-Recognition/character_recognition/mlp.cu b/Project2-Character-Recognition/character_recognition/mlp.cu index 5a3ed7f..c7a53b0 100644 --- a/Project2-Character-Recognition/character_recognition/mlp.cu +++ b/Project2-Character-Recognition/character_recognition/mlp.cu @@ -1,7 +1,20 @@ + + #include #include +//#include +//#include "device_launch_parameters.h" #include "common.h" #include "mlp.h" +#include +#include + +/*! Block size used for CUDA kernel launch. */ +#define blockSize 32 +#define index(i,j,ld) (((j)*(ld))+(i)) +#define GPULOSS 0 + +float *dev_X, *dev_wI, *dev_wO, *dev_h1, *dev_pred, *dev_loss, *dev_y; namespace CharacterRecognition { using Common::PerformanceTimer; @@ -10,18 +23,375 @@ namespace CharacterRecognition { static PerformanceTimer timer; return timer; } - - // TODO: __global__ - - /** - * Example of use case (follow how you did it in stream compaction) - */ - /*void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); - } - */ - // TODO: implement required elements for MLP sections 1 and 2 here + + typedef struct _matrixSize { + int WA, HA, WB, HB, WC, HC; + } sMatrixSize; + + + void printMat(float*P, int uWP, int uHP) { + int i, j; + for (i = 0; i < uHP; i++) { + for (j = 0; j < uWP; j++) + printf(" %f ", P[index(i, j, uHP)]); + printf("\n"); + } + } + + + void fixedInit(float *data, int size) { + if (size == 4) { + data[0] = 10.1f; + data[1] = 0.9f; + data[2] = 20.0f; + data[3] = 0.87f; + } + else if (size == 2) { + data[0] = 41.0f; + data[1] = -54.0f; + } + } + + + void randomInit(float *data, int size) { + for (int i = 0; i < size; ++i) + data[i] = rand() / (float)RAND_MAX; + } + + + void indexInit(float *data, int size) { + for (int i = 0; i < size; ++i) + data[i] = (float)i; + } + + + int getNum(int &n, float *v) { + // Generate a random number + srand(time(NULL)); + // Make sure the number is within the index range + int index = rand() % n; + // Get random number from the vector + int num = v[index]; + // Remove the number from the vector + std::swap(v[index], v[n - 1]); + n--; + // Return the removed number + return num; + } + + + void generateRandom(int n, float *permuteData) { + float *v = (float *)malloc(n); + // Fill the vector with the values 1, 2, 3, ..., n + for (int i = 0; i < n; i++) { + v[i] = i; + } + // While vector has elements get a random number from the vector and print it + int i = 0; + while (n > 0) { + permuteData[i] = getNum(n,v); + i++; + } + } + + + void deviceMemory(float *Xi, float *wI, float *wO, sMatrixSize &hidden_matrix_size, sMatrixSize &output_matrix_size, bool create = false) { + if (create) { + unsigned int size_X = hidden_matrix_size.WB * hidden_matrix_size.HB; + unsigned int mem_size_X = sizeof(float) * size_X; + unsigned int size_wI = hidden_matrix_size.WA * hidden_matrix_size.HA; + unsigned int mem_size_wI = sizeof(float) * size_wI; + unsigned int size_wO = output_matrix_size.WA * output_matrix_size.HA; + unsigned int mem_size_wO = sizeof(float) * size_wO; + unsigned int size_h1 = hidden_matrix_size.WC * hidden_matrix_size.HC; + unsigned int mem_size_h1 = sizeof(float) * size_h1; + unsigned int size_pred = output_matrix_size.WC * output_matrix_size.HC; + unsigned int mem_size_pred = sizeof(float) * size_pred; + + cudaMalloc((void **)&dev_X, mem_size_X); + checkCUDAError("cudaMalloc dev_X"); + cudaMalloc((void **)&dev_wI, mem_size_wI); + checkCUDAError("cudaMalloc dev_wI"); + cudaMalloc((void **)&dev_wO, mem_size_wO); + checkCUDAError("cudaMalloc dev_wO"); + cudaMalloc((void **)&dev_h1, mem_size_h1); + checkCUDAError("cudaMalloc dev_h1"); + cudaMalloc((void **)&dev_pred, mem_size_pred); + checkCUDAError("cudaMalloc dev_pred"); + + cudaMemcpy(dev_X, Xi, mem_size_X, cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy dev_X"); + cudaMemcpy(dev_wI, wI, mem_size_wI, cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy dev_wI"); + cudaMemcpy(dev_wO, wO, mem_size_wO, cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy dev_wO"); + } + else { + cudaFree(dev_X); + cudaFree(dev_wI); + cudaFree(dev_wO); + cudaFree(dev_h1); + cudaFree(dev_pred); + cudaFree(dev_y); + cudaFree(dev_loss); + } + } + + + void matrixMultiply(cublasHandle_t* handle, sMatrixSize &matrix_size, float *d_A, float *d_B, float *d_C){ + const float alpha = 1.0f; + const float beta = 0.0f; + cublasSgemm(*handle, CUBLAS_OP_T, CUBLAS_OP_N, matrix_size.WA, matrix_size.WB, matrix_size.HA, &alpha, d_A, matrix_size.HA, d_B, matrix_size.HB, &beta, d_C, matrix_size.HC); + checkCUDAError("matrix multiply"); + } + + __global__ void kernSigmoid(int n, float *input) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + input[index] = 1.0f / (1 + exp(-input[index])); + } + + __global__ void kernMSE(int n, float *pred, float *yi, float *loss) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + loss[index] = powf(pred[index] - yi[index], 2); + } + + + float MSE(float *pred, float *yi, sMatrixSize &output_matrix_size) { +#if GPULOSS + float *loss; + unsigned int size_pred = output_matrix_size.WC * output_matrix_size.HC; + unsigned int mem_size_pred = sizeof(float) * size_pred; + + cudaMalloc((void **)&dev_loss, mem_size_pred); + checkCUDAError("cudaMalloc dev_pred"); + cudaMalloc((void **)&dev_y, mem_size_pred); + checkCUDAError("cudaMalloc dev_pred"); + cudaMalloc((void **)&dev_pred, mem_size_pred); + checkCUDAError("cudaMalloc dev_pred"); + + cudaMemcpy(dev_y, yi, mem_size_pred, cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy dev_y"); + cudaMemcpy(dev_pred, pred, mem_size_pred, cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy dev_pred"); + + dim3 threads(blockSize); + dim3 grid((output_matrix_size.HC + blockSize - 1) / blockSize); + + kernMSE << > > (output_matrix_size.HC, dev_pred, dev_y, dev_loss); + + thrust::inclusive_scan(dev_loss, dev_loss + output_matrix_size.HC, dev_loss); + + cudaMemcpy(loss, dev_loss + mem_size_pred - 1, sizeof(float), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy dev_loss"); + + cudaFree(dev_loss); + cudaFree(dev_y); + cudaFree(dev_pred); + return *loss; +#else + float loss = 0; + for (int j = 0; j < output_matrix_size.HC; j++) { + printf("prediction: %f \n", pred[j]); + loss += pow(pred[0] - yi[0], 2); + } + return loss; +#endif // GPULOSS + } + + + void backward(float lr, float loss, float *wI, float *wO){ + printf("Backward propagate the error to the weights \n"); + // wI += gradient*lr; + // wO += gradient*lr; + } + + + void forward(float *pred, float *Xi, float *wI, float *wO, sMatrixSize &hidden_matrix_size, sMatrixSize &output_matrix_size) { + // allocate device memory + deviceMemory(Xi, wI, wO, hidden_matrix_size, output_matrix_size, true); + int mem_size_pred = sizeof(float) * output_matrix_size.WC * output_matrix_size.HC; + + cublasHandle_t handle; + cublasCreate(&handle); + dim3 threads(blockSize); + + //hidden layer + dim3 grid((hidden_matrix_size.WC*hidden_matrix_size.HC + blockSize - 1) / blockSize); + + matrixMultiply(&handle, hidden_matrix_size, dev_wI, dev_X, dev_h1); + kernSigmoid <<> > (hidden_matrix_size.HC*hidden_matrix_size.WC, dev_h1); + checkCUDAError("kernSigmoid"); + + //output layer + dim3 grid1((output_matrix_size.WC*output_matrix_size.HC + blockSize - 1) / blockSize); + + matrixMultiply(&handle, output_matrix_size, dev_wO, dev_h1, dev_pred); + kernSigmoid << > > (output_matrix_size.HC*output_matrix_size.WC, dev_pred); + checkCUDAError("kernSigmoid"); + + cudaMemcpy(pred, dev_pred, mem_size_pred, cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy pred"); + + cublasDestroy(handle); + checkCUDAError("handle"); + + deviceMemory(Xi, wI, wO, hidden_matrix_size, output_matrix_size); + } + + void train(float *X, float *y, const int iterations, const float lr, const int sizeData, const int hiddenNodes, const int numLabels, const int numData) { + sMatrixSize hidden_matrix_size = {hiddenNodes, sizeData, 1, sizeData, 1, hiddenNodes }; + sMatrixSize output_matrix_size = {numLabels, hiddenNodes, 1, hiddenNodes, 1, numLabels }; + + unsigned int size_wI = hidden_matrix_size.HA * hidden_matrix_size.WA; + unsigned int mem_size_wI = sizeof(float) * size_wI; + float *wI = (float *)malloc(mem_size_wI); + + unsigned int size_wO = output_matrix_size.HA * output_matrix_size.WA; + unsigned int mem_size_wO = sizeof(float) * size_wO; + float *wO = (float *)malloc(mem_size_wO); + + randomInit(wI, size_wI); + randomInit(wO, size_wO); + + float *permuteData = (float *)malloc(sizeof(float)*numData); + float *Xi = (float *)malloc(sizeof(float)*sizeData); + float *yi = (float *)malloc(sizeof(float)*numLabels); + + unsigned int size_pred = output_matrix_size.WC * output_matrix_size.HC; + unsigned int mem_size_pred = sizeof(float) * size_pred; + float *pred = (float *)malloc(mem_size_pred); + float avgLoss = 0.0; + + for (int iter = 0; iter < iterations; iter++) { + generateRandom(numData, permuteData); + printf("training iteration %i \n", iter); + for (int i = 0; i < numData; i++) { + int index = permuteData[i]; + memcpy(Xi, (void **)&X[sizeData*index], sizeData * sizeof(float)); + memcpy(yi, (void **)&y[numLabels*index], numLabels * sizeof(float)); + + printf("index: %i data: %f %f label: %f \n", index, Xi[0] , Xi[1], yi[0]); + + forward(pred, Xi, wI, wO, hidden_matrix_size, output_matrix_size); + for (int j = 0; j < numLabels; j++) { + printf("prediction: %f \n\n", pred[j]); + } + + float loss = MSE(pred, yi, output_matrix_size); + printf("Least square error loss: %f \n \n", loss); + avgLoss += loss; + backward(lr, loss, wI, wO); + } + } + printf("training done \n"); + + avgLoss /= numData; + printf("Average Loss: %f \n", avgLoss); + + free(wI); free(wO); free(Xi); free(yi); free(permuteData); free(pred); + } + + void test(float *X, float *y, float *wI, float *wO, const int sizeData, const int hiddenNodes, const int numLabels, const int numData){ + sMatrixSize hidden_matrix_size = { hiddenNodes, sizeData, 1, sizeData, 1, hiddenNodes }; + sMatrixSize output_matrix_size = { numLabels, hiddenNodes, 1, hiddenNodes, 1, numLabels }; + + float *Xi = (float *)malloc(sizeof(float)*sizeData); + float *yi = (float *)malloc(sizeof(float)*numLabels); + + unsigned int size_pred = output_matrix_size.WC * output_matrix_size.HC; + unsigned int mem_size_pred = sizeof(float) * size_pred; + float *pred = (float *)malloc(mem_size_pred); + float avgLoss = 0.0; + + for (int i = 0; i < numData; i++) { + memcpy(Xi, (void **)&X[sizeData*i], sizeData * sizeof(float)); + memcpy(yi, (void **)&y[numLabels*i], numLabels * sizeof(float)); + + printf("data: %f %f label: %f \n", i, Xi[0], Xi[1], yi[0]); + + forward(pred, Xi, wI, wO, hidden_matrix_size, output_matrix_size); + + float loss = MSE(pred, yi, output_matrix_size); + printf("Least square error loss: %f \n \n", loss); + avgLoss += loss; + } + printf("testing done \n"); + + avgLoss /= numData; + printf("Average Loss: %f \n", avgLoss); + + free(wI); free(wO); free(Xi); free(yi); free(pred); + } + + void testMatrixMultiply(int HA, int WA, int HB, int WB) { + sMatrixSize matrix_size = { WA, HA, WB, HB, WB, HA}; + + // allocate host memory for matrices A and B + unsigned int size_A = matrix_size.WA * matrix_size.HA; + unsigned int mem_size_A = sizeof(float) * size_A; + float *h_A = (float *)malloc(mem_size_A); + unsigned int size_B = matrix_size.WB * matrix_size.HB; + unsigned int mem_size_B = sizeof(float) * size_B; + float *h_B = (float *)malloc(mem_size_B); + + // initialize host memory + indexInit(h_A, size_A); + indexInit(h_B, size_B); + + // allocate device memory + float *d_A, *d_B, *d_C; + unsigned int size_C = matrix_size.WC * matrix_size.HC; + unsigned int mem_size_C = sizeof(float) * size_C; + + // allocate host memory for the result + float *h_C = (float *)malloc(mem_size_C); + + cudaMalloc((void **)&d_A, mem_size_A); + cudaMalloc((void **)&d_B, mem_size_B); + cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice); + cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice); + cudaMalloc((void **)&d_C, mem_size_C); + + // setup execution parameters + dim3 threads(blockSize, blockSize); + dim3 grid(matrix_size.HB / threads.x, matrix_size.WA / threads.y); + + // create and start timer + printf("Computing result using CUBLAS... \n"); + + cublasHandle_t handle; + cublasCreate(&handle); + + matrixMultiply(&handle, matrix_size, d_A, d_B, d_C); + + // copy result from device to host + cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost); + + // Destroy the handle + cublasDestroy(handle); + + printf("\n Matriz A: \n"); + printMat(h_A, matrix_size.WA, matrix_size.HA); + printf("\n Matriz B: \n "); + printMat(h_B, matrix_size.WB, matrix_size.HB); + printf("\n Matriz C: \n"); + printMat(h_C, matrix_size.WC, matrix_size.HC); + printf("\n\n"); + + // clean up memory + free(h_A); + free(h_B); + free(h_C); + cudaFree(d_A); + cudaFree(d_B); + cudaFree(d_C); + } + } diff --git a/Project2-Character-Recognition/character_recognition/mlp.h b/Project2-Character-Recognition/character_recognition/mlp.h index 2096228..8f9730c 100644 --- a/Project2-Character-Recognition/character_recognition/mlp.h +++ b/Project2-Character-Recognition/character_recognition/mlp.h @@ -1,9 +1,13 @@ #pragma once #include "common.h" +#include namespace CharacterRecognition { Common::PerformanceTimer& timer(); // TODO: implement required elements for MLP sections 1 and 2 here + void test(float *X, float *y, float *wI, float *wO, const int sizeData = 10205, const int hiddenNodes = 256, const int numLabels = 52, const int numData = 52); + void train(float *X, float *y, const int iterations = 1, const float lr = 0.1, const int sizeData = 10205, const int hiddenNodes = 256, const int numLabels = 52, const int numData = 52); + void testMatrixMultiply(int HA, int WA, int HB, int WB); } diff --git a/Project2-Character-Recognition/img/XORforwardProp.png b/Project2-Character-Recognition/img/XORforwardProp.png new file mode 100644 index 0000000..35e0884 Binary files /dev/null and b/Project2-Character-Recognition/img/XORforwardProp.png differ diff --git a/Project2-Character-Recognition/img/XORoutput.png b/Project2-Character-Recognition/img/XORoutput.png new file mode 100644 index 0000000..ad7b67a Binary files /dev/null and b/Project2-Character-Recognition/img/XORoutput.png differ diff --git a/Project2-Character-Recognition/src/main.cpp b/Project2-Character-Recognition/src/main.cpp index 11dd534..c9795d1 100644 --- a/Project2-Character-Recognition/src/main.cpp +++ b/Project2-Character-Recognition/src/main.cpp @@ -10,143 +10,188 @@ #include #include #include "testing_helpers.hpp" +#include + +#define XOR 1 +#define MULT 0 + +#if XOR + #define sizeData 2 + #define numLabels 1 + #define numData 4 + #define hiddenNodes 2 +#else + #define sizeData 10205 + #define numLabels 52 + #define numData 52 + #define hiddenNodes 100 +#endif // XOR + +#define index(i,j,ld) (((j)*(ld))+(i)) + +void loadXorExample(float *X, float *y) { + for (int i = 0; i < sizeData; i++) { + for (int j = 0; j < sizeData; j++) { + X[sizeData*(2 * i + j)] = i; + X[sizeData*(2 * i + j) + 1] = j; + y[2*i + j] = i ^ j; + } + } + + for (int i = 0; i <= 1; i++) { + for (int j = 0; j <= 1; j++) { + std::cout << "data: " << X[sizeData*(2 * i + j)] << X[sizeData*(2 * i + j) + 1] << " label: " << y[2 * i + j] << "\n"; + } + } +} + + +void readImageData(float *X, float *y) { + int c = 0; + for (int i = 1; i <= numLabels; i++) { + float ascii = 65 + c; + if (i % 2 == 0) { c++; } + float yi = i % 2 == 0 ? ascii + 32 : ascii; + y[(i - 1)*(numLabels + 1)] = yi; + + float xi; + std::string n = i < 10 ? "0" + std::to_string(i) : std::to_string(i); + std::string filePath = "../data-set/" + n + "info.txt"; + std::ifstream dataFile(filePath); + int k = 0; + while (!dataFile.fail() && !dataFile.eof()) + { + dataFile >> xi; + X[sizeData*(i - 1) + k++] = xi; + }; + } +} + + +void readImageWeights(float *wI, float *wO) { + printf("Doesn't currently load any weights \n"); + return; +} + + +void fixedInit(float *data, int size) { + if (size == 4) { + data[0] = 10.1f; + data[2] = 0.9f; + data[1] = 20.0f; + data[3] = 0.87f; + } + else if (size == 2) { + data[0] = 41.0f; + data[1] = -54.0f; + } +} -const int SIZE = 1 << 8; // feel free to change the size of array -const int NPOT = SIZE - 3; // Non-Power-Of-Two -int *a = new int[SIZE]; -int *b = new int[SIZE]; -int *c = new int[SIZE]; int main(int argc, char* argv[]) { - // Scan tests - - printf("\n"); - printf("****************\n"); - printf("** SCAN TESTS **\n"); - printf("****************\n"); - - genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); - - // initialize b using StreamCompaction::CPU::scan you implement - // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct. - // At first all cases passed because b && c are all zeroes. - zeroArray(SIZE, b); - printDesc("cpu scan, power-of-two"); - StreamCompaction::CPU::scan(SIZE, b, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(SIZE, b, true); - - zeroArray(SIZE, c); - printDesc("cpu scan, non-power-of-two"); - StreamCompaction::CPU::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(NPOT, b, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("naive scan, power-of-two"); - StreamCompaction::Naive::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan - onesArray(SIZE, c); - printDesc("1s array for finding bugs"); - StreamCompaction::Naive::scan(SIZE, c, a); - printArray(SIZE, c, true); */ - - zeroArray(SIZE, c); - printDesc("naive scan, non-power-of-two"); - StreamCompaction::Naive::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient scan, power-of-two"); - StreamCompaction::Efficient::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient scan, non-power-of-two"); - StreamCompaction::Efficient::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("thrust scan, power-of-two"); - StreamCompaction::Thrust::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - zeroArray(SIZE, c); - printDesc("thrust scan, non-power-of-two"); - StreamCompaction::Thrust::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); - - printf("\n"); - printf("*****************************\n"); - printf("** STREAM COMPACTION TESTS **\n"); - printf("*****************************\n"); - - // Compaction tests - - genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); - - int count, expectedCount, expectedNPOT; - - // initialize b using StreamCompaction::CPU::compactWithoutScan you implement - // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct. - zeroArray(SIZE, b); - printDesc("cpu compact without scan, power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - expectedCount = count; - printArray(count, b, true); - printCmpLenResult(count, expectedCount, b, b); - - zeroArray(SIZE, c); - printDesc("cpu compact without scan, non-power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - expectedNPOT = count; - printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); - - zeroArray(SIZE, c); - printDesc("cpu compact with scan"); - count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient compact, power-of-two"); - count = StreamCompaction::Efficient::compact(SIZE, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient compact, non-power-of-two"); - count = StreamCompaction::Efficient::compact(NPOT, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); + printf("\n"); + printf("****************\n"); + printf("** MLP TESTS **\n"); + printf("****************\n\n"); + + unsigned int size_X = sizeData * numData; + unsigned int mem_size_X = sizeof(float) * size_X; + float *X = (float *)malloc(mem_size_X); + + unsigned int size_y = numLabels * numData; + unsigned int mem_size_y = sizeof(float) * size_y; + float *y = (float *)malloc(mem_size_y); + +#if XOR + printDesc("loading XOR example"); + loadXorExample(X, y); +#else + printDesc("reading image data"); + readImageData(X, y); +#endif // XOR + + std::string train = "y"; + std::string test = "y"; + std::string input; + float lr = 0.1; + int nodes = 10; + int iterations; + + printf("Would you like to train or test? (q) "); + std::cin >> input; + + while (input != "q") { + if (input == "train") { + while (train == "y" | train == "Y" | train == "yes" | train == "Yes") { + printf("How many iterations would you like to do? "); + std::cin >> iterations; + printf("What should the learning rate be? "); + std::cin >> lr; + printf("How many hidden nodes? "); + std::cin >> nodes; + printDesc("training"); + CharacterRecognition::train(X, y, iterations, lr, sizeData, nodes, numLabels, numData); + printf("Would you like to continue training? (y, n, q) "); + std::cin >> train; + } + } + else if (input == "test") { + while (test == "y" | test == "Y" | test == "yes" | test == "Yes") { + unsigned int size_wI = hiddenNodes * sizeData; + unsigned int mem_size_wI = sizeof(float) * size_wI; + float *wI = (float *)malloc(mem_size_wI); + + unsigned int size_wO = numLabels * hiddenNodes; + unsigned int mem_size_wO = sizeof(float) * size_wO; + float *wO = (float *)malloc(mem_size_wO); + +#if XOR + printDesc("loading XOR weights"); + fixedInit(wI, size_wI); + fixedInit(wO, size_wO); +#else + printDesc("reading image weights"); + readImageWeights(wI, wO); +#endif // XOR + + printDesc("testing"); + CharacterRecognition::test(X, y, wI, wO, sizeData, hiddenNodes, numLabels, numData); + printf("Would you like to continue testing? (y, n, q) "); + std::cin >> test; + } + } + + if (test == "q" || train == "q") break; + printf("Would you like to train or test? (q) "); + std::cin >> input; + } + +#if MULT + std::string mult = "n"; + printf("Would you like to test matrix multiplication? (y) "); + std::cin >> mult; + + while (mult == "y" | mult == "Y" | mult == "mult") { + int HA = 2, WA = 2, HB = 2, WB = 1; + printf("How many rows for the first matrix? "); + std::cin >> HA; + printf("How many columns for the first matrix? "); + std::cin >> WA; + printf("How many rows for the second matrix? "); + std::cin >> HB; + printf("How many columns for the second matrix? "); + std::cin >> WB; + + printDesc("test multiply"); + CharacterRecognition::testMatrixMultiply(HA,WA,HB,WB); + + printf("Would you like continue testing matrix multiplication? (y) "); + std::cin >> mult; + } +#endif + + free(X); + free(y); system("pause"); // stop Win32 console from closing on exit - delete[] a; - delete[] b; - delete[] c; } diff --git a/Project2-Stream-Compaction/README.md b/Project2-Stream-Compaction/README.md index 0e38ddb..f645d6e 100644 --- a/Project2-Stream-Compaction/README.md +++ b/Project2-Stream-Compaction/README.md @@ -3,12 +3,142 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Klayton Wittler + * [LinkedIn](https://www.linkedin.com/in/klayton-wittler/) +* Tested on: Windows 10 Pro, i7-7700K @ 4.20GHz 16.0GB, GTX 1070 8.192GB (my PC) -### (TODO: Your README) +## Sections -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +* [Introduction](#introduction) + * [Scanning](#scanning) + * [Stream Compaction](#stream-compaction) +* [Performance Analaysis](#performance-analysis) + * [Optimize Block Size](#optimize-block-size) + * [Compare Implementations](#compare-implementations) + * [Explanation](#explanation) +* [Additions](#additionals) + * [Radix Sort](#radix-sort) +# Introduction +This project implements parallel algorithms in reduction, scanning, and stream compaction in CUDA on the GPU. There is both a naive and work efficient scanning algorithm, however the work efficient still requires memory optimizations to see its real benefits. Stream compaction is used to compress data by removing unnecessary elements. + +## Scanning +Scanning an array is getting the accumulated sum of the elements. This can be both inclusive of the last element or exclusive of the last element. For the purposes of this project we will focus on the exclusive version, knowing that we can convert between the two with a shift. + +### Naive +The naive approach to scanning in parallel would be to make multiple passes in the array, each pass moving the summing elements down the array in an exponenial fasion as seen in the figure below. + +![naive-scan](img/naiveScan.png) + +To implement this in parallel, we have the psuedocode below that in practice requires two arrays to buffer read and writes. +``` +for d = 1 to log_2(n) + for all k in parallel + if k >= 2^(d-1) + output[k] = input[k - 2^d+1] + input[k] + else + output[k] = input[k] + end + end + swap(input,output) +end +``` +This will return an inclusive scan so to convert we shift all elements to the right and add in an identity in the first element, which addition is zero. + + +### Work Efficient +The work efficient approach perform two sweeps (referred to as up and down sweep) of these exponential passes but allows for inplace operation. The upsweep performs a parallel reduction and the downsweep cleverly utilizes the array as a binary tree to return the exclusive scan. + +The upsweep, parallel reduction, uses a divide and conquer approach to achieve the total sum of the array in the last element in this example. + +![upsweep](img/upSweep.png) +``` +for d = 0 to log_2(n-1) + for all k to n by 2^(d+1) in parallel + input[k + 2^(d+1) - 1] += input[k + 2^d] + end +end +``` + +The downsweep treats the array as a binary tree that only saves the left child as seen below and traverses down it following 3 rules as it goes. + +![binary-tree](img/binary_tree.png) + +Rules + +1. Set the root of the entire tree to 0 + +2. Add the parent value and left child value to the right child + +3. Copy the parent value to the left child + +The implementation of this is seen below in the figure where the black arrow represents rule 1, green arrows as rule 2, and orange arrows as rule 3. The psuedocode also shows how this would be done algorithmically in parallel. + +![downsweep](img/downSweep.png) +``` +input[n - 1] = 0 +for d = log_2(n-1) to 0 + for all k = 0 to n-1 by 2^(d+1) in parallel + temp = input[k + 2^d] + input[k + 2^d - 1] = input[k + 2^(d+1) - 1] + input[k + 2^(d+1) - 1] += temp + end +end +``` + +## Stream Compaction +Stream compaction in parallel has three parts: condition mapping, scanning, and scatter. + +![parallel-stream-compaction](img/parallel_stream_compact.png) + +We create an array that indicates whether the element should be included. Then scan the map array to determine the elements index in the return array. Then scatter the results into the new array by checking the condition map array, looking up the index, and placing the element in that index. + +# Performance Analysis +To do a performance analysis, the run time of each algorithm is clocked in milliseconds. The faster the algorithm the lower the run time and the better the approach. The scan comparisons are separate from the compact comparison, but are denoted on the graphs as the same name since compact utilizes the scan. + +## Optimize Block Size +In order to perform an accurate comparison between implementations, the implemented methods of naive and work efficient need to have the block sizes optimized. + +Below we see the outcome of run time at two different array sizes (1 << 16 and 1 << 10 respectively) accross block sizes that are multiples of warp size. + +![block-scan-speed](img/block_scan_speed.png) +![block-compact-speed](img/block_compact_speed.png) + +The optimal block size for each approach was chosen and based on the run time analaysis this is generally 128 threads per block. + +## Compare Implementations + +In the run time graphs below we see that the 'efficient' approach does not perform well even compared to the CPU version. We only see the CPU start to become comparable in compactions with a scan and this is because everything is being done serially. On the other hand the naive implementation is somewhat comparable to the CPU versions but still generally performs worse. We can also see the thrust implementation at the very bottom remains relatively constant through array sizes. + +![array-scan-speed](img/array_scan_speed.png) +![array-compact-speed](img/array_compact_speed.png) + + +### Explanation + +* Write a brief explanation of the phenomena you see here. Can you find the performance bottlenecks? Is it memory I/O? Computation? Is it different for each implementation? + +The primary reasons for these performance gaps in the 'efficient' approach despite the parallel capabilities of the GPU is the thread management. In the figure below, what has been implemented is on the left and whats left to optimize is on the right. We can see thread divergence in each pass with a red circle and warp retirement can be seen in greyed out threads. In the implemented version we can see each level has thread divergence which means that threads are sitting waiting for other threads in the warp to finish a set of instructions. The optimized version does not have a thread divergence until the very last pass in which it is returning the answer anyway. We can also see that the optimized version frees up warps much faster as it goes allowing more work to be done overall. + +![thread-management](img/threadManagement.png) + +The performance gaps in the naive approach and secondarily in the 'efficient' approach is memory optimizations. We can further resturture the way we access memory to use shared memory within blocks, change access patterns to avoid bank conflicts and utilize the full band width on the bus. + +We can see from the graphs that the thrust implementation is the quickest and does not suffer from the performance issues since they have done these optimizations inside their functions. In running the NSight performance analysis it can also been seen that they are pulling alot of device information to optimize for my specific hardware. + +## Output +This output is for 1 << 8 array size and includes a test of radix sort algorithm. There are two flags ```RUN_RADIX``` and ```RADIX_CLASS_EXAMPLE``` at the top of ```main.cpp```. Radix tests can be disabled by setting ```RUN_RADIX 0``` and to run radix sort on the full test array set ```RADIX_CLASS_EXAMPLE 0``` but is currently displaying the class example as to quickly verify its working. + +![console-output](img/console_output.png) + +# Additions + +## Radix Sort + +Radix sort can be done in parallel by breaking the array into tiles and sorting each tile in parallel then merging. Further more sorting within it tile can be parallelized by utilizing scanning approaches developed previously. It should also be noted that radix sort operates on bits and requires k-bit passes to sort where k is the number of bits in each cell. It starts at the least significant bit (LSB) and moves to the most significant bit (MSB), partitioning the array and rearranging the array as it goes as seen below. + +![radix-explanation](img/radix_explanation.png) + +Where within each pass we do parallel scans and scatters as seen below. We mantain a true and false condition array (in practice we can use one and just take the opposite when we need the other), scan the false array and then scan the true array starting where the false left off (in practice we can skip the true array and compute real time based on the false array). Finally, we can use the full array of indices and scatter the results. + +![radix-parallel-pass](img/radix_parallel_pass.png) diff --git a/Project2-Stream-Compaction/img/array_compact_speed.png b/Project2-Stream-Compaction/img/array_compact_speed.png new file mode 100644 index 0000000..06b74c5 Binary files /dev/null and b/Project2-Stream-Compaction/img/array_compact_speed.png differ diff --git a/Project2-Stream-Compaction/img/array_scan_speed.png b/Project2-Stream-Compaction/img/array_scan_speed.png new file mode 100644 index 0000000..c7fc67a Binary files /dev/null and b/Project2-Stream-Compaction/img/array_scan_speed.png differ diff --git a/Project2-Stream-Compaction/img/binary_tree.png b/Project2-Stream-Compaction/img/binary_tree.png new file mode 100644 index 0000000..0bb6d0d Binary files /dev/null and b/Project2-Stream-Compaction/img/binary_tree.png differ diff --git a/Project2-Stream-Compaction/img/block_compact_speed.png b/Project2-Stream-Compaction/img/block_compact_speed.png new file mode 100644 index 0000000..cf45f61 Binary files /dev/null and b/Project2-Stream-Compaction/img/block_compact_speed.png differ diff --git a/Project2-Stream-Compaction/img/block_scan_speed.png b/Project2-Stream-Compaction/img/block_scan_speed.png new file mode 100644 index 0000000..d6ccad3 Binary files /dev/null and b/Project2-Stream-Compaction/img/block_scan_speed.png differ diff --git a/Project2-Stream-Compaction/img/console_output.png b/Project2-Stream-Compaction/img/console_output.png new file mode 100644 index 0000000..2570ad1 Binary files /dev/null and b/Project2-Stream-Compaction/img/console_output.png differ diff --git a/Project2-Stream-Compaction/img/data/dataCollection.xlsx b/Project2-Stream-Compaction/img/data/dataCollection.xlsx new file mode 100644 index 0000000..3a53592 Binary files /dev/null and b/Project2-Stream-Compaction/img/data/dataCollection.xlsx differ diff --git a/Project2-Stream-Compaction/img/data/scan_compact_figures.pptx b/Project2-Stream-Compaction/img/data/scan_compact_figures.pptx new file mode 100644 index 0000000..1370f1f Binary files /dev/null and b/Project2-Stream-Compaction/img/data/scan_compact_figures.pptx differ diff --git a/Project2-Stream-Compaction/img/downSweep.png b/Project2-Stream-Compaction/img/downSweep.png new file mode 100644 index 0000000..da05820 Binary files /dev/null and b/Project2-Stream-Compaction/img/downSweep.png differ diff --git a/Project2-Stream-Compaction/img/naiveScan.png b/Project2-Stream-Compaction/img/naiveScan.png new file mode 100644 index 0000000..a1b6837 Binary files /dev/null and b/Project2-Stream-Compaction/img/naiveScan.png differ diff --git a/Project2-Stream-Compaction/img/parallel_stream_compact.png b/Project2-Stream-Compaction/img/parallel_stream_compact.png new file mode 100644 index 0000000..c893762 Binary files /dev/null and b/Project2-Stream-Compaction/img/parallel_stream_compact.png differ diff --git a/Project2-Stream-Compaction/img/radix_explanation.png b/Project2-Stream-Compaction/img/radix_explanation.png new file mode 100644 index 0000000..f9457ec Binary files /dev/null and b/Project2-Stream-Compaction/img/radix_explanation.png differ diff --git a/Project2-Stream-Compaction/img/radix_parallel_pass.png b/Project2-Stream-Compaction/img/radix_parallel_pass.png new file mode 100644 index 0000000..fe8c8f9 Binary files /dev/null and b/Project2-Stream-Compaction/img/radix_parallel_pass.png differ diff --git a/Project2-Stream-Compaction/img/stream_compact.png b/Project2-Stream-Compaction/img/stream_compact.png new file mode 100644 index 0000000..a7e9706 Binary files /dev/null and b/Project2-Stream-Compaction/img/stream_compact.png differ diff --git a/Project2-Stream-Compaction/img/threadManagement.png b/Project2-Stream-Compaction/img/threadManagement.png new file mode 100644 index 0000000..19f4f90 Binary files /dev/null and b/Project2-Stream-Compaction/img/threadManagement.png differ diff --git a/Project2-Stream-Compaction/img/upSweep.png b/Project2-Stream-Compaction/img/upSweep.png new file mode 100644 index 0000000..a1fd577 Binary files /dev/null and b/Project2-Stream-Compaction/img/upSweep.png differ diff --git a/Project2-Stream-Compaction/src/main.cpp b/Project2-Stream-Compaction/src/main.cpp index d016553..46d32ba 100644 --- a/Project2-Stream-Compaction/src/main.cpp +++ b/Project2-Stream-Compaction/src/main.cpp @@ -11,14 +11,18 @@ #include #include #include +#include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 8; //1 << 8; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; int *c = new int[SIZE]; +#define RADIX_CLASS_EXAMPLE 1 +#define RUN_RADIX 1 // turn off radix to collect data + int main(int argc, char* argv[]) { // Scan tests @@ -147,6 +151,51 @@ int main(int argc, char* argv[]) { //printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); +#if RUN_RADIX + printf("\n"); + printf("**********************\n"); + printf("** RADIX SORT TESTS **\n"); + printf("**********************\n"); + + #if RADIX_CLASS_EXAMPLE + const int size = 8; + const int npot = 7; + int unsorted_array[] = { 0, 4, 1, 2, 5, 6, 7, 3 }; //hard coding array to class example + int unsorted_arrayNPOT[] = { 0, 4, 1, 2, 5, 6, 3 }; + #else + const int size = SIZE; + const int npot = NPOT; + int unsorted_array[size]; + int unsorted_arrayNPOT[npot]; + memcpy(unsorted_array, a, sizeof(int) * size); + memcpy(unsorted_arrayNPOT, a, sizeof(int) * NPOT); + #endif // CLASS_EXAMPLE + + int sorted_array[size]; + memcpy(sorted_array, unsorted_array, sizeof(int) * size); // copy and sort to compare + std::sort(sorted_array, sorted_array + size); + + int sorted_arrayNPOT[npot]; + memcpy(sorted_arrayNPOT, unsorted_arrayNPOT, sizeof(int) * npot); // copy and sort to compare + std::sort(sorted_arrayNPOT, sorted_arrayNPOT + npot); + + printArray(size, unsorted_array, true); + + zeroArray(size, c); + printDesc("radix sort, power-of-two"); + StreamCompaction::Radix::sort(size, c, unsorted_array); + printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(size, c, true); + printCmpResult(size, sorted_array, c); + + zeroArray(npot, c); + printDesc("radix sort, non-power-of-two"); + StreamCompaction::Radix::sort(npot, c, unsorted_arrayNPOT); + printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(npot, c, true); + printCmpResult(npot, sorted_arrayNPOT, c); +#endif + system("pause"); // stop Win32 console from closing on exit delete[] a; delete[] b; diff --git a/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt b/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt index cdbef77..fc37515 100644 --- a/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt +++ b/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt @@ -9,9 +9,11 @@ set(SOURCE_FILES "efficient.cu" "thrust.h" "thrust.cu" + "radix.h" + "radix.cu" ) cuda_add_library(stream_compaction ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_61 ) diff --git a/Project2-Stream-Compaction/stream_compaction/common.cu b/Project2-Stream-Compaction/stream_compaction/common.cu index 2ed6d63..ccffc8c 100644 --- a/Project2-Stream-Compaction/stream_compaction/common.cu +++ b/Project2-Stream-Compaction/stream_compaction/common.cu @@ -23,7 +23,11 @@ namespace StreamCompaction { * which map to 0 will be removed, and elements which map to 1 will be kept. */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + bools[index] = idata[index] != 0 ? 1 : 0; } /** @@ -32,7 +36,13 @@ namespace StreamCompaction { */ __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { - // TODO + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + if (bools[index]) { + odata[indices[index]] = idata[index]; + } } } diff --git a/Project2-Stream-Compaction/stream_compaction/cpu.cu b/Project2-Stream-Compaction/stream_compaction/cpu.cu index a2d3e6c..4079764 100644 --- a/Project2-Stream-Compaction/stream_compaction/cpu.cu +++ b/Project2-Stream-Compaction/stream_compaction/cpu.cu @@ -12,6 +12,13 @@ namespace StreamCompaction { return timer; } + void scanHelper(int n, int *odata, const int *idata) { + odata[0] = 0; + for (int i = 1; i < n; i++) { + odata[i] = odata[i - 1] + idata[i - 1]; + } + } + /** * CPU scan (prefix sum). * For performance analysis, this is supposed to be a simple for loop. @@ -19,7 +26,7 @@ namespace StreamCompaction { */ void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + scanHelper(n, odata, idata); timer().endCpuTimer(); } @@ -30,9 +37,14 @@ namespace StreamCompaction { */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + int k = 0; + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + odata[k++] = idata[i]; + } + } timer().endCpuTimer(); - return -1; + return k; } /** @@ -42,9 +54,28 @@ namespace StreamCompaction { */ int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + int *map_array = new int[n]; + int *scanned = new int[n]; + + // mapping to boolean + for (int i = 0; i < n; i++) { + map_array[i] = idata[i] != 0 ? 1 : 0; + } + // scanning exclusively + scanHelper(n, scanned, map_array); + + //scatter results + int k = 0; + for (int i = 0; i < n; i++) { + if (map_array[i]) { + k++; + odata[scanned[i]] = idata[i]; + } + } + delete[n] map_array; + delete[n] scanned; timer().endCpuTimer(); - return -1; + return k; } } } diff --git a/Project2-Stream-Compaction/stream_compaction/efficient.cu b/Project2-Stream-Compaction/stream_compaction/efficient.cu index 2db346e..2d6875b 100644 --- a/Project2-Stream-Compaction/stream_compaction/efficient.cu +++ b/Project2-Stream-Compaction/stream_compaction/efficient.cu @@ -3,6 +3,11 @@ #include "common.h" #include "efficient.h" + + +/*! Block size used for CUDA kernel launch. */ +#define blockSize 128 + namespace StreamCompaction { namespace Efficient { using StreamCompaction::Common::PerformanceTimer; @@ -11,14 +16,80 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } + + __global__ void kernUpSweep(int n, int p, int *idata) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + + if (index % (2 * p) == 0) { + idata[index + 2 * p - 1] += idata[index + p - 1]; + } + + } + + __global__ void kernDownSweep(int n, int p, int *idata) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + + if (index % (2 * p) == 0) { + int t = idata[index + p - 1]; + idata[index + p - 1] = idata[index + (2 * p) - 1]; + idata[index + (2 * p) - 1] += t; + } + } + + void workEfficientScan(int n, int *dev_idata, dim3 &threadsPerBlock, dim3 &fullBlocksPerGrid) { + //perform upsweep parallel reduction + for (int d = 0; d < ilog2ceil(n); d++) { + int p = 1 << d; + kernUpSweep << > > (n, p, dev_idata); + checkCUDAError("kernel kernUpSweep failed!"); + } + + //set root to 0 + cudaMemset(dev_idata + n - 1, 0, sizeof(int)); + + //perform down sweep as binary tree + for (int d = ilog2ceil(n)-1 ; d >= 0; d--) { + int p = 1 << d; + kernDownSweep << > > (n, p, dev_idata); + checkCUDAError("kernel kernDownSweep failed!"); + } + + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + //set up variable, allocate space on gpu, and copy over data + int npad = 1 << ilog2ceil(n); //pads adds padding if needed for arrays of not power of 2 length + int *dev_idata; + + cudaMalloc((void**)&dev_idata, npad * sizeof(int)); + checkCUDAError("cudaMalloc dev_idata failed!"); + + cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + checkCUDAError("Memcpy idata failed!"); + + dim3 threadsPerBlock(blockSize); + dim3 fullBlocksPerGrid((npad + blockSize - 1) / blockSize); + timer().startGpuTimer(); - // TODO + + //call work efficient scan helper + workEfficientScan(npad, dev_idata, threadsPerBlock, fullBlocksPerGrid); + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_idata, sizeof(int) * n, cudaMemcpyDeviceToHost); + checkCUDAError("Memcpy odata failed!"); + + cudaFree(dev_idata); } /** @@ -31,10 +102,65 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + //set up variable, allocate space on gpu, and copy over data + int npad = 1 << ilog2ceil(n); + int *dev_idata; + int *dev_indices; + int *dev_odata; + int *dev_bools; + + cudaMalloc((void**)&dev_idata, npad * sizeof(int)); + checkCUDAError("cudaMalloc dev_idata failed!"); + + cudaMalloc((void**)&dev_odata, npad * sizeof(int)); + checkCUDAError("cudaMalloc dev_odata failed!"); + + cudaMalloc((void**)&dev_indices, npad * sizeof(int)); + checkCUDAError("cudaMalloc dev_indices failed!"); + + cudaMalloc((void**)&dev_bools, npad * sizeof(int)); + checkCUDAError("cudaMalloc dev_bools failed!"); + + cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + checkCUDAError("Memcpy idata failed!"); + + dim3 threadsPerBlock(blockSize); + dim3 fullBlocksPerGrid((npad + blockSize - 1) / blockSize); + timer().startGpuTimer(); - // TODO + + // map data to boolean + StreamCompaction::Common::kernMapToBoolean<< > > (npad, dev_bools, dev_idata); + checkCUDAError("Memcpy kernMapToBoolean failed!"); + + //copy to indices to call on workEfficientScan inplace + cudaMemcpy(dev_indices, dev_bools, sizeof(int) * npad, cudaMemcpyDeviceToDevice); + checkCUDAError("Memcpy odata failed!"); + + //call work efficient exclusive scan helper to edit dev_indices inplace + workEfficientScan(npad, dev_indices, threadsPerBlock, fullBlocksPerGrid); + + //scatter results from scan + StreamCompaction::Common::kernScatter << > > (npad, dev_odata, dev_idata, dev_bools, dev_indices); + checkCUDAError("Memcpy kernScatter failed!"); + timer().endGpuTimer(); - return -1; + + //copy back output + cudaMemcpy(odata, dev_odata, sizeof(int) * n, cudaMemcpyDeviceToHost); + checkCUDAError("Memcpy odata failed!"); + + // find length of output array as final element of indices array + int *k = new int; + cudaMemcpy(k, dev_indices + npad - 1, sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("Memcpy dev_indices failed!"); + + cudaFree(dev_idata); + cudaFree(dev_indices); + cudaFree(dev_odata); + cudaFree(dev_bools); + + return *k; } } } diff --git a/Project2-Stream-Compaction/stream_compaction/efficient.h b/Project2-Stream-Compaction/stream_compaction/efficient.h index 803cb4f..1450345 100644 --- a/Project2-Stream-Compaction/stream_compaction/efficient.h +++ b/Project2-Stream-Compaction/stream_compaction/efficient.h @@ -6,6 +6,8 @@ namespace StreamCompaction { namespace Efficient { StreamCompaction::Common::PerformanceTimer& timer(); + void workEfficientScan(int n, int *dev_idata, dim3 &threadsPerBlock, dim3 &fullBlocksPerGrid); + void scan(int n, int *odata, const int *idata); int compact(int n, int *odata, const int *idata); diff --git a/Project2-Stream-Compaction/stream_compaction/naive.cu b/Project2-Stream-Compaction/stream_compaction/naive.cu index 4308876..0a146bc 100644 --- a/Project2-Stream-Compaction/stream_compaction/naive.cu +++ b/Project2-Stream-Compaction/stream_compaction/naive.cu @@ -3,6 +3,9 @@ #include "common.h" #include "naive.h" +/*! Block size used for CUDA kernel launch. */ +#define blockSize 128 + namespace StreamCompaction { namespace Naive { using StreamCompaction::Common::PerformanceTimer; @@ -11,15 +14,74 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } - // TODO: __global__ + __global__ void kernScan(int n, int p, int *odata, int *idata) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + if (index >= p) { + odata[index] = idata[index - p] + idata[index]; + } + else { + odata[index] = idata[index]; + } + + } + + __global__ void kernRightShift(int n, int *odata, int *idata) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + if (index == 0) { + odata[index] = 0; + } + odata[index + 1] = idata[index]; + + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO + //set up variable, allocate space on gpu, and copy over data + int *dev_idata; + int *dev_odata; + + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_idata failed!"); + + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_odata failed!"); + + cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + checkCUDAError("Memcpy idata failed!"); + + dim3 threadsPerBlock(blockSize); + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + timer().startGpuTimer(); + + //inclusive scanning + for (int d = 0; d < ilog2ceil(n); d++) { + int p = 1 << d; + kernScan<<>>(n, p, dev_odata, dev_idata); + checkCUDAError("kernel kernScan failed!"); + + std::swap(dev_idata, dev_odata); + } + + //right shift to convert to exclusive scan + kernRightShift << > > (n, dev_odata, dev_idata); + checkCUDAError("kernel kernRightShift failed!"); + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_odata, sizeof(int) * n, cudaMemcpyDeviceToHost); + checkCUDAError("Memcpy odata failed!"); + + cudaFree(dev_idata); + cudaFree(dev_odata); } } } diff --git a/Project2-Stream-Compaction/stream_compaction/radix.cu b/Project2-Stream-Compaction/stream_compaction/radix.cu new file mode 100644 index 0000000..1f48d4d --- /dev/null +++ b/Project2-Stream-Compaction/stream_compaction/radix.cu @@ -0,0 +1,115 @@ +#include +#include +#include "common.h" +#include "naive.h" +#include "efficient.h" +#include +#include "radix.h" + +/*! Block size used for CUDA kernel launch. */ +#define blockSize 128 + +namespace StreamCompaction { + namespace Radix { + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; + } + + __global__ void kernFullAddress(int n, int totalFalses, int *indices, int *bools){ + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + indices[index] = bools[index] ? indices[index] : index - indices[index] + totalFalses; + } + + __global__ void kernBitMapToBoolean(int n, int bit, int *bools, const int *idata) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + bools[index] = idata[index] & bit ? 0 : 1; + } + + __global__ void kernFullScatter(int n, int *odata, const int *idata, const int *indices) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + odata[indices[index]] = idata[index]; + + } + + int findMax(int n, const int *idata) { + int mx = idata[0]; + for (int i = 1; i < n; i++) { + if (idata[i] > mx) { + mx = idata[i]; + } + } + return mx; + } + + void sort(int n, int *odata, const int *idata) { + //int mx = findMax(n, idata); + int kbit = ilog2ceil(n); + int npad = 1 << kbit; + + int *dev_idata, *dev_odata, *dev_bools, *dev_indices; + cudaMalloc((void**)&dev_idata, sizeof(int) * npad); + checkCUDAError("cudaMalloc dev_idata failed!"); + cudaMalloc((void**)&dev_odata, sizeof(int) * npad); + checkCUDAError("cudaMalloc dev_odata failed!"); + cudaMalloc((void**)&dev_bools, sizeof(int) * npad); + checkCUDAError("cudaMalloc dev_bools failed!"); + cudaMalloc((void**)&dev_indices, sizeof(int) * npad); + checkCUDAError("cudaMalloc dev_indices failed!"); + + cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + checkCUDAError("Memcpy idata failed!"); + + dim3 threadsPerBlock(blockSize); + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + timer().startGpuTimer(); + for(int i = 0; i <= kbit; i++){ + kernBitMapToBoolean<< > > (n, (1 << i), dev_bools, dev_idata); + checkCUDAError("Memcpy kernMapToBoolean failed!"); + + cudaMemcpy(dev_indices, dev_bools, sizeof(int) * n, cudaMemcpyDeviceToDevice); + checkCUDAError("Memcpy dev_bools to dev_indices failed!"); + StreamCompaction::Efficient::workEfficientScan(npad, dev_indices, threadsPerBlock, fullBlocksPerGrid); + checkCUDAError("Efficient scan failed!"); + + int totalFalses, lastBool; + cudaMemcpy(&totalFalses, dev_indices + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("Memcpy totalFalses failed!"); + cudaMemcpy(&lastBool, dev_bools + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("Memcpy lastBool failed!"); + + totalFalses += lastBool; + kernFullAddress << > > (n, totalFalses, dev_indices, dev_bools); + checkCUDAError("kernFullAddress failed!"); + + kernFullScatter << > > (n, dev_odata, dev_idata, dev_indices); + checkCUDAError("kernScatter failed!"); + + cudaMemcpy(dev_idata, dev_odata, sizeof(int) * n, cudaMemcpyDeviceToDevice); + checkCUDAError("Memcpy dev_odata to dev_idata failed!"); + } + + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_idata, sizeof(int) * n, cudaMemcpyDeviceToHost); + + + cudaFree(dev_idata); + cudaFree(dev_odata); + cudaFree(dev_bools); + cudaFree(dev_indices); + } + } +} diff --git a/Project2-Stream-Compaction/stream_compaction/radix.h b/Project2-Stream-Compaction/stream_compaction/radix.h new file mode 100644 index 0000000..aa489d9 --- /dev/null +++ b/Project2-Stream-Compaction/stream_compaction/radix.h @@ -0,0 +1,11 @@ +#pragma once + +#include "common.h" + +namespace StreamCompaction { + namespace Radix { + StreamCompaction::Common::PerformanceTimer& timer(); + + void sort(int n, int *odata, const int *idata); + } +} diff --git a/Project2-Stream-Compaction/stream_compaction/thrust.cu b/Project2-Stream-Compaction/stream_compaction/thrust.cu index 1def45e..d6835c8 100644 --- a/Project2-Stream-Compaction/stream_compaction/thrust.cu +++ b/Project2-Stream-Compaction/stream_compaction/thrust.cu @@ -22,6 +22,8 @@ namespace StreamCompaction { // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(idata, idata + n, odata); + timer().endGpuTimer(); } } diff --git a/README.md b/README.md index 3a0b2fe..2654b00 100644 --- a/README.md +++ b/README.md @@ -3,14 +3,11 @@ CUDA Number Algorithms **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Klayton Wittler + * [LinkedIn](https://www.linkedin.com/in/klayton-wittler/) +* Tested on: Windows 10 Pro, i7-7700K @ 4.20GHz 16.0GB, GTX 1070 8.192GB (my PC) -### (TODO: Your README) +# [Stream Compaction](/Project2-Stream-Compaction) -Link to the readmes of the other two subprojects. - -Add anything else you think is relevant up to this point. -(Remember, this is public, so don't put anything here that you don't want to share with the world.) +# [Character Recognition](/Project2-Character-Recognition)