diff --git a/Project2-Character-Recognition/README.md b/Project2-Character-Recognition/README.md index 4503fac..429cd93 100644 --- a/Project2-Character-Recognition/README.md +++ b/Project2-Character-Recognition/README.md @@ -2,13 +2,140 @@ CUDA Character Recognition ====================== **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** +* Jiangping Xu + * [LinkedIn](https://www.linkedin.com/in/jiangping-xu-365b19134/) +* Tested on: Windows 10, i7-4700MQ @ 2.40GHz 8GB, GT 755M 6100MB (personal laptop) +_________________________________________________________________________ +[Introduction](#Stream-Compaction) - [Performance](#performance) - [Result](#result) +_________________________________________________________________________ +## Introduction +In this project I implement a GPU accelerated Neural Network framework with the following features: -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* the number of hidden layers and the number of neural for each layer can be set freely +* allow batch training +* use GPU accelerated matrix multiply (using tiles and shared memory) during forward and back propagation +* Xavier Uniform Initialization is applied here for weights initialization -### (TODO: Your README) +A tricky thing is that when doing matrix multiply in GPU, launching one thread for each element in output matrix is not enough. For example, when we do M1 * M2 = P, where M1 is a 3 by 4 matrix and M2 is a 4 by 3 matrix, we need 12 threads to load the entire matrix rather than 9. -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +## Performance +

+ +

+Computation time goes up when increasing the number of layers. (For each hidden layer, there are 30 neural.) +

+ +

+When changing the number of neural, the change in time cost is not obvious. Maybe the GPU version matrix multiply is not sensetive to the matrix size. + +

+ +

+ +## Result +learning rate = 0.1, batch size = 52 + +a 5-layer network with a structure of 10201 - 30 - 10 - 30 -52 +``` +Epoch: 0 Cost: 6.494277 +Epoch: 1 Cost: 6.283911 +Epoch: 2 Cost: 6.080731 +Epoch: 3 Cost: 5.884087 +Epoch: 4 Cost: 5.693419 +Epoch: 5 Cost: 5.508251 +Epoch: 6 Cost: 5.328183 +Epoch: 7 Cost: 5.152885 +Epoch: 8 Cost: 4.982097 +Epoch: 9 Cost: 4.815609 +Epoch: 10 Cost: 4.653265 +Epoch: 11 Cost: 4.494954 +Epoch: 12 Cost: 4.340601 +Epoch: 13 Cost: 4.190164 +Epoch: 14 Cost: 4.043626 +Epoch: 15 Cost: 3.900990 +Epoch: 16 Cost: 3.762273 +Epoch: 17 Cost: 3.627504 +Epoch: 18 Cost: 3.496716 +Epoch: 19 Cost: 3.369947 +Epoch: 20 Cost: 3.247230 +Epoch: 21 Cost: 3.128597 +Epoch: 22 Cost: 3.014070 +Epoch: 23 Cost: 2.903666 +Epoch: 24 Cost: 2.797389 +Epoch: 25 Cost: 2.695230 +Epoch: 26 Cost: 2.597171 +Epoch: 27 Cost: 2.503177 +Epoch: 28 Cost: 2.413202 +Epoch: 29 Cost: 2.327186 +Epoch: 30 Cost: 2.245058 +Epoch: 31 Cost: 2.166734 +Epoch: 32 Cost: 2.092118 +Epoch: 33 Cost: 2.021107 +Epoch: 34 Cost: 1.953589 +Epoch: 35 Cost: 1.889446 +Epoch: 36 Cost: 1.828552 +Epoch: 37 Cost: 1.770779 +Epoch: 38 Cost: 1.715999 +Epoch: 39 Cost: 1.664077 +Epoch: 40 Cost: 1.614883 +Epoch: 41 Cost: 1.568285 +Epoch: 42 Cost: 1.524154 +Epoch: 43 Cost: 1.482364 +Epoch: 44 Cost: 1.442791 +Epoch: 45 Cost: 1.405317 +Epoch: 46 Cost: 1.369825 +Epoch: 47 Cost: 1.336205 +Epoch: 48 Cost: 1.304351 +Epoch: 49 Cost: 1.274163 +Epoch: 50 Cost: 1.245543 +Epoch: 51 Cost: 1.218400 +Epoch: 52 Cost: 1.192649 +Epoch: 53 Cost: 1.168206 +Epoch: 54 Cost: 1.144996 +Epoch: 55 Cost: 1.122945 +Epoch: 56 Cost: 1.101985 +Epoch: 57 Cost: 1.082053 +Epoch: 58 Cost: 1.063086 +Epoch: 59 Cost: 1.045029 +Epoch: 60 Cost: 1.027830 +Epoch: 61 Cost: 1.011437 +Epoch: 62 Cost: 0.995804 +Epoch: 63 Cost: 0.980887 +Epoch: 64 Cost: 0.966647 +Epoch: 65 Cost: 0.953043 +Epoch: 66 Cost: 0.940041 +Epoch: 67 Cost: 0.927607 +Epoch: 68 Cost: 0.915708 +Epoch: 69 Cost: 0.904317 +Epoch: 70 Cost: 0.893405 +Epoch: 71 Cost: 0.882945 +Epoch: 72 Cost: 0.872915 +Epoch: 73 Cost: 0.863291 +Epoch: 74 Cost: 0.854052 +Epoch: 75 Cost: 0.845177 +Epoch: 76 Cost: 0.836649 +Epoch: 77 Cost: 0.828448 +Epoch: 78 Cost: 0.820560 +Epoch: 79 Cost: 0.812967 +Epoch: 80 Cost: 0.805656 +Epoch: 81 Cost: 0.798613 +Epoch: 82 Cost: 0.791824 +Epoch: 83 Cost: 0.785277 +Epoch: 84 Cost: 0.778962 +Epoch: 85 Cost: 0.772866 +Epoch: 86 Cost: 0.766981 +Epoch: 87 Cost: 0.761295 +Epoch: 88 Cost: 0.755800 +Epoch: 89 Cost: 0.750488 +Epoch: 90 Cost: 0.745350 +Epoch: 91 Cost: 0.740378 +Epoch: 92 Cost: 0.735566 +Epoch: 93 Cost: 0.730906 +Epoch: 94 Cost: 0.726391 +Epoch: 95 Cost: 0.722017 +Epoch: 96 Cost: 0.717776 +Epoch: 97 Cost: 0.713663 +Epoch: 98 Cost: 0.709673 +Epoch: 99 Cost: 0.705801 +``` \ No newline at end of file diff --git a/Project2-Character-Recognition/character_recognition/CMakeLists.txt b/Project2-Character-Recognition/character_recognition/CMakeLists.txt index 7446175..01edd01 100644 --- a/Project2-Character-Recognition/character_recognition/CMakeLists.txt +++ b/Project2-Character-Recognition/character_recognition/CMakeLists.txt @@ -7,5 +7,5 @@ set(SOURCE_FILES cuda_add_library(character_recognition ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_30 ) diff --git a/Project2-Character-Recognition/character_recognition/mlp.cu b/Project2-Character-Recognition/character_recognition/mlp.cu index 5a3ed7f..476c5db 100644 --- a/Project2-Character-Recognition/character_recognition/mlp.cu +++ b/Project2-Character-Recognition/character_recognition/mlp.cu @@ -2,6 +2,14 @@ #include #include "common.h" #include "mlp.h" +#include +#include +#include +#include + +constexpr int blockSize = 64; +constexpr int maxUnit = 100; +constexpr int TILE_WIDTH = 8; namespace CharacterRecognition { using Common::PerformanceTimer; @@ -10,18 +18,379 @@ namespace CharacterRecognition { static PerformanceTimer timer; return timer; } - - // TODO: __global__ - - /** - * Example of use case (follow how you did it in stream compaction) - */ - /*void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); - } - */ + + __device__ void matrixMul(int size1, int size2, int size3, float* matrix1, float* matrix2, float *output) { + __shared__ float sM[TILE_WIDTH][TILE_WIDTH]; + __shared__ float sN[TILE_WIDTH][TILE_WIDTH]; + + int bx = blockIdx.x; int by = blockIdx.y; + int tx = threadIdx.x; int ty = threadIdx.y; + + int col = bx * TILE_WIDTH + tx; + int row = by * TILE_WIDTH + ty; + + // Initialize accumulator to 0 + float pValue = 0; + + // Multiply and add + for (int m = 0; m < (size2 + TILE_WIDTH - 1) / TILE_WIDTH; m++) { + int tmpCR = m * TILE_WIDTH + tx; + sM[ty][tx] = tmpCR < size2 ? matrix1[row * size2 + tmpCR] : 0; + tmpCR = m * TILE_WIDTH + ty; + sN[ty][tx] = tmpCR < size2 ? matrix2[col + tmpCR * size3] : 0; + __syncthreads(); + + for (int k = 0; k < TILE_WIDTH; ++k) + pValue += sM[ty][k] * sN[k][tx]; + __syncthreads(); + } + if (col >= size3 || row >= size1) + return; + output[0] = pValue; + } + + __global__ void kernSubtract(int n, float* output, float* error) { + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index < n) { + error[index] = output[index] - error[index]; + } + } + + __global__ void kernSquare(int n, float *input, float *output) { + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index < n) { + float tmp = input[index]; + output[index] = tmp * tmp; + } + } + + __global__ void kernDotMul(int n, float *m1, float *m2, float *output) { + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index < n) { + output[index] = m1[index] * m2[index]; + } + } + + __global__ void kernSigGrad(int n, float *m, float *output) { + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index < n) { + float tmp = m[index]; + output[index] = tmp *(1.0f - tmp); + } + } + + __global__ void kernCalGrad(int curLSize, int batchSize, int preLSize, float *delta, float *X, float *grad) { + //grad = delta * X' / batchSize; + __shared__ float sM[TILE_WIDTH][TILE_WIDTH]; + __shared__ float sN[TILE_WIDTH][TILE_WIDTH]; + + int bx = blockIdx.x; int by = blockIdx.y; + int tx = threadIdx.x; int ty = threadIdx.y; + + int col = bx * TILE_WIDTH + tx; + int row = by * TILE_WIDTH + ty; + + // Initialize accumulator to 0 + float pValue = 0; + + // Multiply and add + for (int m = 0; m < (batchSize + TILE_WIDTH - 1) / TILE_WIDTH; m++) { + int tmpCR = m * TILE_WIDTH + tx; + sM[ty][tx] = tmpCR < batchSize ? delta[row * batchSize + tmpCR] : 0; + tmpCR = m * TILE_WIDTH + ty; + sN[ty][tx] = tmpCR < batchSize ? X[col * batchSize + tmpCR] : 0; + __syncthreads(); + + for (int k = 0; k < TILE_WIDTH; ++k) + pValue += sM[ty][k] * sN[k][tx]; + __syncthreads(); + } + if (col >= preLSize || row >= curLSize) + return; + grad[row * preLSize + col] = pValue / batchSize; + } + + __global__ void kernBpDelta(int preLSize, int curLSize, int batchSzie, float *weight, float *delta, float *X, float *newDelta) { + __shared__ float sM[TILE_WIDTH][TILE_WIDTH]; + __shared__ float sN[TILE_WIDTH][TILE_WIDTH]; + + int bx = blockIdx.x; int by = blockIdx.y; + int tx = threadIdx.x; int ty = threadIdx.y; + + int col = bx * TILE_WIDTH + tx; + int row = by * TILE_WIDTH + ty; + + // Initialize accumulator to 0 + float pValue = 0; + + // Multiply and add + for (int m = 0; m < (curLSize + TILE_WIDTH - 1) / TILE_WIDTH; m++) { + int tmpCR = m * TILE_WIDTH + tx; + sM[ty][tx] = tmpCR < curLSize ? weight[row + tmpCR * preLSize] : 0; + tmpCR = m * TILE_WIDTH + ty; + sN[ty][tx] = tmpCR < curLSize ? delta[col + tmpCR * batchSzie] : 0; + __syncthreads(); + + for (int k = 0; k < TILE_WIDTH; ++k) + pValue += sM[ty][k] * sN[k][tx]; + __syncthreads(); + } + if (col >= batchSzie || row >= preLSize) + return; + float tmp = X[row * batchSzie + col]; + newDelta[row * batchSzie + col] = pValue * tmp * (1.0f - tmp); + } + + __global__ void kernForwardOne(int curLSize, int preLSize, int batchSize, float *weights, float *input, float *output) { + //input : the input matrix, preLSize * batchSize + //weights : the weight matrix, curLSize * preLSize, followed by curLSize constant term + //output : the output matrix, curLSize * batchSize + + __shared__ float sM[TILE_WIDTH][TILE_WIDTH]; + __shared__ float sN[TILE_WIDTH][TILE_WIDTH]; + + int bx = blockIdx.x; int by = blockIdx.y; + int tx = threadIdx.x; int ty = threadIdx.y; + + int col = bx * TILE_WIDTH + tx; + int row = by * TILE_WIDTH + ty; + + // Initialize accumulator to 0 + float pValue = 0; + + // Multiply and add + for (int m = 0; m < 1/*(preLSize + TILE_WIDTH - 1) / TILE_WIDTH*/; m++) { + int tmpCR = m * TILE_WIDTH + tx; + sM[ty][tx] = tmpCR < preLSize ? weights[row * preLSize + tmpCR] : 0; + tmpCR = m * TILE_WIDTH + ty; + sN[ty][tx] = tmpCR < preLSize ? input[col + tmpCR * batchSize] : 0; + __syncthreads(); + + for (int k = 0; k < TILE_WIDTH; ++k) + pValue += sM[ty][k] * sN[k][tx]; + __syncthreads(); + } + if (col >= batchSize || row >= curLSize) + return; + output[row * batchSize + col] = 1.0f / (1.0f + expf(-pValue - weights[row + curLSize * preLSize])); + } + + __global__ void kernUpdateW(int n, float *weight, float *weightgrad, float alpha) { + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= n) + return; + weight[index] -= weightgrad[index] * alpha; + } + + __global__ void initStates(curandState *state, unsigned long seed) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + curand_init(seed, idx, 0, &state[idx]); + } + + __global__ void kernIniW(int n, int n2, float r, float *weights, curandState *state) { + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= n2) { + return; + } + if (index >= n) { + weights[index] = 0; + return; + } + float rand = curand_uniform(&state[index]); + weights[index] = (rand * 2.0f - 1.0f) * r; + } + + void initializeW(float *weights, int *layerSizes, int layerNum) { + //Xavier Uniform Initialization + int curIdx = 0; + for (int i = 1; i < layerNum; i++) { + int n = layerSizes[i] * layerSizes[i - 1]; + /* + curandState* dev_states; + cudaMalloc(&dev_states, n * sizeof(curandState)); + checkCUDAError("cudaMalloc dev_states failed!"); + + srand(time(0)); + unsigned long seed = rand(); + dim3 GridSize2((n + blockSize - 1) / blockSize); + initStates<<>>(dev_states, seed); + + float* dev_weights; + cudaMalloc((void**)&dev_weights, sizeof(float) * (n + layerSizes[i])); + checkCUDAError("cudaMalloc dev_weight failed!"); + */ + float r = std::sqrtf(6.0f / (layerSizes[i] + layerSizes[i - 1])); + /* + dim3 GridSize((n + layerSizes[i] + blockSize - 1) / blockSize); + kernIniW<<>>(n, n + layerSizes[i], r, dev_weights, dev_states); + checkCUDAError("weight initialization failed!"); + + printf("!!\n"); + cudaMemcpy(weights + curIdx, dev_weights, sizeof(float) * (n + layerSizes[i]), cudaMemcpyDeviceToHost); + checkCUDAError("copy to weights failed!"); + curIdx += n + layerSizes[i]; + printf("!\n"); + + cudaFree(dev_states); + cudaFree(dev_weights); + */ + srand(time(0)); + for (int j = curIdx; j < n + curIdx; j++) { + weights[j] = (2 * ((double)rand() / (RAND_MAX)) - 1) * r; + } + curIdx += n; + for (int j = curIdx; j < layerSizes[i] + curIdx; j++) { + weights[j] = 0; + } + curIdx += layerSizes[i]; + } + } // TODO: implement required elements for MLP sections 1 and 2 here + float computeCostGrad(int *layerSizes, int layerNum, int batchSize, float *weights, float *grad, float *data, float *label) { + float *dev_weight; + cudaMalloc((void**)&dev_weight, sizeof(float) * + maxUnit * (std::max(layerSizes[0], maxUnit) + 1)); + checkCUDAError("cudaMalloc dev_weight failed!"); + + int totalUnit = 0; + for (int i = 0; i < layerNum; i++) { + totalUnit += layerSizes[i]; + } + + float *dev_buffer; + cudaMalloc((void**)&dev_buffer, sizeof(float) * + batchSize * totalUnit); + checkCUDAError("cudaMalloc dev_buffer failed!"); + + int *indices = new int[layerNum]; + indices[0] = 0; + int *wIndices = new int[layerNum]; + wIndices[0] = 0; + + int curLSize, preLSize; + curLSize = layerSizes[0]; + cudaMemcpy(dev_buffer, data, curLSize * batchSize * sizeof(float), cudaMemcpyHostToDevice); + + //forward propagation + dim3 TileSize(TILE_WIDTH, TILE_WIDTH, 1); + for (int i = 1; i < layerNum; i++) { + preLSize = curLSize; + curLSize = layerSizes[i]; + indices[i] = indices[i - 1] + preLSize * batchSize; + wIndices[i] = (preLSize + 1) * curLSize; + cudaMemcpy(dev_weight, weights + wIndices[i - 1], sizeof(float) * wIndices[i], cudaMemcpyHostToDevice); + checkCUDAError("copy to dev_weight failed!"); + wIndices[i] += wIndices[i - 1]; + dim3 GridSize((batchSize + TILE_WIDTH - 1) / TILE_WIDTH, (curLSize + TILE_WIDTH - 1) / TILE_WIDTH, 1); + kernForwardOne<<>>(curLSize, preLSize, batchSize, dev_weight, dev_buffer + indices[i - 1], dev_buffer + indices[i]); + } + + float *dev_error; + int numOutput = batchSize * layerSizes[layerNum - 1]; + cudaMalloc((void**)&dev_error, sizeof(float) * numOutput); + checkCUDAError("cudaMalloc dev_error failed!"); + cudaMemcpy(dev_error, label, sizeof(float) * numOutput, cudaMemcpyHostToDevice); + checkCUDAError("set dev_error(label) failed!"); + + dim3 GridSize2((numOutput + blockSize - 1) / blockSize); + kernSubtract<<>>(numOutput, dev_buffer + indices[layerNum - 1], dev_error); + + float *dev_sqrErr; + cudaMalloc((void**)&dev_sqrErr, sizeof(float) * numOutput); + checkCUDAError("cudaMalloc dev_sqrErr failed!"); + kernSquare<<>>(numOutput, dev_error, dev_sqrErr); + + thrust::device_ptr thrust_dev_sqrErr(dev_sqrErr); + float cost = thrust::reduce(thrust_dev_sqrErr, thrust_dev_sqrErr + numOutput, (float) 0, thrust::plus()); + cost *= 0.5 / batchSize; + + cudaFree(dev_sqrErr); + + ///////Calculate Grad/////// + + float *dev_grad; + cudaMalloc((void**)&dev_grad, sizeof(float) * + maxUnit * (std::max(layerSizes[0], maxUnit) + 1)); + checkCUDAError("cudaMalloc dev_grad failed!"); + + float *dev_delta; + cudaMalloc((void**)&dev_delta, sizeof(float) * + maxUnit * batchSize); + checkCUDAError("cudaMalloc dev_delta failed!"); + + float *dev_deltaBuffer; + cudaMalloc((void**)&dev_deltaBuffer, sizeof(float) * + maxUnit * batchSize); + checkCUDAError("cudaMalloc dev_deltaBuffer failed!"); + + kernSigGrad<<>>(numOutput, dev_buffer + indices[layerNum - 1], dev_delta); + checkCUDAError("kernDotMul failed!"); + + kernDotMul<<>>(numOutput, dev_error, dev_delta, dev_delta); + checkCUDAError("kernDotMul failed!"); + + for(int i = layerNum - 1; i >= 1; i--) { + curLSize = layerSizes[i]; + preLSize = layerSizes[i - 1]; + //compute Wij grad + dim3 GridSize3((preLSize + TILE_WIDTH - 1) / TILE_WIDTH, (curLSize + TILE_WIDTH - 1) / TILE_WIDTH, 1); + kernCalGrad<<>>(curLSize, batchSize, preLSize, dev_delta, dev_buffer + indices[i - 1], dev_grad); + checkCUDAError("kernCalGrad failed!"); + + cudaMemcpy(grad + wIndices[i - 1], dev_grad, sizeof(float) * (wIndices[i] - wIndices[i - 1] - curLSize), cudaMemcpyDeviceToHost); + checkCUDAError("copy dev_grad to grad failed!"); + + //compute constant term grad + float *start = dev_delta; + for (int j = 0; j < curLSize; j++) { + thrust::device_ptr thrust_dev_delta(start); + grad[wIndices[i] - curLSize + j] = thrust::reduce(thrust_dev_delta, thrust_dev_delta + batchSize, (float)0, thrust::plus()) / batchSize; + start += batchSize; + } + + cudaMemcpy(dev_weight, weights + wIndices[i - 1], sizeof(float) * (wIndices[i] - wIndices[i - 1] - curLSize), cudaMemcpyHostToDevice); + checkCUDAError("copy to dev_weight failed!"); + dim3 GridSize4((batchSize + TILE_WIDTH - 1) / TILE_WIDTH, (preLSize + TILE_WIDTH - 1) / TILE_WIDTH, 1); + kernBpDelta<<>>(preLSize, curLSize, batchSize, dev_weight, dev_delta, dev_buffer + indices[i - 1], dev_deltaBuffer); + checkCUDAError("delta bp failed!"); + float *tmp = dev_delta; + dev_delta = dev_deltaBuffer; + dev_deltaBuffer = tmp; + } + + cudaFree(dev_delta); + cudaFree(dev_error); + cudaFree(dev_deltaBuffer); + cudaFree(dev_grad); + cudaFree(dev_weight); + cudaFree(dev_buffer); + + delete[] indices; + delete[] wIndices; + + return cost; + } + + void updateWeights(int n, float *weight, float *weightgrad, float alpha) { + float *dev_weights, float *dev_weightgrad; + cudaMalloc((void**)&dev_weights, n * sizeof(float)); + checkCUDAError("malloc dev_weight failed!"); + cudaMalloc((void**)&dev_weightgrad, n * sizeof(float)); + checkCUDAError("malloc dev_weightgrad failed!"); + + cudaMemcpy(dev_weights, weight, sizeof(float) * n, cudaMemcpyHostToDevice); + checkCUDAError("copy to dev_weights failed!"); + cudaMemcpy(dev_weightgrad, weightgrad, sizeof(float) * n, cudaMemcpyHostToDevice); + checkCUDAError("copy to dev_weightgrad failed!"); + + dim3 GridSize((n + blockSize - 1) / blockSize); + kernUpdateW<<>>(n, dev_weights, dev_weightgrad, alpha); + + cudaMemcpy(weight, dev_weights, sizeof(float) * n, cudaMemcpyDeviceToHost); + checkCUDAError("copy to weight failed!"); + + cudaFree(dev_weightgrad); + cudaFree(dev_weights); + } } diff --git a/Project2-Character-Recognition/character_recognition/mlp.h b/Project2-Character-Recognition/character_recognition/mlp.h index 2096228..5b55845 100644 --- a/Project2-Character-Recognition/character_recognition/mlp.h +++ b/Project2-Character-Recognition/character_recognition/mlp.h @@ -6,4 +6,7 @@ namespace CharacterRecognition { Common::PerformanceTimer& timer(); // TODO: implement required elements for MLP sections 1 and 2 here + void initializeW(float* weights, int *layerSizes, int layerNum); + float computeCostGrad(int *layerSizes, int layerNum, int batchSize, float *weights, float *grad, float *data, float *label); + void updateWeights(int n, float *weight, float *weightgrad, float alpha); } diff --git a/Project2-Character-Recognition/data-set/01info.txt b/Project2-Character-Recognition/data-set/1info.txt similarity index 100% rename from Project2-Character-Recognition/data-set/01info.txt rename to Project2-Character-Recognition/data-set/1info.txt diff --git a/Project2-Character-Recognition/data-set/02info.txt b/Project2-Character-Recognition/data-set/2info.txt similarity index 100% rename from Project2-Character-Recognition/data-set/02info.txt rename to Project2-Character-Recognition/data-set/2info.txt diff --git a/Project2-Character-Recognition/data-set/03info.txt b/Project2-Character-Recognition/data-set/3info.txt similarity index 100% rename from Project2-Character-Recognition/data-set/03info.txt rename to Project2-Character-Recognition/data-set/3info.txt diff --git a/Project2-Character-Recognition/data-set/04info.txt b/Project2-Character-Recognition/data-set/4info.txt similarity index 100% rename from Project2-Character-Recognition/data-set/04info.txt rename to Project2-Character-Recognition/data-set/4info.txt diff --git a/Project2-Character-Recognition/data-set/05info.txt b/Project2-Character-Recognition/data-set/5info.txt similarity index 100% rename from Project2-Character-Recognition/data-set/05info.txt rename to Project2-Character-Recognition/data-set/5info.txt diff --git a/Project2-Character-Recognition/data-set/06info.txt b/Project2-Character-Recognition/data-set/6info.txt similarity index 100% rename from Project2-Character-Recognition/data-set/06info.txt rename to Project2-Character-Recognition/data-set/6info.txt diff --git a/Project2-Character-Recognition/data-set/07info.txt b/Project2-Character-Recognition/data-set/7info.txt similarity index 100% rename from Project2-Character-Recognition/data-set/07info.txt rename to Project2-Character-Recognition/data-set/7info.txt diff --git a/Project2-Character-Recognition/data-set/08info.txt b/Project2-Character-Recognition/data-set/8info.txt similarity index 100% rename from Project2-Character-Recognition/data-set/08info.txt rename to Project2-Character-Recognition/data-set/8info.txt diff --git a/Project2-Character-Recognition/data-set/09info.txt b/Project2-Character-Recognition/data-set/9info.txt similarity index 100% rename from Project2-Character-Recognition/data-set/09info.txt rename to Project2-Character-Recognition/data-set/9info.txt diff --git a/Project2-Character-Recognition/img/Timeused1.png b/Project2-Character-Recognition/img/Timeused1.png new file mode 100644 index 0000000..062641b Binary files /dev/null and b/Project2-Character-Recognition/img/Timeused1.png differ diff --git a/Project2-Character-Recognition/img/Timeused2.png b/Project2-Character-Recognition/img/Timeused2.png new file mode 100644 index 0000000..e27d495 Binary files /dev/null and b/Project2-Character-Recognition/img/Timeused2.png differ diff --git a/Project2-Character-Recognition/img/Timeused3.png b/Project2-Character-Recognition/img/Timeused3.png new file mode 100644 index 0000000..ebf6f92 Binary files /dev/null and b/Project2-Character-Recognition/img/Timeused3.png differ diff --git a/Project2-Character-Recognition/src/main.cpp b/Project2-Character-Recognition/src/main.cpp index 11dd534..d29823b 100644 --- a/Project2-Character-Recognition/src/main.cpp +++ b/Project2-Character-Recognition/src/main.cpp @@ -10,143 +10,122 @@ #include #include #include "testing_helpers.hpp" +#include +#include +using namespace std; + +const float alpha = 0.1; +const int batchSize = 52; + +void GotoLine(std::ifstream& file, unsigned int num) { + file.seekg(std::ios::beg); + for (int i = 0; i < num - 1; ++i) { + file.ignore(std::numeric_limits::max(), '\n'); + } + return; +} + +using time_point_t = std::chrono::high_resolution_clock::time_point; +time_point_t time_start_cpu; +time_point_t time_end_cpu; + +bool cpu_timer_started = false; -const int SIZE = 1 << 8; // feel free to change the size of array -const int NPOT = SIZE - 3; // Non-Power-Of-Two -int *a = new int[SIZE]; -int *b = new int[SIZE]; -int *c = new int[SIZE]; +float prev_elapsed_time_cpu_milliseconds = 0.f; + +void startCpuTimer() +{ + if (cpu_timer_started) { throw std::runtime_error("CPU timer already started"); } + cpu_timer_started = true; + + time_start_cpu = std::chrono::high_resolution_clock::now(); +} + +void endCpuTimer() +{ + time_end_cpu = std::chrono::high_resolution_clock::now(); + + if (!cpu_timer_started) { throw std::runtime_error("CPU timer not started"); } + + std::chrono::duration duro = time_end_cpu - time_start_cpu; + prev_elapsed_time_cpu_milliseconds = + static_cast(duro.count()); + + cpu_timer_started = false; +} int main(int argc, char* argv[]) { - // Scan tests - - printf("\n"); - printf("****************\n"); - printf("** SCAN TESTS **\n"); - printf("****************\n"); - - genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); - - // initialize b using StreamCompaction::CPU::scan you implement - // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct. - // At first all cases passed because b && c are all zeroes. - zeroArray(SIZE, b); - printDesc("cpu scan, power-of-two"); - StreamCompaction::CPU::scan(SIZE, b, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(SIZE, b, true); - - zeroArray(SIZE, c); - printDesc("cpu scan, non-power-of-two"); - StreamCompaction::CPU::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(NPOT, b, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("naive scan, power-of-two"); - StreamCompaction::Naive::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan - onesArray(SIZE, c); - printDesc("1s array for finding bugs"); - StreamCompaction::Naive::scan(SIZE, c, a); - printArray(SIZE, c, true); */ - - zeroArray(SIZE, c); - printDesc("naive scan, non-power-of-two"); - StreamCompaction::Naive::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient scan, power-of-two"); - StreamCompaction::Efficient::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient scan, non-power-of-two"); - StreamCompaction::Efficient::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("thrust scan, power-of-two"); - StreamCompaction::Thrust::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - zeroArray(SIZE, c); - printDesc("thrust scan, non-power-of-two"); - StreamCompaction::Thrust::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); - - printf("\n"); - printf("*****************************\n"); - printf("** STREAM COMPACTION TESTS **\n"); - printf("*****************************\n"); - - // Compaction tests - - genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); - - int count, expectedCount, expectedNPOT; - - // initialize b using StreamCompaction::CPU::compactWithoutScan you implement - // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct. - zeroArray(SIZE, b); - printDesc("cpu compact without scan, power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - expectedCount = count; - printArray(count, b, true); - printCmpLenResult(count, expectedCount, b, b); - - zeroArray(SIZE, c); - printDesc("cpu compact without scan, non-power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - expectedNPOT = count; - printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); - - zeroArray(SIZE, c); - printDesc("cpu compact with scan"); - count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient compact, power-of-two"); - count = StreamCompaction::Efficient::compact(SIZE, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient compact, non-power-of-two"); - count = StreamCompaction::Efficient::compact(NPOT, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); - - system("pause"); // stop Win32 console from closing on exit - delete[] a; - delete[] b; - delete[] c; + + float *data = new float[10201 * 52]; + float *label = new float[52 * 52]; + + for (int i = 1; i < 53; i++) { + string filename = i + "info.txt"; + ifstream file(filename); + string line; + if (file.is_open()) + { + GotoLine(file, 3); + for (int j = 0; j < 10201; j++) { + file >> data[i - 1 + j * 52]; + } + file.close(); + } + } + for (int i = 0; i < 52; i++) { + for (int j = 0; j < 52; j++) { + if(i != j) + label[i * 52 + j] = 0; + else + label[i * 52 + j] = 1; + } + } + + //normalization + for (int i = 0; i < 52; i++) { + float avg = 0; + float sigma = 0; + for (int j = 0; j < 10201; j++) { + avg += data[i + j * 52]; + } + avg /= 10201.0f; + for (int j = 0; j < 10201; j++) { + sigma += (data[i + j * 52] - avg) * (data[i + j * 52] - avg); + } + sigma = std::sqrt(sigma / 10201.0f); + for (int j = 0; j < 10201; j++) { + data[i + j * 52] = (data[i + j * 52] - avg) / (sigma + 0.000001f); + } + } + + int layerNum = 3; + int *layerSizes = new int[layerNum]; + layerSizes[0] = 10201; + layerSizes[1] = 70; + layerSizes[2] = 52; + + int totalWNum = 0; + for (int i = 1; i < layerNum; i++) { + totalWNum += layerSizes[i] * (1 + layerSizes[i - 1]); + } + float *weights = new float[totalWNum]; + float *grad = new float[totalWNum]; + + CharacterRecognition::initializeW(weights, layerSizes, layerNum); + + startCpuTimer(); + float cost; + for (int i = 0; i < 10; i++) { + cost = CharacterRecognition::computeCostGrad(layerSizes, layerNum, batchSize, weights, grad, data, label); + CharacterRecognition::updateWeights(totalWNum, weights, grad, alpha); + printf("Epoch: %d Cost: %f \n", i, cost); + } + endCpuTimer(); + printf("Time used : %f ms", prev_elapsed_time_cpu_milliseconds); + + delete[] layerSizes; + delete[] data; + delete[] label; + delete[] weights; + delete[] grad; } diff --git a/Project2-Stream-Compaction/README.md b/Project2-Stream-Compaction/README.md index 0e38ddb..3f54054 100644 --- a/Project2-Stream-Compaction/README.md +++ b/Project2-Stream-Compaction/README.md @@ -3,12 +3,102 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Jiangping Xu + * [LinkedIn](https://www.linkedin.com/in/jiangping-xu-365b19134/) +* Tested on: Windows 10, i7-4700MQ @ 2.40GHz 8GB, GT 755M 6100MB (personal laptop) +_________________________________________________________________________ +[Introduction](#Stream-Compaction) - [Performance Analysis](#performance-analysis) - [Questions](#questions) - [Output](#output) +_________________________________________________________________________ +## Introduction +The goal of stream compaction is to remove the redundent elements in a given array. In this project, I implemented several stream compaction algorithms either in CPU or GPU. The main steps are basically the same for these implementations. First, map the input array to a boolean array where 1(true) stands for useful elements and 0(false) stands for the elements need to be removed. Then a prefix-sum scan is performed to find the indices of the elements we keep in the output array. Finally scatter the elements based on the boolean array and the index array to get the final result. -### (TODO: Your README) +All features are as follows: +* A CPU version of scan algorithm (a simple for loop), with a CPU version of stream compation that based on scan. +* A CPU version of stream compaction without using the scan function +* Naive GPU Scan Algorithm (see [Example 39-1](https://developer.nvidia.com/gpugems/GPUGems3/gpugems3_ch39.html)) +* Work-Efficient GPU Scan (see [Section 39.2.2](https://developer.nvidia.com/gpugems/GPUGems3/gpugems3_ch39.html)) +* A GPU scatter funtion and a mapping to boolean array function that are used by both of the GPU scan algorithm. +* A GPU scan using Thrust library. -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +## Performance Analysis +__Find The Optimal Block Size__ +

+ +

+ +First the performances of naive and work efficient method with different block sizes are tested. Two different input array sizes are considered: a power of two length (256) and a non-power of two length (253). From the graph above we notice that the performance of naive gpu scan doesn't change much along with the change of block size, while the work efficient scan seems to achieve the best performance with a block size of 512. Block sizes for naive and work efficent scan will both set to 512 in the following tests. + +__Comparison__ +

+ +

+We can see that the methods with better performance are CPU and thrust scan. See the next section for a more detailed discussion. + +## Questions +### Can you find the performance bottlenecks? Is it memory I/O? Computation? Is it different for each implementation? + +`CPU`: the computation is the bottle neck while memory access is fast. + +`GPU Naive`: theoretically it should be faster than the CPU scan. Although the total number of addition ( O(nlog(n)) ) is much more than a simple CPU loop (n - 1), multiple threads compute at the same time. In the best situation, only O(log(n)) time is needed. But the memory access speed may become the bottle neck for GPU scan. Reading and writing global memory is really time consuming. Besides, the naive gpu implementation above needs further optimization. Warp divergence and bank conflict also slow down the performance. + +`Work Efficient`: this algorithm requires O(n) additions to scan an array. When the array length goes up, theoratically it beats the naive GPU scan. But it also suffers from the problems mentioned above. + +`Thrust`: warp partition is less occured in this implememntation. I guess it may also use the share memory to speed up memory access. + +## Output +blockSize = 512, ArraySize = 256 / 253 +``` +**************** +** SCAN TESTS ** +**************** + [ 9 35 14 13 10 25 14 25 43 0 7 36 26 ... 38 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.0008ms (std::chrono Measured) + [ 0 9 44 58 71 81 106 120 145 188 188 195 231 ... 6032 6070 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 0.0008ms (std::chrono Measured) + [ 0 9 44 58 71 81 106 120 145 188 188 195 231 ... 5983 5998 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 0.05296ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.0496ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.092896ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.090368ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.093824ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.092896ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 1 1 2 1 2 3 0 3 1 2 3 2 2 ... 2 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.0013ms (std::chrono Measured) + [ 1 1 2 1 2 3 3 1 2 3 2 2 2 ... 2 2 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.0011ms (std::chrono Measured) + [ 1 1 2 1 2 3 3 1 2 3 2 2 2 ... 1 2 ] + passed +==== cpu compact with scan ==== + elapsed time: 0.0049ms (std::chrono Measured) + [ 1 1 2 1 2 3 3 1 2 3 2 2 2 ... 2 2 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.222208ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.157696ms (CUDA Measured) + passed +``` diff --git a/Project2-Stream-Compaction/img/ScanTimeCostWithIncreasingBlockSize.png b/Project2-Stream-Compaction/img/ScanTimeCostWithIncreasingBlockSize.png new file mode 100644 index 0000000..f8f6317 Binary files /dev/null and b/Project2-Stream-Compaction/img/ScanTimeCostWithIncreasingBlockSize.png differ diff --git a/Project2-Stream-Compaction/img/ScanTimeCostWithIncreasingInputArrayLength.png b/Project2-Stream-Compaction/img/ScanTimeCostWithIncreasingInputArrayLength.png new file mode 100644 index 0000000..b4721b0 Binary files /dev/null and b/Project2-Stream-Compaction/img/ScanTimeCostWithIncreasingInputArrayLength.png differ diff --git a/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt b/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt index cdbef77..e31ca3c 100644 --- a/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt +++ b/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt @@ -13,5 +13,5 @@ set(SOURCE_FILES cuda_add_library(stream_compaction ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_30 ) diff --git a/Project2-Stream-Compaction/stream_compaction/common.cu b/Project2-Stream-Compaction/stream_compaction/common.cu index 2ed6d63..aebaddd 100644 --- a/Project2-Stream-Compaction/stream_compaction/common.cu +++ b/Project2-Stream-Compaction/stream_compaction/common.cu @@ -24,6 +24,10 @@ namespace StreamCompaction { */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { // TODO + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= n) + return; + bools[index] = (idata[index] != 0); } /** @@ -33,6 +37,11 @@ namespace StreamCompaction { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { // TODO + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= n) + return; + if(bools[index]) + odata[indices[index]] = idata[index]; } } diff --git a/Project2-Stream-Compaction/stream_compaction/cpu.cu b/Project2-Stream-Compaction/stream_compaction/cpu.cu index a2d3e6c..0dc88f1 100644 --- a/Project2-Stream-Compaction/stream_compaction/cpu.cu +++ b/Project2-Stream-Compaction/stream_compaction/cpu.cu @@ -19,7 +19,10 @@ namespace StreamCompaction { */ void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + odata[0] = 0; + for (int j = 1; j < n; j++) { + odata[j] = odata[j - 1] + idata[j - 1]; + } timer().endCpuTimer(); } @@ -30,9 +33,15 @@ namespace StreamCompaction { */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + int count = 0; + for (int j = 0; j < n; j++) { + if (idata[j] != 0) { + odata[count] = idata[j]; + count++; + } + } timer().endCpuTimer(); - return -1; + return count; } /** @@ -42,9 +51,29 @@ namespace StreamCompaction { */ int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + int *valid = new int[n]; + for (int i = 0; i < n; i++) { + if (idata[i] == 0) { + valid[i] = 0; + } + else { + valid[i] = 1; + } + } + int *index = new int[n]; + index[0] = 0; + for (int j = 1; j < n; j++) { + index[j] = index[j - 1] + valid[j - 1]; + } + for (int i = 0; i < n; i++) { + if (valid[i] == 1) { + odata[index[i]] = idata[i]; + } + } + int count = index[n - 1]; + delete valid, index; timer().endCpuTimer(); - return -1; + return count; } } } diff --git a/Project2-Stream-Compaction/stream_compaction/efficient.cu b/Project2-Stream-Compaction/stream_compaction/efficient.cu index 2db346e..3e8e912 100644 --- a/Project2-Stream-Compaction/stream_compaction/efficient.cu +++ b/Project2-Stream-Compaction/stream_compaction/efficient.cu @@ -3,6 +3,9 @@ #include "common.h" #include "efficient.h" +#define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__) +#define blockSize 512 + namespace StreamCompaction { namespace Efficient { using StreamCompaction::Common::PerformanceTimer; @@ -12,13 +15,65 @@ namespace StreamCompaction { return timer; } + __global__ void kernUpSweep(int n, int d, int *data) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n >> (d + 1)) + return; + int offset = 1 << d; + int ai = offset * (2 * index + 1) - 1; + int bi = offset * (2 * index + 2) - 1; + data[bi] += data[ai]; + } + + __global__ void kernDownSweep(int n, int d, int *data) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= 1 << d) + return; + int offset = n >> (d + 1); + int ai = offset * (2 * index + 1) - 1; + int bi = offset * (2 * index + 2) - 1; + float t = data[ai]; + data[ai] = data[bi]; + data[bi] += t; + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + if (n < 2) + return; + int n_ = 1 << ilog2ceil(n); + int *dev_data; + cudaMalloc((void**)&dev_data, n_ * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_data failed!"); + + cudaMemset(dev_data, 0, sizeof(int) * n_); + checkCUDAErrorWithLine("Set dev_data to 0 failed!"); + cudaMemcpy(dev_data, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + checkCUDAErrorWithLine("Copy idata to dev_data failed!"); + timer().startGpuTimer(); - // TODO + + dim3 GridSize((n_ / 2 + blockSize - 1) / blockSize); + for (int d = 0; d < ilog2ceil(n); d++) { + kernUpSweep<<>>(n_, d, dev_data); + checkCUDAErrorWithLine("kernUpSweep failed!"); + } + + cudaMemset(dev_data + n_ - 1, 0, sizeof(int)); + checkCUDAErrorWithLine("set zero failed!"); + + for (int d = 0; d < ilog2ceil(n); d++) { + kernDownSweep<<>>(n_, d, dev_data); + checkCUDAErrorWithLine("kernDownSweep failed!"); + } + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_data, sizeof(int) * n, cudaMemcpyDeviceToHost); + checkCUDAErrorWithLine("Copy dev_data to odata failed!"); + cudaFree(dev_data); } /** @@ -31,10 +86,78 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + if (n == 0) + return 0; + if (n == 1){ + if (idata[0] != 0) { + odata[0] = idata[0]; + return 1; + } + else { + return 0; + } + } + + int n_ = 1 << ilog2ceil(n); + int *dev_idata, *dev_odata, *dev_bools, *dev_indices; + + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_idata failed!"); + cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_odata failed!"); + + cudaMalloc((void**)&dev_bools, n * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_bools failed!"); + + cudaMalloc((void**)&dev_indices, n_ * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_indices failed!"); + timer().startGpuTimer(); // TODO + dim3 GridSize1((n + blockSize - 1) / blockSize); + StreamCompaction::Common::kernMapToBoolean<<>>(n, dev_bools, dev_idata); + checkCUDAErrorWithLine("kernMapToBoolean failed!"); + + cudaMemset(dev_indices, 0, sizeof(int) * n_); + checkCUDAErrorWithLine("Set dev_indices to 0 failed!"); + + cudaMemcpy(dev_indices, dev_bools, sizeof(int) * n, cudaMemcpyDeviceToDevice); + checkCUDAErrorWithLine("Copy dev_bools to dev_indices failed!"); + + dim3 GridSize2((n_ / 2 + blockSize - 1) / blockSize); + for (int d = 0; d < ilog2ceil(n); d++) { + kernUpSweep<<>>(n_, d, dev_indices); + checkCUDAErrorWithLine("kernUpSweep failed!"); + } + + cudaMemset(dev_indices + n_ - 1, 0, sizeof(int)); + checkCUDAErrorWithLine("set zero failed!"); + + for (int d = 0; d < ilog2ceil(n); d++) { + kernDownSweep<<>>(n_, d, dev_indices); + checkCUDAErrorWithLine("kernDownSweep failed!"); + } + + StreamCompaction::Common::kernScatter<<>>(n, dev_odata, dev_idata, dev_bools, dev_indices); + timer().endGpuTimer(); - return -1; + + cudaMemcpy(odata, dev_odata, sizeof(int) * n, cudaMemcpyDeviceToHost); + checkCUDAErrorWithLine("copy dev_odata to odata failed!"); + int count; + cudaMemcpy(&count, dev_indices + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAErrorWithLine("copy last indice failed!"); + if (idata[n - 1] != 0) { + count++; + } + + cudaFree(dev_idata); + cudaFree(dev_odata); + cudaFree(dev_bools); + cudaFree(dev_indices); + return count; } } } diff --git a/Project2-Stream-Compaction/stream_compaction/naive.cu b/Project2-Stream-Compaction/stream_compaction/naive.cu index 4308876..034e124 100644 --- a/Project2-Stream-Compaction/stream_compaction/naive.cu +++ b/Project2-Stream-Compaction/stream_compaction/naive.cu @@ -3,6 +3,9 @@ #include "common.h" #include "naive.h" +#define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__) +#define blockSize 512 + namespace StreamCompaction { namespace Naive { using StreamCompaction::Common::PerformanceTimer; @@ -12,14 +15,55 @@ namespace StreamCompaction { return timer; } // TODO: __global__ + __global__ void kernAddSkip(int n, int offset, int *odata, int *idata) + { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n) + return; + odata[index] = (index < offset ? idata[index] : idata[index] + idata[index - offset]); + } + + __global__ void kernMakeExclusive(int n, int *odata, int *idata) + { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n) + return; + odata[index] = (index == 0 ? 0 : idata[index - 1]); + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + dim3 GridSize((n + blockSize - 1) / blockSize); + int *dev_data1, *dev_data2, *tmp; + cudaMalloc((void**)&dev_data1, n * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_data1 failed!"); + cudaMalloc((void**)&dev_data2, n * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_data2 failed!"); + + cudaMemcpy(dev_data1, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + timer().startGpuTimer(); - // TODO + + int offset = 1; + for(int i = 0; i < ilog2ceil(n); i++) { + kernAddSkip<<>>(n, offset, dev_data2, dev_data1); + checkCUDAErrorWithLine("kernAddSkip failed!"); + offset <<= 1; + tmp = dev_data1; + dev_data1 = dev_data2; + dev_data2 = tmp; + } + kernMakeExclusive<<>>(n, dev_data2, dev_data1); + checkCUDAErrorWithLine("kernMakeExclusive failed!"); + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_data2, sizeof(int) * n, cudaMemcpyDeviceToHost); + cudaFree(dev_data1); + cudaFree(dev_data2); + return; } } } diff --git a/Project2-Stream-Compaction/stream_compaction/thrust.cu b/Project2-Stream-Compaction/stream_compaction/thrust.cu index 1def45e..ca92e68 100644 --- a/Project2-Stream-Compaction/stream_compaction/thrust.cu +++ b/Project2-Stream-Compaction/stream_compaction/thrust.cu @@ -18,11 +18,27 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int* dev_idata; + int* dev_odata; + cudaMalloc((void**)&dev_idata, sizeof(int) * n); + cudaMalloc((void**)&dev_odata, sizeof(int) * n); + + cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + + thrust::device_ptr dv_in(dev_idata); + thrust::device_ptr dv_out(dev_odata); + timer().startGpuTimer(); // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(dv_in, dv_in + n, dv_out); timer().endGpuTimer(); + + cudaMemcpy(odata, dev_odata, sizeof(int) * n, cudaMemcpyDeviceToHost); + + cudaFree(dev_idata); + cudaFree(dev_odata); } } } diff --git a/README.md b/README.md deleted file mode 100644 index 3a0b2fe..0000000 --- a/README.md +++ /dev/null @@ -1,16 +0,0 @@ -CUDA Number Algorithms -====================== - -**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** - -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) - -### (TODO: Your README) - -Link to the readmes of the other two subprojects. - -Add anything else you think is relevant up to this point. -(Remember, this is public, so don't put anything here that you don't want to share with the world.) -