diff --git a/Project2-Character-Recognition/README.md b/Project2-Character-Recognition/README.md index 4503fac..9993386 100644 --- a/Project2-Character-Recognition/README.md +++ b/Project2-Character-Recognition/README.md @@ -3,12 +3,8 @@ CUDA Character Recognition **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) - -### (TODO: Your README) - -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +Peyman Norouzi +* [LinkedIn](https://www.linkedin.com/in/peymannorouzi) +* Tested on: Windows 10, i7-6700 @ 3.40GHz 16GB, Quadro P1000 4096MB (Moore 100B Lab) +Implemented most of the kernels for MLP. But the code does not run. I had/have a lot of things in my basket for the past/current week. I worked a lot on this project but could not complete all of MLP to write the read me. I would appreciate some points based on my structure and functions. diff --git a/Project2-Character-Recognition/character_recognition/mlp.cu b/Project2-Character-Recognition/character_recognition/mlp.cu index 5a3ed7f..8ed4cd8 100644 --- a/Project2-Character-Recognition/character_recognition/mlp.cu +++ b/Project2-Character-Recognition/character_recognition/mlp.cu @@ -2,6 +2,14 @@ #include #include "common.h" #include "mlp.h" +#include + +#define num_examples 4 +#define num_input_chanels 2 +#define num_hidden_channels 2 +#define num_out_channels 2 +#define batch_size 1 + namespace CharacterRecognition { using Common::PerformanceTimer; @@ -10,7 +18,193 @@ namespace CharacterRecognition { static PerformanceTimer timer; return timer; } - + + + float *inp = new float[batch_size*num_input_chanels]; + float *dev_inp; + + float *target = new float[batch_size*num_out_channels]; + float *dev_target; + + + float *w1 = new float[num_input_chanels*num_hidden_channels]; + float *dev_w1; + + float *dev_a1_B_relu; + float *dev_a1_A_relu; + + float *w2 = new float[num_hidden_channels * num_out_channels]; + float *dev_w2; + + float *dev_out_B_soft; + float *dev_out_A_soft; + + float *dev_dw1; + float *dev_dw2; + + + float initweight(int n, float *w) { + + for (int i = 0; i < n; i++) { + w[i] = ((2* rand()/RAND_MAX)-1) * 0.002; + } + + return *w; + } + + //int in = num_hidden_channels + w1 = initweight(num_input_chanels * num_hidden_channels, w1); + w2 = initweight(num_hidden_channels * num_out_channels, w2); + + //cudaMemset(dataGPU, 0, 1000 * sizeof(float)); + + //cudaMalloc((void**)&dev_odata, n * sizeof(int)); + //cudaMemcpy(dev_odata, odata, sizeof(int) * n, cudaMemcpyHostToDevice); + + void initialize() { + + cudaMalloc((void**)&dev_inp, batch_size * num_input_chanels * sizeof(float)); + cudaMemcpy(dev_inp, inp, batch_size * num_input_chanels * sizeof(float), cudaMemcpyHostToDevice); + + cudaMalloc((void**)&dev_target, batch_size * num_out_channels * sizeof(float)); + cudaMemcpy(dev_target, target, batch_size * num_out_channels * sizeof(float), cudaMemcpyHostToDevice); + + cudaMalloc((void**)&dev_w1, num_input_chanels*num_hidden_channels * sizeof(float)); + cudaMemcpy(dev_w1, w1, num_input_chanels*num_hidden_channels * sizeof(float), cudaMemcpyHostToDevice); + + cudaMalloc((void**)&dev_w2, num_hidden_channels * num_out_channels * sizeof(float)); + cudaMemcpy(dev_w2, w2, num_hidden_channels * num_out_channels * sizeof(float), cudaMemcpyHostToDevice); + + cudaMalloc((void**)&dev_a1_B_relu, batch_size * num_hidden_channels * sizeof(float)); + cudaMalloc((void**)&dev_a1_A_relu, batch_size * num_hidden_channels * sizeof(float)); + + cudaMalloc((void**)&dev_out_B_soft, batch_size * num_out_channels * sizeof(float)); + cudaMalloc((void**)&dev_out_A_soft, batch_size * num_out_channels * sizeof(float)); + + cudaMalloc((void**)&dev_dw1, num_input_chanels * num_hidden_channels * sizeof(float)); + cudaMalloc((void**)&dev_dw2, num_hidden_channels * num_out_channels * sizeof(float)); + // random initializes w1,w2 and the rest to zeros + } + + + + __global__ void SumVectors(float *vec1, float *vec2, float *out) { + + int index = blockIdx.x * blockDim.x + threadIdx.x; + out[index] = vec1[index] + vec2[index]; + } + + // help from http://luniak.io/cuda-neural-network-implementation-part + __global__ void ForwardLayer(float* W, float* activation, float* Z, float* b, int W_x_dim, int W_y_dim, int Acti_x_dim, int Acti_y_dim) { + + int row = blockIdx.y * blockDim.y + threadIdx.y; + int col = blockIdx.x * blockDim.x + threadIdx.x; + + int Z_x_dim = Acti_x_dim; + int Z_y_dim = W_y_dim; + + float Z_value = 0; + + if (row < Z_y_dim && col < Z_x_dim) { + for (int i = 0; i < W_x_dim; i++) { + Z_value += W[row * W_x_dim + i] * activation[i * Acti_x_dim + col]; + } + Z[row * Z_x_dim + col] = Z_value + b[row]; + } + } + + __global__ void BackpropLayer(float* W, float* dZ, float *dActi, int W_x_dim, int W_y_dim, int dZ_x_dim, int dZ_y_dim) { + + int col = blockIdx.x * blockDim.x + threadIdx.x; + int row = blockIdx.y * blockDim.y + threadIdx.y; + + int dActi_x_dim = dZ_x_dim; + int dActi_y_dim = W_x_dim; + + float dA_value = 0.0f; + + if (row < dActi_y_dim && col < dActi_x_dim) { + for (int i = 0; i < W_y_dim; i++) { + dA_value += W[i * W_x_dim + row] * dZ[i * dZ_x_dim + col]; + } + dActi[row * dActi_x_dim + col] = dA_value; + } + } + + + __global__ void reluActivationForward(float *Z, float *activation, int n) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + + if (index < n) { + A[index] = fmaxf(Z[index], 0); + } + } + + __global__ void reluActivationBackprop(int n ,float *Z, float *dZ, float *dactivation) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + + if (index < n) { + if (Z[index] > 0) { + dZ[index] = dactivation[index]; + } + else { + dZ[index] = 0; + } + } + } + + __device__ float sigmoid(float x) { + return 1.0f / (1 + exp(-x)); + } + + __global__ void sigmoidActivationForward(int n, float* Z, float* activation) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + + if (index < n) { + activation[index] = sigmoid(Z[index]); + } + } + + __global__ void sigmoidActivationBackprop(int n, float *Z, float *dZ, float *dactivation) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + + if (index < n) { + dZ[index] = dactivation[index] * sigmoid(Z[index]) * (1 - sigmoid(Z[index])); + } + } + + + __global__ void BCELoss(int n,float *preds, float *target, float *loss) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + + if (index < n) { + float partial_cost = target[index] * logf(preds[index]) + + (1.0f - target[index]) * logf(1.0f - preds[index]); + + atomicAdd(loss, -partial_cost / n); + } + } + + __global__ void dBCELoss(int n ,float *preds, float *target, float* dY) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + + if (index < n) { + dY[index] = -1.0 * (target[index] / preds[index] - (1 - target[index]) / (1 - preds[index])); + } + } + + __global__ void ElementWiseMultiplication(int n, float *input1, float *input2, float *out) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + + if (index 0 to 1. Then using that we complete steam compaction. + +### 2. GPU Naive scan: +In this method, instead of going sequentially through all the indices in the array, we go through all of the indices of the array in parallel log(n) times just as following: + +![](img/figure-39-2.jpg) + +This method would give an inclusive scan, after we move the whole array one to the right thus making the scan exclusive. + +### 3. GPU work efficient scan and stream compaction: + +Sum: In this method, instead of double counting sums as we did in the naive approach, we only sum each respective indices once. This approach can be done in a place which means it doesn't suffer from the race conditions of the naive method since there won't be a case where one thread writes to and another thread reads from the same location in the array (it is more memory efficient). The scan uses a balanced binary tree in concept to perform the scan in two phases below: + +* Parallel Reduction (Up-Sweep) +* Down-Sweep + +The apporch can be seen using the following figure: + +![](img/figure-39-4.jpg) + +Since this approach uses a binary tree structure, it only works with arrays with power-of-two length. If the array does not hold that condition, I round the array memory allocation to the nearest power of two. + +In compaction, I used the implemented work efficient scan to the array and compact the input array out of zeros (This would have not been possible without the scan since the GPU does not work sequentially). + +### 4. Scan using Thrust package: + +I have also implemented the exclsuive scan using the thurst package as a base line to see how the approaches earlier would stack up against each other. + + +## Why is My GPU Approach So Slow? (Extra Credit) (+5) + +I first implemented the work efficient method by launching n threads where n represents the number of elements in the array. This is very inefficient because in a lot of the times most of the used threads won't be doing any work. This is because of the efficient scan uses the following loops for both up sweep and down sweep: + +![](img/extra.PNG) + +In the inner loop, we are working in parallel on the GPU. The step size for this loop is not 1 and changes based on the outer loop (d). Thus as d increases, the step size in the inner loop increases. Increased step size would then take use less and less of the launched threads if we have them as n (fixed). To improve this, we would need to launch less and fewer threads as we progress in the other loop. You can have a look at my code in the efficient.cu to see how I am changing the number of laughed thread thus improving the GPU performance significantly. This is the approach I used for my work efficient method. + + +## Performance: + +1. The performance of each of the implemented algorithm changes based on how many blocks were used in launching their respective kernals. I ran multiple kernal values for each GPU implementation and 256 worked the best for the efficient approach and 512 worked the best for the naive approach. + +2. The followings are the performanace of the implemneted approaches for both scanning and compation: + +![](img/Scan_2.PNG) +![](img/Scan_N2.PNG) +![](img/Compaction.PNG) + +3. + +CPU performs the best when the array size is small which makes sense. This is because we are doing fewer mathematical operations in the CPU approach. But as the size of the array increases the performance of the CPU also decreases significantly. This is because we are operating each summation sequentially thus making the process a lot less efficient. The GPU work efficient approach shines as the number of elements in the array increases. this makes sense because we are doing not only less mathematical operations (compare to GPU naive) but we are also doing more and more of the operations in parallel on the warp on the GPU. The performance of the efficient approach is very similar to the thrust performance as the array size increases which kind of speak to the fact that the thrust package might be using the same (close approach) to the work efficient. Though I expect the thrust to perform better as the number of array size increases because the package probably does a better job with memory management. In compaction part, the CPU performs when we are not using scan when compacting which makes sense. But then GPU work efficient also performs reasonably considering many kernels that had to be run for the same easy implementation of compaction. + +## Tests output: + +``` +**************** +** SCAN TESTS ** +**************** + [ 21 15 25 19 28 10 5 34 4 33 35 23 44 ... 39 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.0019ms (std::chrono Measured) + [ 0 21 36 61 80 108 118 123 157 161 194 229 252 ... 24539 24578 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 0.0016ms (std::chrono Measured) + [ 0 21 36 61 80 108 118 123 157 161 194 229 252 ... 24424 24464 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 0.067008ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.063488ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.088416ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.089088ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.050432ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.05008ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 3 1 1 1 0 0 1 2 2 3 1 1 2 ... 1 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.0027ms (std::chrono Measured) + [ 3 1 1 1 1 2 2 3 1 1 2 3 1 ... 1 1 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.0028ms (std::chrono Measured) + [ 3 1 1 1 1 2 2 3 1 1 2 3 1 ... 2 1 ] + passed +==== cpu compact with scan ==== + elapsed time: 1.6032ms (std::chrono Measured) + [ 3 1 1 1 1 2 2 3 1 1 2 3 1 ... 1 1 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.283808ms (CUDA Measured) + [ 3 1 1 1 1 2 2 3 1 1 2 3 1 ... 1 1 ] + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.285792ms (CUDA Measured) + [ 3 1 1 1 1 2 2 3 1 1 2 3 1 ... 2 1 ] + passed +``` -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) diff --git a/Project2-Stream-Compaction/img/Compaction.PNG b/Project2-Stream-Compaction/img/Compaction.PNG new file mode 100644 index 0000000..9c340e2 Binary files /dev/null and b/Project2-Stream-Compaction/img/Compaction.PNG differ diff --git a/Project2-Stream-Compaction/img/Scan_2.PNG b/Project2-Stream-Compaction/img/Scan_2.PNG new file mode 100644 index 0000000..1a1484d Binary files /dev/null and b/Project2-Stream-Compaction/img/Scan_2.PNG differ diff --git a/Project2-Stream-Compaction/img/Scan_N2.PNG b/Project2-Stream-Compaction/img/Scan_N2.PNG new file mode 100644 index 0000000..84d0e2e Binary files /dev/null and b/Project2-Stream-Compaction/img/Scan_N2.PNG differ diff --git a/Project2-Stream-Compaction/img/extra.PNG b/Project2-Stream-Compaction/img/extra.PNG new file mode 100644 index 0000000..c4d91c9 Binary files /dev/null and b/Project2-Stream-Compaction/img/extra.PNG differ diff --git a/Project2-Stream-Compaction/img/figure-39-4.jpg b/Project2-Stream-Compaction/img/figure-39-4.jpg index 5888f20..8964589 100644 Binary files a/Project2-Stream-Compaction/img/figure-39-4.jpg and b/Project2-Stream-Compaction/img/figure-39-4.jpg differ diff --git a/Project2-Stream-Compaction/src/main.cpp b/Project2-Stream-Compaction/src/main.cpp index d016553..caf0328 100644 --- a/Project2-Stream-Compaction/src/main.cpp +++ b/Project2-Stream-Compaction/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 10; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -137,14 +137,14 @@ int main(int argc, char* argv[]) { printDesc("work-efficient compact, power-of-two"); count = StreamCompaction::Efficient::compact(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); printDesc("work-efficient compact, non-power-of-two"); count = StreamCompaction::Efficient::compact(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); system("pause"); // stop Win32 console from closing on exit diff --git a/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt b/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt index cdbef77..4bb0dc2 100644 --- a/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt +++ b/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt @@ -13,5 +13,5 @@ set(SOURCE_FILES cuda_add_library(stream_compaction ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_61 ) diff --git a/Project2-Stream-Compaction/stream_compaction/common.cu b/Project2-Stream-Compaction/stream_compaction/common.cu index 2ed6d63..f8adceb 100644 --- a/Project2-Stream-Compaction/stream_compaction/common.cu +++ b/Project2-Stream-Compaction/stream_compaction/common.cu @@ -22,7 +22,19 @@ namespace StreamCompaction { * Maps an array to an array of 0s and 1s for stream compaction. Elements * which map to 0 will be removed, and elements which map to 1 will be kept. */ - __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { + __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + + if (index >= n) { + return; + } + + if (idata[index] == 0) { + bools[index] = 0; + } + else { + bools[index] = 1; + } // TODO } @@ -32,6 +44,16 @@ namespace StreamCompaction { */ __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { + + int index = threadIdx.x + (blockIdx.x * blockDim.x); + + if (index >= n) { + return; + } + + if (bools[index] == 1) { + odata[indices[index]] = idata[index]; + } // TODO } diff --git a/Project2-Stream-Compaction/stream_compaction/cpu.cu b/Project2-Stream-Compaction/stream_compaction/cpu.cu index a2d3e6c..85e1dbe 100644 --- a/Project2-Stream-Compaction/stream_compaction/cpu.cu +++ b/Project2-Stream-Compaction/stream_compaction/cpu.cu @@ -6,11 +6,11 @@ namespace StreamCompaction { namespace CPU { using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; - } + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; + } /** * CPU scan (prefix sum). @@ -18,9 +18,20 @@ namespace StreamCompaction { * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first. */ void scan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); - // TODO - timer().endCpuTimer(); + bool timer_started = false; + try { + timer().startCpuTimer(); + } + catch (const std::exception& e) { + timer_started = true; + } + odata[0] = 0; + for (int i = 1; i < n; i++) { + odata[i] = odata[i - 1] + idata[i-1]; + } + if (timer_started == false) { + timer().endCpuTimer(); + } } /** @@ -30,9 +41,16 @@ namespace StreamCompaction { */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); + int count = 0; + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + odata[count] = idata[i]; + count++; + } + } // TODO timer().endCpuTimer(); - return -1; + return count; } /** @@ -42,9 +60,21 @@ namespace StreamCompaction { */ int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + int *scanned = new int[n]; + int *mask = new int[n]; + for (int i = 0; i < n; i++) { + mask[i] = idata[i] == 0 ? 0: 1; + } + scan(n, scanned, mask); + int length = 0; + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + odata[scanned[i]] = idata[i]; + length = scanned[i]; + } + } timer().endCpuTimer(); - return -1; + return length+1; } } } diff --git a/Project2-Stream-Compaction/stream_compaction/efficient.cu b/Project2-Stream-Compaction/stream_compaction/efficient.cu index 2db346e..b48b8b9 100644 --- a/Project2-Stream-Compaction/stream_compaction/efficient.cu +++ b/Project2-Stream-Compaction/stream_compaction/efficient.cu @@ -2,7 +2,7 @@ #include #include "common.h" #include "efficient.h" - +#define blockSize 256 namespace StreamCompaction { namespace Efficient { using StreamCompaction::Common::PerformanceTimer; @@ -12,14 +12,116 @@ namespace StreamCompaction { return timer; } + + int *dev_odata; + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); - } + + void printArray(int n, int *a, bool abridged = false) { + printf(" [ "); + for (int i = 0; i < n; i++) { + if (abridged && i + 2 == 15 && n > 16) { + i = n - 2; + printf("... "); + } + printf("%3d ", a[i]); + } + printf("]\n"); + } + + __global__ void up_sweep(int N, int *Dev_odata, int d) { + + int index = threadIdx.x + (blockIdx.x * blockDim.x); + + index = index * (1 << (d + 1)); + + if (index > N-1) { + return; + } + + if (((index + (1 << (d)) - 1) < N) && ((index + (1 << (d + 1)) - 1) < N)) { + + Dev_odata[index + (1 << (d + 1)) - 1] += Dev_odata[index + (1 << (d)) - 1]; + } + + + + } + + + __global__ void down_sweep(int N, int *Dev_odata, int d) { + + int index = threadIdx.x + (blockIdx.x * blockDim.x); + + + index = index * (1 << (d + 1)); + + + if (index > N-1) { + return; + } + + + if (((index + (1 << (d)) - 1) < N) && ((index + (1 << (d + 1)) - 1) < N)) { + + int t = Dev_odata[index + (1 << (d)) - 1]; + Dev_odata[index + (1 << (d)) - 1] = Dev_odata[index + (1 << (d + 1)) - 1]; + Dev_odata[index + (1 << (d + 1)) - 1] += t; + } + + + } + /** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ + void scan(int n, int *odata, const int *idata) { + + + + + //printArray(n, idata); + //int new_n = n; + n = 1 << ilog2ceil(n); // make n something that is power of 2 + + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + cudaMemcpy(dev_odata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + + bool timer_started = false; + try { + timer().startGpuTimer(); + } + catch (const std::exception& e) { + timer_started = true; + } + + for (int d = 0; d <= ((ilog2ceil(n)) - 1); d++) { + int count_thread = 1 << ((ilog2ceil(n) - d - 1)); // i need ceil(n/d) threads total + dim3 fullBlocksPerGrid(((count_thread)+blockSize -1)/ blockSize); + up_sweep << > > (n, dev_odata, d); + } + + cudaMemset(n + dev_odata - 1, 0, sizeof(int)); + + for (int d = ilog2ceil(n) - 1; d >= 0; d--) { + int count_thread = 1 << ((ilog2ceil(n) - d - 1)); // i need ceil(n/d) threads total + dim3 fullBlocksPerGrid(((count_thread)+blockSize - 1) / blockSize); + down_sweep << > > (n, dev_odata, d); + } + + if (timer_started == false) { + timer().endGpuTimer(); + } + + cudaMemcpy(odata, dev_odata, sizeof(int) * (n), cudaMemcpyDeviceToHost); + //cudaMemcpy(dev_odata, dev_idata, sizeof(int) * n, cudaMemcpyDeviceToDevice); + //odata[0] = 0; + //printArray(n, odata); + + cudaFree(dev_odata); + + } /** * Performs stream compaction on idata, storing the result into odata. @@ -27,14 +129,61 @@ namespace StreamCompaction { * * @param n The number of elements in idata. * @param odata The array into which to store elements. - * @param idata The array of elements to compact. + * @param idata The array of elements to compact. * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { - timer().startGpuTimer(); + + int *dev_mask; + int *dev_idata; + int *temp_dev_odata; + + int count; + + cudaMalloc((void**)&dev_mask, n * sizeof(int)); + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMalloc((void**)&temp_dev_odata, n * sizeof(int)); + + cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + + timer().startGpuTimer(); + dim3 fullBlocksPerGrid((n+blockSize - 1) / blockSize); + + StreamCompaction::Common::kernMapToBoolean << > > (n, dev_mask, dev_idata); + + int *temp_idata = new int[n]; + + + + cudaMemcpy(temp_idata, dev_mask, n * sizeof(int), cudaMemcpyDeviceToHost); + + //printArray(n, temp_idata); + + scan(n, odata, temp_idata); + + + if (temp_idata[n - 1] == 0) { + count = odata[n - 1]; + } + else { + count = odata[n - 1] + 1; + } + + //cudaMemcpy(mask_scan, temp_odata, n * sizeof(int), cudaMemcpyHostToDevice); + + StreamCompaction::Common::kernScatter << > > (n, temp_dev_odata, dev_idata, dev_mask, dev_odata); + + timer().endGpuTimer(); + + cudaMemcpy(odata, temp_dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + // TODO - timer().endGpuTimer(); - return -1; + + cudaFree(dev_mask); + cudaFree(dev_idata); + cudaFree(temp_dev_odata); + + return count; } } } diff --git a/Project2-Stream-Compaction/stream_compaction/naive.cu b/Project2-Stream-Compaction/stream_compaction/naive.cu index 4308876..a04a2cc 100644 --- a/Project2-Stream-Compaction/stream_compaction/naive.cu +++ b/Project2-Stream-Compaction/stream_compaction/naive.cu @@ -1,8 +1,12 @@ #include #include +#include "device_launch_parameters.h" #include "common.h" #include "naive.h" +#define blockSize 512 + + namespace StreamCompaction { namespace Naive { using StreamCompaction::Common::PerformanceTimer; @@ -11,15 +15,76 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } + int *dev_idata; + int *dev_odata; + + void printArray(int n, int *a, bool abridged = false) { + printf(" [ "); + for (int i = 0; i < n; i++) { + if (abridged && i + 2 == 15 && n > 16) { + i = n - 2; + printf("... "); + } + printf("%3d ", a[i]); + } + printf("]\n"); + } + // TODO: __global__ + __global__ void scan_GPU(int N, int *Dev_idata, int *Dev_odata, int d) { + + int index = threadIdx.x + (blockIdx.x * blockDim.x); + + if (index >= N) { + return; + } + + + + if (index >= (1 << (d - 1))) { + Dev_odata[index] = Dev_idata[index - (1 << (d - 1))] + Dev_idata[index]; + } + else { + Dev_odata[index] = Dev_idata[index]; + } + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); + bool timer_started = false; + + //printArray(n, idata); + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + cudaMemcpy(dev_odata, odata, sizeof(int) * n, cudaMemcpyHostToDevice); + + try { + timer().startGpuTimer(); + } + catch (const std::exception& e) { + timer_started = true; + } + + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + + for (int d = 1; d <= ilog2ceil(n); d++) { + scan_GPU << > > (n, dev_idata, dev_odata, d); + cudaMemcpy(dev_idata, dev_odata, sizeof(int) * n, cudaMemcpyDeviceToDevice); + } + + if (timer_started == false) { + timer().endGpuTimer(); + } + cudaMemcpy(odata+1, dev_odata, sizeof(int) * (n-1), cudaMemcpyDeviceToHost); + odata[0] = 0; + //printArray(n, odata); + cudaFree(dev_odata); + cudaFree(dev_idata); } } } diff --git a/Project2-Stream-Compaction/stream_compaction/thrust.cu b/Project2-Stream-Compaction/stream_compaction/thrust.cu index 1def45e..9ef711f 100644 --- a/Project2-Stream-Compaction/stream_compaction/thrust.cu +++ b/Project2-Stream-Compaction/stream_compaction/thrust.cu @@ -18,11 +18,26 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); + + + thrust::host_vectorh_idata(idata, idata + n); + thrust::device_vectordev_idata(n); + thrust::device_vectordev_odata(n); + + + + thrust::copy(h_idata.begin(), h_idata.end(), dev_idata.begin()); + timer().startGpuTimer(); + thrust::exclusive_scan(dev_idata.begin(), dev_idata.end(), dev_odata.begin()); + timer().endGpuTimer(); + + thrust::copy(dev_odata.begin(), dev_odata.end(), odata); + + // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); - timer().endGpuTimer(); + } } } diff --git a/README.md b/README.md index 3a0b2fe..1d5638e 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,12 @@ -CUDA Number Algorithms +CUDA Stream Compaction ====================== **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +Peyman Norouzi +* [LinkedIn](https://www.linkedin.com/in/peymannorouzi) +* Tested on: Windows 10, i7-6700 @ 3.40GHz 16GB, Quadro P1000 4096MB (Moore 100B Lab) -### (TODO: Your README) - -Link to the readmes of the other two subprojects. - -Add anything else you think is relevant up to this point. -(Remember, this is public, so don't put anything here that you don't want to share with the world.) +1) [Stream Compaction](./Project2-Stream-Compaction) +2) [Character Recognition](./Project2-Character-Recognition)