From 202ee000acd51bad44b9a0776858e39ea5f42870 Mon Sep 17 00:00:00 2001 From: Jack12xl Date: Wed, 16 Sep 2020 00:39:44 +0800 Subject: [PATCH 01/23] implement the cpu version --- stream_compaction/cpu.cu | 66 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 64 insertions(+), 2 deletions(-) diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..99e3daa 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -2,6 +2,8 @@ #include "cpu.h" #include "common.h" +#include // Jack12 for assert +#include // Jack12 uses vector namespace StreamCompaction { namespace CPU { @@ -17,9 +19,30 @@ namespace StreamCompaction { * For performance analysis, this is supposed to be a simple for loop. * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first. */ + inline void cpu_scan(const int& n, int* odata, const int* idata) { + // scan without cputimer inline + if (n == 0) { + return; + } + + assert(odata != nullptr); + assert(idata != nullptr); + + int prefix_sum = 0; + + odata[0] = 0; + for (int i = 1; i < n; i++) { + prefix_sum += idata[i - 1]; + odata[i] = prefix_sum; + } + } + void scan(int n, int *odata, const int *idata) { + timer().startCpuTimer(); // TODO + // should be exclusive + cpu_scan(n, odata, idata); timer().endCpuTimer(); } @@ -31,8 +54,19 @@ namespace StreamCompaction { int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + if (n != 0) { + assert(odata != nullptr); + assert(idata != nullptr); + } + int p = 0; + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + odata[p] = idata[i]; + p++; + } + } timer().endCpuTimer(); - return -1; + return p; } /** @@ -43,8 +77,36 @@ namespace StreamCompaction { int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int out = 0; + if (n == 0) { + return out; + } + + assert(odata != nullptr); + assert(idata != nullptr); + + // map to 0, 1 + int* bin_arr = new int[n]; + for (int i = 0; i < n; i++) { + bin_arr[i] = idata[i] == 0 ? 0 : 1; + } + // scan + int* scan_arr = new int[n]; + cpu_scan(n, scan_arr, bin_arr); + // odata contains the scan result + + for (int i = 0; i < n; i++) { + if (bin_arr[i]) { + out++; + odata[scan_arr[i]] = idata[i]; + } + } + + delete [] bin_arr; + delete [] scan_arr; timer().endCpuTimer(); - return -1; + + return out; } } } From 8ffbcd78a7a4a91354d61a02205a684ddc57e6e7 Mon Sep 17 00:00:00 2001 From: Jack12xl Date: Wed, 16 Sep 2020 21:16:35 +0800 Subject: [PATCH 02/23] update part 2 naive approach --- src/main.cpp | 6 +-- stream_compaction/cpu.cu | 17 +++++---- stream_compaction/naive.cu | 75 ++++++++++++++++++++++++++++++++++++++ stream_compaction/naive.h | 2 + 4 files changed, 89 insertions(+), 11 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..80812d0 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -51,11 +51,11 @@ int main(int argc, char* argv[]) { printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); - /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan - onesArray(SIZE, c); + // For bug-finding only: Array of 1s to help find bugs in stream compaction or scan + /*onesArray(SIZE, c); printDesc("1s array for finding bugs"); StreamCompaction::Naive::scan(SIZE, c, a); printArray(SIZE, c, true); */ diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 99e3daa..4ef6609 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -52,12 +52,13 @@ namespace StreamCompaction { * @returns the number of elements remaining after compaction. */ int compactWithoutScan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); + // TODO if (n != 0) { assert(odata != nullptr); assert(idata != nullptr); } + timer().startCpuTimer(); int p = 0; for (int i = 0; i < n; i++) { if (idata[i] != 0) { @@ -75,7 +76,6 @@ namespace StreamCompaction { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); // TODO int out = 0; if (n == 0) { @@ -85,13 +85,16 @@ namespace StreamCompaction { assert(odata != nullptr); assert(idata != nullptr); - // map to 0, 1 int* bin_arr = new int[n]; + int* scan_arr = new int[n]; + timer().startCpuTimer(); + // map to 0, 1 + for (int i = 0; i < n; i++) { bin_arr[i] = idata[i] == 0 ? 0 : 1; } // scan - int* scan_arr = new int[n]; + cpu_scan(n, scan_arr, bin_arr); // odata contains the scan result @@ -101,11 +104,9 @@ namespace StreamCompaction { odata[scan_arr[i]] = idata[i]; } } - - delete [] bin_arr; - delete [] scan_arr; timer().endCpuTimer(); - + delete[] bin_arr; + delete[] scan_arr; return out; } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..6426ff6 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -2,6 +2,10 @@ #include #include "common.h" #include "naive.h" +#include +#include // Jack12 for assert + +//#define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__) namespace StreamCompaction { namespace Naive { @@ -12,14 +16,85 @@ namespace StreamCompaction { return timer; } // TODO: __global__ + __global__ void kernScanStep( + int n, + int d_phase, + const int* dev_buf_0, + int* dev_buf_1) { + + int k = (blockIdx.x * blockDim.x) + threadIdx.x; + if (k >= n) { + return; + } + if (k >= d_phase) { + dev_buf_1[k] = dev_buf_0[k - d_phase] + dev_buf_0[k]; + } + return; + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + // + if (n == 0) { + return; + } + assert(odata != nullptr); + assert(odata != nullptr); + + int* device_buf_0; + int* device_buf_1; + cudaMalloc((void**)&device_buf_0, n * sizeof(int)); + cudaMalloc((void**)&device_buf_1, n * sizeof(int)); + + //int* device_idata; + cudaMemcpy(device_buf_0, idata, n * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(device_buf_1, idata, n * sizeof(int), cudaMemcpyHostToDevice); timer().startGpuTimer(); + // TODO + // to device + int it_ceil = ilog2ceil(n); + dim3 blocksPerGrid = (n + blocksize - 1) / blocksize; + + for (int d = 1; d <= it_ceil; d++) { + StreamCompaction::Naive::kernScanStep<<>>(n, (int)std::pow(2, d-1), device_buf_0, device_buf_1); + cudaMemcpy(device_buf_0, device_buf_1, n * sizeof(int), cudaMemcpyHostToDevice); + } + + timer().endGpuTimer(); + cudaThreadSynchronize(); + // to exclusive + cudaMemcpy(odata + 1, device_buf_0, (n-1) * sizeof(int), cudaMemcpyDeviceToHost); + //cudaMemcpy(odata, device_buf_0, n * sizeof(int), cudaMemcpyDeviceToHost); + /*std::memmove(odata + 1, odata, n-1); + odata[0] = 0;*/ + cudaFree(device_buf_0); + cudaFree(device_buf_1); } + } } + +//__global__ void scan(float* g_odata, float* g_idata, int n) { +// extern __shared__ float temp[]; // allocated on invocation +// int thid = threadIdx.x; +// int pout = 0, pin = 1; // Load input into shared memory. +// // This is exclusive scan, so shift right by one +// // and set first element to 0 +// temp[pout * n + thid] = (thid > 0) ? g_idata[thid - 1] : 0; +// __syncthreads(); +// for (int offset = 1; offset < n; offset *= 2) +// { +// pout = 1 - pout; +// // swap double buffer indices +// pin = 1 - pout; +// if (thid >= offset) +// temp[pout * n + thid] += temp[pin * n + thid - offset]; +// else +// temp[pout * n + thid] = temp[pin * n + thid]; +// __syncthreads(); +// } +// g_odata[thid] = temp[pout * n + thid]; // write output } diff --git a/stream_compaction/naive.h b/stream_compaction/naive.h index 37dcb06..9475500 100644 --- a/stream_compaction/naive.h +++ b/stream_compaction/naive.h @@ -2,6 +2,8 @@ #include "common.h" +constexpr int blocksize = 128; + namespace StreamCompaction { namespace Naive { StreamCompaction::Common::PerformanceTimer& timer(); From cc3ce75d7f4236f42a2181cbab62c663dd748dc4 Mon Sep 17 00:00:00 2001 From: Jack12xl Date: Sat, 19 Sep 2020 01:34:02 +0800 Subject: [PATCH 03/23] bug not fixed in efficient, the nsight debugger not working again --- src/main.cpp | 6 +- stream_compaction/efficient.cu | 109 +++++++++++++++++++++++++++++++++ stream_compaction/efficient.h | 2 + stream_compaction/naive.cu | 52 +++++++++------- 4 files changed, 144 insertions(+), 25 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 80812d0..78fa8d5 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 3; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -71,14 +71,14 @@ int main(int argc, char* argv[]) { printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..f4b3c9d 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -2,6 +2,10 @@ #include #include "common.h" #include "efficient.h" +#include +#include +//#include "cis565_stream_compaction_test/testing_helpers.hpp" + namespace StreamCompaction { namespace Efficient { @@ -11,14 +15,79 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } + __global__ void kernUpdateArray(const int& idx, const int& val, int* dev_a) { + dev_a[idx] = val; + } + + __global__ void kernUpSweepStep( + int N, + int d_2, + int* dev_idata + ){ + int k = (blockIdx.x * blockDim.x) + threadIdx.x; + if (k >= N) { + return; + } + if (k % (2 * d_2) == 0) { + dev_idata[k + 2 * d_2 - 1] += dev_idata[k + d_2 - 1]; + } + } + + __global__ void kernDownSweepStep( + int N, + int d_2, + int* dev_idata + ) { + int k = (blockIdx.x * blockDim.x) + threadIdx.x; + if (k >= N) { + return; + } + if (k % ( d_2 * 2 )== 0) { + int tmp = dev_idata[k + d_2 -1]; + dev_idata[k + d_2 - 1] = dev_idata[k + 2 * d_2 - 1]; + dev_idata[k + 2 * d_2 - 1] = tmp + dev_idata[k + 2 * d_2 - 1]; + } + + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + if (n == 0) { + return; + } + assert(odata != nullptr); + assert(idata != nullptr); + + int log_n = ilog2ceil(n); + int n_2 = 1 << log_n; + + int* dev_idata; + dim3 blocksPerGrid = (n_2 + efficient_blocksize - 1) / efficient_blocksize; + /*int* dev_odata;*/ + cudaMalloc((void**)&dev_idata, n_2 * sizeof(int)); + /*cudaMalloc((void**)&dev_odata, n_2 * sizeof(int));*/ + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); timer().startGpuTimer(); // TODO + for (int d = 0; d <= log_n - 1; d ++) { + kernUpSweepStep<<>>(n_2, 1 << d, dev_idata); + } + + //cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost); + kernUpdateArray<<<1, 1>>>(n_2 - 1, 0, dev_idata); + //cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost); + + + for (int d = log_n - 1; d >= 0; d--) { + kernDownSweepStep << > > (n_2, 1 << d, dev_idata); + } + + timer().endGpuTimer(); + cudaMemcpy(odata + 1, dev_idata, (n - 1) * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(dev_idata); } /** @@ -37,4 +106,44 @@ namespace StreamCompaction { return -1; } } + + __global__ void prescan(float* g_odata, float* g_idata, int n) { + extern __shared__ float temp[]; // allocated on invocation + int thid = threadIdx.x; int offset = 1; + temp[2 * thid] = g_idata[2 * thid]; // load input into shared memory + temp[2*thid+1] = g_idata[2*thid+1]; + + for (int d = n >> 1; d > 0; d >>= 1) // build sum in place up the tree + { + __syncthreads(); + if (thid < d) + { + int ai = offset * (2 * thid + 1) - 1; + int bi = offset * (2 * thid + 2) - 1; + temp[bi] += temp[ai]; + } + offset *= 2; + } + + + if (thid == 0) { temp[n - 1] = 0; } // clear the last element + + for (int d = 1; d < n; d *= 2) // traverse down tree & build scan + { + offset >>= 1; + __syncthreads(); + if (thid < d){ + int ai = offset * (2 * thid + 1) - 1; + int bi = offset * (2 * thid + 2) - 1; + + float t = temp[ai]; + temp[ai] = temp[bi]; + temp[bi] += t; + } + } + __syncthreads(); + + g_odata[2 * thid] = temp[2 * thid]; // write results to device memory g_odata[2*thid+1] = temp[2*thid+1]; + + } } diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 803cb4f..c81f7f9 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -2,6 +2,8 @@ #include "common.h" +constexpr int efficient_blocksize = 128; + namespace StreamCompaction { namespace Efficient { StreamCompaction::Common::PerformanceTimer& timer(); diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 6426ff6..13d5821 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -7,6 +7,7 @@ //#define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__) + namespace StreamCompaction { namespace Naive { using StreamCompaction::Common::PerformanceTimer; @@ -30,6 +31,9 @@ namespace StreamCompaction { if (k >= d_phase) { dev_buf_1[k] = dev_buf_0[k - d_phase] + dev_buf_0[k]; } + else { + dev_buf_1[k] = dev_buf_0[k]; + } return; } /** @@ -58,9 +62,12 @@ namespace StreamCompaction { int it_ceil = ilog2ceil(n); dim3 blocksPerGrid = (n + blocksize - 1) / blocksize; + int offset; for (int d = 1; d <= it_ceil; d++) { - StreamCompaction::Naive::kernScanStep<<>>(n, (int)std::pow(2, d-1), device_buf_0, device_buf_1); - cudaMemcpy(device_buf_0, device_buf_1, n * sizeof(int), cudaMemcpyHostToDevice); + offset = (int)std::pow(2, d - 1); + StreamCompaction::Naive::kernScanStep<<>>(n, offset, device_buf_0, device_buf_1); + //cudaMemcpy(device_buf_0, device_buf_1, n* sizeof(int), cudaMemcpyDeviceToDevice); + std::swap(device_buf_0, device_buf_1); } @@ -78,23 +85,24 @@ namespace StreamCompaction { } } -//__global__ void scan(float* g_odata, float* g_idata, int n) { -// extern __shared__ float temp[]; // allocated on invocation -// int thid = threadIdx.x; -// int pout = 0, pin = 1; // Load input into shared memory. -// // This is exclusive scan, so shift right by one -// // and set first element to 0 -// temp[pout * n + thid] = (thid > 0) ? g_idata[thid - 1] : 0; -// __syncthreads(); -// for (int offset = 1; offset < n; offset *= 2) -// { -// pout = 1 - pout; -// // swap double buffer indices -// pin = 1 - pout; -// if (thid >= offset) -// temp[pout * n + thid] += temp[pin * n + thid - offset]; -// else -// temp[pout * n + thid] = temp[pin * n + thid]; -// __syncthreads(); -// } -// g_odata[thid] = temp[pout * n + thid]; // write output } +__global__ void scan(float* g_odata, float* g_idata, int n) { + extern __shared__ float temp[]; // allocated on invocation + int thid = threadIdx.x; + int pout = 0, pin = 1; // Load input into shared memory. + // This is exclusive scan, so shift right by one + // and set first element to 0 + temp[pout * n + thid] = (thid > 0) ? g_idata[thid - 1] : 0; + __syncthreads(); + for (int offset = 1; offset < n; offset *= 2) + { + pout = 1 - pout; + // swap double buffer indices + pin = 1 - pout; + if (thid >= offset) + temp[pout * n + thid] += temp[pin * n + thid - offset]; + else + temp[pout * n + thid] = temp[pin * n + thid]; + __syncthreads(); + } + g_odata[thid] = temp[pout * n + thid]; // write output +} From 221fd2d1176ace703cbb5800c441112a257bacb1 Mon Sep 17 00:00:00 2001 From: Jack12xl Date: Sat, 19 Sep 2020 21:54:31 +0800 Subject: [PATCH 04/23] add 3.1 --- src/main.cpp | 1 + stream_compaction/efficient.cu | 23 +++++++++++++---------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 78fa8d5..8fbe61b 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -27,6 +27,7 @@ int main(int argc, char* argv[]) { printf("** SCAN TESTS **\n"); printf("****************\n"); + //onesArray(SIZE - 1, a); genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case a[SIZE - 1] = 0; printArray(SIZE, a, true); diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index f4b3c9d..d2d59ce 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -15,28 +15,29 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } - __global__ void kernUpdateArray(const int& idx, const int& val, int* dev_a) { - dev_a[idx] = val; + __global__ void kernUpdateArray(int idx, int val, int* d_data) { + d_data[idx] = val; } __global__ void kernUpSweepStep( int N, int d_2, - int* dev_idata + int* d_data ){ int k = (blockIdx.x * blockDim.x) + threadIdx.x; if (k >= N) { return; } if (k % (2 * d_2) == 0) { - dev_idata[k + 2 * d_2 - 1] += dev_idata[k + d_2 - 1]; + d_data[k + 2 * d_2 - 1] += d_data[k + d_2 - 1]; } + __syncthreads(); } __global__ void kernDownSweepStep( int N, int d_2, - int* dev_idata + int* d_data ) { int k = (blockIdx.x * blockDim.x) + threadIdx.x; if (k >= N) { @@ -44,10 +45,12 @@ namespace StreamCompaction { } if (k % ( d_2 * 2 )== 0) { - int tmp = dev_idata[k + d_2 -1]; - dev_idata[k + d_2 - 1] = dev_idata[k + 2 * d_2 - 1]; - dev_idata[k + 2 * d_2 - 1] = tmp + dev_idata[k + 2 * d_2 - 1]; + int tmp = d_data[k + d_2 -1]; + d_data[k + d_2 - 1] = d_data[k + 2 * d_2 - 1]; + d_data[k + 2 * d_2 - 1] = tmp + d_data[k + 2 * d_2 - 1]; } + __syncthreads(); + } /** @@ -86,7 +89,7 @@ namespace StreamCompaction { timer().endGpuTimer(); - cudaMemcpy(odata + 1, dev_idata, (n - 1) * sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(odata, dev_idata,n * sizeof(int), cudaMemcpyDeviceToHost); cudaFree(dev_idata); } @@ -115,7 +118,7 @@ namespace StreamCompaction { for (int d = n >> 1; d > 0; d >>= 1) // build sum in place up the tree { - __syncthreads(); + __syncthreads(); if (thid < d) { int ai = offset * (2 * thid + 1) - 1; From fe526bd7d2de82685210e74bdc9b458710e0f12d Mon Sep 17 00:00:00 2001 From: Jack12xl Date: Sun, 20 Sep 2020 01:34:57 +0800 Subject: [PATCH 05/23] add part 3.2 --- README.md | 23 +++++++++--- src/main.cpp | 10 ++--- stream_compaction/common.cu | 19 +++++++++- stream_compaction/efficient.cu | 68 +++++++++++++++++++++++++++++----- stream_compaction/efficient.h | 2 +- 5 files changed, 99 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index 0e38ddb..7b6d828 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,23 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Ling Xie + * [LinkedIn](https://www.linkedin.com/in/ling-xie-94b939182/), + * [personal website](https://jack12xl.netlify.app). +* Tested on: + * Windows 10, Intel(R) Xeon(R) CPU E5-2650 v4 @ 2.20GHz 2.20GHz ( two processors) + * 64.0 GB memory + * NVIDIA TITAN XP GP102 + +Thanks to [FLARE LAB](http://faculty.sist.shanghaitech.edu.cn/faculty/liuxp/flare/index.html) for this ferocious monster. + +### Intro + +In this project, basically we implement parallel scan algorithm based on CUDA required by [instruction](https://github.com/Jack12xl/Project2-Stream-Compaction/blob/master/INSTRUCTION.md). + + + +#### Part 5: why GPU version so slow -### (TODO: Your README) -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) diff --git a/src/main.cpp b/src/main.cpp index 8fbe61b..e3d4140 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 3; // feel free to change the size of array +const int SIZE = 1 << 4; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -70,14 +70,14 @@ int main(int argc, char* argv[]) { zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); - StreamCompaction::Efficient::scan(SIZE, c, a); + StreamCompaction::Efficient::scan(SIZE, c, a, true); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); - StreamCompaction::Efficient::scan(NPOT, c, a); + StreamCompaction::Efficient::scan(NPOT, c, a, true); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); printArray(NPOT, c, true); printCmpResult(NPOT, b, c); @@ -138,14 +138,14 @@ int main(int argc, char* argv[]) { printDesc("work-efficient compact, power-of-two"); count = StreamCompaction::Efficient::compact(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); printDesc("work-efficient compact, non-power-of-two"); count = StreamCompaction::Efficient::compact(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); system("pause"); // stop Win32 console from closing on exit diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..15ee84c 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -1,4 +1,5 @@ #include "common.h" +#include void checkCUDAErrorFn(const char *msg, const char *file, int line) { cudaError_t err = cudaGetLastError(); @@ -22,17 +23,31 @@ namespace StreamCompaction { * Maps an array to an array of 0s and 1s for stream compaction. Elements * which map to 0 will be removed, and elements which map to 1 will be kept. */ - __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { + __global__ void kernMapToBoolean(int N, int *bools, const int *idata) { // TODO + int k = (blockIdx.x * blockDim.x) + threadIdx.x; + if (k >= N) { + return; + } + bools[k] = idata[k] == 0 ? 0 : 1; } /** * Performs scatter on an array. That is, for each element in idata, * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]]. */ - __global__ void kernScatter(int n, int *odata, + __global__ void kernScatter(int N, int *odata, const int *idata, const int *bools, const int *indices) { // TODO + int k = (blockIdx.x * blockDim.x) + threadIdx.x; + if (k >= N) { + return; + } + + if (bools[k] == 1) { + odata[indices[k]] = idata[k]; + } + } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index d2d59ce..374741f 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -31,7 +31,6 @@ namespace StreamCompaction { if (k % (2 * d_2) == 0) { d_data[k + 2 * d_2 - 1] += d_data[k + d_2 - 1]; } - __syncthreads(); } __global__ void kernDownSweepStep( @@ -49,14 +48,11 @@ namespace StreamCompaction { d_data[k + d_2 - 1] = d_data[k + 2 * d_2 - 1]; d_data[k + 2 * d_2 - 1] = tmp + d_data[k + 2 * d_2 - 1]; } - __syncthreads(); - - } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { + void scan(int n, int *odata, const int *idata, bool ifTimer = true) { if (n == 0) { return; } @@ -72,7 +68,11 @@ namespace StreamCompaction { cudaMalloc((void**)&dev_idata, n_2 * sizeof(int)); /*cudaMalloc((void**)&dev_odata, n_2 * sizeof(int));*/ cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); - timer().startGpuTimer(); + + if (ifTimer) { + timer().startGpuTimer(); + } + // TODO for (int d = 0; d <= log_n - 1; d ++) { kernUpSweepStep<<>>(n_2, 1 << d, dev_idata); @@ -87,8 +87,10 @@ namespace StreamCompaction { kernDownSweepStep << > > (n_2, 1 << d, dev_idata); } + if (ifTimer) { + timer().endGpuTimer(); + } - timer().endGpuTimer(); cudaMemcpy(odata, dev_idata,n * sizeof(int), cudaMemcpyDeviceToHost); cudaFree(dev_idata); } @@ -102,11 +104,59 @@ namespace StreamCompaction { * @param idata The array of elements to compact. * @returns The number of elements remaining after compaction. */ - int compact(int n, int *odata, const int *idata) { + int compact(int N, int *odata, const int *idata) { + if (N == 0) { + return 0; + } + assert(odata != nullptr); + assert(idata != nullptr); + + int* dev_idata; + int* dev_odata; + int* dev_bools; + int* dev_indices; + + cudaMalloc((void**)&dev_idata, N * sizeof(int)); + cudaMalloc((void**)&dev_odata, N * sizeof(int)); + cudaMalloc((void**)&dev_bools, N * sizeof(int)); + cudaMalloc((void**)&dev_indices, N * sizeof(int)); + + cudaMemcpy(dev_idata, idata, N * sizeof(int), cudaMemcpyHostToDevice); + timer().startGpuTimer(); // TODO + dim3 blocksPerGrid = (N + efficient_blocksize - 1) / efficient_blocksize; + Common::kernMapToBoolean << > > (N, dev_bools, dev_idata); + + scan(N, dev_indices, dev_bools, false); + + Common::kernScatter << > > ( + N, + dev_odata, + dev_idata, + dev_bools, + dev_indices + ); + + timer().endGpuTimer(); - return -1; + + cudaMemcpy(odata, dev_odata, N * sizeof(int), cudaMemcpyDeviceToHost); + int* hst_bools = new int[N]; + cudaMemcpy(hst_bools, dev_bools, N * sizeof(int), cudaMemcpyDeviceToHost); + int out = 0; + for (int i = 0; i < N; i++) { + if (hst_bools[i] == 1) { + out++; + } + } + + cudaFree(dev_idata); + cudaFree(dev_odata); + cudaFree(dev_bools); + cudaFree(dev_indices); + + return out; } } diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index c81f7f9..45b065a 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -8,7 +8,7 @@ namespace StreamCompaction { namespace Efficient { StreamCompaction::Common::PerformanceTimer& timer(); - void scan(int n, int *odata, const int *idata); + void scan(int n, int *odata, const int *idata, bool ifTimer); int compact(int n, int *odata, const int *idata); } From eb7844b94869e9678ed44047b1843ea4bd4e591e Mon Sep 17 00:00:00 2001 From: Jack12xl Date: Mon, 21 Sep 2020 00:23:05 +0800 Subject: [PATCH 06/23] add 4th part --- stream_compaction/thrust.cu | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..7722cd8 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -17,12 +17,37 @@ namespace StreamCompaction { /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { + void scan(int N, int *odata, const int *idata) { + if (N == 0) { + return; + } + assert(odata != nullptr); + assert(idata != nullptr); + int* dev_idata; int* dev_odata; + + cudaMalloc((void**)&dev_idata, N * sizeof(int)); + cudaMalloc((void**)&dev_odata, N * sizeof(int)); + /*thrust::host_vector thrust_hst_odata_vec = thrust*/ + + cudaMemcpy(dev_idata, idata, N * sizeof(int), cudaMemcpyHostToDevice); + + thrust::device_ptr dev_thrust_idata = thrust::device_pointer_cast(dev_idata); + //thrust::device_vector< int > dev_thrust_idata_vec(idata, idata + N); + thrust::device_vector dev_thrust_idata_vec(dev_thrust_idata, dev_thrust_idata + N); + + thrust::device_ptr dev_thrust_odata = thrust::device_pointer_cast(dev_odata); + thrust::device_vector dev_thrust_odata_vec(dev_thrust_odata, dev_thrust_odata + N); + timer().startGpuTimer(); // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(dev_thrust_idata_vec.begin(), dev_thrust_idata_vec.end(), dev_thrust_odata_vec.begin()); timer().endGpuTimer(); + + thrust::copy(dev_thrust_odata_vec.begin(), dev_thrust_odata_vec.end(), odata); + /*cudaFree(dev_idata); + cudaFree(dev_odata);*/ } } } From 850f130e89aa2a62190d7461644ba02781018ddc Mon Sep 17 00:00:00 2001 From: Jack12xl Date: Mon, 21 Sep 2020 14:43:21 +0800 Subject: [PATCH 07/23] add csvfile --- CMakeLists.txt | 1 + README.md | 2 ++ src/main.cpp | 37 ++++++++++++++++++++++++++++++++++++- stream_compaction/cpu.cu | 1 + 4 files changed, 40 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c504b62..454fafd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -41,6 +41,7 @@ include_directories(.) set(headers "src/testing_helpers.hpp" + "src/csvfile.hpp" ) set(sources diff --git a/README.md b/README.md index 7b6d828..35dd182 100644 --- a/README.md +++ b/README.md @@ -21,5 +21,7 @@ In this project, basically we implement parallel scan algorithm based on CUDA re #### Part 5: why GPU version so slow +The reason why the +1. memory bandwidth: the current version fetches memory uncontinuously, which leads to a low memory bandwidth. diff --git a/src/main.cpp b/src/main.cpp index e3d4140..8396779 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -12,8 +12,9 @@ #include #include #include "testing_helpers.hpp" +#include "csvfile.hpp" -const int SIZE = 1 << 4; // feel free to change the size of array +const int SIZE = 1 << 24; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -35,6 +36,7 @@ int main(int argc, char* argv[]) { // initialize b using StreamCompaction::CPU::scan you implement // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct. // At first all cases passed because b && c are all zeroes. + zeroArray(SIZE, b); printDesc("cpu scan, power-of-two"); StreamCompaction::CPU::scan(SIZE, b, a); @@ -47,6 +49,19 @@ int main(int argc, char* argv[]) { printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); printArray(NPOT, b, true); printCmpResult(NPOT, b, c); + + zeroArray(SIZE, c); + printDesc("cpu scan, non-power-of-two"); + StreamCompaction::CPU::scan(NPOT, c, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + printArray(NPOT, b, true); + printCmpResult(NPOT, b, c); + + zeroArray(SIZE, b); + printDesc("cpu scan, power-of-two"); + StreamCompaction::CPU::scan(SIZE, b, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + printArray(SIZE, b, true); zeroArray(SIZE, c); printDesc("naive scan, power-of-two"); @@ -152,4 +167,24 @@ int main(int argc, char* argv[]) { delete[] a; delete[] b; delete[] c; + // save to csv + try + { + csvfile csv("MyTable.csv"); // throws exceptions! + // Hearer + csv << "X" << "VALUE" << endrow; + // Data + int i = 1; + csv << i++ << "String value" << endrow; + csv << i++ << 123 << endrow; + csv << i++ << 1.f << endrow; + csv << i++ << 1.2 << endrow; + csv << i++ << "One more string" << endrow; + csv << i++ << "\"Escaped\"" << endrow; + csv << i++ << "=HYPERLINK(\"https://playkey.net\"; \"Playkey Service\")" << endrow; + } + catch (const std::exception& ex) + { + std::cout << "Exception was thrown: " << ex.what() << std::endl; + } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 4ef6609..a07b626 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -32,6 +32,7 @@ namespace StreamCompaction { odata[0] = 0; for (int i = 1; i < n; i++) { + //odata[i] += idata[i - 1]; prefix_sum += idata[i - 1]; odata[i] = prefix_sum; } From 2d3e0c151b8886facbfeda86d351be85439e4841 Mon Sep 17 00:00:00 2001 From: Jack12xl Date: Mon, 21 Sep 2020 21:11:27 +0800 Subject: [PATCH 08/23] add part 5 --- README.md | 9 +++- src/main.cpp | 52 ++++++++++-------- stream_compaction/efficient.cu | 99 ++++++++++++++++++++++++++++++---- stream_compaction/efficient.h | 2 +- 4 files changed, 128 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index 35dd182..b20ff0c 100644 --- a/README.md +++ b/README.md @@ -19,9 +19,14 @@ In this project, basically we implement parallel scan algorithm based on CUDA re +#### Part 1~4: + + + #### Part 5: why GPU version so slow -The reason why the +The reason why the GPU is slower than CPU version: -1. memory bandwidth: the current version fetches memory uncontinuously, which leads to a low memory bandwidth. +1. **Spatial coherence:** The cpu version reads the memory in a continuous way while the current version fetches memory uncontinuously, which leads to a low memory bandwidth. +2. **The input size matters:** When the size of input array is trivial (for example 2^4), **cpu** version is faster than **gpu's**. When the size goes up, the situation goes reversed and **gpu** version is much faster than **cpu's** since naturally **gpu** is better in dealing with a large amounts of number. diff --git a/src/main.cpp b/src/main.cpp index 8396779..64431e1 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -14,7 +14,8 @@ #include "testing_helpers.hpp" #include "csvfile.hpp" -const int SIZE = 1 << 24; // feel free to change the size of array +const int power = 24; +const int SIZE = 1 << power; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -42,14 +43,7 @@ int main(int argc, char* argv[]) { StreamCompaction::CPU::scan(SIZE, b, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); printArray(SIZE, b, true); - - zeroArray(SIZE, c); - printDesc("cpu scan, non-power-of-two"); - StreamCompaction::CPU::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(NPOT, b, true); - printCmpResult(NPOT, b, c); - + // zeroArray(SIZE, c); printDesc("cpu scan, non-power-of-two"); StreamCompaction::CPU::scan(NPOT, c, a); @@ -57,11 +51,7 @@ int main(int argc, char* argv[]) { printArray(NPOT, b, true); printCmpResult(NPOT, b, c); - zeroArray(SIZE, b); - printDesc("cpu scan, power-of-two"); - StreamCompaction::CPU::scan(SIZE, b, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(SIZE, b, true); + // zeroArray(SIZE, c); printDesc("naive scan, power-of-two"); @@ -70,11 +60,11 @@ int main(int argc, char* argv[]) { printArray(SIZE, c, true); printCmpResult(SIZE, b, c); - // For bug-finding only: Array of 1s to help find bugs in stream compaction or scan - /*onesArray(SIZE, c); - printDesc("1s array for finding bugs"); - StreamCompaction::Naive::scan(SIZE, c, a); - printArray(SIZE, c, true); */ + //// For bug-finding only: Array of 1s to help find bugs in stream compaction or scan + ///*onesArray(SIZE, c); + //printDesc("1s array for finding bugs"); + //StreamCompaction::Naive::scan(SIZE, c, a); + //printArray(SIZE, c, true); */ zeroArray(SIZE, c); printDesc("naive scan, non-power-of-two"); @@ -83,20 +73,40 @@ int main(int argc, char* argv[]) { //printArray(SIZE, c, true); printCmpResult(NPOT, b, c); + zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); - StreamCompaction::Efficient::scan(SIZE, c, a, true); + StreamCompaction::Efficient::scan(SIZE, c, a, true, false, false); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); - StreamCompaction::Efficient::scan(NPOT, c, a, true); + StreamCompaction::Efficient::scan(NPOT, c, a, true, false, false); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); printArray(NPOT, c, true); printCmpResult(NPOT, b, c); + zeroArray(SIZE, c); + printDesc("work-efficient scan with index scale, power-of-two"); + StreamCompaction::Efficient::scan(SIZE, c, a, true, true, false); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient scan with index scale, non-power-of-two"); + StreamCompaction::Efficient::scan(NPOT, c, a, true, true, false); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(NPOT, c, true); + printCmpResult(NPOT, b, c); + + + + + + zeroArray(SIZE, c); printDesc("thrust scan, power-of-two"); StreamCompaction::Thrust::scan(SIZE, c, a); diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 374741f..7bf1f29 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -19,6 +19,7 @@ namespace StreamCompaction { d_data[idx] = val; } +#pragma region vanilla __global__ void kernUpSweepStep( int N, int d_2, @@ -49,10 +50,65 @@ namespace StreamCompaction { d_data[k + 2 * d_2 - 1] = tmp + d_data[k + 2 * d_2 - 1]; } } +#pragma endregion + +#pragma region indexScale +// for part 5 + __global__ void kernUpSweepIndexScaleStep( + int N, + int d_2, + int* d_data + ) { + int k = 2 * d_2 * ( (blockIdx.x * blockDim.x) + threadIdx.x) + 2 * d_2 - 1; + /*k *= 2 * d_2; + k += 2 * d_2 - 1;*/ + if (k >= N) { + return; + } + d_data[k] += d_data[k - d_2]; + } + + __global__ void kernDownSweepIndexScaleStep( + int N, + int d_2, + int* d_data + ) { + int k = 2 * d_2 * ((blockIdx.x * blockDim.x) + threadIdx.x) + 2 * d_2 - 1; + if (k >= N) { + return; + } + int tmp = d_data[k - d_2]; + d_data[k - d_2] = d_data[k]; + d_data[k] = tmp + d_data[k]; + } +#pragma endregion + +#pragma region SharedMemory + __global__ void kernSharedMemoryScan(int N, float* dev_idata) { + int t_offset = blockIdx.x * blockDim.x; + int t_id = threadIdx.x; + int k = t_offset + t_id; + if (k >= N) { + return; + } + + extern __shared__ float shared[]; + // upsweep + + __syncthreads(); + // make last element to zero + if (k == 0) { + dev_idata[N - 1] = 0; + } + __syncthreads(); + // downsweep + + } + #pragma endregion /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata, bool ifTimer = true) { + void scan(int n, int *odata, const int *idata, bool ifTimer = true,bool ifIdxScale = false, bool ifSharedMemory = false) { if (n == 0) { return; } @@ -74,18 +130,38 @@ namespace StreamCompaction { } // TODO - for (int d = 0; d <= log_n - 1; d ++) { - kernUpSweepStep<<>>(n_2, 1 << d, dev_idata); - } + if (!ifSharedMemory) { + for (int d = 0; d <= log_n - 1; d++) { + if (ifIdxScale) { + blocksPerGrid = (n_2 / (1 << (1 + d)) + efficient_blocksize - 1) / efficient_blocksize; + kernUpSweepIndexScaleStep << > > (n_2, 1 << d, dev_idata); + } + else { + kernUpSweepStep << > > (n_2, 1 << d, dev_idata); + } + + } + + //cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost); + kernUpdateArray << <1, 1 >> > (n_2 - 1, 0, dev_idata); + //cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost); - //cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost); - kernUpdateArray<<<1, 1>>>(n_2 - 1, 0, dev_idata); - //cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost); - - for (int d = log_n - 1; d >= 0; d--) { - kernDownSweepStep << > > (n_2, 1 << d, dev_idata); + for (int d = log_n - 1; d >= 0; d--) { + if (ifIdxScale) { + blocksPerGrid = (n_2 / (1 << (1 + d)) + efficient_blocksize - 1) / efficient_blocksize; + kernDownSweepIndexScaleStep << > > (n_2, 1 << d, dev_idata); + } + else { + kernDownSweepStep << > > (n_2, 1 << d, dev_idata); + } + } } + else { + + //kernSharedMemoryScan + } + if (ifTimer) { timer().endGpuTimer(); @@ -160,6 +236,9 @@ namespace StreamCompaction { } } + + + // ref :: gpu gem __global__ void prescan(float* g_odata, float* g_idata, int n) { extern __shared__ float temp[]; // allocated on invocation int thid = threadIdx.x; int offset = 1; diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 45b065a..92f30dc 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -8,7 +8,7 @@ namespace StreamCompaction { namespace Efficient { StreamCompaction::Common::PerformanceTimer& timer(); - void scan(int n, int *odata, const int *idata, bool ifTimer); + void scan(int n, int *odata, const int *idata, bool ifTimer, bool ifIdxScale, bool ifSharedMemory); int compact(int n, int *odata, const int *idata); } From 1c0c77727cd62a7b10e3eb9fc7005cc03a16400f Mon Sep 17 00:00:00 2001 From: Jack12xl Date: Mon, 21 Sep 2020 23:24:33 +0800 Subject: [PATCH 09/23] add shared memory not that worked --- README.md | 4 ++ src/main.cpp | 14 ++++- stream_compaction/efficient.cu | 103 ++++++++++++++++++++++----------- 3 files changed, 85 insertions(+), 36 deletions(-) diff --git a/README.md b/README.md index b20ff0c..727926d 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,10 @@ CUDA Stream Compaction Thanks to [FLARE LAB](http://faculty.sist.shanghaitech.edu.cn/faculty/liuxp/flare/index.html) for this ferocious monster. +##### Cmake change + +Add [csvfile.hpp]() to get the performance in CSV form. + ### Intro In this project, basically we implement parallel scan algorithm based on CUDA required by [instruction](https://github.com/Jack12xl/Project2-Stream-Compaction/blob/master/INSTRUCTION.md). diff --git a/src/main.cpp b/src/main.cpp index 64431e1..2d7bea1 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -14,7 +14,7 @@ #include "testing_helpers.hpp" #include "csvfile.hpp" -const int power = 24; +const int power = 19; const int SIZE = 1 << power; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; @@ -102,7 +102,19 @@ int main(int argc, char* argv[]) { printArray(NPOT, c, true); printCmpResult(NPOT, b, c); + zeroArray(SIZE, c); + printDesc("work-efficient scan with shared memory, power-of-two"); + StreamCompaction::Efficient::scan(SIZE, c, a, true, false, true); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + zeroArray(SIZE, c); + printDesc("work-efficient scan with shared memory, non-power-of-two"); + StreamCompaction::Efficient::scan(NPOT, c, a, true, false, true); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(NPOT, c, true); + printCmpResult(NPOT, b, c); diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 7bf1f29..22f1ece 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -77,32 +77,59 @@ namespace StreamCompaction { if (k >= N) { return; } - int tmp = d_data[k - d_2]; + int tmp = d_data[k + - d_2]; d_data[k - d_2] = d_data[k]; d_data[k] = tmp + d_data[k]; } #pragma endregion #pragma region SharedMemory - __global__ void kernSharedMemoryScan(int N, float* dev_idata) { + __global__ void kernSharedMemoryUpSweepStep(int N, int d_2, int* dev_idata) { int t_offset = blockIdx.x * blockDim.x; int t_id = threadIdx.x; - int k = t_offset + t_id; + int k = 2 * d_2 * (t_offset + t_id) + 2 * d_2 - 1; if (k >= N) { return; } extern __shared__ float shared[]; - // upsweep - + shared[2 * t_id] = dev_idata[k - d_2]; + shared[2 * t_id + 1] = dev_idata[k]; __syncthreads(); - // make last element to zero - if (k == 0) { - dev_idata[N - 1] = 0; + + shared[2 * t_id + 1] += shared[2 * t_id]; + + /*for (int i = 1; i <= unroll_depth; i++) { + int mul = 1 << (i + 1); + shared[mul * (t_id + 1) - 1] += shared[mul * (t_id + 1) - mul >> 1 - 1]; + __syncthreads(); + }*/ + + + dev_idata[k] = shared[2 * t_id + 1]; + } + + __global__ void kernSharedMemoryDownSweepStep(int N, int d_2, int* dev_idata) { + int t_offset = blockIdx.x * blockDim.x; + int t_id = threadIdx.x; + int k = 2 * d_2 * (t_offset + t_id) + 2 * d_2 - 1; + if (k >= N) { + return; } + + extern __shared__ float shared[]; + shared[2 * t_id] = dev_idata[k - d_2]; + shared[2 * t_id + 1] = dev_idata[k]; + __syncthreads(); + + int tmp = shared[2 * t_id]; + shared[2 * t_id] = shared[2 * t_id + 1]; + shared[2 * t_id + 1] += tmp; __syncthreads(); - // downsweep + dev_idata[k - d_2] = shared[2 * t_id]; + dev_idata[k] = shared[2 * t_id + 1]; } #pragma endregion /** @@ -130,37 +157,43 @@ namespace StreamCompaction { } // TODO - if (!ifSharedMemory) { - for (int d = 0; d <= log_n - 1; d++) { - if (ifIdxScale) { - blocksPerGrid = (n_2 / (1 << (1 + d)) + efficient_blocksize - 1) / efficient_blocksize; - kernUpSweepIndexScaleStep << > > (n_2, 1 << d, dev_idata); - } - else { - kernUpSweepStep << > > (n_2, 1 << d, dev_idata); - } - + + for (int d = 0; d <= log_n - 1; d++) { + if (ifIdxScale) { + blocksPerGrid = (n_2 / (1 << (1 + d)) + efficient_blocksize - 1) / efficient_blocksize; + kernUpSweepIndexScaleStep << > > (n_2, 1 << d, dev_idata); + } + else if (ifSharedMemory) { + blocksPerGrid = (n_2 / (1 << (1 + d)) + efficient_blocksize - 1) / efficient_blocksize; + kernSharedMemoryUpSweepStep <<>> (n_2, 1 << d, dev_idata); } + else{ + kernUpSweepStep << > > (n_2, 1 << d, dev_idata); + } + + + } - //cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost); - kernUpdateArray << <1, 1 >> > (n_2 - 1, 0, dev_idata); - //cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost); + //cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost); + kernUpdateArray << <1, 1 >> > (n_2 - 1, 0, dev_idata); + //cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost); - for (int d = log_n - 1; d >= 0; d--) { - if (ifIdxScale) { - blocksPerGrid = (n_2 / (1 << (1 + d)) + efficient_blocksize - 1) / efficient_blocksize; - kernDownSweepIndexScaleStep << > > (n_2, 1 << d, dev_idata); - } - else { - kernDownSweepStep << > > (n_2, 1 << d, dev_idata); - } + for (int d = log_n - 1; d >= 0; d--) { + if (ifIdxScale) { + blocksPerGrid = (n_2 / (1 << (1 + d)) + efficient_blocksize - 1) / efficient_blocksize; + kernDownSweepIndexScaleStep << > > (n_2, 1 << d, dev_idata); + } + else if (ifSharedMemory) { + blocksPerGrid = (n_2 / (1 << (1 + d)) + efficient_blocksize - 1) / efficient_blocksize; + kernSharedMemoryDownSweepStep << > > (n_2, 1 << d, dev_idata); + } + else { + kernDownSweepStep << > > (n_2, 1 << d, dev_idata); } } - else { - - //kernSharedMemoryScan - } + + if (ifTimer) { @@ -204,7 +237,7 @@ namespace StreamCompaction { dim3 blocksPerGrid = (N + efficient_blocksize - 1) / efficient_blocksize; Common::kernMapToBoolean << > > (N, dev_bools, dev_idata); - scan(N, dev_indices, dev_bools, false); + scan(N, dev_indices, dev_bools, false, true, false); Common::kernScatter << > > ( N, From 892b7deafe2b722c296a82e75444bdc7589a18b7 Mon Sep 17 00:00:00 2001 From: Jack12xl Date: Tue, 22 Sep 2020 02:02:11 +0800 Subject: [PATCH 10/23] add shared memory for shared memory --- src/main.cpp | 2 +- stream_compaction/efficient.cu | 54 +++++++++++++++++++++------------- 2 files changed, 35 insertions(+), 21 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 2d7bea1..03d9f06 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -14,7 +14,7 @@ #include "testing_helpers.hpp" #include "csvfile.hpp" -const int power = 19; +const int power =16; const int SIZE = 1 << power; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 22f1ece..8af0447 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -85,7 +85,7 @@ namespace StreamCompaction { #pragma endregion #pragma region SharedMemory - __global__ void kernSharedMemoryUpSweepStep(int N, int d_2, int* dev_idata) { + __global__ void kernSharedMemoryUpSweepStep(int N, int d_2, int cur_depth, int target_depth, int* dev_idata) { int t_offset = blockIdx.x * blockDim.x; int t_id = threadIdx.x; int k = 2 * d_2 * (t_offset + t_id) + 2 * d_2 - 1; @@ -98,16 +98,23 @@ namespace StreamCompaction { shared[2 * t_id + 1] = dev_idata[k]; __syncthreads(); - shared[2 * t_id + 1] += shared[2 * t_id]; + /*shared[2 * t_id + 1] += shared[2 * t_id];*/ - /*for (int i = 1; i <= unroll_depth; i++) { + for (int i = 0; i < target_depth - cur_depth; i++) { int mul = 1 << (i + 1); - shared[mul * (t_id + 1) - 1] += shared[mul * (t_id + 1) - mul >> 1 - 1]; + int idx_a = mul * (t_id + 1) - 1; + int idx_b = mul * (t_id + 1) - mul / 2 - 1; + if (idx_a < 2 * blockDim.x) { + int a = shared[idx_a]; + int b = shared[idx_b]; + shared[idx_a] += shared[idx_b]; + } __syncthreads(); - }*/ + } dev_idata[k] = shared[2 * t_id + 1]; + dev_idata[k - d_2] = shared[2 * t_id]; } __global__ void kernSharedMemoryDownSweepStep(int N, int d_2, int* dev_idata) { @@ -157,27 +164,34 @@ namespace StreamCompaction { } // TODO - - for (int d = 0; d <= log_n - 1; d++) { - if (ifIdxScale) { - blocksPerGrid = (n_2 / (1 << (1 + d)) + efficient_blocksize - 1) / efficient_blocksize; - kernUpSweepIndexScaleStep << > > (n_2, 1 << d, dev_idata); - } - else if (ifSharedMemory) { + if (ifSharedMemory) { + int unroll_depth = ilog2ceil(efficient_blocksize); + + for (int cur_depth = 0; cur_depth < log_n; cur_depth += unroll_depth) { + int d = cur_depth; blocksPerGrid = (n_2 / (1 << (1 + d)) + efficient_blocksize - 1) / efficient_blocksize; - kernSharedMemoryUpSweepStep <<>> (n_2, 1 << d, dev_idata); + int target_depth = std::min(cur_depth + unroll_depth, log_n); // log_n exclusive + kernSharedMemoryUpSweepStep << > > (n_2, 1 << d, cur_depth, target_depth, dev_idata); } - else{ - kernUpSweepStep << > > (n_2, 1 << d, dev_idata); + } + else { + for (int d = 0; d <= log_n - 1; d++) { + if (ifIdxScale) { + blocksPerGrid = (n_2 / (1 << (1 + d)) + efficient_blocksize - 1) / efficient_blocksize; + kernUpSweepIndexScaleStep << > > (n_2, 1 << d, dev_idata); + } + /*else if (ifSharedMemory) { + blocksPerGrid = (n_2 / (1 << (1 + d)) + efficient_blocksize - 1) / efficient_blocksize; + kernSharedMemoryUpSweepStep <<>> (n_2, 1 << d, dev_idata); + }*/ + else { + kernUpSweepStep << > > (n_2, 1 << d, dev_idata); + } } - - } + - //cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost); kernUpdateArray << <1, 1 >> > (n_2 - 1, 0, dev_idata); - //cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost); - for (int d = log_n - 1; d >= 0; d--) { if (ifIdxScale) { From ec9387e6b2bc8d05360f3b17a9222131469f43b5 Mon Sep 17 00:00:00 2001 From: Jack12xl Date: Tue, 22 Sep 2020 03:29:17 +0800 Subject: [PATCH 11/23] shared sweep down can work for full block size --- src/main.cpp | 2 +- stream_compaction/efficient.cu | 55 ++++++++++++++++++++++++---------- stream_compaction/efficient.h | 2 +- 3 files changed, 41 insertions(+), 18 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 03d9f06..6f237c1 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -14,7 +14,7 @@ #include "testing_helpers.hpp" #include "csvfile.hpp" -const int power =16; +const int power =3; const int SIZE = 1 << power; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 8af0447..39f0319 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -117,7 +117,7 @@ namespace StreamCompaction { dev_idata[k - d_2] = shared[2 * t_id]; } - __global__ void kernSharedMemoryDownSweepStep(int N, int d_2, int* dev_idata) { + __global__ void kernSharedMemoryDownSweepStep(int N, int d_2, int cur_depth, int target_depth, int* dev_idata) { int t_offset = blockIdx.x * blockDim.x; int t_id = threadIdx.x; int k = 2 * d_2 * (t_offset + t_id) + 2 * d_2 - 1; @@ -130,10 +130,17 @@ namespace StreamCompaction { shared[2 * t_id + 1] = dev_idata[k]; __syncthreads(); - int tmp = shared[2 * t_id]; - shared[2 * t_id] = shared[2 * t_id + 1]; - shared[2 * t_id + 1] += tmp; - __syncthreads(); + for (int i = cur_depth - 1; i >= target_depth; i--) { + int mul = 1 << (i + 1); + int idx_a = mul * (t_id + 1) - 1; + int idx_b = mul * (t_id + 1) - mul / 2 - 1; + if (idx_a < 2 * blockDim.x) { + int tmp = shared[idx_b]; + shared[idx_b] = shared[idx_a]; + shared[idx_a] += tmp; + } + __syncthreads(); + } dev_idata[k - d_2] = shared[2 * t_id]; dev_idata[k] = shared[2 * t_id + 1]; @@ -166,7 +173,7 @@ namespace StreamCompaction { // TODO if (ifSharedMemory) { int unroll_depth = ilog2ceil(efficient_blocksize); - + // for each upsweep, scan a 2 * blocksize for (int cur_depth = 0; cur_depth < log_n; cur_depth += unroll_depth) { int d = cur_depth; blocksPerGrid = (n_2 / (1 << (1 + d)) + efficient_blocksize - 1) / efficient_blocksize; @@ -193,19 +200,35 @@ namespace StreamCompaction { kernUpdateArray << <1, 1 >> > (n_2 - 1, 0, dev_idata); - for (int d = log_n - 1; d >= 0; d--) { - if (ifIdxScale) { - blocksPerGrid = (n_2 / (1 << (1 + d)) + efficient_blocksize - 1) / efficient_blocksize; - kernDownSweepIndexScaleStep << > > (n_2, 1 << d, dev_idata); - } - else if (ifSharedMemory) { - blocksPerGrid = (n_2 / (1 << (1 + d)) + efficient_blocksize - 1) / efficient_blocksize; - kernSharedMemoryDownSweepStep << > > (n_2, 1 << d, dev_idata); + if (!ifSharedMemory) { + for (int d = log_n - 1; d >= 0; d--) { + if (ifIdxScale) { + blocksPerGrid = (n_2 / (1 << (1 + d)) + efficient_blocksize - 1) / efficient_blocksize; + kernDownSweepIndexScaleStep << > > (n_2, 1 << d, dev_idata); + } + /*else if (ifSharedMemory) { + blocksPerGrid = (n_2 / (1 << (1 + d)) + efficient_blocksize - 1) / efficient_blocksize; + kernSharedMemoryDownSweepStep << > > (n_2, 1 << d, dev_idata); + }*/ + else { + kernDownSweepStep << > > (n_2, 1 << d, dev_idata); + } } - else { - kernDownSweepStep << > > (n_2, 1 << d, dev_idata); + } + else { + int unroll_depth = ilog2ceil(efficient_blocksize); + for (int cur_depth = log_n; cur_depth >= 0; cur_depth -= unroll_depth) { + + + int target_depth = std::max(cur_depth - unroll_depth, 0); + int d = target_depth; + //blocksPerGrid = (n_2 / (1 << d) + efficient_blocksize - 1) / efficient_blocksize; + // cur_depth should not be covered + kernSharedMemoryDownSweepStep << > > (n_2, 1 << d, cur_depth, target_depth, dev_idata); } } + + diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 92f30dc..7c427a0 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -2,7 +2,7 @@ #include "common.h" -constexpr int efficient_blocksize = 128; +constexpr int efficient_blocksize = 8; namespace StreamCompaction { namespace Efficient { From c7dc480d8592c914f20abd94a1c92cf165fd1b26 Mon Sep 17 00:00:00 2001 From: Jack12xl Date: Tue, 22 Sep 2020 03:51:32 +0800 Subject: [PATCH 12/23] fail to add full shared memory to downsweep --- src/main.cpp | 2 +- stream_compaction/efficient.cu | 2 ++ stream_compaction/efficient.h | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 6f237c1..b9c95ad 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -14,7 +14,7 @@ #include "testing_helpers.hpp" #include "csvfile.hpp" -const int power =3; +const int power =2; const int SIZE = 1 << power; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 39f0319..d7d21ec 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -135,6 +135,8 @@ namespace StreamCompaction { int idx_a = mul * (t_id + 1) - 1; int idx_b = mul * (t_id + 1) - mul / 2 - 1; if (idx_a < 2 * blockDim.x) { + int a = shared[idx_a]; + int b = shared[idx_b]; int tmp = shared[idx_b]; shared[idx_b] = shared[idx_a]; shared[idx_a] += tmp; diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 7c427a0..d541c8b 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -2,7 +2,7 @@ #include "common.h" -constexpr int efficient_blocksize = 8; +constexpr int efficient_blocksize = 2; namespace StreamCompaction { namespace Efficient { From 83ab6d17861cc78899008e4236d47320845aee7a Mon Sep 17 00:00:00 2001 From: Jack12xl Date: Tue, 22 Sep 2020 03:57:26 +0800 Subject: [PATCH 13/23] return to vanilla shared sweep down --- src/main.cpp | 2 +- stream_compaction/efficient.cu | 57 ++++++++++------------------------ stream_compaction/efficient.h | 2 +- 3 files changed, 18 insertions(+), 43 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index b9c95ad..03d9f06 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -14,7 +14,7 @@ #include "testing_helpers.hpp" #include "csvfile.hpp" -const int power =2; +const int power =16; const int SIZE = 1 << power; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index d7d21ec..8af0447 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -117,7 +117,7 @@ namespace StreamCompaction { dev_idata[k - d_2] = shared[2 * t_id]; } - __global__ void kernSharedMemoryDownSweepStep(int N, int d_2, int cur_depth, int target_depth, int* dev_idata) { + __global__ void kernSharedMemoryDownSweepStep(int N, int d_2, int* dev_idata) { int t_offset = blockIdx.x * blockDim.x; int t_id = threadIdx.x; int k = 2 * d_2 * (t_offset + t_id) + 2 * d_2 - 1; @@ -130,19 +130,10 @@ namespace StreamCompaction { shared[2 * t_id + 1] = dev_idata[k]; __syncthreads(); - for (int i = cur_depth - 1; i >= target_depth; i--) { - int mul = 1 << (i + 1); - int idx_a = mul * (t_id + 1) - 1; - int idx_b = mul * (t_id + 1) - mul / 2 - 1; - if (idx_a < 2 * blockDim.x) { - int a = shared[idx_a]; - int b = shared[idx_b]; - int tmp = shared[idx_b]; - shared[idx_b] = shared[idx_a]; - shared[idx_a] += tmp; - } - __syncthreads(); - } + int tmp = shared[2 * t_id]; + shared[2 * t_id] = shared[2 * t_id + 1]; + shared[2 * t_id + 1] += tmp; + __syncthreads(); dev_idata[k - d_2] = shared[2 * t_id]; dev_idata[k] = shared[2 * t_id + 1]; @@ -175,7 +166,7 @@ namespace StreamCompaction { // TODO if (ifSharedMemory) { int unroll_depth = ilog2ceil(efficient_blocksize); - // for each upsweep, scan a 2 * blocksize + for (int cur_depth = 0; cur_depth < log_n; cur_depth += unroll_depth) { int d = cur_depth; blocksPerGrid = (n_2 / (1 << (1 + d)) + efficient_blocksize - 1) / efficient_blocksize; @@ -202,35 +193,19 @@ namespace StreamCompaction { kernUpdateArray << <1, 1 >> > (n_2 - 1, 0, dev_idata); - if (!ifSharedMemory) { - for (int d = log_n - 1; d >= 0; d--) { - if (ifIdxScale) { - blocksPerGrid = (n_2 / (1 << (1 + d)) + efficient_blocksize - 1) / efficient_blocksize; - kernDownSweepIndexScaleStep << > > (n_2, 1 << d, dev_idata); - } - /*else if (ifSharedMemory) { - blocksPerGrid = (n_2 / (1 << (1 + d)) + efficient_blocksize - 1) / efficient_blocksize; - kernSharedMemoryDownSweepStep << > > (n_2, 1 << d, dev_idata); - }*/ - else { - kernDownSweepStep << > > (n_2, 1 << d, dev_idata); - } + for (int d = log_n - 1; d >= 0; d--) { + if (ifIdxScale) { + blocksPerGrid = (n_2 / (1 << (1 + d)) + efficient_blocksize - 1) / efficient_blocksize; + kernDownSweepIndexScaleStep << > > (n_2, 1 << d, dev_idata); } - } - else { - int unroll_depth = ilog2ceil(efficient_blocksize); - for (int cur_depth = log_n; cur_depth >= 0; cur_depth -= unroll_depth) { - - - int target_depth = std::max(cur_depth - unroll_depth, 0); - int d = target_depth; - //blocksPerGrid = (n_2 / (1 << d) + efficient_blocksize - 1) / efficient_blocksize; - // cur_depth should not be covered - kernSharedMemoryDownSweepStep << > > (n_2, 1 << d, cur_depth, target_depth, dev_idata); + else if (ifSharedMemory) { + blocksPerGrid = (n_2 / (1 << (1 + d)) + efficient_blocksize - 1) / efficient_blocksize; + kernSharedMemoryDownSweepStep << > > (n_2, 1 << d, dev_idata); + } + else { + kernDownSweepStep << > > (n_2, 1 << d, dev_idata); } } - - diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index d541c8b..92f30dc 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -2,7 +2,7 @@ #include "common.h" -constexpr int efficient_blocksize = 2; +constexpr int efficient_blocksize = 128; namespace StreamCompaction { namespace Efficient { From 2a919c5d54f5f7afea14bb11b54d5d76ca7ec12a Mon Sep 17 00:00:00 2001 From: Jack12xl Date: Tue, 22 Sep 2020 04:14:21 +0800 Subject: [PATCH 14/23] back to life --- stream_compaction/efficient.cu | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 8af0447..e417737 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -138,6 +138,37 @@ namespace StreamCompaction { dev_idata[k - d_2] = shared[2 * t_id]; dev_idata[k] = shared[2 * t_id + 1]; } + + /*__global__ void kernSharedMemoryDownSweepStep(int N, int d_2, int cur_depth, int target_depth, int* dev_idata) { + int t_offset = blockIdx.x * blockDim.x; + int t_id = threadIdx.x; + int k = 2 * d_2 * (t_offset + t_id) + 2 * d_2 - 1; + if (k >= N) { + return; + } + + extern __shared__ float shared[]; + shared[2 * t_id] = dev_idata[k - d_2]; + shared[2 * t_id + 1] = dev_idata[k]; + __syncthreads(); + + for (int i = cur_depth - 1; i >= target_depth; i--) { + int mul = 1 << (i + 1); + int idx_a = mul * (t_id + 1) - 1; + int idx_b = mul * (t_id + 1) - mul / 2 - 1; + if (idx_a < 2 * blockDim.x) { + int a = shared[idx_a]; + int b = shared[idx_b]; + int tmp = shared[idx_b]; + shared[idx_b] = shared[idx_a]; + shared[idx_a] += tmp; + } + __syncthreads(); + } + + dev_idata[k - d_2] = shared[2 * t_id]; + dev_idata[k] = shared[2 * t_id + 1]; + }*/ #pragma endregion /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. From cb7c625c2e3f8af39417ea627196b7c47c8732b0 Mon Sep 17 00:00:00 2001 From: Jack12xl Date: Tue, 22 Sep 2020 12:28:31 +0800 Subject: [PATCH 15/23] finish part 7 shared memory for both upsweep and downsweep --- src/main.cpp | 2 +- stream_compaction/efficient.cu | 46 ++++++++++++++++++++-------------- stream_compaction/efficient.h | 2 +- 3 files changed, 29 insertions(+), 21 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 03d9f06..2d7bea1 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -14,7 +14,7 @@ #include "testing_helpers.hpp" #include "csvfile.hpp" -const int power =16; +const int power = 19; const int SIZE = 1 << power; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index e417737..f83d3bb 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -117,7 +117,7 @@ namespace StreamCompaction { dev_idata[k - d_2] = shared[2 * t_id]; } - __global__ void kernSharedMemoryDownSweepStep(int N, int d_2, int* dev_idata) { + /*__global__ void kernSharedMemoryDownSweepStep(int N, int d_2, int* dev_idata) { int t_offset = blockIdx.x * blockDim.x; int t_id = threadIdx.x; int k = 2 * d_2 * (t_offset + t_id) + 2 * d_2 - 1; @@ -137,9 +137,9 @@ namespace StreamCompaction { dev_idata[k - d_2] = shared[2 * t_id]; dev_idata[k] = shared[2 * t_id + 1]; - } + }*/ - /*__global__ void kernSharedMemoryDownSweepStep(int N, int d_2, int cur_depth, int target_depth, int* dev_idata) { + __global__ void kernSharedMemoryDownSweepStep(int N, int d_2, int cur_depth, int target_depth, int* dev_idata) { int t_offset = blockIdx.x * blockDim.x; int t_id = threadIdx.x; int k = 2 * d_2 * (t_offset + t_id) + 2 * d_2 - 1; @@ -152,13 +152,13 @@ namespace StreamCompaction { shared[2 * t_id + 1] = dev_idata[k]; __syncthreads(); - for (int i = cur_depth - 1; i >= target_depth; i--) { + for (int i = cur_depth - 1 - target_depth; i >= 0; i--) { int mul = 1 << (i + 1); int idx_a = mul * (t_id + 1) - 1; int idx_b = mul * (t_id + 1) - mul / 2 - 1; if (idx_a < 2 * blockDim.x) { - int a = shared[idx_a]; - int b = shared[idx_b]; + /*int a = shared[idx_a]; + int b = shared[idx_b];*/ int tmp = shared[idx_b]; shared[idx_b] = shared[idx_a]; shared[idx_a] += tmp; @@ -168,7 +168,7 @@ namespace StreamCompaction { dev_idata[k - d_2] = shared[2 * t_id]; dev_idata[k] = shared[2 * t_id + 1]; - }*/ + } #pragma endregion /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. @@ -224,23 +224,31 @@ namespace StreamCompaction { kernUpdateArray << <1, 1 >> > (n_2 - 1, 0, dev_idata); - for (int d = log_n - 1; d >= 0; d--) { - if (ifIdxScale) { - blocksPerGrid = (n_2 / (1 << (1 + d)) + efficient_blocksize - 1) / efficient_blocksize; - kernDownSweepIndexScaleStep << > > (n_2, 1 << d, dev_idata); - } - else if (ifSharedMemory) { + if (ifSharedMemory) { + int unroll_depth = ilog2ceil(efficient_blocksize); + for (int cur_depth = log_n; cur_depth > 0; cur_depth -= unroll_depth) { + int target_depth = std::max(0, cur_depth - unroll_depth); + int d = target_depth; blocksPerGrid = (n_2 / (1 << (1 + d)) + efficient_blocksize - 1) / efficient_blocksize; - kernSharedMemoryDownSweepStep << > > (n_2, 1 << d, dev_idata); + kernSharedMemoryDownSweepStep << > > (n_2, 1 << d, cur_depth, target_depth, dev_idata); } - else { - kernDownSweepStep << > > (n_2, 1 << d, dev_idata); + } + else { + for (int d = log_n - 1; d >= 0; d--) { + if (ifIdxScale) { + blocksPerGrid = (n_2 / (1 << (1 + d)) + efficient_blocksize - 1) / efficient_blocksize; + kernDownSweepIndexScaleStep << > > (n_2, 1 << d, dev_idata); + } + /*else if (ifSharedMemory) { + blocksPerGrid = (n_2 / (1 << (1 + d)) + efficient_blocksize - 1) / efficient_blocksize; + kernSharedMemoryDownSweepStep << > > (n_2, 1 << d, dev_idata); + }*/ + else { + kernDownSweepStep << > > (n_2, 1 << d, dev_idata); + } } } - - - if (ifTimer) { timer().endGpuTimer(); } diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 92f30dc..8ce8ef3 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -2,7 +2,7 @@ #include "common.h" -constexpr int efficient_blocksize = 128; +constexpr int efficient_blocksize = 256; namespace StreamCompaction { namespace Efficient { From aec70a8164d7b67da8166d2fc536158145e4886b Mon Sep 17 00:00:00 2001 From: Jack12xl Date: Tue, 22 Sep 2020 12:29:07 +0800 Subject: [PATCH 16/23] add csv --- src/csvfile.hpp | 112 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 src/csvfile.hpp diff --git a/src/csvfile.hpp b/src/csvfile.hpp new file mode 100644 index 0000000..67bac0f --- /dev/null +++ b/src/csvfile.hpp @@ -0,0 +1,112 @@ +#pragma once +#include +#include +#include +#include + +class csvfile; + +inline static csvfile& endrow(csvfile& file); +inline static csvfile& flush(csvfile& file); + +class csvfile +{ + std::ofstream fs_; + bool is_first_; + const std::string separator_; + const std::string escape_seq_; + const std::string special_chars_; +public: + csvfile(const std::string filename, const std::string separator = ";") + : fs_() + , is_first_(true) + , separator_(separator) + , escape_seq_("\"") + , special_chars_("\"") + { + fs_.exceptions(std::ios::failbit | std::ios::badbit); + fs_.open(filename); + } + + ~csvfile() + { + flush(); + fs_.close(); + } + + void flush() + { + fs_.flush(); + } + + void endrow() + { + fs_ << std::endl; + is_first_ = true; + } + + csvfile& operator << (csvfile& (*val)(csvfile&)) + { + return val(*this); + } + + csvfile& operator << (const char* val) + { + return write(escape(val)); + } + + csvfile& operator << (const std::string& val) + { + return write(escape(val)); + } + + template + csvfile& operator << (const T& val) + { + return write(val); + } + +private: + template + csvfile& write(const T& val) + { + if (!is_first_) + { + fs_ << separator_; + } + else + { + is_first_ = false; + } + fs_ << val; + return *this; + } + + std::string escape(const std::string& val) + { + std::ostringstream result; + result << '"'; + std::string::size_type to, from = 0u, len = val.length(); + while (from < len && + std::string::npos != (to = val.find_first_of(special_chars_, from))) + { + result << val.substr(from, to - from) << escape_seq_ << val[to]; + from = to + 1; + } + result << val.substr(from) << '"'; + return result.str(); + } +}; + + +inline static csvfile& endrow(csvfile& file) +{ + file.endrow(); + return file; +} + +inline static csvfile& flush(csvfile& file) +{ + file.flush(); + return file; +} \ No newline at end of file From f91b5df3fefd56497e4ad66f736937c93f8fcf9d Mon Sep 17 00:00:00 2001 From: Jack12xl Date: Tue, 22 Sep 2020 16:15:53 +0800 Subject: [PATCH 17/23] add size_t to anti overflow, add simple standard sort --- src/main.cpp | 44 ++++++++++++++++++++++++-------- stream_compaction/CMakeLists.txt | 2 ++ stream_compaction/efficient.cu | 43 ++++++++----------------------- stream_compaction/radixSort.cu | 36 ++++++++++++++++++++++++++ stream_compaction/radixSort.h | 11 ++++++++ 5 files changed, 94 insertions(+), 42 deletions(-) create mode 100644 stream_compaction/radixSort.cu create mode 100644 stream_compaction/radixSort.h diff --git a/src/main.cpp b/src/main.cpp index 2d7bea1..9c29585 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -11,10 +11,13 @@ #include #include #include +#include #include "testing_helpers.hpp" #include "csvfile.hpp" -const int power = 19; + + +const int power = 25; const int SIZE = 1 << power; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; @@ -30,7 +33,7 @@ int main(int argc, char* argv[]) { printf("****************\n"); //onesArray(SIZE - 1, a); - genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case + genArray(SIZE - 1, a, 2); // Leave a 0 at the end to test that edge case a[SIZE - 1] = 0; printArray(SIZE, a, true); @@ -89,32 +92,34 @@ int main(int argc, char* argv[]) { printCmpResult(NPOT, b, c); zeroArray(SIZE, c); - printDesc("work-efficient scan with index scale, power-of-two"); - StreamCompaction::Efficient::scan(SIZE, c, a, true, true, false); + printDesc("work-efficient scan with shared memory, power-of-two"); + StreamCompaction::Efficient::scan(SIZE, c, a, true, false, true); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); - printDesc("work-efficient scan with index scale, non-power-of-two"); - StreamCompaction::Efficient::scan(NPOT, c, a, true, true, false); + printDesc("work-efficient scan with shared memory, non-power-of-two"); + StreamCompaction::Efficient::scan(NPOT, c, a, true, false, true); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); - printDesc("work-efficient scan with shared memory, power-of-two"); - StreamCompaction::Efficient::scan(SIZE, c, a, true, false, true); + printDesc("work-efficient scan with index scale, power-of-two"); + StreamCompaction::Efficient::scan(SIZE, c, a, true, true, false); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); - printDesc("work-efficient scan with shared memory, non-power-of-two"); - StreamCompaction::Efficient::scan(NPOT, c, a, true, false, true); + printDesc("work-efficient scan with index scale, non-power-of-two"); + StreamCompaction::Efficient::scan(NPOT, c, a, true, true, false); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); printArray(NPOT, c, true); printCmpResult(NPOT, b, c); + + @@ -185,6 +190,25 @@ int main(int argc, char* argv[]) { printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); + printf("\n"); + printf("*****************************\n"); + printf("** STREAM COMPACTION TESTS **\n"); + printf("*****************************\n"); + + + genArray(SIZE - 1, a, 256); // Leave a 0 at the end to test that edge case + a[SIZE - 1] = 0; + printArray(SIZE, a, true); + + + printf("The array to be sorted is : \n"); + printArray(SIZE, a, true); + printDesc("std sort"); + StreamCompaction::RadixSort::CpuStandardSort(SIZE, b, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + printArray(SIZE, b, true); + //printCmpResult(NPOT, b, b); + system("pause"); // stop Win32 console from closing on exit delete[] a; delete[] b; diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index 567795b..8656c2f 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -4,6 +4,7 @@ set(headers "naive.h" "efficient.h" "thrust.h" + "radixSort.h" ) set(sources @@ -12,6 +13,7 @@ set(sources "naive.cu" "efficient.cu" "thrust.cu" + "radixSort.cu" ) list(SORT headers) diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index f83d3bb..7076393 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -59,7 +59,8 @@ namespace StreamCompaction { int d_2, int* d_data ) { - int k = 2 * d_2 * ( (blockIdx.x * blockDim.x) + threadIdx.x) + 2 * d_2 - 1; + // use size_t in case overflow + size_t k = 2 * d_2 * ( (blockIdx.x * blockDim.x) + threadIdx.x) + 2 * d_2 - 1; /*k *= 2 * d_2; k += 2 * d_2 - 1;*/ if (k >= N) { @@ -73,7 +74,7 @@ namespace StreamCompaction { int d_2, int* d_data ) { - int k = 2 * d_2 * ((blockIdx.x * blockDim.x) + threadIdx.x) + 2 * d_2 - 1; + size_t k = 2 * d_2 * ((blockIdx.x * blockDim.x) + threadIdx.x) + 2 * d_2 - 1; if (k >= N) { return; } @@ -86,9 +87,9 @@ namespace StreamCompaction { #pragma region SharedMemory __global__ void kernSharedMemoryUpSweepStep(int N, int d_2, int cur_depth, int target_depth, int* dev_idata) { - int t_offset = blockIdx.x * blockDim.x; - int t_id = threadIdx.x; - int k = 2 * d_2 * (t_offset + t_id) + 2 * d_2 - 1; + size_t t_offset = blockIdx.x * blockDim.x; + size_t t_id = threadIdx.x; + size_t k = 2 * d_2 * (t_offset + t_id) + 2 * d_2 - 1; if (k >= N) { return; } @@ -105,8 +106,8 @@ namespace StreamCompaction { int idx_a = mul * (t_id + 1) - 1; int idx_b = mul * (t_id + 1) - mul / 2 - 1; if (idx_a < 2 * blockDim.x) { - int a = shared[idx_a]; - int b = shared[idx_b]; + /*int a = shared[idx_a]; + int b = shared[idx_b];*/ shared[idx_a] += shared[idx_b]; } __syncthreads(); @@ -117,32 +118,10 @@ namespace StreamCompaction { dev_idata[k - d_2] = shared[2 * t_id]; } - /*__global__ void kernSharedMemoryDownSweepStep(int N, int d_2, int* dev_idata) { - int t_offset = blockIdx.x * blockDim.x; - int t_id = threadIdx.x; - int k = 2 * d_2 * (t_offset + t_id) + 2 * d_2 - 1; - if (k >= N) { - return; - } - - extern __shared__ float shared[]; - shared[2 * t_id] = dev_idata[k - d_2]; - shared[2 * t_id + 1] = dev_idata[k]; - __syncthreads(); - - int tmp = shared[2 * t_id]; - shared[2 * t_id] = shared[2 * t_id + 1]; - shared[2 * t_id + 1] += tmp; - __syncthreads(); - - dev_idata[k - d_2] = shared[2 * t_id]; - dev_idata[k] = shared[2 * t_id + 1]; - }*/ - __global__ void kernSharedMemoryDownSweepStep(int N, int d_2, int cur_depth, int target_depth, int* dev_idata) { - int t_offset = blockIdx.x * blockDim.x; - int t_id = threadIdx.x; - int k = 2 * d_2 * (t_offset + t_id) + 2 * d_2 - 1; + size_t t_offset = blockIdx.x * blockDim.x; + size_t t_id = threadIdx.x; + size_t k = 2 * d_2 * (t_offset + t_id) + 2 * d_2 - 1; if (k >= N) { return; } diff --git a/stream_compaction/radixSort.cu b/stream_compaction/radixSort.cu new file mode 100644 index 0000000..2e8e5e4 --- /dev/null +++ b/stream_compaction/radixSort.cu @@ -0,0 +1,36 @@ +#include "efficient.h" +#include +#include +#include "radixSort.h" +#include +#include +#include + +namespace StreamCompaction{ + namespace RadixSort { + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; + } + + void CpuStandardSort(const int& N, int* out, const int* in) { + if (N == 0) { + return; + } + assert(in != nullptr); + assert(out != nullptr); + + std::vector a_vec(in, in + N); + + timer().startCpuTimer(); + std::sort(a_vec.begin(), a_vec.end()); + timer().endCpuTimer(); + + std::copy(a_vec.begin(), a_vec.end(), out); + } + + /*inline void GpuradixSort(const int& N, )*/ + } +} diff --git a/stream_compaction/radixSort.h b/stream_compaction/radixSort.h new file mode 100644 index 0000000..12553ff --- /dev/null +++ b/stream_compaction/radixSort.h @@ -0,0 +1,11 @@ +#pragma once +#include "common.h" +constexpr int radix_blocksize = 256; + +namespace StreamCompaction { + namespace RadixSort { + StreamCompaction::Common::PerformanceTimer& timer(); + + void CpuStandardSort(const int& N, int* out, const int* in); + } +} \ No newline at end of file From f7a819a61022496c0d9c06e91e59423d6c4db335 Mon Sep 17 00:00:00 2001 From: Jack12xl Date: Tue, 22 Sep 2020 20:08:04 +0800 Subject: [PATCH 18/23] debugging radixsort --- src/main.cpp | 76 +++++++++++++++++---------------- stream_compaction/radixSort.cu | 77 ++++++++++++++++++++++++++++++++-- stream_compaction/radixSort.h | 2 + 3 files changed, 117 insertions(+), 38 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 9c29585..2a47cb9 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -14,10 +14,10 @@ #include #include "testing_helpers.hpp" #include "csvfile.hpp" +#include - -const int power = 25; +const int power = 8; const int SIZE = 1 << power; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; @@ -119,24 +119,47 @@ int main(int argc, char* argv[]) { printArray(NPOT, c, true); printCmpResult(NPOT, b, c); - - + //zeroArray(SIZE, c); + //printDesc("thrust scan, power-of-two"); + //StreamCompaction::Thrust::scan(SIZE, c, a); + //printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + ////printArray(SIZE, c, true); + //printCmpResult(SIZE, b, c); - + //zeroArray(SIZE, c); + //printDesc("thrust scan, non-power-of-two"); + //StreamCompaction::Thrust::scan(NPOT, c, a); + //printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + ////printArray(NPOT, c, true); + //printCmpResult(NPOT, b, c); - zeroArray(SIZE, c); - printDesc("thrust scan, power-of-two"); - StreamCompaction::Thrust::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); + printf("\n"); + printf("*****************************\n"); + printf("** STREAM SORT TESTS **\n"); + printf("*****************************\n"); - zeroArray(SIZE, c); - printDesc("thrust scan, non-power-of-two"); - StreamCompaction::Thrust::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); + int sort_size_power = 2; + assert(sort_size_power <= power); + int sort_size = 1 << sort_size_power; + + int num_power = 2; + genArray(sort_size - 1, a, 1 << num_power); // Leave a 0 at the end to test that edge case + a[sort_size - 1] = 0; + printArray(sort_size, a, true); + + printf("The array to be sorted is : \n"); + printArray(sort_size, a, true); + printDesc("Std sort"); + StreamCompaction::RadixSort::CpuStandardSort(sort_size, b, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + printArray(sort_size, b, true); + + printDesc("Radix sort"); + zeroArray(sort_size, c); + StreamCompaction::RadixSort::GpuRadixSort(sort_size, c, a, num_power); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + printArray(sort_size, c, true); + printCmpResult(sort_size, b, c); printf("\n"); printf("*****************************\n"); @@ -190,24 +213,7 @@ int main(int argc, char* argv[]) { printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); - printf("\n"); - printf("*****************************\n"); - printf("** STREAM COMPACTION TESTS **\n"); - printf("*****************************\n"); - - - genArray(SIZE - 1, a, 256); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); - - - printf("The array to be sorted is : \n"); - printArray(SIZE, a, true); - printDesc("std sort"); - StreamCompaction::RadixSort::CpuStandardSort(SIZE, b, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(SIZE, b, true); - //printCmpResult(NPOT, b, b); + system("pause"); // stop Win32 console from closing on exit delete[] a; diff --git a/stream_compaction/radixSort.cu b/stream_compaction/radixSort.cu index 2e8e5e4..e579a82 100644 --- a/stream_compaction/radixSort.cu +++ b/stream_compaction/radixSort.cu @@ -5,9 +5,26 @@ #include #include #include +#include -namespace StreamCompaction{ - namespace RadixSort { +int k_th_bit(int k, int n) { + return (n >> k) & 1; +} + +void myprint(int n, int* a, bool abridged = false) { + printf(" [ "); + for (int i = 0; i < n; i++) { + if (abridged && i + 2 == 15 && n > 16) { + i = n - 2; + printf("... "); + } + printf("%3d ", a[i]); + } + printf("]\n"); +} + +namespace StreamCompaction { + namespace RadixSort { using StreamCompaction::Common::PerformanceTimer; PerformanceTimer& timer() { @@ -31,6 +48,60 @@ namespace StreamCompaction{ std::copy(a_vec.begin(), a_vec.end(), out); } - /*inline void GpuradixSort(const int& N, )*/ + void GpuRadixSort(const int& N, int* hst_out, const int* hst_in, const int max_bit ){ + // + if (N == 0) { + return; + } + assert(hst_in != nullptr); + assert(hst_out != nullptr); + + /*int* dev_in, dev_out, dev_out_buf; + cudaMalloc((void**)&dev_in, N * sizeof(int)); + cudaMalloc((void**)&dev_out, N * sizeof(int)); + cudaMalloc((void**)&dev_out_buf, N * sizeof(int)); + cudaMemcpy(dev_in, hst_in, N * sizeof(int), cudaMemcpyHostToDevice);*/ + + int* hst_e,* hst_f,* hst_d; + int* hst_out_buf; + hst_e = new int[N]; + hst_f = new int[N]; + hst_d = new int[N]; + + hst_out_buf = new int[N]; + std::copy(hst_in, hst_in + N, hst_out_buf); + + timer().startGpuTimer(); + for (int k = max_bit; k > 0; k--) { + for (int i = 0; i < N; i++) { + hst_e[i] = 1 - k_th_bit(k-1, hst_out_buf[i]); + } + + std::cout << "hst_out_buf: "; + myprint(N, hst_out_buf); + + Efficient::scan(N, hst_f, hst_e, false, false, true); + + int total_falses = hst_e[N - 1] + hst_f[N - 1]; + for (int i = 0; i < N; i++) { + hst_d[i] = hst_e[i] == 0 ? (i - hst_f[i] + total_falses) : hst_f[i]; + } + + for (int i = 0; i < N; i++) { + hst_out[i] = hst_out_buf[hst_d[i]]; + } + std::copy(hst_out, hst_out + N, hst_out_buf); + + std::cout << "hst_out: "; + myprint(N, hst_out); + } + + timer().endGpuTimer(); + + delete[] hst_e; + delete[] hst_f; + delete[] hst_d; + delete[] hst_out_buf; + } } } diff --git a/stream_compaction/radixSort.h b/stream_compaction/radixSort.h index 12553ff..ea918b9 100644 --- a/stream_compaction/radixSort.h +++ b/stream_compaction/radixSort.h @@ -7,5 +7,7 @@ namespace StreamCompaction { StreamCompaction::Common::PerformanceTimer& timer(); void CpuStandardSort(const int& N, int* out, const int* in); + + void GpuRadixSort(const int& N, int* out, const int* in, const int max_bit); } } \ No newline at end of file From d4d3b8306eb5a8bb141871bdc506a6d02ced8343 Mon Sep 17 00:00:00 2001 From: Jack12xl Date: Tue, 22 Sep 2020 21:23:45 +0800 Subject: [PATCH 19/23] finish part 6 --- src/main.cpp | 4 ++-- stream_compaction/radixSort.cu | 25 ++++--------------------- 2 files changed, 6 insertions(+), 23 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 2a47cb9..1967041 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -138,11 +138,11 @@ int main(int argc, char* argv[]) { printf("** STREAM SORT TESTS **\n"); printf("*****************************\n"); - int sort_size_power = 2; + int sort_size_power = 5; assert(sort_size_power <= power); int sort_size = 1 << sort_size_power; - int num_power = 2; + int num_power = 5; genArray(sort_size - 1, a, 1 << num_power); // Leave a 0 at the end to test that edge case a[sort_size - 1] = 0; printArray(sort_size, a, true); diff --git a/stream_compaction/radixSort.cu b/stream_compaction/radixSort.cu index e579a82..446e96c 100644 --- a/stream_compaction/radixSort.cu +++ b/stream_compaction/radixSort.cu @@ -11,18 +11,6 @@ int k_th_bit(int k, int n) { return (n >> k) & 1; } -void myprint(int n, int* a, bool abridged = false) { - printf(" [ "); - for (int i = 0; i < n; i++) { - if (abridged && i + 2 == 15 && n > 16) { - i = n - 2; - printf("... "); - } - printf("%3d ", a[i]); - } - printf("]\n"); -} - namespace StreamCompaction { namespace RadixSort { using StreamCompaction::Common::PerformanceTimer; @@ -72,28 +60,23 @@ namespace StreamCompaction { std::copy(hst_in, hst_in + N, hst_out_buf); timer().startGpuTimer(); - for (int k = max_bit; k > 0; k--) { + for (int k = 0; k < max_bit; k ++) { for (int i = 0; i < N; i++) { - hst_e[i] = 1 - k_th_bit(k-1, hst_out_buf[i]); + hst_e[i] = 1 - k_th_bit(k, hst_out_buf[i]); } - std::cout << "hst_out_buf: "; - myprint(N, hst_out_buf); - Efficient::scan(N, hst_f, hst_e, false, false, true); int total_falses = hst_e[N - 1] + hst_f[N - 1]; + for (int i = 0; i < N; i++) { hst_d[i] = hst_e[i] == 0 ? (i - hst_f[i] + total_falses) : hst_f[i]; } for (int i = 0; i < N; i++) { - hst_out[i] = hst_out_buf[hst_d[i]]; + hst_out[hst_d[i]] = hst_out_buf[i]; } std::copy(hst_out, hst_out + N, hst_out_buf); - - std::cout << "hst_out: "; - myprint(N, hst_out); } timer().endGpuTimer(); From 8ce587b682170fa62eadf32102d8fafa72f4843d Mon Sep 17 00:00:00 2001 From: Jack12xl Date: Tue, 22 Sep 2020 21:56:37 +0800 Subject: [PATCH 20/23] change efficient function signature with enum class --- README.md | 7 ++++-- src/csvfile.hpp | 2 +- src/main.cpp | 45 +++++++++++++++++++++++++++------- stream_compaction/efficient.cu | 15 ++++++------ stream_compaction/efficient.h | 10 ++++++-- stream_compaction/radixSort.cu | 2 +- 6 files changed, 59 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index 727926d..d008251 100644 --- a/README.md +++ b/README.md @@ -15,11 +15,14 @@ Thanks to [FLARE LAB](http://faculty.sist.shanghaitech.edu.cn/faculty/liuxp/flar ##### Cmake change -Add [csvfile.hpp]() to get the performance in CSV form. +Add + +1. [csvfile.hpp]() to get the performance in CSV form. +2. [radixSort.h](), [radixSort.]() for [Part 6]() ### Intro -In this project, basically we implement parallel scan algorithm based on CUDA required by [instruction](https://github.com/Jack12xl/Project2-Stream-Compaction/blob/master/INSTRUCTION.md). +In this project, basically we implement parallel scan algorithm based on CUDA parrallelism required by [instruction](https://github.com/Jack12xl/Project2-Stream-Compaction/blob/master/INSTRUCTION.md). diff --git a/src/csvfile.hpp b/src/csvfile.hpp index 67bac0f..5ebde7b 100644 --- a/src/csvfile.hpp +++ b/src/csvfile.hpp @@ -17,7 +17,7 @@ class csvfile const std::string escape_seq_; const std::string special_chars_; public: - csvfile(const std::string filename, const std::string separator = ";") + csvfile(const std::string filename, const std::string separator = ",") : fs_() , is_first_(true) , separator_(separator) diff --git a/src/main.cpp b/src/main.cpp index 1967041..325a40e 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -79,42 +79,42 @@ int main(int argc, char* argv[]) { zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); - StreamCompaction::Efficient::scan(SIZE, c, a, true, false, false); + StreamCompaction::Efficient::scan(SIZE, c, a, EFF_method::nonOptimization, true); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); - StreamCompaction::Efficient::scan(NPOT, c, a, true, false, false); + StreamCompaction::Efficient::scan(NPOT, c, a, EFF_method::nonOptimization, true); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan with shared memory, power-of-two"); - StreamCompaction::Efficient::scan(SIZE, c, a, true, false, true); + StreamCompaction::Efficient::scan(SIZE, c, a, EFF_method::sharedMemory, true); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan with shared memory, non-power-of-two"); - StreamCompaction::Efficient::scan(NPOT, c, a, true, false, true); + StreamCompaction::Efficient::scan(NPOT, c, a, EFF_method::sharedMemory, true); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan with index scale, power-of-two"); - StreamCompaction::Efficient::scan(SIZE, c, a, true, true, false); + StreamCompaction::Efficient::scan(SIZE, c, a, EFF_method::idxMapping, true); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan with index scale, non-power-of-two"); - StreamCompaction::Efficient::scan(NPOT, c, a, true, true, false); + StreamCompaction::Efficient::scan(NPOT, c, a, EFF_method::idxMapping, true); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); printArray(NPOT, c, true); printCmpResult(NPOT, b, c); @@ -201,19 +201,46 @@ int main(int argc, char* argv[]) { zeroArray(SIZE, c); printDesc("work-efficient compact, power-of-two"); - count = StreamCompaction::Efficient::compact(SIZE, c, a); + count = StreamCompaction::Efficient::compact(SIZE, c, a, EFF_method::nonOptimization); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); printDesc("work-efficient compact, non-power-of-two"); - count = StreamCompaction::Efficient::compact(NPOT, c, a); + count = StreamCompaction::Efficient::compact(NPOT, c, a, EFF_method::nonOptimization); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); - + zeroArray(SIZE, c); + printDesc("work-efficient compact with idx mapping, power-of-two"); + count = StreamCompaction::Efficient::compact(SIZE, c, a, EFF_method::idxMapping); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(count, c, true); + printCmpLenResult(count, expectedCount, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient compact with idx mapping, non-power-of-two"); + count = StreamCompaction::Efficient::compact(NPOT, c, a, EFF_method::idxMapping); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(count, c, true); + printCmpLenResult(count, expectedNPOT, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient compact with shared memory, power-of-two"); + count = StreamCompaction::Efficient::compact(SIZE, c, a, EFF_method::sharedMemory); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(count, c, true); + printCmpLenResult(count, expectedCount, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient compact with shared memory, non-power-of-two"); + count = StreamCompaction::Efficient::compact(NPOT, c, a, EFF_method::sharedMemory); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(count, c, true); + printCmpLenResult(count, expectedNPOT, b, c); + system("pause"); // stop Win32 console from closing on exit delete[] a; diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 7076393..7ccb806 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -152,7 +152,7 @@ namespace StreamCompaction { /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata, bool ifTimer = true,bool ifIdxScale = false, bool ifSharedMemory = false) { + void scan(int n, int *odata, const int *idata, EFF_method cur_method, bool ifTimer = true ) { if (n == 0) { return; } @@ -174,7 +174,7 @@ namespace StreamCompaction { } // TODO - if (ifSharedMemory) { + if (cur_method == EFF_method::sharedMemory) { int unroll_depth = ilog2ceil(efficient_blocksize); for (int cur_depth = 0; cur_depth < log_n; cur_depth += unroll_depth) { @@ -186,7 +186,7 @@ namespace StreamCompaction { } else { for (int d = 0; d <= log_n - 1; d++) { - if (ifIdxScale) { + if (cur_method == EFF_method::idxMapping) { blocksPerGrid = (n_2 / (1 << (1 + d)) + efficient_blocksize - 1) / efficient_blocksize; kernUpSweepIndexScaleStep << > > (n_2, 1 << d, dev_idata); } @@ -195,6 +195,7 @@ namespace StreamCompaction { kernSharedMemoryUpSweepStep <<>> (n_2, 1 << d, dev_idata); }*/ else { + // non optimization kernUpSweepStep << > > (n_2, 1 << d, dev_idata); } } @@ -203,7 +204,7 @@ namespace StreamCompaction { kernUpdateArray << <1, 1 >> > (n_2 - 1, 0, dev_idata); - if (ifSharedMemory) { + if (cur_method == EFF_method::sharedMemory) { int unroll_depth = ilog2ceil(efficient_blocksize); for (int cur_depth = log_n; cur_depth > 0; cur_depth -= unroll_depth) { int target_depth = std::max(0, cur_depth - unroll_depth); @@ -214,7 +215,7 @@ namespace StreamCompaction { } else { for (int d = log_n - 1; d >= 0; d--) { - if (ifIdxScale) { + if (cur_method == EFF_method::idxMapping) { blocksPerGrid = (n_2 / (1 << (1 + d)) + efficient_blocksize - 1) / efficient_blocksize; kernDownSweepIndexScaleStep << > > (n_2, 1 << d, dev_idata); } @@ -245,7 +246,7 @@ namespace StreamCompaction { * @param idata The array of elements to compact. * @returns The number of elements remaining after compaction. */ - int compact(int N, int *odata, const int *idata) { + int compact(int N, int *odata, const int *idata, EFF_method cur_method) { if (N == 0) { return 0; } @@ -269,7 +270,7 @@ namespace StreamCompaction { dim3 blocksPerGrid = (N + efficient_blocksize - 1) / efficient_blocksize; Common::kernMapToBoolean << > > (N, dev_bools, dev_idata); - scan(N, dev_indices, dev_bools, false, true, false); + scan(N, dev_indices, dev_bools, cur_method, false); Common::kernScatter << > > ( N, diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 8ce8ef3..168ca4d 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -4,12 +4,18 @@ constexpr int efficient_blocksize = 256; +enum class EFF_method { + nonOptimization, + idxMapping, + sharedMemory +}; + namespace StreamCompaction { namespace Efficient { StreamCompaction::Common::PerformanceTimer& timer(); - void scan(int n, int *odata, const int *idata, bool ifTimer, bool ifIdxScale, bool ifSharedMemory); + void scan(int n, int *odata, const int *idata, EFF_method cur_method, bool ifTimer); - int compact(int n, int *odata, const int *idata); + int compact(int n, int *odata, const int *idata, EFF_method cur_method); } } diff --git a/stream_compaction/radixSort.cu b/stream_compaction/radixSort.cu index 446e96c..995d499 100644 --- a/stream_compaction/radixSort.cu +++ b/stream_compaction/radixSort.cu @@ -65,7 +65,7 @@ namespace StreamCompaction { hst_e[i] = 1 - k_th_bit(k, hst_out_buf[i]); } - Efficient::scan(N, hst_f, hst_e, false, false, true); + Efficient::scan(N, hst_f, hst_e, EFF_method::sharedMemory, false); int total_falses = hst_e[N - 1] + hst_f[N - 1]; From 236cffc30f58452f6546a6f3b723924837572bb4 Mon Sep 17 00:00:00 2001 From: Jack12xl Date: Wed, 23 Sep 2020 02:04:46 +0800 Subject: [PATCH 21/23] add image and readme draft for test --- README.md | 231 +++++++++++++++++++++++++++++++++++++- img/Compact.svg | 1 + img/SCAN.svg | 1 + img/SCAN_for_block.svg | 1 + img/Thrust.png | Bin 0 -> 32884 bytes img/Thrust_timeline.png | Bin 0 -> 53947 bytes src/main.cpp | 174 ++++++++++++++++++++-------- stream_compaction/naive.h | 2 +- 8 files changed, 355 insertions(+), 55 deletions(-) create mode 100644 img/Compact.svg create mode 100644 img/SCAN.svg create mode 100644 img/SCAN_for_block.svg create mode 100644 img/Thrust.png create mode 100644 img/Thrust_timeline.png diff --git a/README.md b/README.md index d008251..19f4b13 100644 --- a/README.md +++ b/README.md @@ -17,23 +17,244 @@ Thanks to [FLARE LAB](http://faculty.sist.shanghaitech.edu.cn/faculty/liuxp/flar Add -1. [csvfile.hpp]() to get the performance in CSV form. +1. [csvfile.hpp]() to automatically record the performance in CSV form. 2. [radixSort.h](), [radixSort.]() for [Part 6]() -### Intro +#### Intro -In this project, basically we implement parallel scan algorithm based on CUDA parrallelism required by [instruction](https://github.com/Jack12xl/Project2-Stream-Compaction/blob/master/INSTRUCTION.md). +In this project, basically we implement **scan** algorithm(in specific prefix sum) based on CUDA parrallelism, as required by [instruction](https://github.com/Jack12xl/Project2-Stream-Compaction/blob/master/INSTRUCTION.md). The detailed algorithm content can be viewed at [nvidia gpu gem](https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda). +Here we managed to implement **all** the compulsory and extra point sections. -#### Part 1~4: +#### Overview +##### Optimized block size -#### Part 5: why GPU version so slow +After implementing all the functions, first we try to find the optimized block size. + +![alt text](https://github.com/Jack12xl/Project1-CUDA-Flocking/blob/master/images/2_x_baseline.png) + +From the image, we may choose the optimized block size as 256 + +##### Implementation Comparisons + +Here we compare each scan, compact implementations under different array size. The results below are ran under block size = 256. + +![alt text](https://github.com/Jack12xl/Project1-CUDA-Flocking/blob/master/images/2_x_baseline.png) + +![alt text](https://github.com/Jack12xl/Project1-CUDA-Flocking/blob/master/images/2_x_baseline.png) + +##### notes: + +- the **non-opt** refers to the non-optimization scan function before Part 5. +- The **idx** refers to the optimized version in Part 5. +- The shared memory refers to the optimized version in Part 7 + +##### Output of test program + +Here we add test for radix sort, shared memory based scan and compact. + +``` + +**************** +** SCAN TESTS ** +**************** + [ 1 0 0 1 1 1 0 1 1 1 0 0 1 ... 0 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.0589ms (std::chrono Measured) + [ 0 1 1 1 2 3 4 4 5 6 7 7 7 ... 32801 32801 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 0.056ms (std::chrono Measured) + [ 0 1 1 1 2 3 4 4 5 6 7 7 7 ... 32799 32800 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 0.042656ms (CUDA Measured) + [ 0 1 1 1 2 3 4 4 5 6 7 7 7 ... 32801 32801 ] + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.041024ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.104704ms (CUDA Measured) + [ 0 1 1 1 2 3 4 4 5 6 7 7 7 ... 32801 32801 ] + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.108032ms (CUDA Measured) + [ 0 1 1 1 2 3 4 4 5 6 7 7 7 ... 32799 32800 ] + passed +==== work-efficient scan with shared memory, power-of-two ==== + elapsed time: 0.0256ms (CUDA Measured) + [ 0 1 1 1 2 3 4 4 5 6 7 7 7 ... 32801 32801 ] + passed +==== work-efficient scan with shared memory, non-power-of-two ==== + elapsed time: 0.025024ms (CUDA Measured) + [ 0 1 1 1 2 3 4 4 5 6 7 7 7 ... 32799 32800 ] + passed +==== work-efficient scan with index scale, power-of-two ==== + elapsed time: 0.083584ms (CUDA Measured) + [ 0 1 1 1 2 3 4 4 5 6 7 7 7 ... 32801 32801 ] + passed +==== work-efficient scan with index scale, non-power-of-two ==== + elapsed time: 0.077536ms (CUDA Measured) + [ 0 1 1 1 2 3 4 4 5 6 7 7 7 ... 32799 32800 ] + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.094944ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.091936ms (CUDA Measured) + passed + +***************************** +** STREAM SORT TESTS ** +***************************** + [ 31 24 18 1 17 25 4 15 25 3 30 22 7 ... 5 0 ] +The array to be sorted is : + [ 31 24 18 1 17 25 4 15 25 3 30 22 7 ... 5 0 ] +==== Std sort ==== + elapsed time: 0.0011ms (std::chrono Measured) + [ 0 1 1 1 3 4 5 5 5 5 7 7 9 ... 30 31 ] +==== Radix sort ==== + elapsed time: 0.0009ms (std::chrono Measured) + [ 0 1 1 1 3 4 5 5 5 5 7 7 9 ... 30 31 ] + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 3 0 2 1 1 1 0 3 1 3 2 2 3 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.2151ms (std::chrono Measured) + [ 3 2 1 1 1 3 1 3 2 2 3 3 2 ... 2 1 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.4586ms (std::chrono Measured) + [ 3 2 1 1 1 3 1 3 2 2 3 3 2 ... 1 2 ] + passed +==== cpu compact with scan ==== + elapsed time: 0.5532ms (std::chrono Measured) + [ 3 2 1 1 1 3 1 3 2 2 3 3 2 ... 2 1 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.443296ms (CUDA Measured) + [ 3 2 1 1 1 3 1 3 2 2 3 3 2 ... 2 1 ] + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.403328ms (CUDA Measured) + [ 3 2 1 1 1 3 1 3 2 2 3 3 2 ... 1 2 ] + passed +==== work-efficient compact with idx mapping, power-of-two ==== + elapsed time: 0.362304ms (CUDA Measured) + [ 3 2 1 1 1 3 1 3 2 2 3 3 2 ... 2 1 ] + passed +==== work-efficient compact with idx mapping, non-power-of-two ==== + elapsed time: 0.493792ms (CUDA Measured) + [ 3 2 1 1 1 3 1 3 2 2 3 3 2 ... 1 2 ] + passed +==== work-efficient compact with shared memory, power-of-two ==== + elapsed time: 0.394784ms (CUDA Measured) + [ 3 2 1 1 1 3 1 3 2 2 3 3 2 ... 2 1 ] + passed +==== work-efficient compact with shared memory, non-power-of-two ==== + elapsed time: 0.463968ms (CUDA Measured) + [ 3 2 1 1 1 3 1 3 2 2 3 3 2 ... 1 2 ] + passed +``` + + + +#### Part 1~3: + +The performance is showed in previous image. + +#### Part 4: + +Here shows the thrust summary and timeline: + +![alt text](https://github.com/Jack12xl/Project1-CUDA-Flocking/blob/master/images/2_x_baseline.png) + +![alt text](https://github.com/Jack12xl/Project1-CUDA-Flocking/blob/master/images/2_x_baseline.png) + +#### Part 5: why GPU version so slow [Extra point] The reason why the GPU is slower than CPU version: 1. **Spatial coherence:** The cpu version reads the memory in a continuous way while the current version fetches memory uncontinuously, which leads to a low memory bandwidth. 2. **The input size matters:** When the size of input array is trivial (for example 2^4), **cpu** version is faster than **gpu's**. When the size goes up, the situation goes reversed and **gpu** version is much faster than **cpu's** since naturally **gpu** is better in dealing with a large amounts of number. +3. **Occupancy low**: Not all the threads are doing its job in non-optimization version. + +##### Simple solution + +We can increase the **Occupancy** by mapping the each thread index to the active index to force them to work. Also, we can dynamically adjust the grid dimension to stop calling useless grid. + +##### Tips: + +The mapping step may cause integer overflow. So we use **size_t** for thread index. + +#### Part 6 Radix Sort [Extra point] + +For simplicity and less memory copy between gpu and cpu, we mainly implement the algorithm in cpu side, except for the scan function, which we call the shared memory version from part 7. + +We compare the results with built-in std::sort. Here we show the correctness of the radix sort. + +``` +***************************** +** STREAM SORT TESTS ** +***************************** + [ 31 24 18 1 17 25 4 15 25 3 30 22 7 ... 5 0 ] +The array to be sorted is : + [ 31 24 18 1 17 25 4 15 25 3 30 22 7 ... 5 0 ] +==== Std sort ==== + elapsed time: 0.0011ms (std::chrono Measured) + [ 0 1 1 1 3 4 5 5 5 5 7 7 9 ... 30 31 ] +==== Radix sort ==== + elapsed time: 0.0009ms (std::chrono Measured) + [ 0 1 1 1 3 4 5 5 5 5 7 7 9 ... 30 31 ] + passed + +``` + + + +#### Part 7 Scan with shared memory [Extra point] + +As is showed in previous figure, adding shared memory can boost the performance in a large degree since it provides higher memory bandwidth than global memory. + +##### Implementation explain + +First the implementation in gpu gem is somehow not that robust to input blocksize because it tries to read all block memory into a single shared memory bank. If the blocksize keep increasing, it would soon drain the shared memory limit(48 kb) . + +So instead, we tear the up-sweep and down-sweep process in several part( based on the block size) with different sweep depth. In each part we respectively assign the shared memory based on the largest array this part would hop into. + +##### Detail: + +In our design, we set the shared memory size twice as big as the block size. The reason for this is to utilize the index mapping from [part 5](). + +Sadly we do not consider the bank conflict effect. + +##### Tips + +The shared memory version is prone to cause integer overflow so we decrease the element range in input array. + + + +#### Questions: + +- ##### Roughly optimize the block sizes of each of your implementations for minimal run time on your GPU. + + - As is discussed in [here](), we adopt the 256 block size for both naive and efficient version. + +- ##### Compare all of these GPU Scan implementations (Naive, Work-Efficient, and Thrust) to the serial CPU version of Scan. Plot a graph of the comparison (with array size on the independent axis). + + - The picture is showed [here](). + +- ##### Can you find the performance bottlenecks? Is it memory I/O? Computation? Is it different for each implementation? + + - Personally I believe the bottlenecks lie mainly in memory I/O. Because for each implementation the computation is pretty straight(with complexity **O(n)** and **O(n * log(n)**). When the shared memory is introduced, the performance goes up drastically. + +- ##### Paste the output of the test program into a triple-backtick block in your README. + + - Pasted [here]() diff --git a/img/Compact.svg b/img/Compact.svg new file mode 100644 index 0000000..3783ce7 --- /dev/null +++ b/img/Compact.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/img/SCAN.svg b/img/SCAN.svg new file mode 100644 index 0000000..af77cd7 --- /dev/null +++ b/img/SCAN.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/img/SCAN_for_block.svg b/img/SCAN_for_block.svg new file mode 100644 index 0000000..9175408 --- /dev/null +++ b/img/SCAN_for_block.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/img/Thrust.png b/img/Thrust.png new file mode 100644 index 0000000000000000000000000000000000000000..39487a47e445f45b900062337485d375f10a471e GIT binary patch literal 32884 zcmcG#Wpo@n&^Fj+ika=0nPcXdF=mX}j+q@ZQ_RfF%#N8cW@ct)W?bLg@4idt?4RA$ z>FJqnNiC^HPpPW(bg-O^7(6TvEC2xT65_%N002(=@jDe7^y5r^k&gZG1!E(wW(NQW zJ^y?_;%N|YKW-x1i+s0Nv^KJL(z7)L6s=5c^z7^nwJ^t40e}#Y5Ef8!PCH3ARze$` z?>o=(CNxDFlt&UF{pDQ*{i72c5((XItc84LR*U2)#bH#hs}U}UGv`NVd{$NtF*GzK zrvwrfFxN zk_720z?swC?%bz8HAEK66Da1MmF(23%y>%Bm9ZYuPtQ?vK_1X-3g{3)`x2rQ)(DsF zB+N_nAo$uSt^>)vacdssadGH0$A(8=79JGWy>WAYXI1(^dH54T`-*=a{3O_JZ1_*{ zi@a7OG=sdO5-$wpcMEg6)AXx)j0qop9T>zXSOFyPXj&93zs9B15Nq&{tACEDcjUY! zb-?%ILH``@8XJWFlcfRy^-q8P9<|gQ(EqCnQl4^`z<>7%%V7}&|9un^xT&%Y(tl49 zFZKQP-+cjf^v{0(s|Dz(lK*#2aKS|fDx^KWis7AaQ8z=b8jxk>GO^QjNNxpCO>&Dp zy~msWzS&y6mDy6?-i365*QbMEJcSjKS@=ATey`V)?Pg42)?lHWHfkX%YvV``hhq2r zfHqwR9~6tB>Dm1k`Zrh9F3F}@vxpNcB}`A6#QDjC5LJ@DLorpRr7hFa3Pkn|g8QH){G zRf3hn+n(dnp4^!}KVL<^UYvS@f5ks4ngo|MC5f}dGJ~FXxDDp&8xT{A1nDQHE|9J! z?Qr<4(GhuajF&cTVW;oK&9po@_46=d_bz67z@oLjo=r{L)=|wF^MJ6j$ddW=2jr<& zO_PsC|9ABW&c?Ayk)v(@jM&I=N-+-zQ}3e_3S3e8b`pDemx927S^>gxjZ{Z08H1Hgr=7DM1F74QeaKqs>+#s3W{%v;Qf+HU%)G{qgO-()a-N9zudOa8s?)YI$+ zF*AjGZS^ueZv6(8;RY#cSGlW0V= zEI(=af9hhN>o%{7h1Q9aqDZ(%V(s2GYm=e`3tu4=guc2OA}E#x*Z)|TJ~y%CU{Xme zm=M-bp45Bfm9J2pra7!TV|A;UZ?n>rxLPA%>yIllGWqeNNVI;j=SE;R)KUmbITfpf z>PgeHtpamU&j}XbAF_gKc4kZMcJ(MM4WM2<{(7@4!FfN|WHvBS!mM=MUE80zwM9$~ zh^J~Ku|K&U{Co-`wKV1nsPwEVI7S*WJ?+xiq&SahVFBGA}lYT{|kz~}{y-Sqj4Mo%8 z;oYL=6+?_WMcS*s1nu6O9B+y|Gr4hb9SwlU!DU!qXkANjY~QWvhk*cur{9PHGS}5F zl_Zrk-1nbRk-jXew9!A$T}5|%^$l*;>gdSdnUs-EqeF0c#n=6=$$q?R3W*r&?=Z9K zP*X8@%aY4$eCK#yU?&0qmF%4GkFOGrtl~698eCjBvD}KqA9^K>BG^Qh@h4vwY1(`u zgpoP^oJ4Tg99_hh-Y#D5OOBk10t()E#PgNaZ2w&keF@3ZD!P259nw(93C<37mRSw zztCY5HMLmHfhm7e-;aB{??>uaW@t|Sd4ryLVQ!1Xq|1FG4`ytcWDi^w`PwQ_>rUbY zMFhS={(b<}bb+&8^SL)}e95TCvB&H>HVIlr6bS9Qf)(!Y!CkZ2VFJ9H5#0xU62}Q$ zY5W2~%QIp=FT{XSMgP`A()0ia?UV5V@7iU(rI*W;7CTlDAg|5W3}>GrwIXNKY;0}+!&FHLC2pj> zPBR`U8e<`hpyb&jf(NGX`(a^x3C(<4^Fa7bmsov+zUM8Pu$h*^1Kw-Zg=wzIa|wKv zS&Nm$MMZkGJ}TbH!$e==H@y4fj8$5s4lShh(55udVtmI28OimAitb{cAG zXngWTOW0k9K6O_k8*a@3H2oT)_ME}5Mgr);4WCvebpg^LDl`jnIJtBz7N%*zhBa|uq%TP=U;+BxPxYth*_hP7Jq$5 z75Ok)yu9Ty$M3-Ynjd+KK@Jpo2d{zF08l{x%cPP!{`T_*G{7${DQU3Bgtsr|##?ch z1yTS2E}V_AnRpp)J+b} zl^GV0DwNv4srx>pChvD}D??s#MXG?w8tW~-S%Qr1x_>h_4+Rt)&!jykg93T(Z_vKo zvRF!G8=d;!li*Uj{fE}oHZB_+*qO^2BGIR7Ct;y4oxW@e}|($rLc<~IGX8?VU9#>Q5U-~4mmT@mn}N;(h* z4%$pI0$KPU!!LewBWB2hSiBg+J|8j4$17Hh*Q8#lN+ndAcfKjkYo8F1E0_ioQ@(&- zQGpy>3*cl$0EUJKC5Tsxa{Lg0cw~U!W$&x^gJjYs6t6wuHN?(O<0yYMMZz1|pczn| z8uGARzbyE;<^-bJ>dWmQet$WJ_{hZFuEx$Q zEPOu&>-_BBBU09Z4FRxiySv(jvL2U%0~d@tp!y|@`PDJq;wsm0GRBObyY3)(^d6qb zBiNh>86^f#Zph^cw0VY;d=v&9Q{HZ`#qb-b{jRwfDA7WEM}2E1*Gs807}z0zBQJS* z>7YD!5Bz6zz~nv`;J3W<4VDCU-(NAD6Ja+tnEjXM-H}u{b60~W8S{aSdV2f$%Inry zYBHPlHBIn04M`Hh0&03iZ$+M4z6t2i^a$P(KJ5~F> zP>A;hw(3>K&Jw3wB79ved-of>y$Ko@v!5NL-xuwDRuMA#PR*_n%a6s$I~fc<9|W+d zC%-f&FSP^+Wm|hr*!p;4`#4cvHqc1qI8|G)-(%>+9>c1Y3%^Cbq#=nSAf#H9LIh^= zA5jP0uxvp+8^h9E1)cM>BQLm9Uj_M^z2Qv3+^_X8pUIM4J%1T+%F&?K$KiwnRT%HncuPROeo`M3YZ!4?QSOWI>n)~mpGrcrXOch zjDj>X6p)dT0TEFVzy~^=cQ?c5Z23{uhi)elJ|yQ~+sZ;l#bk0{uqwd7#NTCcw_9rd zxYnn;P~g$@HkTTzn53OA`w1ip|B$cca12eGQ>-O?-!f@k5h3g5HGP5t{yNbuL?PY} z`Ln0{LOb4of!s8h2KN^wD5j>LlgIEL@QVsPnm4Y$iZRyfhT1k=$eoM9l0Vt)!%oo} zIQWMD>u5p;80hJhmz2ES9D-GiOmx)uzj1OtB_4fDj~J(0j3W(yVkNSd{VhM#;d9gg zVr)wzwYAJM^QRi;R$O8~nW_~iAhTgH8fD#HqlponIJS=qjfPfXs^x8X$mjm?QOo0@EZwP{) zo}V8EfP?QLU%Eg|$D`=9@WN?298Ot1FWb}MEV%l%#s3AAxLYn6LI3}F<(q;d4f_A% z$Ol3G|L?C$8ed9(1V4z|k^hS&yop715`qJe3s~%xx=MoTHN$`H<|!5XED~q2=n$^n zJP%m&%n}Y0)aV#&KY-t*Za^e%Cq)P2uO#bp!0@`bLzHHy$XWDG713+?^vY`Ek4f2} ziK3SZX#ZlzQUr?2*9on-Nb&PhA+g4{{eT08HudgS?b&-bj`Jd^Vi8t9O16%GQ%;D;1Z`nqDt*$zn z<2uHccgMyo(P6u4?(GBk1061AqrH)7hwywTIz67nXx-P?)@N;FR^6GxgUb|=_#U+S z49ig%uLmk8>?r(yo-Z7{)}E=ubaTib4QB$Uj9|CpYv|eE7IW{(;B%l5hZUkfqrX;Z z3ZD6f-Xi}vNq2`9|Q9jiM=uMql?UUe5*Xs>Kzb_vXr zm}66MW?J#S#gR07{5x`+u6#0#HL;6;m4kl9%SSpzeJx8vWy`j_JdA>sNGU1{;3%^7 ze%!iD?RvigYW>Ksu6Sy>t0L1U%tW%9ysihRZam5%Hvaxi&W_#c6)LSAkV2l`2eZ>}e)&KJx_+ zqrJfjV|`RN3;G_U#LSZ`cVCvPr@zH(Q|Q>vbjHd0rY-<0gH{8_~8n;o3zl;?bc(SAa8I7E};-#PMe zO59^0{`?9fDdFfSAOD5Z3u7lvei7y4+DZmHK3^e+Df17G2EMwa;hWQ)LaPj0o^n}$ ztJKx~`fsb*!CoW7`-(uLNo&cdqTQy&d{%J2VgnlX!uAl-zK)rzm6aa}Zd9xyi!SK! zY5CQ*UI%wS8QJicWLw&QdrQ9BzrD#uVpo`vg3!lL);Qs@59z0c2Kcfp8pZWD%2(Xs z@cRao{sE{vYlCpX{xW7Blk>qa5Y&r`Jg+crhxL?d^;!2O-#%Y$BBh>^!-gG_>XgNsJzzT&a9k@-;KyGapk*_Y3U*~s^_*m+m5 z`I`F->E={IkG6al2AaVo)H{jhu49jue$80af7Pd6V9#ypZfgPk?Ob1OaNL<>9cMo5 zFN*UF69tFko|no5%*1;rpwBvR-WU-Qz0{tnm|Q#`LSc{QEy;ud>~=l#l=teTrmV!s zqiJd4b~p~?=&HN^gbr$erljFct#wE_xJa_yo$_T;mTdHCih%1^13lhEIpJNU5F+1D z6ne=}9q)W44sDzgr`>pT+k5LR_53(L$?;n+?|mrREH^ra)2$|#@%N|4S(tzpl_n<= zki0c*RL$Lv$<03%zqg{lC2|j-cRV?{iA!p+&yJC$YkBH?sd`&o+;-BQrH(b1Gg~=WUj)lO(DBt8crc77khVe(UfR9RkC{gtwy5H zPF04<)A6WbV1?RjuYNW&&G=<2-p=vQACzjd!W__?^;&;8yyj|*LiyCoJ4NGdGDrYG zM6i0R+z+73H2@Ue*6eeQwkktuB{MRbkZwfOOU)gs`mz*5({@j2N6vcNsGV+=_ReKv ze20NX{KuRbJ!ov;Xl>Pt;rMJ}cgYDrndyL2!=G}wjJPatl9h2IF}$qV62p{~Nj~a9 zhwH;;OCf^Q-}b_JxJMSkD!iA}=`|=-8~v;v-|(&_nOu`@+xL1~7$RDE?IaIc^z8f+ zS-A5EN6{S$MxjIgjmn9nNNXUEIN4Jp@&3H8Hu`N-y8LWKyP7{Hss#X=7g|h&+nu9X zb!!ET0dY?>F$eo8_cW>DWM`x4^6xR#w$c*O&C_!_ko({SJhGowx#!jCZ|_oS4A+he z8z~uS$vvHE^lRlSW4~WHQ(F8aeQ^8;=21AulYs!@i*#3ytv64%4Rf&;Scjw&SS>$J zcPf|wY_IpX^Wl5aZ{JY(;cwC$Z1T8vJK#cnGo3W9j5hTna=R*2OSdo{OTt_7u$Aex4RfxQRB}e9jfh7Ks>UlR6XN zPR)69dh)CuoI&*rn3|C-Y*;~jv|wD93ef)bdO=mJ3g+cSQm)VHrn}|O9l)GOjbUII zJFui4q5!HjbVD0F6Xixh+U(d_mYcyf#s6rtd5}H)^BI$YBMYPTt!ZxQ_RlAfqp-pk zR>5{S7Wa;y?r!dq>A&cLhfg9IL~fh+L{Hi}KOaYxtno@JrT+BWjgzP^%(%+6#wLx4 zC-idsSy$Kez9kdT&my$7U*kv0d+egMu4V4N6Nry1oWC_;d5vdyq?7)1J$1cv8PJxj za`bgC@#yB1Cg6+AIl6n$DADqB@qH2Nu_MQ32k3Kjz+SyprEqT|0#@16^=J#CmNXmI zi?cgnbI;m)kF(HW^9pz1xVE8JLJ0>=hs+;Q#Sr0`q!w@RiNUq8%fnMQiMmi)gJQDg=K z)Rz_fXLwv#oSQ0kO$(g_y}_$xzQ%4lrnkRb@^|#{#8tj)xo4={Y>V0c^#4ikjSdVP zVikeG7{s^O%QMOE%W=2LSlz~yjUOopkLv?!O9rfIstT8kU%5T!8t zFcUSCJg2q|&S^;IJ0h|g@8SS*N=nX*MN4YK6ZZZTc0kNZ(YLM4$b!^x%0&*JKs%h5}eeJ!pz>~xId;Gn6 z>x@7|gA76)pF+$~9VEh3G&zX3MP^r@q?d;@(Jiyys7flH0}U5>Va|q|p7MQ;1oGg?lDwxfWIEFy^DcrQKvUflUSrG)YQmAvwh!egzp67SCzDw6cI*s0c znmC%Kd+IG+iC%m9l|GZv@veDdlpYsVc=$!CftrL=XVNhyn+1kneRxXhbPSV?TY0vQ zNzyC?#U682h5@mkqD8y))wI~@{AYqssT2D{--z1Fqx{9NHH4waAsxH^LE@n zRE9ISJ<+P2mXPwB=@Z{hACJ%AXOP>@R@riSP=vD|3w}E%t(+85AM4wAS$(grrCBe% zSuC3mrfLikJf$UB4wlZ#+5MeOGRr;r7O$+lQNeON_e=N>vPNjmT+ye6Lvxb(?V%;V zaaIJjsc=!}Kbyp^lKW#*WpBmqZa>!ll=$cuHXYha!@s|B2DWg7Ucb?-n&0ikgS~~m z?4N^eivD@m+{pu-nBc!)>5fpcw$@08XZWK-*{J_Ai?{PA>deHn$t_|ffds-nH)*^Ci@$ttxlrZN+1i`z*j=hq9J zLRgOw`38ytBd(RUAF1NWDf2{qpM1-m)UQP zb9W#S(jpOBJzI@$aWMVW^xnAP3rD|U>+kR%7oc(D>fK_~?Z#NT>^(typ?7MrIMT<= zW99xF;P)LfaA~@l`h*gd#YA74cL_aamIgj0N=}}~5Sf8yr!;ie5?dng^q~oYBeE-^ zvn%wRl$EPlX6ELoaN+M#Q8|P!?Icrx;*q%Kt4P(m^`KeXbZ&lak-}8>v-|TQF42Hy z(EzhmW#yfG?;YJ^r9|Wxp7DvrrWUl^WYP)7X+;??H-_8@Jw;iaZDVzuZ1W;>gQC*X ziY00JkZC?L+3^nLRGu}t0C0GCcz)kzNI}VxSi(f6R#;NP;*>l2G0zu{+z4<|l-=o? z8#59ieWS=I?;GZQxA42H>^As1gDVBNVXS@e&qggrluEl8GmI+n@kXps-f_t<#X*G8 z-r#k0@F8uG5w9NMZDvpNXdiN*E)`Z+m6e;HR>*5^{&c8_f`3;o_TH6p!aGdcJvm<{ z%5_-GvSTu4J5~^H$5xK@L-CSxi`hTVhPhX4Eo2sKRT*d}av8nrwchSjFf#WlT|vcr6o6a>jLqJ6{h7oV_G6vu&u`dj|J}SxzyX`_6 zx7|!AJjPb_pnLzQv9{M@eTI+zh4tP)z|1oFZor%DDx8&xIv)GWDl_B7QW>kP8|AF$ zb8&;sJZWiv*EI>XJ!>9ps3EV>9lAYL^VV=MJa>d5otkNK&Ns%KOgP zy5riUB020Zs~G+AMm8{Ep4Iyre#wRVG+2Ypg8g;=^xWgwv13Nwhgj`?ecx4<;OeL9 zMVV8HsIBsP_I%3i+8$Gxasp3ljQCxeIUStt`OeGEQ$bs$M7~0V-Sa?A%I>*YOx)Z; zFhZtGx(-rhftpUr{Opg#AKYZ#0r*R+LfWSm5Vz@ms1T351A6c@0z#um0e zmQ}i|TO&r1^Hn)znS<*PJd+xWrz;N*@NVBXsbLo=#OQHUEQ@A3$)=V)>~Z}C7Z&bN zNlumsucVrg)RGjOKyo65$_&+1B3acqu-Lx8yMfx_u27UeSyzho4Jh(s zK^|7w^{P~!RE|c^&M0{qL6MuDJ)kb(|7#LE6TTp)7153qD^BXAW(X+l>?_e66%Ak0 z^$v(rjnS8=qNl|c4UE%A9~n((MX2c;HHFpJSBC{D;1<)5(H2o07t_Ef=G60WH;7g- zGT(~Hc>0H>RH!Ewj2#v6!O|Gc4%m{)JFvabjxZ#iR^<2CQZ-c&cSwomrqc5N8SnNr z6Opc?2c?bUdp)+WUtc_iq*-5o7mGVwQf5=0T3W@;#5XS*c~zk*%r7Pe18xuBGdM`L z(K%#Ph0{l~zP_z=eW}Ct(VtDBDC>|OH=5H_O-@zKEh4SFv<;qCiCl4?z^7%=6WjeC&rxr*1qH@lY;N#c|3vw^meQBp2ybawm07 zkx#;O|Ay$MSzN#=sO;x=B^9S8!X_Dr91l<5=v~%A?Vmaj7nbD5Sz!cABwq zpwrbvCFZ^BJWZ?X>D7v2rgqMFm|(LNC9_*Fj8VC1%pCPAW_NZ0l{n4Uv~Ahd@ahyP z#hjAq^jt_wu;%cmk)V!_0~xK-WopbZ?nN1VoVNQtt-?ixQIt6zWwoD^{>=$<&61Bl z*%|tr>N5)ati;w#w>lqkkt0tF1Z$L7#gf8eDgvU^(0R$Z$i^UXs!~L=I~BfA1N`b$uLv1eey7sH2V)TBU& zaTBpFUHxYhRsbolZNevF15d=P^yqW7^J|n5hNW8iDEi!I<)<^x7XRlc%@zktc*Y%p zV{1Z77Ye@vN=%Bs!ArgD|6-Tq!nj&vm{K3*Xe@V|>(fxX4-ygVO#cm0AISZ$_$iF^ z_kSX&vRMBVVoOIQ4*tIeQ2&n+(wX`B1_`ylC|C+Xum?YU^^QBZ0`s$u-BP3>I zxp;bV)*ZTUU7g6@*GYe7v-52W@6Q~WKj5=Bi5&8>YDa6394hZDzV3f_tq|7~Vws{J zzvo+!2{3oh5m@$EtRx8o{A?S59-2`&`EJ$8DF$72J~fy*dNVhi*MZPGG-&Cr{_}8p zj7r8BCx6Okl0m4DqEqD0lytxGCZl^MvtZ6)&S9yl)88ct;IO}zmJ(QDKlZ;S(DJ|9 zTdlQD$9!f&imQI<9#Y|;g(ZCNxk=uHKf5+gX;EX7W@8a4BRB$XZI3#)IqKlJoY zkGR5+eii;4TnTPqA78^v_Uwae^#zZAjZ<8Wdn8hw=`)r^>%J1VNl4!!}y2RDE2{culc?FQFPagMa%(Q01f|o zLZj9$Ib(E=G?+!eyAVrygCQW&BqyhcF+9Jtu_eBZ!^`?=xUa1Z>9c#cxG{5ojyD|=?DVutgWowUNP;P~GbC)PZokL3{Aj5)4$VF^c*WExBARPrl?GZtWs?-NMaTy6%5tNL53xyMVtU3{4VfTR z2<0;0x#!2WPmx5VEbWe}=nmz+gom1=5u^X5&SO<}$aDo9?S!0iRI=*g3FxeBV)7DH zyD@6iTw{t3iU1P4vPf_Ir%)YSW*apNSfYrH90eUk#c$8I+jq3F0)Y?0X#*@8W}XWM z*AhfcAL4Nse1^QyM3N`cRbK5cf3;xm$VLGdh+9dWHs26H#wCpSmU))ZaqT&T?nM@< zZ&-^q^y=Rh@L4kZYAD^FNcVK{pa>74U^qhs6o|5$cILfKsxlQRChZZ z4V6|Y0I4*ka*%L?TrP9J`bCDSgV@lyx57fl3Y|T`&5iJFUe6cIn4QE$I>>4`p}LH-M4LJx_Zyi}QH@Gak(-~(!0s-I zY=hzHogu}?&#kOD=NK9dld3PA3g>-=T;kw5qAZf-u`?$b z^#;{~cS?N}K`UqGTpeBgIyc1WmLg{A=HT$HK|X@Z0Iwsxee|sNpGi`K z=Yt;>UcPS3m@%y-^*;NXq+_7MAE2}S)!KhsYg;=VGmmrcR5Ab&6tW?q zzS<%=OlfLzujv?>RS-DNd~USCvLK`3(Rt?l!K9z5y;gLMnzdYkygfbUUW%Za?RN8R zt52BhUamWL`Xo1B{}XpcQNUrN{a*MP%a29jrLuDKdqzvk-h^hcGGl33G6_t*mvhCF zqoV=A-WUJVOpLk3fm_q?Us2q4H{_bs^n;Sp(y!Vo<1|iNGJAxirja#cH@JLFQ-^VB z_X-4F!IL{wpwFG?$n|~R=s%ze_BDT-o37YLGyY(o875YJ5+QBOzd#@KqYwB{ zRN@Hsj?nHJ8L=5djRXDbe>k8wt#h{?sm=QqRi{*N+eDCAKNQVk;OfA11+uWFcX6$d zE=moc|2l2x+E=QLEG&IsP`$iq;Y%2l5+ve^;8XkpMZ}7ZUJEAkQQb%{u>9QYV;7v7 zURDXTlCT{k|IpOGpC^xZ{EJMpnulDhCyj1Yd-8fPRsvigYj9xy0VQP;;|G@J3!S(D)S-jmni?M0IaL~yg zE87pkCMRfkB!i#B-N(S}M?d=bF<@4e@!4k{!Hif!23@X}NgvuX6j82aLO(l_Er)Bu zH^0pMs^@CBG>Rm5)r;B@ZaEHivu3gC`zC+jmj~3p z&HftHziyTFoNdwDl>c*OJN*Y?bRWIv0*fS~&G+N)fVC%_b{DAh7gKD+>m-r;-mFr~ zk8D;bHj}pwYr|fDvy3_TlZ}BxuF)SB$M0e#SG6bef_a(%4Jl(}+p{o9evAwHF#7~O zPV7S~_pO0Y#|lz`kK^K&>4%INfykrz5|*5eO*X8hOr(K^3k3vh=~~mJ_y$bjGJgvE zJZeFSa5rbemn2LCo%8j4gc71ash7xWaw{jhImh=OB7{rZ*pREaU<;1l%&ump<&xXt zF9K;}^oTvlQN&aWuBUUUC$B+`a_PAqnD1nSLc>QEeuhXfjne`x4e81q$f%? zeGtxj+Z0gr>S$-WPX7-QJ^)eTadLtu_7vtgLB3q>H58;HVMX;K%%q3yx8pCx#8O*b zg?F;cyP%}(^uq=xJ~EJuR>6d;XSpiojze1Nm5j|kfDJ$s5hmyFE;eodNF6IRqAJLkv5-Ka7^T0B( z{iXY)tKuL4xg?$?c;G>uKeA}^%JxWVnOXN|*EMBjp4P#mSsgyw!6CMwuvKlSS@ncuI z6ITAo_z8n|&NW&GyPL0>QTqlHHqA9|ZV1ym@r4qH7B*JGJaWI)Y<5?d>4pL!&EWZN zrIXNGX|EB{>cf{tx+O4x+LmIUVQ?=?Ac}Qno|c9N0%;{)y31O~q^#8tG@p`~$+DKA ze6>13KQMU;mGfEM@(%QTfBr2S;CZ@ttyH@XOLo8%5hjhy;1g>pF0dio(z8b0#amb~h4F&j5dlQHiO05w@D@e#naD5Rs?+JfPC@Lv+-nI6QkGV}f zCnzK>>;)ZWV&^?5!j9>{00o@(+_Yo`H+XE}NTQsp)m~O+4~kFep2TXD@a4inhr{JU znvSJ-?yc5Iwi=;_2XWC90j;f=jx^+#jf+j;{mc-fhX*ZJTLQ0eBvKCHJinT`lD{!e z!4Z4mayy}#JAGd@PAB&onG)q+Vh z2n;(~{md(0UQ`poY!6^H{bMn8m;P3A!*qoMv=&~=SBxLm$Zz)arqf7;5j@3Kfbbg{ z#`}Wm%ZeB4w`&QChgyVvjmNOFe9rM97ZbHw!#%M6~&;&MThu=e05!GwNgj z?{V%`t;`24-l~472AjxTrFL|lCw4R?$PDccd;n02Bk=Q+Z>6YLK~3S;*4EvVNngyB zUtqfqEcH;uoxeOV^%)o(w9CJ|@t+t|N!rRiaeQ0kDTm|ROLsD4)EHd+Q2*M54i2|d zwo{-rL6hO(T#6NV-(Q!LnIiERY!_)Sw1`Nru3&-um-%6PBW_%bI+`ZrTB6$TS#ioL zLo{E~6;)Mb*KzG`>&n{B4<5zF(WyF`d3C!$>O^Z!FdkS$kjl|0Ow(vk}DqA zAGdT99#1}82_!**xn~O%QHz5FRUT8f&~Qdj^Tq*B&zE$J{ZS z)~3qa0-=r-jfWl#4vD2&uMJ+t&iL%SPwTv@4ufd}A!U+Xws(sQivaML?J0!UkK-Ly z9g`6*G(PshorrKW^I1eki6)N|*B&ULEKOZZr3M2Au$MH%bzbOu*Z)`w1U91Pu$`^r zZsUKvzk?|aB3E-l^UiZrT2b{k1&Fe*CGIc$u0MEwQ>*=49#jz!K)o(BxX*qWLQ)V} z!!{Swj)@Ki0AOj)B04RCZq`;~y`>BywF5rBhmBg1itA^mm@P|728VcncMc*<*nOFN zkwaFQU;v2F%38(>?&`x`-NQqJqvtnBEL?jUPyAjd6KNQ3CkgAxUL?qdx%(E+D^2-f zb{|YLq1D>AN17xRn%&9AD48>%@JGWve;doaZ>Xo+lhb9N?2Qk7(i9VORs%k5$ipD& zK%ZM|yVSU@KANvu?z)7!*5K@(cn}GF{kn$?;k|dP?u1xM{z78FezmQYRr9#Be5CP% zITO!_n|-@2hmxF>G~oVtsRtpByt$fciCQk~nOkBVq3~YKexaNN@boz4?dRilmzQh3 z_YE%7vgVo53O?i{N)aq{0D};Gy@j%3z`{(pIvkRm`w3)1OL{s#>Dej@!aLYsRWnfn z0B7Zoj*pjS`~{;==f>YJYJM-0A!bZjsSP+DH*5^G+#JJg**uUvnF)X_`qLrSif8e-!ZR8Xb*v&yWM}{(we7Vh=y4=Q!A&1S2by(bOM^!k|A$6j(@i|OxOm_KC_Ox` z%O@9nxa4%yqI9(T&EpGVQ;lZ$qFn&FOxtU{)dA!JOG<682dr{8Okz(YK<_UbHitg&C-kx?kZn%J=Ft{c~gOwWG z8C5j2xOs;u+?DtJ*^34TKn}&}xV&l}I6ZnvLs`O2JGw^6X&)#V1at3bG|))&VE0U% zfflw`4u_!8mpOUf;93@f|0Eiw+-CQ+`V(+8o%@3-Vhl1yVZK&edr|qM^<><)GBtWguauPY$x@EaMw4m_}Ci&7VEyZWr#EAy}TDjXjP}oxb z;?io_tysM&(vfN6p6fc@TBfv^nNFPX8S|0D=4d=N}N*LI9L*wM#K&McKKzx!Jq8`yRIGG)1Evjzf&@ zMd}3ilQQJLK-atcil^6Jew^%_88B_eqjWIW=GamEIVq^(smNwq_FAavkSz?J2Zt*J z;frw9P@^++J0#aYb)IrNtzymZxMj7$Ib9PoH|k3%)i`F>m9C8xldVZe zNN>E4(|-`Bqxop9*<<{+&HjT|uWI;d!M*Fy0@&EMs9Su)V8_RyO&YaIDZTfHb7^u~ zmU?b%xtGViHxnJeSH1gijN4!qK(p^l3X`qJ=r`I~M%+YZCTAP$9-ZY@_%|2hyV(mk zeWt!QKl*@H9N!^&u*j50wzR}QW{>?c*{5!OUUF!bRrx-b*ZX|>Eq(h|lUX*nB%MC+MPI%Emvv zxAlStzqC{LDMN=wMeWwHnicyYa(itZvi@+j1|JUbP5mZR;@ne4)T4greC@OpKvYr3 zdCuJT@OeA?vE5!lq7??;_Z8P^w-egM`=qR_6*qLm0UK`%lQ#I=|&hUR+09OW{ z)&jlQ_~wL&fU|+pIX6V9Ng9{m-D5xAZr$U%K$f4i@bnB$XBx)0u2`W84g?HOj3^wy z3gz{m?(3J8T8=7RjDUD-wLzlkr!V%&%TlB36m?=HU}6#J*r2p6O@-9$ zf4Ii7c;QQl$dpi8bCW?YE#vyTo|8r1k1(ydCgU9Fwnsl^&=Aqhq3yCH2OGOB=%^2E zmlqHd$)Hl(rfAifROatyaatPLHEi5{ST=WKD5uKB76~rxRmua<21y(4m(*UeQxwa# zsx(p;!Tii%o3qro+S7-eQlZF{JnTya!z?P4wpqVhau$8MTe8Gk*vIClU@c(R`H_ur z_o+(MeyZB_#ePlC=$PoE=FsYYhiJ0bFEDYUt%Clmq8%B{8oD(s<# za1GG{uMDB1sHTk~^<{lRST_!Tp-Zq0?J!TJ_ag)Hu8TeU~`)6^|-zayq}b}!?Bsd0y&p7y`v+5}q0 zGUS*mN?N~GJ)7drEHAahIzrtSOtpjgi*r2olP#5|E|tqg|7s$CT8JpkX(f`_0p%%v zeYud4(k7dfi+{FwCZyY6C2qZuGkRR zFe7l@71E^|;#1#uC~4iUzpvF>(&b2RA}mj|EE$}^OrAo-Po~b-i?o%uahp2XSW{=$ zEZO~Ut-WP%96{42IFc-~n3*kRiy16tW@ct)3oMJ7nOU-!nJgSJGcz;p_}$y@zT1e~ zxF6RsKc=TEy0UAkva_Ddd@?uS{%zupI{9YLjYeh9AlX&t+scI$NS?S8trwGzSkJPe zZy|&iw+y=4wj^D$AcQAOxbDv%HC0vrmph^Qf5{y?R-5Ck>XNN&W?HCThj{<-(N_Ie z3Wm%|$)AU^@E~@SWg&`Uvj3z|5X0PjLjJ$;K!|=vn@w=CPFMf)Cm%64?f>Z>70oi5 z;SXt)SH*7#lK`<@!2g|S_)%qT;r73K4pSTC|D}gO+mfRG-w81Pf6`SHft*%}k>-c& zlW9%guNK;+uIWPPkC-WSzUMDVyv^D!m_IIu`~`dplC9!`GvUW_Ib8cEg5FlB(^ zdAFTq4njdm!Xj_7HP>gG>c$ZvHNS4IH(Gge(kZX~BcFmU_!?c{&2=roW?w_`D)3H# zFeo(5RN4yHNz3n;>sDhQzV)-NOL-_e`rrAMdd{ zb;3pasOPKw_u`VszFOZa_69ANoA2gkd1yA(*}UHfPD=yB`TQ>F)xPL$uPB^>-)6Y) zm==l8B5$LHQL@9|YBxV^>`_;G%K<6qCM`c0rU$EVagp9QU~cQ6$TE6E{60FP8Hz@3 zEN%0R?qH}kD@)Dq@8pwr-GI#Z7FU%22 zDKO@lTGbKi@N9{RY!vyNLSwemb=y!ArNNASXJ3}k8zRvL1i`hT>lNp20^gkUh9|LJ zLq8Tegdi>%(C(Mw{8GX;o8sWG3q!&khs_NZgW(kt-z zr$%NBdH3-w9ORi|EQq5wpUz?Lwc4+bI6tiAWbZaUyX3aYU(PC=e=){%Y}jv@~2B(O-L6iqiL|jAE#Jc)w6AcuYjlV@h;!t2;~%)=FMmM)8j6HsM(=^=)39 zb@oqNT^mVvFGkdQZHNQ}tS~maJuN>vX!uZcmFgKOb>nAtaVr@<@#b7G`r&ZxlAapW ze`j#$0M_Q%)rYDO+-J2g)=qEE!-}VjYdYV!Qny$A3M>hI5LcDhY9SZInDd9rI6w0* zo3tI-5&Bv*7)69>uB`gvaE$yJ%aA>r1?Pr@*TfU|NGwFx^(!-y7$Lp>Z_(Y-lOs>Q zztbUAOTFnqFH_;@;g&%0M( zI-mK%8CN~U-RaOwCf&ta;tr}8Uc_XM=W(;`M~XA$G7>_E_4Yo#lq6<^vHv%*&XHYD z&LWeJ_qjWS^if{K&#Mzo%y;l@%vp_CrTBRhapyTbHO1<}xhPXpNeU!1n-z_8<+L#} zmY|z(CvtK&LfR$L@E8<5JUOzugX{#I1`QbpoA103*#+Pa22`>>C@K2G*+0eoRk{kK z7cT@XRfrY}BuyElW$LuYCM;f*(Bvs$To^4{pQ2gK<9fEge5|5xlCJUm97QF>(;8}z zY1rl~L8Bs1qw`wMRLG1Y5pk)vcZbiAy(72gb+#o)jF-9ag*?OrtGphXuh%gV6_Bkm zdd?X}j0N&%7K{a7x-=b?%X#9YNJ9dS3K#6puxT+K1}MrtugRG^13Ni=WlcYthw3q{ z?E3t3#*Po26FVZ;aFHc^P5s2o9MRHR;Py>TFk$KD=-R9LfuC-WekvT6)*zNB1#Cwu z8UMviyx}{d#_PXMY0@P*s$Vr!bDXxf4yh02#vSfc-w$&jc4C4XcQ{XOBbvSNENt${3G2QKkxe# z7gt|Ds7U}_Lqlp&oPM?mMIr)4>&BK(=kK?CFEJV)hh7;m+dw>E@GTJ`BFmbqUq4lf zVmE>Q4E2)@`YvXF%B;T*S=v-|xf9=7@pxWVzsIS$?{TBuJ@S6C3S<$Aw!6Abi<{%Q zgg52o^K2Flv^%@10`@%+EeeuH&C~PAhVM+@^?8NzTGY{~?yAQ3_SZ6x+=`Qd*ZG9E zW#*|gX%o@p%zV+OIN`%#mj8k*KJS$I>xU)Vc5}R^xiycL1u`k^7$@;tWwnwLg>;X} z#k1rKg>%h5K!KkCZ+6za@y>XAC>&Rpdn-q^5r71oN>AI?idcOpS#aoZ13smn&fR*p9N}O zbeg@ix!v<20b;%V8w=W^F2Mz8T@UG*(E-0V4Q%;`HX9c$HmA+BC@Kh*x8AD_`P2k{ z{chxgUm}dlH|gL!v$t-)p?=&}QPqr-Dgs~ooL`!9+d=i`@%8`v+3z=dgcnp#j{AFVrsVqYa!+aijX!jde*E~1;0o-Ne)gCfRvF%ST!j2_ z=c4nG>q+HjOk}O^&%vU-YN`$Z%mDQyaCi*wpO^<1`*GrA!q)lLG-|p(*RGo&-a5TC z?S1-Jxv(f*TyvOa*E~DA>+=?VCyW3<7C=Xgxd}eeX1@X0={|6@`Loig8P`3&%ThN} z(4BMa@+r!fuH$`eJoMEw?dVSLI=HWW7xx~B&!XkpNAA^Ja!D{{d=K~VP#rZ+%w}|fyV=D( z^trf^HmrRA=Kr?N7>^o)q$vy&=ey}%6wg){(s1cbb6)g5o6UZ%gsL^71hd>GK|UB& ziJ`T&v*YIPuQR4Qxxe!4bNGA|lgV|Pz1XqIa+t|wZdNVsKptdo&mGO~oLRzR99`9r z`Vd`OK^Sf(cY8-E(V|;p@>}Ie_&UlFxm&2i)J#_z`$$VgbYN$MG{KDSmR%{+yW5pj zeA7{{GGT6Y6IEhIRve|qTcqWp?-*MK_RO$eQh3xkncMQ{vd3^JF*+l1WDmpT~sv#zAiZd_r-NyfC4Bjd@$)B+X(2Q1Kpq*PM7&LQD|1l%Z4^sGLPsbkjQE6&3Lohe8>H zZK0=;(st00J8F>Me%r@5_c1F=eW7cNe-Sr!hw2yID&9S^bUvV_vREc6ei4@Wi=fG; zI(>jjXp!Zx9x}URT3s2Jm64a)Gwuv|#{l{CY|~jCINwAm^UFB$9q|v|Hv-o4wCW1f zO{f(~iFBa83BMs~QJB&FZ_H0=4F)X>YOY6a)gmTO-=+LULv8%O{cQj>F**gz_y%3> zMV;RM!N~P}ipI@V;UT3dX064TRD=~M@te_V?yw4vO&WTBi*M}=Q7^d_o2a&}veb#& z66>HSTD6igig7me4$fWU%4XIR5P_aTo$?2n-M(3Y37dEFO7O@Sk(%H!j(20oV?;s| zZUo-t2)G3uY+6OxxgYmE+<`=BY}>Ly{^;2tAr#txu047)0p)?s{QO!L9N>z@u&CiI zy^sP>+O}O;M_gRZ;9Aj?*!GUfNZ>fK%n`9|T^jLLnVUSme`rR-5LO|vVNIH6)NGJl zmVY{L^vei3S^rvF{h;;C857%r1OT`vw|wqckqQI=`bF;>#P+)@&WOus_;#CrRkMQV zRWu+8M$!Z82qz^j!GBppENV(QBUkHu^*N^ZmtNJZQYKEJ0TT} z>glLR91!w2VFP+#3>|;QzqyM5G#~k1JJ?#ipEVshp~c5?i;sc!TiXXVvZaiZZ#^|t zf{x{>)d!`Imoy;r**TiNyQJ1QY>i3lM`7S8bb@ON&)w)OtFr19Jr}F?viZr96z_+! zsP1Y6lb7meY1&B2P?3)n6gb1qvObz}*+y)LEE+$?u$hgRdf(z-*VPKF7vPp@snYJH zK>yHHbn+dMAOu~>*Hu`2l6 z58HQ1E)Fm>kvgO#wcqgdoi_5w$6Hf)=jIl=*!Xjj2rT-c2^3ZgJWg$;gdc|Am9XSP zkZZxkiv;M#=Sd%Wbx8q&LJ}VJm%=}^oZ*dDXc@jU-Whv(xzI6qzW>?c!KoUY{er8f zE1=7I=%}Z{TFq&ig3Fm@ANA@9Z4|t@7MV5;iBtO+A6~!UtI781gAin&r)A}6WF-8s zT2%nyLpovnT?utV_j?o)yqQHAB9;`QX2A1yd>@<~x;;x1mC-=)zCCX?=5BtzBcpj; zx1}Y@9l`LFoto@lHhh5K{e3Lff?TxiWqcSC*6QXI3(}nGY@TeIou+my~;YcWSB|PYZjj43~)ryU^82 zB$tcFlGH?(?Y_JiiM?A&Bc6ty$D!Zscq}w(x}sVEG1Z%3vrv+dw%*0(1VkVA{0xM_ z!04#aNloo`zt#R^jp+xNCx<0R&-tNBDIL)GzrEnA@_IE?Cj@Ho&hxh5gKHY)K_ssWT-Tza#T;Ng z!0|tGlA!!bFhFkWjb}jCNdw%sn$?!2s1@TQws#7vOQ59e=p=6eI?>Y~i@rLTHArpP z0^cz5uvJ#TkcEBpMD*Re@86jzHey`)Pu)OeS8EwXL#4ZjyQ#Dg68Y5)FDDtdI+GYl z4`_vyS1Ri-i?6}%fW|bSlq>%IP^8lO+naIkSZT1jyO|$+7&Yo2hecXMe?53Wp?ut< zKlh}v4+i>BC(DJY&D&`V$p&hcXo?YJOqo`SmCb7=ehbW}npbouBh zx5SIG-tymxeo-#=z>N0R{Entgw&v@`9XQIzmhD7=p{pxU_C!OBtN2ai$dz&&I>9IK zTaNaV830Eb50nwNc;ladC!M3)ow10N@R82HzTuO*Vi{+-dfrvtke?k|vVWZ8Ovlk$ zo{tz1`+S*h(HV5wAU=)rMCi}!F&AzsR`9BZF?Y~OKe1tN-|ciNCa=py_K76`rkwP9 zJ$mpSPxZ^Nq=s%rgK3pr&52ds%QOwWqb0NA>z7w)@5nM~;-rw5^BR56zsBIcgi5w2 zy-|;*$!(+V9xH&U3Zf+o;x3-kJqO3Pi$d#30)E*#!4GC*tTMJVciv#ITnc^?tP@6{ zb0<8YyIg|&m=hT?mWe&Zb}TpFauN(+P*@)Be+3qB#d~e9I^ngb0tis@P<=)<_+CGQ zdrS5P$Fa!gTFe_RuAc`Ua01Su66#Pg4VlL*myCSzNa}YZsj)`h9_+JPK z3oAG!dBz1p4gJ$U^p z%kHU~X^~=Zm*)n^{nBd_FP<_K9HR9tRh0O0MU9M$~mCZLq%i(H13Kxjh6`H-}_H7{n;8jP8<&)o5QRQb-p z<2D!;k?+eJ^9RJn6<=35dCq6(3ciH{&lixLr zW0ge#I4L)90t8cIh+InAK^b15!KGWk90Bg>u;qLuG}J;HcUeo@x*@$su=?FE^w&71 zL_gsa(7Mq+dTrrUUnQa<5^iMXta7#jF_eeg;eI94M@hJ)x8|BdA=(^q2LzXCYIFG3 zX}IZyuMe7JMj;!J<>isV1A0pGXdLrnYkB^4c)x*tx=%u=bz`oZ4`Rl=Sb{Bsg)L-c z!&^_Fp7&5qgU7?poO)@>-hfT@4Ht?J9$}Tr!+%H~vU}A3{xUK(S3GalORZecufs#M z&tSN6q{=oywJs(L>EB)SJl$Kg<4tfF-`2n4(^nN#L-aJWp?UO509G$FQ_Yy(*TJVg zOnD&qUi5sC0`gX2%^cpTZ}lvZmCx=9XfWOyVy;Z)wF8QI~fP`$W*4!%hBC5z$m=`w(E)>(n&_e|AUWB^5CEri?yzH3)&dM9=|4 zi_j%nlROnf-Aab8Q*or5nm6K-#t5AVlq}kFbJDS019mx`yzVbWzwfyT_$v}d`;x21 zZ(CiJ&0TD(C5CnfA&-y$13zz@C{a+><*MIsf+3 zuuPmjVvJFjb45h9 zSgy9EkNMNS5%W}mNDg7JU(qtmX;jyi~^;Ce-Y+e zJX%=(HE_@rqU$8b^EhEy+hUKd9KouQLoMNh{AW~$-C^b7y?rfd)&3G>rai8$e$a4o zuHUrIp4Y$biIjh;W{KMuZ~bMh|8h8$@VYNj9%JRrLLIs2Fp^DxORq}Eul6Ot%Zv1o zQ*vY>d#vRp(0R@RkPf|Np0~}U_Ia~F001y0^HA^xS`^4UO>nhUp-xN`EVBy2TNlP)%=Z{aB^&7sQxWJ_%d&f}P(#BVr!>FteB zg-sM=Q;ttV+&3D~l+Tgf&(0gFCow-*F|`bSTZ*%CBrI|9bhXI{JRXSd^(Ik-n^!dx zF7b$`h2&8}46KZ2XeSfMrD%HH>c+->aYbdJ{L6K|vA=SI;I&8v24FB>$m&@t;YJm= zGBLk=;%8_l<<@_ES$Y1dzQ2IK+&S<^J4L#0`0s2+v+lcP7fV+IkKKSpEJ3q5|9jEv z8yofK<}kb3m+(tw?DzW&rtpeY4*emxt;^?g-L`AfZvsaDH z`llW_Z_t2%VGH{k>06NI!~kmBY<62lrweiruBCT(wz-;|esJ0WA#hq}OQBf`cXGK7 z9-}6SU=`9N5bM&VI%6iVP4UjtbN3o{9+lME=^(4^WFjj(2d&$b0#tk^=V|(=u9oyL zByUWWPip`@1w5NwmOo^&74LUZHV%^vIqBeOQ(tygCfQb2$q~Q(@{n@rC-Ln`p|$2H zNxte@^nn+A(S42DD;pvRU#$=bh<`FEY@wm`Q~W5Urm|7`gTE<+%8|i3ynBPIunQ5; zmM?#%j6Mh@%nk{A9JWa-oh}$h0Fa8{m$oYV9?dCEHId16ymAPXCFXaEAAA0W1qc`l zP8T- z`W%?JBf!DfBmScB!TL>o`n}X3ry>w#%~(X0m1_1(giZAsu^V<7Wia>TrE-{Y7#PO7 z0|n@{ls{k_!7>_@5&Hz^H#1LPa~;EjhYXXo7-*yk+WDz|gZFnRdc1t?9(;Ekk2^ z64lit_7cRC_`HeHOh#>gpM-&aqgNPXQ_mSauPv}xs0k*&(X~<&*Q%-QW{aLs?^LH> z(MmUyfWBKLDfj+2Hge3firqV|s}Nh?*6FM9cfg=5qf*4`+Q9JY&x|L!$l=gz>$F`t zA3}WgwSm9~t}JXvdJ&iHd%RwfM1Mm(RwBJNEwZwT!H+$DQ&}ixz8zT8>>ut~{k1iMuVG0KIU%}m902Bn`PT7xzrS+>hS5I__MeOC~RyW*7`sIVlm4mE2yIe)|yzM9( z7FOkRWiIV}ija&*7G}?7wAG$7`{kkDBk9mS85rpqzRpF8tAhtW-q>_tPyhg;{eLwA z_LpL0GII-auMR}HDQ(#_T*0kB_VqKwJQlz=WpuHkLO&~e@g;RkNtOg=cc3Ai zl@aU^TyA!>n3#!0gt@(_QXlMm+T(?p(tN^wt~*?><*n<@bB_&_qi2kO3Pp} zjLs1AZj3h_TQc9#iC@a*?t+!}g-4A$`>X0vauW(n?bCDTk&m#hpIcf(R>7djwIc{< zfU|UVqLokbb~p0w`+`T14Xf<>xyF-2a%4aQ=6b20B+0{>Iu;T09)DLyjusmDcoygW zxt>DI3RHv&5c&48kIJFxw%-f89%$YQkM>I<*k1oM9W3|S8^hE-py~6DIxwQ(b51P@ zV@LYywWjvmXPUFIfjBXMNoP)?hW|Th2@arZm+(+xoD~@gb$d0X&8+g9kS$qn;-3K; z0I{zR_M(j~SM1nn(p{7l;l_ydmw=&E%Cho>g95;ZPu0skRxXIV90DL{tSRR8Y{GUj z!HMfEPbk@V)pYKrejHi1TJadgi`t^WVXTp z|Dy6))KG}5uaxR&_A7XQTU-bcWS;8J=J+LMl}}q+ho)A5GgqW_DE7j30|M@-*&<2} zJHdY>1!l%8I2x^#m;s1Q)+5T}5-;vWv4TfOa}qz?6*=eOX|rsp-dHnG(@cir?xvr)IO{ zxf+@&MYY8>PhLyV5Rf*mBr?coOm_^*=!-j^MGsqlf+GJCe4vxQG?Xl-|CQj@r#JU1 zj?XCR5yexLwx%!Ao=-e<_-N&6`rylD8u>dT(%UK`7n2o55a8ciH0uM6F490p5LIYm zu5V_is4aiEtfkt>#oE2zTXg32HnOVTj*w^Iu+%j!piW;&8_^&G1l=Y^Tm{@p|KcobRmf>*)LksFufV>r zsRxnH>69%zTzR&pb+>py+BaOpK0BveTg}on&db%}$QT)4hC8D-xcYx-vnZ=7(o3Br zF>JX3>5)>7eYW~EAovZ+kN-THb<(&w6G)8^;y}0OxX05U@E9QtGYtmJ;LG=pv7lF# zlv#dZsf7b5I84b1PXArdnkV0|Nn)8mhf5=-*hYsdc!flZ22frW&aD zYjphz0T|T2)?}K^@gr@C%1k_nInAeu(2hoT5G?#L$|Y=MHX;~R1a}zph2P6_y0)n6*Pe-`l%O1H555Ce^{@IGKOOgiETkgD^g zbUT(w%a?7ppZ+qsyUKu5$V~O^H3iG}m_MKp%Fa`1q%f^^h8SmsMfH*i9>!7L88uaca)6Ylt&1Et7b&J_2^R03n zrKKSVlkTjqub`0Z*=p6LQ4_iz(1P$-vd{bVPf^x{>igZ_5jxUE4LDSsAvYh~4% zl5a2O^m?|m%QK8O(O$*@KNjRpy=fHX)xixat6%461mJX~on;?Wn`HV2E33qKCzEqH zbg=;^*)?IbPj; zwk>`olfK?uu4o8@7y=9UP8dO#wRkS7DeE-hR*fAu zDm(?$-yaxlf{-+)8GDZS{8AGAS~%p(_E8j-EEPQlAD7r(PNWI_2&`M|n#K0uJ#Bpv zk}zDYqq$jho%HlGv&Yy)%O2T1P6X#2(w>KXq%o<=j!sALGBKCh=5eiqMz`y_KC55b zrDb)5b%a+al%BVQQuOrQ6fsV?Ec4St3L3*Zenf6h6D0dg(V*_0waoxA(!f|uv~q?f z9=eZhh4@TT^Ygzl)(8ss&Dv#1jKAb^6=f*jeC=kF`ee!H;et(isv#z(o(n;gt9;gx z+d@UTJjwtJx~WR;a<p;enqvG?utPs+LU~URsDC@rrbF>x>zBDPRi9<$Wsv)dhT8H72Q`r=? zpLATI3aXI1nx#;*eSg5kMSJ->m%HV=5e@j&!^5zJ_YcK3CJWwLJK|ExB#dlFu&_7N z;x@jlTI!x~221&z!=ThQ6|Wk~fV@n82CqDGKWCIn&(X?h41LtwtcLTs8^k)SjGxa% zW}rqE5|}duE9t2z9Tev)c5jX{rxbJgp2vVecwDRBsQ3cSQr0hYzNAK+Ou^##rDNT( zFfKDm+M&!j2AS9ZSv!LUY2j*{3OnLr?ybMpHr+g0kmw=ybOBIlu7jE5ce@Fh|9m>L zyL6;|2{pNhc_F)?=C!>r#t?JxBm@cMRMl~oJ3ZFMe2Ks?61$BlCn6iZWl1}~ zp`TfNKOar48ckFl>qbOFJ~*BcsW~9R(Eqq3LMacA<_FI%`>xas5N*1H^15xXyA)f?^9f=@b3aJP;QzJ??K@U1W9 zM!z69WZ1>G&zq>F;aXhNlqCn-f2>I}>Pz3|v<_J)80!ZW{VSJ}#97V*LL+W>O49pw z95T-AGTaIOl;q*u*E+#Nd23(BNG%MI6lxq&BEQ>rTR@%BTW_Ta`f6R4TfKa=RvOZ& zg3Xb(0|Q(Rd)Z-NwZFA8biK<*{CFY`wWBo#8~s&cn@VK4T2@BzRzIwIGvu|qY+Vn2 z$gMLSKEA#&f;#j`;vi2ZZmBhP&F<^7b=cP7W&ChXc==;WT;&Uo^#jR3$MvnJyX=Hze}kd}x^UjeSX86l&khuyel{3<)Rb z*$P(pd49VkHmNt1)%B7({;4L!tmzXo)wLtpq2BvCSdnW57^ftiSU7`@zG`heBcbU-DTwnl+y` ztFOBT2hO=!+tUv+q+k4w(y#%vDZ zC++rpI~=MWxflqf4u@+C*;3RM(EK>+?>oB&<#J*PeOTgAN4*JAjm}T5A3))=F7XiZw>`{`ez584}Ub_ z+?^vAK0c$g4|!S6BoZ_Oy<|?-31WT?5M=f##D%*&#P(9#IF#o@AP6JFd;$L{2#@e< ziomB-LX_M4?-n{cr|L8_G3o2W@K==oTqXv!3Pfau5&HK9)SMXk9h7VMUnjr)iroE} zJFoB(i2j^Mu51&vp=l$ZbJL7-bUXNI02&P$i(MS0W~t@WP1{F1O;gH?(*Z zaLmgz1pjUcGhko}v)&sTu2NVZ3NL3Eq>MoTL5eCnEc{;`{cpWO{x9)C4Me&Fp-BEq z*nE8v=|24Ry?Y!k7fkze~UxVwy(qvufvyMv{ z5s4;DZ965qG4Xm%TJ9{a3RwdbU^viNMmX8yT|hxHV@VHSX^a?An4K@Ivz|!3d2vNH zgzss?8tj-6ua(?bScaWj2NqTr49}z~w_&XXrbbAT&p~@6o=t?SWl(aPW;X90aUKPq z{f%(H=pJyUUTz&ftr&IRbCdB}$0(pPG)2sZ?Q^Dt0`LBElwW9(?5+rcoZb^%cBiir#>FXBNbu?xFE_I*JS?#t`iS^lKLoj9xlrBv%Y%Eanz6rwaPSh6hQ9Lf^f`ktBT}hkXl=NJAuR*LRL?_icxw_ z+hnWlOj3>w9>zz!25M@?O~>7y(r0PI^{i|1)EtS8{TE@G6oL zpr5hf4=R}0%6c@USR9|NLKcV1P6m%(y-9F`x{mLAmS}-)HRNpF>-Vek@6f(mwUG~3 zsBA3|F@krD`jZp)h@aL?Q`#^0m~a^^kPubSTkd8(k(!jJ zs~(bdh3)MsFEq59-l`Ck_yB%w4(Fi`EaE2#a zYmy_KUX6i+Kv`S)b5oTKU9>JbKR>*-7NZsVn+Yb9yiJSA#0B~|>$9$?ulYgEO5nLF zeEya+wMhUcdfuKP92NH1;Y$ON-po(dayeVhC$Ga^lU_ z)Md(z0?5*LZC_*CR%0YX#LT-xNp98hh*{%W{~Z3RJVsLK#&!q#J6r&8?pM{qRu&e8 zZet%ewY0gjU5h_5&=PuO3>CcD(Y0GfVvb#B#d@lrPt_PuQFhEQ(RGg>S< ztdJh`Kn1;JHIXWn{iJ$*1IFQsHu|U~AuiG33CO{5AIwy^D8=v-4Oq|a{!ZZ62RNO?XGmS@oL(m5 zzQ1hkn1koV$DMRVg`5o3q({oyFvoAg>g>9W*nGXLRP=VkHyFlLTEw0FT&{I-ui(yk z%vDv9PX}{!jzW;Hs{Fr%JJJaV3Y|PnQ4q-4644T|@OifKq&RrCjdQaK>rS6-R$?1w z)Zm6-rWSY1*R(J!ox8PJCQlt5931i)&9r%}Pj;^P78cY`A?l`+_Yqzdbpj82??{5n}ksE;FQu<&HoNLY}^!5j@5+ zW-Y6S=wXi3+$O;>i2BV`~_7N%8dfwAP3_v|nytf8Ud!ONJEYbkgV6HRv) zJ^7Y}U31&buTlApj;@FH*o8Sp%h9=)87*WV2wTPFxj)a=Py);@qvn>2k=x2v*j)*W!cq0Q1L>kgH3ATTwS-A&b^H|qaAjRC&-=lQBp9^tP!M92 zJrb=xv#|`zOq;FOj`(0Xp8^zEs9$}kFFh(rE?Tam#fnI_N%s0lo|M+l9hdGuE^!J_ z@0o3;RcR@Wfz!?zuhhFenwr7*dNKmkBZ!TTqEBs|NimCa7##DL*Ohs_?bLRL%r$v(CG? ziF{Z2^wQ%pk3bvziI3@=B|aWAR)yKa2{UwV7Rm7omj%}xad;pNddIxFjZIthAy`{x zgcilWztOJsxaa(;JpmP?Dg7!W49ijg(Axciax3r+AO(#p}D18sJP|n&YFH+eOT5 zH>2x4Uj`x*NABcT55K;f4FM0FHIV;iBs(-8B@M1OTG^)T zFdO_Fe=F;l;0ma z_V)=zW<0Hk6qH(U#=63J1{)!{cei2~)D{x(XPG;<&?*Xp~gw`tyxln-LCort@<8%E@2ac)E7|wU_idk6M z%$6_Sq7KCKqwNF^s+qX{`^nndRVv~)3hx@43L9lD7~r>6#VvH=EYR*VM-lW+Xqx4) z7N()mDp^G{N^p959=;#P?|#pEeYl==eVh3Wc#~2VBPaJqO^W|_prY^Xp1zSqgzqFS zhFNAN(&YyRcF5S*U+QlU$z)j9kdhA&CS`)6RiN6mEKx`jzpAcvq|As$xHya~5&dQy z=gLp2)f4r4HpX;j{tMr6Gs?bYX#pn>(OJ%zEjy6BLI6r{0!_CC&|>k~BjvQGWJenW5qVH3+GtSuR#f$`;hO9Jo%LlW`=%Zs)&|`yZJ>QX6=*zR0?ruDKd1 zvs^_MS!xrcOw6J2eU)FX_V zqo^zJowYKX=BrHWDT!IjV}AAo^q)Q4BV)%x?Ms!EGL#TKreMOpmb zu-@u)R(MDB`V2Mr$BumY@08zKRQTkEyuTo~H5th&4sV5)WH}|Sv+hDCU8J0^C`TT) zIi^F4W_SE1!HCp!#V%Hf4x_OC;gr%SK@VT2*!gC@#wk}?LuBT__g4yuU1zx3P^sgZ&BXn^5n=CR6xi=7ziVI* zO-8vEfmc4QkR@D*fAwJheGEQ7&ktq*(RNda$CXE3-MD<)&t>T+uqTe}b%T9YQCTR+IQAo^^VC-s>&dd~cjr^4t zoS7IoLOMJ@nKq)b;BBO=p64LDW>1i2hk2t#e$nh@04A(fHK`6d+w-@O?Hr9OBZySn z9Umtia6qVc`&Ui#q7>#DZ8ejp_7?6xDquMG>!ryk)>^mVPE`o^X_afV{pl@zEu)PD z2kjm5OM1c9x`wA<7fMu_2NOJI*}YBL9y)s;POzknrW_9&9B5;IkcZ>(B@hG3gxvb^ zC~s~k9XPnT!bliZmEpid5eM4GI_sJ`KFj~2SJ2G6)Rn5r`gx#_@kFXt`1o_u%`l3M zoGy%*1-Nhabw23>yMkTGqDK0JT8>5Yly=qwPPaUX48p2(F+ey`5X43ra^`oVo zpR~ew<+Fi%1G+m&4uN(bW00?0g^3y2sAhzdW%!_Oa+kx~tFEtz5l2|*lkCl-;kGX| z3$MK)y9GGX`sX(5S#H5ktNjO*={Xd1&cOo&8c+uI-tIp4$8+jD9)-X!#zr$U3~q|A_Q0 z1n;S{yD;uHx0wal9f{$DfCv;|Dw@a#FPU-mwYqV;#)$(`m!bh7XF&rEa-zB8K++ zz1*`>4K$MIZb@FhZnv7ReW$+C>b%z*MYgxnDM8J|!ty-gw~2{KZ*T9g(Ms9D>wVlS zx;ZjP^gUrY7?^mkQ+UmPH0Gm4MN^6hTh6dv(KQHs`xNy`3HwKKGJLSJLsRn=kyV@o zH9wb?>3o+7Bywj29fLon0NV1^w!dc?1qN;GosR2HCMmuq&u3#Kf}`ixuV!z=1z}i0 zNr@EIvxKCi#H7Thk01JrYG`Q@Mg{hhN7Vd{1-~fufRMRn!ir-1>1;k~T6I zKoO^>E3PuTWItgbmw|neWJz}sM90z0A(6z{Li*D3O=ahBIgdg2Wx=n4p`pltfQP{b zLqkK5dUGQ)GZbyC&z?Mb_~FBcuK7As(9MY~Hg<%E4a?Se;4PP@_5w-M-dtC^Zn|1g zu$Z)a(8epHp|hQx_vZUU(Ju7r2brIRTxE=&=jF8(`^?|0f{iUZug<4B45fu=CW?ZW z$V7TGeKpkjmh6!~YXvNoBZ!g`3w#7jukJGF7rM@u$nyG-6bPRs!9lUkIRe)P(QFXO z{S-z%v7tn)jwKon3-mVF(ktBT|Hx_O=O%>jzjF{TmAWgO>-T07Sc5!@YdkMJrp&4= zky9Mbx;aI}WTX_FDMkIRvh})xM$KAU!N9>xJe?idpiql^ULmH$betBqt-lF3OwhX2rCst>v5m z;8+vwD@!0UILboEct>JsA-L+i5=X8II*uyP^AvYF?DI=~6O!#h#%1HSB^kyrX>r5K$s?Je5w+2_;>a+Wv%)T{AcgiKmH9o}Q zl*L3r^><`R_1e`ZO-*A>-zr^K+K>`jn!9`wgqydgWZ|R1T2>ZTkDF$x5+B`9L*S-H zeN)XO<4Rp()-1E)P7P;y+tj<`!vxBAx`F9d%MTNgmrx6yq86Tzg5!|4EH z5hQ%Yx9qWnYyVsZ55eG4NYuH0a?0;o)^h%2%dP#6uU&Rp)!(`whKzL}gS{>u5)i1q zUtJ<`a@e2PZ0Y3t2C(UR=sV9an7qYYISTSPdB~Qa{pClz-lCd050OIF6G3N-o=R`S z#ih>pup-#2moGK6wG}+wcf00RvJ(?Kj=Hf8!!{HT^6M9xMzaNz6B6oemRjG42frjE z6VVr$YO%jH@mx>~HmSt1iI121EFxvU}1TzkD(fs&Ynq{(_g zt%l!rWx>*B7?NC_lU>i_ST++D?%)U>YK7D*H^(cfBp|+5R4lwAMDkJ=avd<&=xAPi`FS zrLA@l22`-BQ?U{jMHX^l(p+)H^Q)3Pt{W~ECN|gbt{$r3DD%0JZ5YSm-1-4(0mLR# z*0C6zFvxP;G_QkeoqMMAGU*XiZfS|S%ej)XE;8zAc=;H&b&j(gE#)4L^PyA* z>zjh)ta8x71mWu#+p*uvWqisFpO1K1B(02#=Te+<>vM7nBn+2(6W-7 zfVwu|pqei6ZlAW!joSwkY^F)|_SO7A)?PZ*#iCkDEJ3xZyc@F8T61!4Hvzw`E&-7RGRAH||<2D6aZF$ube4wwxDB z#4V+ByH=U2$!|%!J|`q6@Ax+Gqe{08Wq=NrI}oR%B0xE|X)|D!%pPJVTi*w;|8lUl zB27`#y|B>O#Kbyu+u-JeV8-S7iq7gJ&u^o6I_K33qzt4$u17YR~cUanR$(AA^G0hlVybHaI#j z^Kwr#a=&Q2Vdv-N@;xI{4NY;#{m>$vKJ%MMLDMKMw=_4m)PM=Pxv{KX zg|r<~h-nYlx=@w}W3$8D9J}W<2Sv05jHb2$G?UDsp(HTCaRU`n{U%V)k?iVvB!2}2 z72+_d@oYSX61qr@x@4 zE(z51b>fe-L2p6&;4mhjZe@sq&tdb#6@u9c!RC}>U=X#3^Q_B)4_21?TYTa&g^9fv4R$Xe$KOt zVN!fpOLa{aQk!em)*$DevYR`EDUZpG?Qfl?IN&V2A7xiXBjU~n;mM17-dhc+c{E(J zk0c~^2q>0{KbeGuo#3~R_ZjU8jW;;cT(gU4$|oGJmq+XNwRir3-t~ydC$co~a+{@& zN1ywOtP4Z?tQ1~?9Q4%9EvTDsZx=iHZ}su8QogO2+esM`$VvnZYPWm3kiWOQtMAGe zey7XKva>hmJhW#kUwbI*EZG{mz0KV>0=&Cy=RttMSq(PU@)9k5E3St^y&`kIpKamJ zBt)B{rapi=B)0Hipedf69X?3R$!-4UEOlKiwLaNfyLp4wllkrOY~9=`IHtml*1J)S z>cgmI+uJu|irb!}tMT(@-L@Z-$hmXbRZq~}A1NZPH>V@=vfyjG*r($?mCrq|Y~gpF zMWcI`F&iDAA?~{)A+jFNupQo3&&6X;VOgOBt+x7}SwM8CncLW%D+6Pt7c>M@r~)yP z-_ZnmEl=f^$>YbeE{sQA;&5C1B^{pgE>WM znuw`TJNek^OZfI!YMXXm?+ml~zCPJLyZj@ToqearWsgSy71`nSMHZnxlR#F$wK~?p z+k9mNuGtOeZt&_FvR3j)&9CAum?Xi;tWkno(~uk#@K<|A#YoOR8Ip@n1DVhJ*oBA* zHSy9mS0yAADCR3}d7KLb;!5@>-*WIMql1N-kIs9xu=_kqU>*#4H$RqFD3%YtDWT*l zf(-u_0EcxZrh}urMQEKTvUnm8j^;cZ?>25?k9zbSXmW7#vpfSU^zL}qeC%TT<}!s3 zD{(WIn_!o(;zj1&&OZEns2#Yt(TrF}?L!mjZbF_WutOZWsu&!*9EFJm6A?y9v^qaag2*+g9{F%*$X(@ zXFs`hcdj^6D<0wbYUu}YXQS;N(IGn0CiQDsW2Fs{wR;)zgkR!J&cs|Weq9Dm-_AFug1=xSYK%7W+H&S-s5 z>WgkIWw*PeSl%=&&=_Fc0=7za7tzx+v-#`<-;+KyjJ=xrjMTBAsTNFA#&^xdu7dE2stYiHJ3C2VF76bUF=T;l2T z_1Ktx0^C9rF_MY+cooZMuC1@Hc%8~&Vup}nhOorM1{)?GTfRroxcA{AJ_E3i(9cQNh^qB= z;fi3hcm>*UN2Jcq-rgSKdSuXPRR@cJ;5_AR3nc+Tl*22}o293N`J*DG)0{v2{r#h& zaC^Qi$1}hWS4>~_c6H^VY$i1~KR>_8!m}Rk$~jD9SKBT9l#Mg22I3l%wL4wOMfUd! z^n@0_e>|`pnRBW+C{kG}Xecdc*MZs#yN(v8b-Xo|z(l9c!+Y+%c&enFO-RtNHw{XE z8mA~2;lghuBEIo1=AXqbztG6wmME6a8-=L+2)bEi4wBQmORwLx z#)HC(zHrU!M6&Z&aTUnrXKF4`&4VbOVg+OkqK>>cqQIMMRw3;qf`Zl;jtZu1*|9`= z28Y~=TJ|zAa9}oK64tLVFT5_^Zl&sg6jru{JY9U}k8GPB^k`Au+ChtqnocawGyLi2 z$Rwr=w(}@|#Mox))mn~B@%3fDj7+&!T8&#dd>WQ)$7y;t<-a-oy>e&F1-7?1;8!9p zi50P5vF=kH*aKF-?qNd;kF(B1CBVmnPlK@ zU(Q&X_Y3WJTg;e!mnq%+!}IzV+pM~Y3qKY0x{vbvOl^BQ4wc2lx+7J6{S{-7iHeQ@ zLO~!^+f^}ZM5l71qD~#9e6-#VrEFU{gkK)(Q$E7Lz+k^RCe^*$%J7ozg7svx;p1eq z{Ap`TMJ`Q_EzSXMhoW!ngSL0NdUooz+^5HiGU^7HMu6=*zhD^Y0Fx-e_zcD-$k0^% zerju9@rK^wozK@l?dFt}ETcDv^i5Xv2$h4i7C3FwNHV)#>2y}LW9nFX==RRQuDSq0^+ihwq5!2)U~OUws@+w;Z@u$ z?h%uF_7fJPN{RDbD8>Y}4{#jhzDhl^uM z#|rVxwwTStWloFfvZ*5`BUS_4U(|7PdCs#gu!O#wiff!*aGO`@*G~0wV57CMMVS{r z>C0E8pOHXn!l$Wfm1Rs+M#6R%CtqC6zhamqjk=T6*LuKki*tn>ksU`;+|!Vs7B{s= z$4)4}5CfW}yL_W1HA6-{Kj|GOBk_4L~%^^QwuG>g&9P{xwQc}=8rQ3#9PjaQB)u0c5k#1c!6 zjI_a`7V|607>L%zdiJq(VU@hPo|-e79L7Pe z%!0PeK^DuDB%s&8_wOwC3EI~jV@C1nM-|duoD^mnH1@JJ@$+!7F>%>U7M+}YWG%Z9 z_X^>%^==%IgVYh@MH^~|+HOG41n1`JcBpGX7A&(-`4tLX>~+Z&zFE+ zB<$YH=RKL8qpb=KZ@5$68JkTMA7+sYTR+5wr@3)lr~8In4~^fPm(Kt3A5mAxvW9Kn zx49)qYrc|&s-og}(db5#Z_5ke%Q<^F-_b8S!D05FK4HWpyKjw<$CdbpWb1hye>`PC z|4u|is3y_7Mx)?@b9l4%IfO`ECWfnsFo_^6h+6TJDFEQi(~Uq3`L6C?$KWqkd7;OY zR8;6pcy7$qid>uqq>W;*dB_F6u?a^vQlP|d6eNbjIgb?KS`?3YUiB@8&Uwx0_S?CfP9g=Z^FKCAfgb@TbRpz7GKk_ z^0t()z>4kLA(gUbB=Dx?QyN#$|0-u3p}&V>H;yE$ektb>gZN zR$MlQF_+egI@$q~2p=Z$z41Q#UnwpOc=RvqT!-)Mw~Jy!S+DW;?cNA1l!5T2E4W2^ z7|;#3`q0YhW|7X^vzC6tw;rB8xp&wcluj%8yLbTTWfJta+p9O^U# zX{z-+k7P+5x-?nO686s?ZK3GoS%wv|9|!g|cE)RQIof7`=m=8X6n5JFOfKuYKO6sU zzP@E8r+q6+&OPs+uZyLZeJpkpiGO#?Z-EGV<1KhV!PV$|{rL3H)H(}Mup|D^A)%M! zQRvLh>}N4wRdNE2@dX>w%X_=Z7;2YKEFhh=YvB>Xm7#hb!y|w(#B~ndV|v|p z>UZQrKHEnjmYeNPkiEdlL$8+0N_UEN7d)8trM^egA8`U6OTLjMDbM_#OMDlgR+HDk z_Gsk&@Ppw4$LDrD)GY%WOG{oEjgJ(3gZ30HhjoE2uJfZ$JrOqn0_*r4W3!BtXNfDq z+&ZTg5;i~zr^8Oz#Nl-rC^J6{j)A3o)uh>qZ|65zK~89kId ztGZ;UbnMTaJ1+XgdJ8rU&$eFqKm*xuN_jh8N<=Uoil3h>e^TABw=*H>bzXJ4sVy=h z@8dZ(v2{3b0~rU0l8Cc&K_Ue= zxGcoXZ7Dif{rC|kCWm%o#YpCt+$db#3iqHG#T<=;(pnA-@$4 zHn*~O#}*7X0)95(;Ff!@gq1J?iv1w2@xE-}(+-*t5*Ue{(9xeQ@xcU^QqR@!5=LNu ziIHTL%GD$y<}s|*+=nh=cf|a<T0%E+4~$5rA@WC<$!CYnG=#oqDGf5B^8$t@|J)W!$I(HiTA5f ze{@XLUXuDk_35DqE8ihxNeoILSQvAF&6<#uEbJjMQwAImvC5WVG%SPKGf2PHFD5+(Jd0$38AJ2TLAy87*SGp5j6(%X^Xg7{wVkie7S<#qF=x zs+%*+z-!I}vtoL3uBHQ~rdR9y=flL2Y^{PCW=~4nWDHGhtaX9>VUcVVQA9FjuNYMq z^8&?Q^b#*k2j%E|HWx1|x2ymftspsP3^NuDs z^x(F#Hkco2=2!5r@RDl>IF;^M@J=l^pPf$?_U$KqY(JcO5>Ji73@xprsEvD;fAV4m zmd;6F{+{pRL8(=sf#ytqe}8wk*hS4wNvHnADJ zZgT|`uQo-5F0KojI8qxeO)V_SemAFm3c(|PO-`Q7p>JbqS`5=bp>q`Bmv$mn^0Ba1 zI#+K;b{&0x?@KJn4)mE@LcgP{NxS4jIFI|mNF;tWgAA7OVvn>r9RWcQZ7td225E)W zr&~zbuu20inilOXXnsQ9XSFD9-KNT5K5-((=xU#$Y8x!)meA3$)aF8y+&>G?V0<@NW1^9q0z>gWV4Fc8lh|o?C{6X1JPy`5&Qa@w?z> z8v~-hXf*Dal78sI>ERsems5=xU7;vdUin&;P_htt$o1w4iD2L|lavb*nSP&0FwPd5?>4d(s zw@`^yD%+NpCqz5Pil~NAqOjxu2b&lg{zy$Ng)xe5mFSW1L4&>w${EAZ*DM8224q1L z^2?~fQ`>;Rm<Tm>!jRlL8Us0}u?o_Z)kGzt4{r)5#Y9GL!3Czz20zGyr!oYt#d(+ z7*Tt8Z3z>tC;wSLdQ z-pw+7SqX_(=8^+(={~PTQ9>LiWOvBw*H}2xlqMf?>3Q+`q>;3bb20v^*M@Z9kv`QA zpC6un$DvWIpNezll|0||XPx9;yUF@(`Xe?4pUdt0hzK)>_x~EVl!u;X>a0MXQ!VrQ z!u6RDc@r$5p{$aZvY3|Sn-%z{_4Ij?l09m$!3v9UXJ{5$cm{|`v*sm%ZrJ^WLD-_9 zb7CPjM*N})ubMhjDHAQ4+>|~EA1vruPpy@at$0Vl7y6Ml^Y~dX1MYK_j#>N=K|GFk z{wi+~J*0*E1fLVHzr^a()zjWG1zI(mFIT384XWloQq*m^k_8j-Kkn6}=-}eg5)4Mf zKBS|5@bsk@6r9*q=$YD7g}`r8V?5+E^-dYit_u_vj=lB^ZH>x=j<-cwMHQd#jDcmU zSl~DUGv0Ha_|4(y?{7o2l?Rp{RZ+WM?LJF^?TC0a?h@aVN#@{wMGLG1h!M^gwm&|k z#af7M-K7jUR8&{8mO62JuV-jz*mL;Buhl}a^dCbZ+EG$eD{;1Mm~h&(ANg&wQl;?4 z7>YjzrR#cl%&$u_$Xcs(R@ek5>#T9=>bVnjSGX28dl;Cd3v>i6&3Tibve+u7VcIri zL&q1~M<9NfgPXsaU`wCm&XL|ZH3c$KC$Zpws_{5a<9M+`*YET}+_}<=%84mswDUY( z#_$3##cqV4)IZz-XyC&SXZ<5p)lsz@4s`jqqwx8?W6z6<>~$*b)mCo5yG5^$qLmA( zL=6{VCcwVK`ML?!7V=pG-Dn>WTSL!J@V<(91m03>tj5t-Kc|k|N=IKk`LNdAG}*G? zM>&&Dw26r%2NYYwLTq!Hni7H2dq0^~j4`Q#jKp<>FY4XGC6xv0$xMkHnP7kRO|5QxueaR-pw#U?YL*6U5 z$G?O3j!~1nJuZt+aZ96JsG_SoulDPV3b)}QQC(_t zoppJFLPidBd!da>SL;Ygpa__^E3d}6(AcW9-Eg}2#;Iq7e(V!nPkYnmF@G@>aV7FZ?wy2_nr9Y#G4b9| z0ysAm9`+nBcBt4GD3ypqn3JM_tl}dl3u zq3W0QZviUo`oz;VBDKbtBde<2tn zO@DviOrc$h13Io=iyvw_|La+hv9$dQu>;lo!dK?A<)wLNiZ#_r$`=nF*kG|SdHs5M zpKI~Ul3M0P(#%U)Y<8*+@@K~Tyai>Foz@>;hntGXz7bCVCbXA2n5oo!Jt?}f^zon~ zH=1t#UAD`wm#t&T;ptwPwwoqXW=SL(vNQc?{aTQ8`zPmTD7N*zk&VYcodAZ!0|tdu zQ`7&jT1(OqRN3U~#l3fyBS6{Ps>7yyX2KmwJ+Vk0S4+<2PDDl$hkh088SljWoD)Ci z1Z>VJY2<)66F1W7Eq^BN=j5Fe`N{ttZD`^rlYy-!7xI~<>nRQn|7p1%ZRK8tU_VdySr+A&=J`n|?%tci5g)CfP-5r4j| z9956A{FFx&1Sm%#bWkEJl#vEz1|wU=V&VH8aTb1BMe3}2-9M~q(y?vNr;xAzOf|iwAekVd^t|sW z`Kt%Dn6DX_yc5)%LBb2@_4tL5l)0((<%rdr5Mee)z~Bj0}gyZWNvnKRXu2F264e{W|&YQ!9RsG_-FhBX$- z3faPyKGNru+2uApjcT`&VGfog9K{zo;ZI@D`r*WoQP!6cqi_S*Y!x_b! zeIMxNBw^rF+$?(@dSA?&($Ue;NJeiQzbLvSA|YW(RM%1(mUMJH-PQK2>aDD-%r0-R z5Kc==>qHeKJ*WRng{B*@h<1k2!wJFr8_qaARMyfegZ+(zaxgYGH=CU>W6c%eV19sr zfB>rE4y;9fX|MJXjoUeO^ov zw-N=3Dj4~u8#_MPS>h9A`Tkv*w{4QjsK>-Y{?Pc|X;muu<7poGMu}2awhbrrFW9A4 zbxIl#wcQU`0iggkch_#C5DJan*!j8dT4o1c~{J{@K_lfPHee{;~)a^=e$+&d^Qpu)(IC|S!cMBe^5g14N~W4vG73BJ)OX3iB76= zwk}DES5Nm;@+M8dmJ|OvP(HlgAv4>mc6;RI;NXA?#0J6Ovc}XzYjb0B3qwOQGbsty zK^a=Y8VTwbSRch!>r6+(BO-nq+8Y?G_Q%oeLAYQ)OIS;;-3~2LY|VjQe_Q6@%L>dM zLKK<@n0s6zG3%+lR##&)C#R|`dW*^Y=!d_m%or_Hir=~5PA2Kd@3b~jNc*Two!!Y| zyk=t)`o%{^ln8}<;ZiGhG-VMM8J(z&4+{&6i$j$I;PjvvgC4x9r^te;_6iTjzbLA!ajjL-qv*4@IgM!)?{+S-eAJx4=kB2lsx>EZyhSs$YPb{Zsa~KAga5z9VB<{ z@c7AamtA#*#~Qw~mrVcBH5=F>_xoWI3*ODyAeY7H$N$yJKW&$XLNzhbSYAH5u&^*S zH5CHWm+6r%a0B!SXmtHXSoHsihGekGM>+jZ z+ATOGyIUwMj`bu5CLMVO6yTL<=WKeb^M16*8(AL{_% zkybf9&fGja@lv!X+AAthil-zeC&&I|4kM9> zNM2G_R+q`W;VSwYGB}+ZZ&(2eJiV(!cl}&Lm514vw<-ME)nN|yRNIztd~i^#4_UXm zq~hVOX|_#84MxJ|p09B$BaTIUIm^fh2r>}{J#D2DhEEaOi?%V>&CYVIBkO78zbORa zU+n^+-*AZ;Sg?1rwzr5;A6S5aHNMOZltx$6Rh`AezwV?ZHqNHEl0^3N@JYr@DKlCN zbxvwdB=@2VVnKpTyba}trId_Lb1>Wu@^_)z<1HV{tcAZxQ)kth8LRr&2s4BTT@8Co zvBa!3>+KNigDM5R$ll;=rj0DUo9@?Vo0R&}N~ z;}a7N3GI;g?=9lWLF9D617Ew%$_epWdr(86q;wL9UX}zkFF@v`c8Hfpf>X(thAuEE za_MNhO^Oap5Gh`GH$j|Snr$}}wMkK?e$s1Px76waJ$54|CbnK^Bnp#&LZNaQec-4H zn?mD6U)_?7pOu^kWg1*8$A8d(;t~>75sg8IQuOK@wK-1#1K{&Sa^=gt+oszQHmAf` ziC>j*QlUiHkMi}NHpt_iDYSo>>C%6!G=l3dD(d>&1i|o(przBnWVd-)It=jLkouiV z76<_qUa@MI|0ZP5DPo>JXUiEz`i);buve34V#%_Y@@meN=!up7yn(kxB+{9B?+?%` zK*^O>_c)7F#*QTGEc{TeH8P;m&naQ{qXz_bSp36WUPL(Evl(} zERFelT^!TB?cmOEX!ahf5z548pP4(9nW>V-a`g85^dkqXh!NEknPzI_<^#{kEhph7 zb~)pZb+-e4BzY}AU^K@ltytroVwL-?_8Qs?*XdAm>pczKzhBW$m@4h>m=9gL!(+it zv(DpGE0~*=1%A5I9RqA~FAOu)J z5M#+~vB0m++Od_u;J^ z9VRB!ADqa|*Ek8d>^yx#vTLgQP7YQ!A^7`YH-c4H_`m+9B=C{yZ`2+T$b0 zFQvX}Q%;Vj-#vt*QVm`zD=R0_p-MLu6}vb;PeN-XP@l&}CC2wPezfes_|9N*1&;)A zImVG2(=k_B*H-7u-}CWD*&reHkGs@N#8BlA4k{B(8yNOo7f|H#BB()N<1KVFkru@^ zLoQibViU@ixt)lX5xgG3Oh^J7EkmQf&9&9`TNZ5xCsIkk%p$>s0h2Q#?Ki_lllzrQ zvNX?kLjb@1G5H$-85ZV`)QZY+p`iz_kA0vgWtEQ0jcv_$^2W-#Hv7#qFI<3FdPB9W zVP!ky>Y|KI1+VsdJbWnA%o| zeo)TT!y1Ot3;}w&taaS(5BCx zVRAwYvE?NyO1d9k5eyQtR9Jv777MX;jw&>?#6Hs!m+oX5KDeS&8U@KZn^Ml4B67sZ zOBM8)N4rSDyy^{e;=_ML_r8$qUVog9%MpWakBgqg)O;5!(L1G z;-nW3L8V5Gn0Q0ktA5l={Gx=WU%wvh&)0>ApO}tjO@kUpRif)XtlFIWeXXV1&l+`X z?wa=UQde(nL+pS62UVTDs=4SrNd80v>hraawO_~&I3^`sf}@ttPTv5i!FNK7-dl0o zsc$IBAx8vDQY|RoCktt#a26Vyjo(6WtjvBACLxx4)JLh%-s0XYWwqT+vg3)n>9(G#w!=~7H7v7X_REv(-h7W)nA`jB{mH zI=vgnABxg+dzI01U(CkaMi-Y`(r0wzVLoQNdTlk}jS1^^#TUguWyrQ!mQugbAx%%d zTTd)(2qn$gi&$P`w=TaGaNHj#S}wY=vD&o;Jkuokc2>_(rf;M<%bOX9a_W870LA55 z=LJoYsrj>8x;%ndWu_TcVH57Ngz^A1xmfT+zs`wFm87Z1G}?K}_Pd#z_3*P+4vWCq zJ&I|4o@IvXWo+-5>?6U)IWBg<3X1Y$z4^^YaPU7{!=^Si(Xy^+ukzd5-&^;goll5G z=QK6BV(&2B$Mp2guOG&8p&YD97o0DWK|w(h-)T@89JDd9Ki&AEy`l?$LmV`;_mux` z4WZ?jq7Xv8a-R_}zf3;PkR6n1i7RhiH1-=V@i})JR(GjC9Qz|dO^p=}4QVO(u1V*1 z)T6VZwxT_P?-<9zz*0U3UNuj?K+Mkw5&i=RZLBOsl~UYIP6h zYq5r~uSiqa_O=xX=J4+&fZAr+{CKB$rI#@!6Jo=ln5y_s{|k|;oBy_(q@T-ivU6|% zC1DBuUm32tR7pS(yfVl&3{)Q%K%?9oGtYo8=>FaP(MQ^XDVo>+4gX2nc#w$b#m0MZ zhcbLqiI6(r{qFlv3qgZ4#PE<6wK@Hs7<>P}+gE3r>ycm?u=Ze0^|n@aZ!M2)E4;AWrCyx^tU_iWQI2fJoI}4CkY9(5WrJ6P*~JvqD&oV4x!VQ35(1L zeNz6Ug%2$E?=%RTWhl zWHlH{3L2OU)Pw;5ADF2u(^G;EH8;#P9|*L+vHc5;!u(QEGx-6(r;=lsRw+BsGu_}u z_MYVdVWFP0(QI+nbA?f;+lprD4>ex%Mp4fLSt69au=;|qrAS1&MyJTT)#;Ha^b90raPLK$SP5`Qg>o)^-;P`h9-hq5kg-=HyZ(-fy|0 zn$Z93B1S4|YNj?C`FV5^JVHG&dtgUlzuq$ zsNKc;=9l$>`7evnJnkj*pSJxfzi4r;rM@D8tfg#F7h^7R*Hj(uQkU-?g26yYp!kJV zWT^0fZ5n_EkGfx_{8tx$r9dlzUhe;El_qaV(JOaX5Wdf%L8z(=(^+k!mii^anu%^` zH*@=`l*wOEq=^>{FO!p#XY*ZTzE#KHh*m`C$8W{m&&Oa(W$PoWf*yvt^tXQz_9HGT zRi1bF`1r9vjKkqnM@z!@P%3;r@q0;7Q!-$h$TLtgU}7>j!A6bjMa|gV2L=P)Y3HhG zV8Nxe1n7qD?+V{Hl@6sJbnF*WT9IcE9v<&5Q~m{Cq8T6azpdvN6a8g{sQJcMX#0ga zq3Pwrdewh2@+A8azHs(v3~u-wVW!*$VQ&m>UJgn&LD|+QITbu>s8G@fuUwfWv$Y8u zDN5j^3|~4lE!XH2w*Na5qC&h^B^99q9IXALzWuj@Lzhj$m8F!Api8n98OP;sw*ZVu z>0$e81O-}At{hxkZj1oNz2_h97yr!Ro_R=!*438^VhXx{mj)^Cj!|}K$q`dbC^+Jj zv`7GlsZf>S4jIjnH(}*V@_VhiqHuVmty=dU0zo?**qENB2Uyi?t z6l{06@^!WynXz^MA~B6)2#&^!7->9&j>u8bQ_F7ftLVBAk2xrwoH(7lF-f&P=BKP~^iWvHi`wftU$CMGAz$;lhY|KTt1dT=?2#ZM(jV@+4*;VqnmZ!)q}l#P(1OQg8?#$dx&Bcof|L+OtngHU+B?EU%k zCkpvD++SbXh=fYyb$558%3S!R{}sHbJon;S)qV6e9@lt>+Idm8r(7e;WbNYJ>r?!; zhg=J%9v((`z2KCT6b)_Hk>O!BHZ~%xphaFI6eFHJVP5cmhsWEr^CkcB|m)d|HAkpOahg04aC3|qxh?t zhsCiyUMZk(;T4?B;-0E>I>ob(k%frbqmZj`(v-?uqsjd-)ksPrzq}|!U{=cB0q)gk9X-s-&XD2#3 zdiq7P^K1Un-2Z9;_J{#p&F$@r)6)$D4=r&Pnr!vk-D*%L@lkDz^dC`7Bt~V_v8O=3 zC_nZufn!H$@JB{}ug!x0$0hq8gBQGd>;$!MJ8J(fcCRti)RG#}?}f(ilc}w3%)Q2- z;o;#y`Gn3Y8X8g(Zv*auu{~nHLHow}UscclxqRm(AGCL*(7151Usd;j`=^PDNLy=D zIyp>`_bn=t4KTbiV0X4HrBFgyBdOt;!Y*m__@G^suJaMmuG0F>zy4RNj0wu6fr}RV zb>TCzWrHio1F1~q`F*pM+%EYVuR9-`=i}q^>^_YR_)J60UF!n1QVr05>>Fyu^QxA8 z)@J&@8&|}wm2UT?R}5$QcP+`kdUI^e+&949-Q8WtqF4e$Z~)(|^-@TZV&RJp>P4Rv z$tdW(ub1R_-167tk_qwt!%{z!lK-D3w`HJ%Kv+$!b9^4FpAnw8DL+1-qsPF9VirU4G@m8R)CM-dJS zGN|wfCB5$MxV+l;|L_CNDJ3zHh<)k7w|b=BJTVoQTS;OO)BW_T2@}YX;s2%Ir>CTX zqNn8E_Y@uUSJXsvfBC=sTO$=UwR;Cv&+gH^U}`XvuZvX>68ipR zq}AInaf5?{gJ+Lzw~a(an_B}iThLLcAnmZ=N*6P>7xm9L*HpMy=XdY4962J}Xg-B( zcYoa>W`6P}GFk-~P>OIf_+`8R(F8lRXOrgi#0OfYOa$YxXtdh2x zBHP{=NhRS%#hu)cRPkdn*(K%3ix;b|qLTM?GoNKRD=5)njLjVmGj_Qb&dO3Ias51Z z9qg3ma2I%(F&7vCkCr{({yiznNnJ`MC$U@j5q;#9E2JytF9DP@+Htbdokxhoa`G&F zpGrwe=H>GUIPEqK4i18rDjgkSbhrUR5HMJGjO-q`=S40AwBSs)0^`vL1DgP5Hdfpyov%9Ns{i`AJWVe zkQ%k|(VehGWQl7v4O#nxqQHDuUG3rysraZpBZ)$F>q-h#S?aR9eXEU|BMs)>G1tJQ z;q1QSEBDT1j5V|5b@8@jyfMY^QEPJlbUD|IO0qonz!q!4&o=pFEx`#xVl}~+VFGCH zr#ZUi;t*{(l_vEC7XlQ%dAQWZBp8tqFx~QeI_X^<#98CkQh2I3fOFvH)5=d!lFtUp zVE`PrxJ>+*B)g>Kk7{UU;bQ)RB_-%WO9^p1-K9 zb1Qee5?@0NWLj|@!8Os6725{VX@v-;v{ zoxGK$&Y-^FETsnajpw4MLi6VtqabmJ7hRNMVad+ql}s9DjzhI7i80^$>ryTlYm)W! z6R9o7!if}1q}@QsrEmdTC@8?F*3K^hi)Qr8}WmDa!_*WzP+M zHqzGzCZP;;9RW$4m>BX9uVtSEIr}Txu@hH#Y=tdFrej5V%f5DwhqJ_6t~wz#SKP=1o*UO>KBGi)U5+#0(^1ADQ-hR?k4YL_@(?zuW(X| zv_5s=Wz@6b>wA7eFJY8rp4_Q?m`PV_18wQcdQ>kSGmqt!!7cAdQfB_)e7hwtg=99) z5`O5OR=2O*K%Y?5roV>VAFj5sI^rbkKg5UzdgfQ>e_G>jeICkn+0?eelG_%jd*EJG?%D>lQ8Ck7j*q)JKUEd z8(it^ z3q{#ftiIWdA?#J{VuMPmDJoJy090~#J z>h&hf-3fceeHA_{cAJd;7e~IcHN`s#)ccb0p{x#%Wz)UczgKZq706MUihovb?C`d^ z1qj*<_X?BsM54u(JIo$jK^S}a$d@Rb660E;YND6N7bjx(If^LnY# zb3w0|F=HVMBY{yHUfFoPuRK$wrKnhaRq1iUvTv!swh>Zz)#WHGw8-K~tfa`6hwuGG z;Ml9tW0_Q2QQ%m;(DS-uXXe80l&vKcKTao)1U^?+vGP!N>78a^2P=Co27TWBVEFr) zmj1PP8XLvAB+{^^PGdr(+5rSwg-cE!B@O@QUMpb#Oc$@hJOxyzTos$c9_UzZ=5E{t{Y(&}$f zwJVO~ODv4Nv6N@qf_~=_cwuwy_RCIgqpk2O{ZQuK$qJn!mD{{U*FkQM+O%ZjG?(|9 zv~j%7r($}-H0EZfI#x=&sUxAO!&2_%yLvoqWOG6k(IC7Lo8MFzEwi+`=zjCPG`d9p zJMuG9&ua1b_{glPaAO9ZqfqPn*urrfR>|*tq0nZXx3bv#Dx4uuXs2T%WoF49?H9Hd zH`^;u{!8=LZYpWIu!HM^xyF$;In@!kUw^|vW&d^F#yC$3+;e*1J8Qw?`9?}*Ma2+) zH@OgVC?O=REk7dwbJX76Tz2}QnqAH@x)k|Hh_>q-xAV>{!_FY9CXOH$CcRpI-AVTc zx3t6iiX8e{Q;rht$*P&_HV4y{!S{}eOs8dCK0_OyOfZp!V-v;)Sf2KLdc=Wzy~>l$ zH_}S#uXHp@UQLb8J1$QmKa=b$sPI}T+;Ced2VdhBsBqy(C zY4uTijj&KnNd0>||K6;%wuFuN2Ap~`hq?tujxAG;wz8dg<6itsYJdenevb*S=e+NO z>XV0FuTw*&k281J8h;Y|KQ!B~SJndgrqnU5vt;AB&{)%& zRmwNLXYstwkZ)eBTc}6knDi8ixoNbC=#7V2c+f1b?>lv>Dh)4(Ua#+^IWnP?VOR)emieK)V+5?UsC^K(+ zaqM6ie2QQ>VrV};+c@=+NakRr$3Ap(&LGb)u?nO+_tCk0DDK1T7(6%J+#Vk~Me1E9 zEd;eqZB5omR4dz3x|V?igePqNz)BL75?i^(>sz(NI;wiG6V>~9`@SRvsfdAahRsHL zZR8}3FsL3~;3D6kJMuzk-C+xwV3%M|=r}S;BW2(3K%Dc&mP&MTRw<1oZ~<9;!SG4Q z2s~d8re0|{7<-6=jU5;qeE;)DP~*P2{h;X68Q(+O|Q5R!|Q-by~o{Z7{%cm@|u?fF57#|gSsZ7BjGF{#hkU5|b95+fU+bug|%>v&^8WW;T~xr6gfE2UB*%hR{k;H z;&Jo4YTphGpZd5F)G$0AeY~D0sYZ5I{k8b9CadO1SrMhnyUWCyjOt)6JCtk`{wwl$ z)@Dv0g77{+u~V@SGXktYJvqMS@hmEC4uRaYOHx7bKTgN^&}G%o2QEZdfgkn#naLYt zAJQgr*=9jaOIR<4bL5xrF1&nNCF040euU%qdp~ureL^LKU z(yo=U%LlQTWJC-OhW&A9^U!W}q$S47uq|N)T5JY-46#6``B%(^YtTuPF+0)a%ai<) zbkgvWtb|6)g5aPFNlI_FEK2TN2%>fdamd=iu z57al&!#p^1K(qManSiG{g)T+^r|#r+zV#po_{^s3VGpIR*e}~#r9ZMPmU-7w?$T+n-r&7<$>9av>n@7J zTDMipCLvH#Hm;Zpwxi06pYKH3Ki)=!;?B~tK(N_H4TDWwOX2DJK%t6?ihO*0O2~k7_>h;I zm$x`b?E9RCmbR4pzWyp`D+u|H53*1lSRh`JK;+T5_sJsk1&F^>A6?P|fESubm&XVKT9qN=L#B^!E;s$H6)5#`$D ze#Yl<%5UF(g+jAT1O)|EQEu3hlAh+AJl{Oe{o$gu;Rvb?jZTxr$#!oaQgzqXi!~EZ z>ycwcbUr@*=fT_XVHjgUte+bf>X`OU!?+_e9*Svv7WEs@niAw2AaY-+L7?f4iRJ_M zxP~g`8a}(poOV1k$}Z8Psi?DC7Z;agHxiS%2*_pUt$25FMgBau@LRnrytA})KY**X zwbjy5)yisTcee-%U1nW)&j~E~r{-V34{$~ehB0%Yo0Q5#7-LRepg|+MNIgqJ;$b4;bylISi_Hyrr~y+tFMTKBQ&y*LB9M_Y&~E}+MOz#91-HdAqyzna55{3J1?C*Ni)bX;u_ zjs_RDV#6AwgBv2tRN3Pu%nb~dnwma9p>yw@jEz^L)h)?F5DA%ID9(p^yu>mB6pks- z&~1fBW0vDuju{KgT>C$vs4Zb7TcYg(k7F&`h|q8Zamz(m_qrCc8Wh+CROPUZzF(Xa>^ex2pisP3^HtN3#tTkhb^ zSr0p>o`J13b2fEs;TxI<<^E@-_M3Ia(l(J}Zth={D;w3}&r_wtqQ!;m6 zTZN7c^95tIa{j)74dDCOkZlbK<_?+GCl!c6WD&&B&;d1Uws{ zP*Z(<{WKg8_=a`@R^XkFO-B3&aT9?SZN%Jm#@*5MN(f#|2S28}HIoM;LiD6-Id!Q! zM*Vkss)yG|=6C9f6XSQ*x5@C&;!}3{;ju9zdz31Ov(0@D%jsAudN9MWf5ZsB3+z)lU@!NWpd_rrn zm$nvPyAl^iN@3KNzoxYOgVLxLi$15XJV@ou98qG0Mz`btn(R)i!p#F{JEp(&hm^|53OQ)1$7u{y2(ytj@`1gqQrG(HOz6;+k^IT9wO=2$*jvtgq3kHmq8lAZWelehBU zzr;eySUhw!NHBFIO|i+ecMUShhPFG>nhf(!0S&%g@s2JN-Y~0_Y$V|0!3BgfUdn>P zQWSC`vBl zII$CW7rL(>u`pe(+Lo$^)M7UK+L=NsGO>D@aW;=-(+ zeHcW7oYC8yUILLjF~8trbjz)Fg^j++6^tVW<|Y7F^ZM#yCnhQ?Dmoe$2gf8H7tKd7 z^(_+~(aoVz?^52q6kqL5GMhil9>?e?#aV{>^w~kU@Q?uYL!ITPLI`Rd@n2V zJ%hPRfu3uz$>r;iJBi(E=_MSN?uXGAB*xn$+LN%lSIk@d3=1MEAv?hg#+EZh@62nB ztE-da>e)tUy1ait^E0X6SX=L*a!l`j?B`%*#ja&>0DD5`=Eyow!NkmQt>|V#Nq1Ks zsjx}v*@IJ?LT#7cLz-_VvYd`B-D{a;ZmRROv{^A6mI-e1VBqW;=wx;y? zb-qZU9<>dOVwuYMB+F+Wt<-@C1GO4l8+Ui@{ zdo74F*BoBf5gz^;Q$pr|+YDq%1Tesh-cM-<+KHwSdvvNetqoqvY%~|ba<;CQ(j;PV z;T+_rTdNF%%oCX1sSE5|l;Z_B5v+G3BzS|5dB@oO1@f`N4)S9&$dDd$3Pc4_l*$m1 z(pK#)*JMp%FXZerxZ91ThfUkTTKY#@alvfL&wyxw*!=>wDZ+CH_^>uYD&p*u~$Wj&n*yE|2`8xE|i zU;9Av%B09%wmrnA0Wr}pKe>q+2#@kzZYEgwdqSsOelR6RBR}?r*YwwMU8EZLrBNR& z_p+eKDzf3B(D=)#sm13fjqptw;A!?Z;3sJj;VWo5%noq$+XeUvH9Q4(h8r_%o5gB- zvPxR$CZ>Zt3$s(nv2ZyEG^Ptf3=0$7y69zh!V!aF&cGlgWpB_Ca2)cwTrZOzF zE4J4MZBpY}%s=$zA6=Gge>Sh&5%l+^bYoP(QcA)KyZF777DZ`jqtsJ8j(e@#H+rpw zMFWpDrNXQQF~ho!PKH}&NIfBzpNch~`vb}xXl_yNm(~|{<%h(7E`ar-*TKJW(Wu9` z^g`9se9lh0rNY8H;1%9nMzuB@pz?;jAyKaqa)*V1`^{64OTLWaN^jmX##d59WoSp8 z<6RQ|$rp|>dvaCkL#%>Z#=Ofm>Cs;H#1{~&_f>-B zfg~16m6Ip7G@88=CT2rW8&j$@xsY9?Wa)km*pOaf##!E15Zq$Rg#seCO?iYW0|eSS zOibL|@$mZmsVRnX!uZVf=qxp1*+^uqSmRcobe}VW37m<;fy(K?CeMFp0b*J$ql0E> z)GwC#Hf*<)^mPn{3*Fl^)mmp}Y7Az!WKbD3*R{_JC*f&pYNv%U=P?XNjPS+XeV@o@ z@wo13hKJ|kq*LrBVY1iS%lJHuh)>t>lwBTkakx{@O^S!`9gEkNe^VrFIH@kmdhV)b zutEiwK%9oyxj>Xq*~Y3!RQ|Y5$!XlpHDvVM-M}x)Bz`qb*r7q_jM0^RW5+|(G9d{3h)i0eIDmQL46MHPlX5PwCA(rx2>quEz;0p63nt?r zp5=2mFbd|jU6xc-9HF31jv!}93-r#cnn zw+&St9Y+nXWai`WnXu3o77(q5k_Ou%UVl*eeM&ji*Vp;JD9u&Zq)N%w&s0J6id{B}#9xiWkA%p;s^F?9w!F=D>*&RkY#KDRpIK-+sX0JKM z+RKEZ$zi7nI6f%U`*kWP(Nx2G5!ye@)M$1)TXv*qglv(>g}da-_Wd86oKZ*HTChn9 zK=0HSWGgGj8x&qA>_?3WXKWWc z1_opm71=x6cy+6_(=GORnwr$l@xAuNL1M+1G6DlYdq~U z+U53sz8!nYS|JR>Nkh9;-Ws0uQ!5*^w6z6QRaM~86|&gEvUZw`Z@~&iY5Bva<@4vy zpyNpilh)IIukU1WdiEGife*dDe{Xl;1;e1gej#db=5$M1+Q1$&Oh$RQe)8yh?p1!m z6W_X%eX^VjyVe0^S0Ro>#HFyGJgJ2ltn{Za($dlb0T~U=AS*o)n$PQ09VSjrPUGcb zhM4Z}P9J%*p)|@*-{S!$zexI}jA*pb%{AmG;^jTr)t+GHoCmoECSP2Y*SZe*cWKv6 zQIzCDH&=7{bdAHjPm(9!uY`p7`-;J|ExByQK8bZ^)2w@p+A0WT=#G&}44t?N30*mr z7Q^pjhH2M9r#&Ghq@_$Axia|^-(l=e2SC(G=M?HWO!>)FFim*u&7(tc^dg}$$8 z)ZivaMAJE6Pgj?06o4U&*z;jO3MorQ}CE zZ^7%H`wSY_6A#ZOuDKlgIA@$+FX|BrSon(RC)rYQOrPnUO{U-#a#UOw6TuUXmu-s& zCZXBdR-5N)UScv4ZqrV1nPhlm$BE~FTVE*R8=kMh0QOwc_lDaSFD$yQPRP~an>C`^ zR;*}l>vR1|zK3;SD&l*mtQI zChCxzTPn9^{h<>?8}Yur1&SV*S=?M#-_5A3&Kf#my2kKf5J(L}{3v{KEgiBSwDWwH ziDS+Af=?FC4bx0fqll*@rJa3tJ#i@Q zS?P@3sL&uiB$Tn=7qWdWA~V!@Z0D(=bvmKq>_8B&QX1E*q@;9wc-WJ`!FU7OU^dProzDVT_B(YaKL9Tv-Y`*svi zUAgVVSLM3|bp)(FTu8<=5PD>abACt_YmJC>_qW&$^Jr={XCz`j?d^?4PK3}Z>nEGx2V9o&9SJh>x#VPSs@{Q)uD3c zc-(k#lgln|q5GOolu0K%#V4*x_R%2_n;l|N_GmGyX{D?X5963isCb4YY`!axWYLe7 zk(wG%7pV8|q^GB=q98TdF)7_yc^6BK4-Cv}$2+=2iCtQaC#Nwd{=Nm>M-RKf)iYGb4U9*Pl_SJS-x8l;r8ENu#K^WC=|yvyASe0|am2t- z*fTqhr``(ew7~*Hau;3Lpa0qRVx5r6R1lXLPun%~xoadql=aH0iC8DSpb-AjQdnXQ6 zCJDZUY<13&i%=~V13CZ0uCq(DlD|Wh;sGL#7tX@6&nyYubpk?kTV4L z&fISN5c~=Xg*G(s=OP4(FGs6kmWXfP&b=W7k#tbiiYs&+W*1c zg84a!b75gUPzmWh6q0+VNfPLiX#RK4L*`+FTSO%AjY*S;dEA$UzW_w)NZB^$-|{Lm zXNc$4%D7`U4=#`&J$w`p5TMC0_TB4I;R{Xoa6 zSvlrL2jIfi&IVF~f^BmXt<%-Eg|*w0Gcz+lKn%n);7U?M-{;;_V?VqP9NT9@pf6H0 z1tUyknr80)4^~K$BAW^i`S!;ij%{9tB5Qa5y#Z|bU>-h88KZ0}HC)Z~hSr2C)|#No zO#TwPP5bQj`m?V@(?qcM=Axw><*Krg)wU}PYGtc;Qml-PsU3_qL8Ce#!)|Iqa;6LA zscMR6_%EYMRbXeDX=*xqRKKIIkx;pjKe^tOF&KT$=EIS+dj@2ag;MP8`vad7Lh?_} z+^CN0%O~pPNpAkx8NA$QE#cl!(e^ok>W!osZ$pKMiTS~q8evyG{gENtvWMXwza~q9 zW{H#azx#n|ZxgGbr9krKKqZ|weVhYm#$I>gk#rzjkzM{I)PAk=_N52 z-Yqe+o}PV@iYg0xNya(a3o~R8Ncn|yS#*qq*_4T6*|*}}n3<3RR?F}tOGnpfN;dTq zVwKXidcll};$rrS_M@N#j#~85ZJvQijK=gycPh2g3znBTf6sPK%<_EbWd0wfF~__Y z+B)lAu1vuklCN5R`UXmGWqkDP@~%u!`^o_Y&WPl>T=cV%h_vM|UGkg86@W(k<1SwK zJsYeiD*tU^REKxXxY!D3HIfEAP*esS==n71tc>x6^k(}%k6D$NMkU%%JJk^diN zWVz_nuMLSwkLZ06XE&#A5vGxwM$*7TBG`L>mi-Jj~;& zTl-R2uEEW>okU+*$sQ@wFg#UqaLArgs8~mT)SHBwOt5CoU$?e5y1!UMIOUd1fVtNI znR07T!rZGEF5II#a5n8CRCBMa9Ny`nUM`D_(sTq&x9WsF_<6#@!v1O0{%IR7AJY)R zOA$^Y5+WjvVqM{5T`kpqNo3UauVv*_{qk;BK3h`qoskd@$@mWGs>V$@Yy)tu09^X>CWN%t zABNBK;>8Ol#PJdoXpGK(C-{9c&6#Lu7qG@aP0t^|>bR8>zDDd2UtizKOsxtjeh6Xz zREceSR|oZuIaBGxvw$bA^)qG#)rQTV?%go909C(a(5WG3O1POHeFY`-$eY7gOkBK^ zZH%s*?y*&O-_R5<+wwtDmdlq2;F5t22MU}dK0|hOEiHZ~rms*_Fy{mWYSlK+oWT|d zW`^jn4}djVDoR862QB>Sj7`;hmE>k(bTB8SZdOMRG6q0Z zFKx~}J)F*jMlZe(m;0B*79S z;F^o>|NgCn86}l78ewxg?P~MC3{TWAq%-OS!A@^96s;ZJmZ0&6Ch2se0Q1+cq6u=0 zO?IMx-lSJFW>gXAqgi-4y>`2}NIEisn72y(0A`BiBe?+ko8|pK_BW!~xF?y#=onbg z-#ZEiM3gGTcM8Ok6PtqwIhMsG092&>zJ!4xuYiCnC>^1*J5MnCo&o@w42qY1K_CF2 zC=w#`_rHTj8jrwsS#NJ|P(=z3)E1y#Ly#71QY;{4hN?OCIyDpuO?_*8w!fr={AiIU zIhX5S;A7XAtg4kn{nQpPbVu|vWzix5y3b~!&lfFt_{8m9)YsGRg>nDi*tx+rws|}2 zA3EfHr&ZkPI7o?I_tQ%RJZrNL^sQ{1fx|+~zHC%d@(ubakCp%!t90pj)O)`(#zoyB zd1m>b+`lrjp}MqLo;<17hl6iMEm^}+FQH!^OIh_G*dYgWa)m6R<~K~KuR9uI+p7Ja zAy0@aGm@A*H8$`J33I;9B8hv}_9C=MoKdK96MYf&vvt?Oi~EKA2mSb2B}wgQ?3M>n z0lvfLFh`vg+8OUE4fc`23<6$cs61uK1F#kKXB#G4;)LS-03g(Igq7FV*BclZfM^!j z+4wB4mN`%?hH@Vn8M&7?+GzGx^@q9OwfVnawwDr*)?c>)beh0dp5JoGt*CjipXD~u zbvy)D0Xr%ZrZR=V~gc0M;BGL+Oxj4BM~>@+kw#e!yc4wB+Lj@xO%0sOmX;O^@> zwgm&u;^qLhs`|6pkAJ-ys)&Js+dKid+DDJ;4^|F>-}X-3NH7t8qr0L^MBg#qHdMWY zXwu1$y;gruw{Xn07g#lwqNhY?$hU6+EH}nvYgO}iFK!SatIGElR3FI6K_H+gZcL+e zxJ0auo=3*&U^uqlF@6zK8r1ml_i01*fX20qA7kVm0@ ziSoF9>3PkCxwgYIb+pY`m{9>*ytemJCA<7LTNtPNWG4C!^3pHEO0bLzCkAF3^n#`$ zOzZP*G!^4aLW(0gOy8*e&xDaaOgI1g@+8U0yOyiGmOHt(^BlAK{Wh#=sR8`;JF<1)}MA+m!hb8D|0t@rRURT>kT4>A^NljZ; z`VIq-=+Zi;FZFNQ1Hb>6zo$_MC|Fx~?B65K(D><%o{Y6Wjg_%Ql9NeBnc+U0oPQx_ ze!06iF+mB?t3d~w?qmf?cZ%79%}tt%uD0Sd;9oIX=Op3;+9isu(F1T z$eq;J6L<)D*#;-voQ4+impbh^D$aP>AoK3R6=!bwka-Ibny4-x3`cU5om*Og$Nb3l zm^=rM2hg2uEh*Umo$Lbq{Q*UcEj)H~TbLQhAOb-U$H`fx+v>mB3j91MIQk1tm%ai} zu;e7+WHwCWYQ*S@s|7^t}m4^HzD?9FwwzLVX`w5I) zX7gIk@bb9+_rMvch)7SU$vmz77Mfv*0y~D%Is6VBz9N*-xMMz6+gKMq+1C49Iv&n* zHrTh^l29?~q>d(*(7$q!G;UW_ZTkrZmV{&1Lc{2x5v2dAuq?R6sDePRcmK10TC6q@ z|NU+B&M#b4ZrxFL7w#06IM+>shW;)wrATQL?>bLk>|OIm%E>Bx7*7xut}{@ zP9UzCWi4nC2{7)~nxs?MTK@_4UkOzfC=wD=OYHU^f5tl1%Evad?@#wHcKW$4LBzb) zcll99mMytLZ<}qK7&17%EYZV3r8c=XLANYbxP&_OW6L2x3>en@=Z|RH{O2kGK0IDApz*@Aup@DCpY|K;F?RO1eecmxzZj7>W|Z1`ZuSis zv%BJg&Vsp&Yr^a1&Ojh|Lr&g$>i8FK16Iqbz8cap8&dwGxp%wrI7!Ue2as!k`vRyV zMgd!qr#TuQT2PQ&T!Vzf@^LNr&ztmFVN{xuG*60IE@AB#JyKIL9?1FKAmbY|hh+5Y zANw02Z06p-A6KBmQoUdMA4uz6W)jKCGNQPlc1Ore(0O?d zAWtCh6fMXo0Rd>E^EGkjlD7MD3V7x@V3ZOvdslLSa?spl?~R~ycDuWIeZRb|LGjI% z=%Q$I%-?{Wau5U9ruGABRBC!m@PGOneOm|O}20>K)v}5r#%(3Rqo@bo$*qUF^50~1Rc=O(_ z^~ul3^F9jonpT}%21^hKz^qxdEeMCJzR;{UO?T6uU?Pe(y@V|n*YjjVD~zjiDQ>i? zyk=%=21v5|yGB7Uf8N%c07EuANix=CuP_9`)RYN`*n-DvP^SoB;2q(N;$r3?_aOKO z3%!Ity8?GGw5}Ye65n~Tcyf^og`S@qy1iisL$(P%M2{={hZbOJYZ(ZSJ5wLDA*KT< zcnyF^&S@K@arc;5eRfvcj^Q{qfYC?^52uwQKSS z*;Cq8JFeBs>txZ8JCedm$RFas&$|hV{+Fl-^rc5cMbFvtM6|wvXKs(9iP9P6n2xVw zgyPuM5D9@~2;hM#^g?Lo3k3N}>lgTlC!nw$QaWln@X!;7YK=!v80xz0jz$X34kX?z zgWwft6HO+6{(M^jL1Ky;&mHnl%cHoyDQ@~&~V5j?1m?*1(uxzzX6`w;9BFgw4K?vKB`M!|aKe`Kd_ z+tfaCOG*qhHUas-=Zx|b+uEd44eu-#dnf7DX$*}j%}5D>xY%5f-BDIUW78q#==c~7 z9X%InD$RoC(~|m@8V`{d1xJBedDoJS0n0SN7w zTy<iZ+eSh7+@GRD@` zlclMa7DkIOK}SBwEV~heG^&P=7=oj&^bA}=tN3nvlUUC_kjW$+(4l1zP}EWxoXYGF^YZPY5>FX{5ZMu~Q< z?O$Snn?S>d#=_#;_ z0;>H1DAw2AXOCjbci&t+hKo`X^TJ5n`GtJ?{>vGrx*628qk82|SMr`fh zxKh5dDXwZS@8v7j9tKwpc<=pBT#z4p@8w;rK$hj-r_-s?dqIh}@&__2*)33pa{OhuHfB1LfQ^QvW>;a=IYg>p_M>+meOrac*zJ^4-#^s z>dFx(aw4`)7MRthJjB^Jb-4oYL!^bXyI+C)rwBd?NyVR~8=$mx>V&j<<7Ro5@UCHUJqeR&8m%$zXo}_Eb`Hhio@`Kd@(YRJpf@e5)pF;rhC_c*g zf%uwEriQqI{KLDqm5b~V8NIzb#cZgkEErgbH=9hkK!X^ax{n_Zp190**46asArp>e zCF=R z{O{Nyk0U-4V&G_LrKdkV0%%~hc7eB0fEiX?H5^|2?s#LU zftFbUGG9+FX5F)SFs=cL2Y8xb0a>J;J{i}_z&@W#Sw$$L)Y-?prd|W_TSEf-T(r5S zlMUeu(0J-E06XYaV3{KBNe4&u{@sPYf$E-(!&B+9VFX8k8HUJJb$4f$Zr(23%8Zu;Q7Q9h?Mhr8Q_8;etNXaqpPp#FkKDLw8n+oO|n%0>n+2J0> zqV=u6Syqyw9$6)jlB-VnmMnRhh>YTVba; z?XKJzD?c^$Z2i(%KJgs?=xWR7edVWm9jz(1VpFX_t5tXLv7F-OacA#^#S9h>iE1`5 z&nmdKPQMLrt<98}ff^=5>EES6gejg5o2q;BurSrGkbJ3PXSWx%$IPXdne;Z_#2vyq z5YzH<8JxuK`h2|DM=3TPYJah`IFVdV2&pzl6h*aBeI*t>wj$LIuXEmKa6#kS-_Q)n z-x^;axW=YN%;L*QAT<`^PWGUFu~-9NecuDz3q?4h=m=E!EF>Qd+aBxt`#&5Nc8s*p`CoBdAYHi7c0wBzj` zQSuhaMO2pIH>-y>rV`Zc&>-q|L`2@w-gt>T-WO+;;FA#-GQo{fN{^LihL{JI^|+6R zg)PI0ud%;P7$Y7Ag2d_ff|u7EuP#e)4?>We9c81-Roeu zMfE_h0Z^`XB6yZW`Ir_Ks6er*S)hmfF@lf7%>S}46yg76}0``l`kM}cDjQTpLeL3F;e#RyO? zwD?d?m9RS(5$Qec-y5fTswJzfoif7w%l|#1m)!o{Cmw$<0h;5dMdV0Rsey;kJ7vBJ zsXp}n*=Gx3_K$amP`ab5YuM zoj;}ffha~oauA_Ops*aZvRk#!Gd{8AYEaLy2+vY`_U@07*f#_do9C|UGrTQTh+acT zl2G8W^K5{czsucvXCoO0NY9s1yS1vayE)~aLqH`DVSH_{x%-LVg{28ca9%)OO`q|T zrKKQHOtfH{_(Fb(D*whxi!5$^+Z^YEdcU*dQ-AW@<76wnmIS6|(x7^>Te|(^$@(aq5YSJ==e(5&tSaAr&uSQd>gdc^*5na2W>{l z0?f%)bEqW|vcUS(6*qo6x#WKodrMNa~k|V|4#Vj+HV`ZB^dG}K`;ho_3T*K~M zLRP>aR6)Sk6)pJ&J-XeydyIHT-du#`M5_bkk+2{#ruF^7^M| zJ=7^~WDW^wQ=gzR|CTd=M1X|IX1EF|!bLNilYesUGlFQsH;#vhufjpkLy~KmUWJPG z>!FWLs=4G~l@!s+f$uzPJpToBRczNDN6ofMqF`C*D1%T7DPtoiI~j0UU3zQ= zX)k8%=G;e;yaNi;P)>qD6COUm_WrqYN#M6C28M?8{;V{;jTQKg%67R&wMgfciAhcx zHCDbJp2$;-Dz`HmPNL-AainaQo%$2$WM2F~?#@42=}W3L9)P6u$GlAr_Nzz&Sr8wA z%XgFsU?JN98Gtk{or(@#o7bZ!lyOpDso$8GtN|SqD0MVqVQB@fr~%}IXEhi;Q&C(KMiym zpoIY4+#_5%YU*FNU`;3>r*g2;Pl}K4$4Ey{kLuIXxwJ-2PtS%zZje+{Fl2iR3E{xr zvG{i^I_GsV1%6^xx)VyNDZ_`Oo3{TPWW*z<*5j2Fel$vf>3pcc&}0}{CTc#Yjn%Y> zC`sc>rKrBl7J!>D;a%p>p66@B*OebI&aXU9m_kEE1rL3p8pB8NTJA|o7+D7Q5cz48 zSH2|EC;^(!^)lh?sac3o7uzj;$|Tu|zCP)g?sAu-^~EO)W0x7IN5l4m&fwP(AjT&G zEgqH6mgYPC_1UU!uWhXH>Zo(xShvc5^vr-VbD-{`!@Cx;!z$s)z&2} zyIKc+@|%p3qPQ<}@r@U?jThaG=easev30&5iFKCoY6_?#DPm%xqk&VNWeQw;vcMrz zY>}rSi_Oglh~>vQbmo+EaWWv!b+kT$nEtRj>ZEMJTVkS5F*Y}9@w9bkmgC=Qy?y)3 zrW)AjaP4oVobRUIs~lX&563O1!+_^WCXe#!rRd?UyFkDCRUzZP9-w_W?|HT2NlB+0 z3}KQ-0U1$xFoku1c3QHD`&PC1~DR$7w^(T?o*t?6Vz7qDwVz1~kJg|`M11G=H) z2UFng-5hhy=TSfreC9>wmy`EStcyDi0Z9gd!Aub6(vX)^JMUrX&nTt*fU;VX>Q(+F zd~rWhLZD2i`L(9Z-fKJhcj_%-n%O{EkAx(4Cx>oRNZd7#4bFB;vXkks2v?d~oI$(j z{l_dq$lT(S&MAMq=NlOZw|_5Wr9skycRI+iTqZNL2_6be;{SiDdgs8%n&p3ZvcbmKIN8|V*mkn9y|Hb3 zW81dvWMkX5Z71*S^S#f#zjyvRGt;L}pRTUz>cXd9((s;#qoPROAj)&et(rJnxNt&q zsez$xeRN**Y;4K0Kp2*NyO#eWxx}3)l8@%%KU~b0Im0BSH2>sH2MpPwq?AV?$>G95 zwlq&fN4>@YZbQjf|7Nj<0zH-Gk^gA9KBnX&TibDeWj#POSk(NVi~pTR{Eu~^Uw(x{Qp;Ns4jyQHQe<6dujX1JYB=eTRaUB*YA3f=7e?3lv9rzk zPDofelFI3I$aV)48g7k`cJd@ibXw3tI$vq;h4dk{&yHI#!gcJQ-j9F$u1j#W(q zW=?WF20ov5UG2|zc|K;9YdQ61Ot$M|Sidc9w!NlYR2qL|zc@d3>ZZRYWps5|JmWlG ze&j{1Uv1Mhr6czu=Iz5&a$cP2wzTZN9g(W0e z1zo(n#{kIAzY!k~FCIN`@v>SD+Yc+;FKss2A2ZEa38|M+Jz)AKM~wDS;ozbyOmZ1nfVD{2 z!SV|VKzt1Cpg`t~`fxcvhz}P+V3@=7;g9fN5)!tDeVaBbeh`76;6c`hv+aC7UgdzV z>W?OkMR(aVh7ramMEieNwB{bd<;WOtbKI1}1pP9k0x!ucNz|Q3B2pL0o7HG&dRD-3 zYQy$o=MVQT(d=!vb^bV-f<|44_t3?ngbUCP0XOvGkoY2|0lLo@=E)@&=w%!^9Grl4 zJ^Y9xhL^X_g)k`00Sod7N5RfX405E>(;%~9@^SiyjE1DlhV22^E#M_?8{&PWlwuOp zy2Dt6VM%W#hTmI881Ot^SyN?P%-0{2^U^kHRVdZRN~HgPlkpe-=P~G)aw0w6s@mWe z9~$)bIp#fXFLA6c+elETt>IpO@ze=TAa!(kbX0{=%awkE82o&!c{WQWe*H1F)(*x-PwiobO zI&-qX2M`bt9FPyJPvJR%L2!L{H9I5~oWIGCR#3I93AJ6~$0sG7ARIQ)pZxm&jI0ye zqOMlWRhXcT<-R&{A0z~=Vf}q47jD=A9U)3((210h)Hy6FOY-5$0RkV~MT1nNO1u%? z0o0s#^32{{TLT_vxe41|i)&p6XLVk8yRYRCH*=yxKKkP(VZPXOf+N)~Au7x0rluEd z`1ueQ)ZO^&|7PXKJLGCPwUQ+B`2f$rArKvnhxHrdR;IrYP#>x#d5OPIweKm^MZHJNHO`eG7>D0k4UXe*e$vZN6>2$u z5j}AfvD0nm^5f#^)cx<0gFSd9_;XsQ`cYs3#D~tz)Yj6{dPF-)c-G+)7rc0_2)?uY z`%Q@$&QN@z@ptSuStIKgBTfuh7*GhW=4GvCp7rqF52~&D?|wJl#G15jsHa@e|TB9ata@)zZq!=xwtLUiZhR zs~EAoq-0q273&5v$W&~z`AYcGgcOKvoM8=J?z(Unzu{^TOj*%6HKw;|Z4xa)NVpDK zxeU1y?)Mf7F1~l{pV#~5!@G4~W!|S*?iL34Oz7U9Sh{5IdJo;IV_vn2l{({|ZD# zo>NRznIkU9p?IW8paE6Xh~F&yGk4ceTZqOtM&;2PkEHd!ye}})iZW3!BdaBhD&9E@ zipE{sNM0nKY6xaVIZE|Wx->oWa4fLb3QDr;2?FR7?Tc^dt3KIB{SN%QDq(8R4(vZO zf2)%hRY0Bc^W){-92i{-#8CR=2{*8Qu_gzV&&Pu2bnjm0Ju84RqD?5%w#=n#@ZM%TXVBcVji*m$G(}&GlemeV zDg*B8`!M4LCOAA&XxB|+3jKZ?NFt{D*%*Y+ED?BRc$cC#pVxb|tp;*v8VHz}nO7RE zmii8PZ1bNbedCzHE8k-kWC)#?9ot}~sDSqyowtjMwRBA)zL``HmU9!t=VvUHsw0%b zwz|o_^XN#uI(cTIZs486Pm*xnk@%3bs?V6S+13N&U85S{2|;+$?cXXFWm@|Gvnb!9 z@p1F>^RG@$60I@Ay5BBk$EB+V7x6H_@X5_DctQ{W6vmf`k% z{n?xY2|4t$eMS52XpT}x7$8@~9u&FmX-g`E1x9RPlrHXJW6%z4Q#u}8z$gVtxgw+0 z`7)(Yh#Tgrf}06H3l889?Cv7l(72RQIU!g%+C29a~Nvbc_hK(q~>IOVAu207lXmBD~L_zJ?jSIGTyjQq2d`Qgzr zk7DZPY-3vOG~^mAUPH(Av8= zLngDqAQB30)#1tJrVxOvATt!djZ0JnKM#(03zKzu(C|d2;!`!x%#1HI9WS=ZOJeeC zs*#wZzSH$SW*qudu&u9q8WZBZu<{Bnopl2WaCJ`jFXK7dPylJzhQy+Urt)^eYJI&H z2P`B?wNF5jk9DLEeixrM?W|)Cug=`BRJ`-N$;!)2`SNm8X3zz~PjQS(GkwSW`M6!F z@X*Pu?Md(TsXBkT`+qDAoUSeiT0{C|Z(e~`M)*ow)XKJ+Unez`qM}qWjjUf6ExOWN z^lw)bT))IdS~ceOyzpeUo@edi8Bx5vK!CZdHg!Jjo~?pt^MJluDfg`nD`TZvS)_SC zO!w;BJCXU}1B2`9oh4T6ZlN{6hd3HV2BHP$WNL zGYc;{N|Hok+2Ax|;xsZ0j3Ax3O0t@afWmHkH%zJyp2zCh`gf=KG>60wfaH`{P_VSL zd@a1w7|{Fi$!SymYptrP@nkaT+QB?*an!}FJiFw~J|!Bus0t{(+SJ4G1H2j7{xi{I z2;z7;13)R|WdYWwfr$4BJgx%7h4>Xw8%jRoX(*IR=U6EaRpw^S0yqBOyP!I|G5+6O zg~2O8ZTZIdknYeFJjzM&m*u<7q0JW;p(o4_@8;W5h=OR+JvL|470zjK6 zxq2j|P4wk`DF-b04B98&#D7)2Yn}_#p`R59&rTWlIn8QNjuyZI`8CZ;0I z9A{2HP-c6&s+X`&-v9sb1Sntt541aEPV48T$s3=o=a_0O2}=2$Y-P=v`l_C`+Ta<+ z&Yrf)#;lU&;Lg}&J9dht(3tp!6`wN;mUukcVcHnLAh+w=X$2VS=csrF?C(f5KVF`m zpQ3d1g1BMQn!D1MZ*4YpQzGPk7r|>2Gg)uxIazP!RE{Kh8IERtGjxq~X=dQ78ag$} zQ`xiqEm<|Ri~Hp+6uOdymPw0olSq6FfzSV9=Hv?$Fj#jcJTgz9U%>8{?!1ef3Gk2WiTd#s1s^Sc0Dp zor(?~w?N2LtKCo<{5^!&iu1qAM4X?~Zd9#SEp^imJi z=6+{7M3PSENzi&FK^KuBZYF3Zi(Y4?yLWflh9xSd5|zRHJ(kST^YEbKjhyIr0duHm zeVJZI!`t|HJyej#nb^KP!8Zae*zv~}cfx;gsZ3TNQaj3l1&522Ps4W^PPpk{S%Im8 zbP|CjRsnBNOcL0(H`{ii6^(Qk)IAZzXXUqutt}M(i0QKx-ElS0DsttO7s0nB#{u}W zu12ZdHtK59`(Irv`D4pHp|+%*ARZl;nftC+cR*_3F6?U++eGRDJ&|}*E0pRmkL{~37c0+ta!OaXeHWu-c2^L!tBdx1T z3z(wI?6Tq{6yhh~n%f=e)(=R?d>9L!U|)TFMNs*fdPqIS3%X^`LPJ4u`Ub?{Kxch_ zV4Dh!q6#Kj1Q#z_EQ}N2k^>{3Dc{`mjY&S5lV3F22$#*ges2sMNhMs3L zE{Nv&L)w@Apj~3Qn9u?YwR*I5XYXy(0K{6{C)r%Sd-45G`}C0M9Ich{bSo)8EKlk2 zf8-8d6JBoImz%OK5^pKOzzMm2T$Q9m3zr513e&s7OHy7R$28=}Ev~PbyQHx7Z+TA*3etp_5PoZkjA<0GLaUW{! z7#tA|HP)1^@?L;k9Dety(GW zxDwzSX1(c~0s-LA?iAsk;4^j8E9pIn$WsIv*@d)gHIJalY_# zwQH}x#MB%Hf}8!HNKU>SiK_OaN6r%-{jS7v{>aQjP{%ZljNeo?>M~YXEXzJ{NDc-4 zhomtkw`ZNYhKhjS$QnN)6}Q!6Hqb848J&rYclX*)qO>F9?IS+LMJ+8&%wHPBFaZH=%Q4|;0cw_xaUfEQ!NDO&{skR(0?w}klc@XOqNDLa znO%wvR5e4@+xxN^1xaYD+T0hPTq71)-g;#I-QFQ?J)ikAfnb;-j`zC4N2r5pv#NM9 z4GgUA_dmqbB?{zA6dj?3BgYg#LBVxWN0zz62L*k-9`{@c=yWPiM(h{zLt$4K%ZP)Npao+uf-^*;y&1P_71ED>EpGLx~9#0Dz-(N z+L%1i2oj&_+dS8RHq?l7NbzLYOg!}Vdd8p@Is~7Gim#%{D4CLZTOFRX$ZJ2UVTnm%$x1P%c9B+c=$U&be=pb)2kk;!#@(AkYc zWcL8CtPmH6s8p?%fbH~pZ>95%Yt6aYML-o%GG2GLUq4ctpuQNf)1%x^uQQP`o#V6qi>>te7l?$eXeitwKsF*n zxWi*a212F5zb7!Dl|=+VAhZ;dnz~XXs@;Wt7uxRUR@<8B!TR{EPF5#fiPfjH#~k|h zT_kNAE{I3SK2e*33YHbqyO|-T*QJ3xPz$|-uqspVG^mG;ZKm2 z-H61`=~vF&J=NFI!TU^>~)cX|w%8v>o(3NZht&bW&8Nk+ zU*Z3?0|2Bt(xTORA-`zm=cuw*K_WAE4{JY;q{}lZ9iNGO# z7;ee`Poe_P^9=te;hfLvH&17}ERlo8g1dOT_Xlfd9gQbRDpB<=UG(&0dWZ+Dkf_Hx2i?&n`{<{K?8Je&di|XjeXL$UoMI!nma! zrX%NZqYW-T3^#n_1!;S5aOrK0h~9q&i1NaEu2oH0Hku(Xwor~zXiRo#z0-U+N^XK~ z_4C?|J>0dweim%DziG2eXkvF6%DPWt@lxDK-{ra1B+!Wq z(3-b};Tm|4S*-BAke`k3G_U@h9&5Xl;-}5Np^-Y13kq@R@{XWs@A1~ZN_29MP!uC3 z{+yCUU8G|Q;nL9Gt1roM7!9*+7B~5*=siN0d18kmab)^n^p|9xR_a-2srF||gJ@>u z)Rkrz9f^DXjE%e&V8tFa?&&f6e(sy9z85OC>4T$Ur&p@NzS=;zbqyOi;;cu|X_K(n$){*2$uhq|D@`gXt8P zdeYbNnwH4>d2_MHcE91efeg%wL(p+*zyRfC)7RMgefBmZBY>A@&P*Z<5=knn653QuZ z60`;k06aa%zg?>KFoF55E8M+Cz7eEY=s8L|U9V*++1&lObUTZQfittEz}Rt*#}jx<0TRuk~jdLdvj3 z)3Oh^2=H@vnO9U~!j+i1>V20YZjummRBD`UsSYelM^CwwVFKj%%pHXnvvLYcLnA@j z=k1S5CcRSQ2ZxHFzIddn zfvU8y#i}Sli%B$U;UQG+tsPWFwITEKWRn#84svH%biKs87KB)3P~DSqN!u&~yQ3fY z88Te~5VWP6fO$AWOz-6Op}O{1vnZn&quMU!Q(M8h&Ey`}m6EV`)%HSlALg6t6Sh|p z4vJ|^?K!jBZ$VtRgT*u$x-plscjYPC#Ia$Q+9Q(N!cpvSd#bJN;H=}Gr_g7oUJ{oJ zRw}J_rXV(*B?Xd3D7;aLFb(W4y~3T9zlqcO^A2CYf&aM*TimNj?jNRCCLZ*}58{~vYYhj)Yl*wvko#wk%$Z0kgO!)hSd63l6jG3eXErq{otC~q4gM7n zYStI)9Cz1$e$Y_FCWqIS4v%$>vPJ2Lof0r=O-7FT|Yd2HI~)I)zpt= zzfaOl)H2mI&Eu!h$e0O1PQ!d?pA{DxoZ)VjGV)DX?GA>6_RbN5i!+Rra`0x9oZNl! z4AIgn%hx+U&L%KqVr|UDA#gAc9oSMz6XpD&WZYnvzxQFGH0y)NslX}9O<@}37%3X4~0TUB~VX%3ri zaXP*hs(x3zUzg+i-ap9A1bT@`S4v6pOJ)0gxiqnNV7aVJeY{Pnzv5#THf-)d< zBJhL)vvsB%b@;{JN-1ZMs)AyM7zxT1bw1}cjHBNLa}S-(9sw-SJ!QqM`Dl2=ov|*X zeOO$0eFQj8DT4clA_jaAn^m0>Qi)0$LX89?yIP*vQCLXViI2`Sc9{wE`8wY`PeO@68xSB- zkSLY0H&Jqow(>Ms_=dJGCRE zadas29bAZd#reBOr-l-8c9B1e`5{rWur0S37^W1JmR&vFu3M&G9{2^npMRG^H8@vJ zXXl!p8JkwmD5rlhaH0|*H{blltc1+UssOGw*dTMtYw;*=WTN7@c>~>{Wp33>2uIM@ z8`=5(!&{P3)Z9@Vyo>vijgFJU#KYa8zA`o@EiEuri&$Y*OpAmF*hwf$ZgKtB+2RAD{f4G89x6&BQU0 zh-RHQInr9O!Rj;0Y7A*B3{5R1Ov!x z)I*B`?k8|Yee(ps?;<$fHCRX~eBh#AhU7QGYXM3ba50IuoA;%SIG)c$VrtX6{1(ET z0!@WO2y1G;kE(Jr;7C8UW(KNF*zTQh1eA;g9R_sDjcy5don@E{C0;sQTWRI+G^S5Y zL~FV7+zNGI;&7VdZr%Ux2K|Tw4KJsb(j2;rlyZ_g^7^x`&ZDbHz+ItAmbvZT`lvi2 z;t~A_rpQ__<1l$HifeaIL31y z!SB4AHV0t)czC%Bx@9Dmb^ZfOF#7>Oq2ArU54-t3QcKjLOJn$qxqoarh${%(u23@a zX3`MN#I=p3NBh%mId@UmKQTb+P%Y*Q`NsEVWm=g$jf{O*QFU=7;*G**o)&f zr6iI`a&(7pQ95;mYql>ZnXpH=Q zJ6xjrWI)lgircmLXie4f)p3}*LCYBPw|bVSx;l-d86RXr=f<`b%d3~v>8+S#9ww{g zDx!c?luB|aVp46awtMYLG8RXl+4vS6x0bZML>i5xDIs)v5Y0lA4Eiy7RGaM5Z`t{VABMJV zj>B-_FRL~W?^uKx0R`pX6^?poWio2eB|>UPj$vW6`b{RRtjCeaYBhjb0=;WO!9jdl zgGKK6v3a@UA-4%Xq$fLeF7L@t@x!CkCpFRu4V}gxzgLn)Nt51?EPfncss}(nepSOv zj^I%6TVc%O&RC{Yb3#BIiExVlRr+klA~ddhN;Q{Rx2oYhGY=gg>)wq=bccCQPgUXQ zC>4z4gYIbL0ppu?9^rUT14*EW%sU$1_(~*4F-jAMeNJxHcw>uPH~oi+p~W2Exv#y) zt!+Vb#QpJAIbiB=SFrvo-l$->y5rNZN5Be0V~ecdj{< z1$uY%hDuXtU1j}cR|h+IK=*#*j#>#9zr7ISycRb>61Zt;F$zF77+Tn~Xx)@22y>p4 z|8QNM&OlZ1M>O+j?OU=LX2flNP-A5OZJdBl0J9!f_jX^h2d?8sYu(<8FwR^yxNdvOp#TYq-pZvo* zg!!^9I)<^3{n}5sh!zCIFjDE(mOM_XDwDo&xZ@`HD|FDIj_>T2WdiKfU*zY?p zHFb1u-y^RW&#YYVhC9AvLbc}MgcPQM)WTc09N2f-WlcWPf7qlg!eu;J5&%G{2WS`T z2|V!_F@Dk^H8?sUg9BpIDokyAlB)XE5H@DOodY8`z^@u^B=z@iRkn<~Id*2l=9F0V^x14Tp9b;ZosUUZrNE5zE&Mzj_1u2G66h zZ%1c+n5R)9zoIh8Ynv=_AOOUq#1+!)J<77u@$2^t49wXz;y}LP5tlBD!11OPP0 zGCF1}%P}C|vWiTWK6rfQ!}fsqPy@I|#VAVC^0c4vi<(+7sh2JHCl*9am~%BQca|J%gOQRr8aU1j=Q;l}G0(=l6bV(|UYHAm4!XE6@Rm?x}_(9m%> zd=$4*!b2q*b-LE=0YVmar{_DdjFdPJ!s8YF$m(n)Hv|CR+#-hp0x$#3fVDxi9uEdG z*U$iFX1>yD|FY_gHT_|WX+T8>2_Urc<(V(TQ&~Jg7SHA<>QJbgA!SLYZ3g0$kA;xkq`g?7`$`^52?FU+J~JHQ&H0vX&ugE zR{R2Q=$m+$N7vlTk2tfo5_{lL%F|1}0dYy2SWR87I-$dZt9F_TG;sG87nAJ4wRtq+ zvVOzQjg>0}0t`@4`{07YYZ3tfVH(GHg>`}Gv72roN{8=jOo4zue>TwccovWhQ6VId zA`J4*&)mcfC`d?0=<%Z#BbWvQip6J?q?~%=T!JK&}2+e#_)bjh9RkoJrnP``{;S7z31T(t7!Uh%}C)lDrHja z;>32-=bwI-^vKI1LkS*z*!Y&9AOh;MgL*^=Tmy#3musz@%rFkFBI)2xin#nGOKn)t zpgzC-&wY-fPywkKTHsL^^YH3O#k8;hKBp8EYAmcQ@ZSJ=79J-F9Qjx`G(`#kMmB!Y z+1X_U5iEXHo^WMF4k%!*>+>|}J!#f{liT&V|5*@<4RKDc{XCfyQyj#nxv_(t<2?uL zdob=s0b5JXHL}R)TdiqfmT$aGcb+@mNvX3GjwdAGTns&lMn136K7RV=a<+XZw=pCgLSFa-GurPF3c~m-o)s3ZwSBjRkn>j zJpV9neeAMw7g089E6dQ1QJEr?Q6fN&FwOzGl?4>FBs=ps^8eb??G{IElqd?IbhFUQ7JHWh91tCI zBQd)WnT~pEMYFv`^4kD`@mBzQ%q4)v$!RnKc36y8BrEk^=@!i3(H@{Dzl0Xm<}aJ>`t`}qVulFsT% zppXzajI=aH^Q6mVjb&K<5qc23Wn#T|F|G1a;NTLy?+38vcQe(La*d)0K^)Y{S#2@wf(3s)>Q z`fRk7>uuXzk7=?KGGJc$^&SJ?E-Fo|6yshas63QaAnl^q_JHRcUg}LSV zd8!Pd-Zz0?Xg~7b58G_@CQ&@2URbp+`jst>mg`nAF0m4U9e`>pt`yXpea;48=@is)y-_1 zu`+zB$>&9dp=tIeqk)U>HIAv3LwS&wv+V=-3-FvbAFtZR`I;}5m{RV7edF$NASfoc zELHXfnuS78qs4;9^YFU#yYy6fq(5#6$ScKoon?Kv_`+pvFSi-SB+NZMb$&9>vBr{c zCd%hDJsjCuLy3F!BTwoO!$?vE$!M+a0_%|}_M&rSuDg||w>;ZyBHj3bv*2|6{{s}z z6~F+9$i#>!y=RwCnqJN|4e4?=eHfw%w)6l#?>bIdt$xoN183CxXbkPJq@wa4b1<;B z3%5}4Q;G8_*rGvqG`Rkc2YY8Y zK)Y9FPx2u+P}2DIT%mO~Hh}Vto_eOqFek5`hJfrDddSW&#sGX0)58S@OvFt~Rzhdc z046_IUte$eXGMoQU2jy!W~4=`1sggfWzdS4uCGdYZ{5$5fVe5Bi!uYIE^WhDoSX^T zyq}Gjfb5-#z_xt=faBmGAu5xpOs+(cQb{3&knVn6ndxjIJ~XfCVHcE4+qtrWA46DY z4!vEvfHMkjTd$b*`DQx~I+N1PI66XR5;?T%3*Tv&WRX3!XL*60cD|fn;Y*r{LaGi~ zr`Z<$&rmad*R&@&fuO(6@=_FV*9Ugl%5<@1H*8Wzruxoiht{Ei!Q@41H!^Pf!Z2ly zjQo8T$IafjUdG+nmj|P$PS+LpqSxlx;0ABknE2T-MF)lcQ4L=<=+IVmdoS&E6+p2| z++Agnx%y3Z8*;BpwzH*i007Xz@M&q>9RC^bQ=UBk%4ov))^bp$e(XR1jQVt3to$gy z+Vd)8u$DByfk}`FxfHGkzJ5Zfh+I4w)#~$65iVDZgv)lJuGPlVo^1wn0&wsRisaDp zgl-ek4NVh8tX}|%n_Y;j<7Iw#>m93fXYF!x@BmrQ!#zt^JXR9KM%+E+M(247@h;zy zl?#6%tBW?;ZC?4RPw%uNAN(t8mIdPsFtxzO(w&P$J9&rOndZ93xgceLPhtJ8_q>a{ z-FjCs2g4ez>tksc&91kdQHi~HDcW3Vz@3}aZ>iAO_WIy#BcG(b6y~l(_ZO^DcDNZ& zyFHUKQE(uvp7Gdc+t_NB>l!i{7P40EnUz|E z&NVRkQ={|jV;3R7$Yn^8m{;5y=;sJ*`Vf=7vz^{Sx1}S}&%QXIxqqw-W48nV?q#Gk z1vhZj*WvUE&V;dUvY2KhM3sI}5fr2+sZO3V84hGbz5ltET9YnsTxJV}{{~XPh32-$C$>%cNUv@};PnCk*$b=+ zm9+5UD$?j*0DRLWUAVTA)E-2A5kQ4w76c z(F5`VM5PekZCpOx0f3i&f0ggXQm0Reg*uuH1g5HMhj&k>;^5YaNfYVKS%U;N`jb`a zU8uzUWQNL_DUerQ8cR;+$o-ht*O=CcD%6^`fyTsHY237iq$GxWXH%R{s}!yKr2*J= zjt)kfi_m!vVcf}k%&qqH6q%m^vSB?Y3Q0%|Gw>cv zPId}6H-J46(;+W4T%rMdjPF)w+Vh|C%Xc?eWwZ)3DkaF@#u47UsHlIc&fnLe%W!5X z0X=In5|quTFWXH-;#~TFeszKpcj1r)ZNSUUIYNCmDb(vaU>Ef^PJjYLZfNW5SnoIa zt_+dv)*Y&?u&XIpvV>XiHjD|O@Tsci^Z7KA`cc9%%*a>83Kvn=AEw6|mcX@17$mw! z&MM3#ClL9D08f;6T%HA-_YM{wZZVYI)ZAr29frPk5gByX$96?UK<1);br#0N}gw{(sKXMyo&LRk6ssrE8u5g8=awn-ri&_xDQ5)rP0v|h<{k0=ptHWZBOM2;Sm0bWg zUF37_`?AFRC4eC7%++4-G$9V%Rsn(7MGL*42pJz|w{YX&9UHQ9* zW&Iax@b9Y=w*{pa@2OeV#MzybD^j zaSqR|%qr>d=DJMR7#8OC$%##qMo?;-TK|2rb9rh{djP3TA)LZmCU^t?UX z)el?M?>dqenP0hid!af|Y`-5oA6L=&7jfE7GNJT^&&wLA00*TDnMs*+ZxAfhet&QMrDCA|5Bm#X-xD(Z&iQEhFPb;hYG4uip2 z8Rj7>yR~BXvqzc#oTBUfXQBSL9tCvy4hB|j=5T9%}AF}7TU>P5FpPfH^5%G`eoh)GYndNb5b2TwWl?qUs3lg=e zwiZ^}&5^DCRaFUz^WP`#N+OO&RAkIwQL|7uF35k`D-5tZTqPw~HELTO?Kaxn=cte4F zArhRA1W#>i(`dDzmDP)93q_ka34t+96l`+tlx;Y4g8qyI7AVWeyPSomL)9YVwO&PP zvN(*KLUvYnzbiu5XKKiJYaeg;p2S^#{B8cp^wKE-$Emcio`1rPqZi@Or+Gd09R^E9 z&4m5QRYN^chnn;*8mfO7?i^F^g8h|ndQ$Zb78DRul9~O(Ro~#%h{%zQj;teweV`JI zP-OIhrb64Ya-V5wRMAlWB3APB{JC_`%Y%inX=={6wD2}y!W(RI$u72i?bj1jz=mbb zV#X^V#~Hhv`GAM-sLuw&>k9!2t3pa+nY~uR0TBpc-pARAaPjGF+c~WPNmPqB7nQXL zWWsr_?{5W2P+ols|NY_b%9u16c#znmPSf$m1CEhyt;uFDw~|h`)X=d>*s|!nhL6zm zp3Qao(qcouM5SsW6`~EAEL_1dczUd7egP$KQ-0?9{s0~Ovr;-0TfO4G@xG13 z!?eLLdZ$|}^SpdQR{sHk9Ezvx=F>i-iu-O7VWsBX48A}{*q%*6V*zEC4aKUzb*mp#=|R&VNZjbI$#n{+H*^+Wx?{oi68YsVPS3VLU9v zHZTol2V&HbFz++D1Ia3j$1f9!{fZ&~M`n<7?ciIbxyPDfQZ4FtFV2_j0Q1 zQxhE?^)?~PX1TzczZeUvwx;>#A#gln=#Q6Xv$X^%c>2YnRA`Sw*RI4X6+L`l>H-ks z75BPa{IiLLMU!-@M4WkGMu7l2{bJNa)Mb_9?m~D(2SO%3p~^D2lsoLHr>iQ8(t*D$ z%dF0sRG8C(B3KBHkP>G1RHJ2pjo|4FbL~lQEpa{w;Zx#hs_-8l%%i?{d~V$2O}y{U^<64&Sb+1 zU{*Sk#mjgh+W8vZ!UvWiO@{qM%MtewrgYX}42@>=4JyiR?yt%xQ6Rh5rt=&E_`fjO zv9FfL&N7+kH#g_=O-kx3lB+d!>&5Cfs8CTnUc8(bW8OHM@8Hfxh~7DRKgxq8 -const int power = 8; + +const int power = 16; const int SIZE = 1 << power; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; int *c = new int[SIZE]; + + int main(int argc, char* argv[]) { // Scan tests + int sort_size_power = 5; + assert(sort_size_power <= power); + int sort_size = 1 << sort_size_power; + + int num_power = 5; + printf("\n"); printf("****************\n"); printf("** SCAN TESTS **\n"); @@ -40,28 +49,53 @@ int main(int argc, char* argv[]) { // initialize b using StreamCompaction::CPU::scan you implement // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct. // At first all cases passed because b && c are all zeroes. - + std::ostringstream stringStream; + stringStream << "Compact_" << power; + stringStream << "_Sort_" << sort_size_power << "_" << num_power; + stringStream << "_naiveblck_" << blocksize << "_effblck_" << efficient_blocksize; + stringStream << ".csv"; + std::string file_name = stringStream.str(); + + csvfile my_csv(file_name); + my_csv << "Compact at power " << "Sort power" << "Sort num power" << endrow; + my_csv << power << sort_size_power << num_power << endrow; + my_csv << " " << endrow; + + my_csv << "Naive block size" << "Efficient block size" << endrow; + my_csv << blocksize << efficient_blocksize << endrow; + my_csv << endrow; + + my_csv << endrow; + my_csv << "SCAN" << endrow; + my_csv << endrow; + + float cur_time; zeroArray(SIZE, b); printDesc("cpu scan, power-of-two"); StreamCompaction::CPU::scan(SIZE, b, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + cur_time = StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(); + printElapsedTime(cur_time, "(std::chrono Measured)"); printArray(SIZE, b, true); + my_csv << "cpu scan p_2 " << cur_time << endrow; // zeroArray(SIZE, c); printDesc("cpu scan, non-power-of-two"); StreamCompaction::CPU::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + cur_time = StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(); + printElapsedTime(cur_time, "(std::chrono Measured)"); printArray(NPOT, b, true); printCmpResult(NPOT, b, c); - - // + my_csv << "cpu scan n_p_2 " << cur_time << endrow; zeroArray(SIZE, c); printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + cur_time = StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(); + printElapsedTime(cur_time, "(CUDA Measured)"); printArray(SIZE, c, true); printCmpResult(SIZE, b, c); + my_csv << "naive scan p_2" << cur_time << endrow; + //// For bug-finding only: Array of 1s to help find bugs in stream compaction or scan ///*onesArray(SIZE, c); @@ -72,77 +106,94 @@ int main(int argc, char* argv[]) { zeroArray(SIZE, c); printDesc("naive scan, non-power-of-two"); StreamCompaction::Naive::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + cur_time = StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(); + printElapsedTime(cur_time, "(CUDA Measured)"); //printArray(SIZE, c, true); printCmpResult(NPOT, b, c); - + my_csv << "naive scan n_p_2" << cur_time << endrow; zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a, EFF_method::nonOptimization, true); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + cur_time = StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(); + printElapsedTime(cur_time, "(CUDA Measured)"); printArray(SIZE, c, true); printCmpResult(SIZE, b, c); + my_csv << "efficient scan non-optimization p_2" << cur_time << endrow; zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a, EFF_method::nonOptimization, true); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + cur_time = StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(); + printElapsedTime(cur_time, "(CUDA Measured)"); printArray(NPOT, c, true); printCmpResult(NPOT, b, c); + my_csv << "efficient scan non-optimization n_p_2" << cur_time << endrow; zeroArray(SIZE, c); printDesc("work-efficient scan with shared memory, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a, EFF_method::sharedMemory, true); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + cur_time = StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(); + printElapsedTime(cur_time, "(CUDA Measured)"); printArray(SIZE, c, true); printCmpResult(SIZE, b, c); + my_csv << "efficient scan shared p_2" << cur_time << endrow; zeroArray(SIZE, c); printDesc("work-efficient scan with shared memory, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a, EFF_method::sharedMemory, true); + cur_time = StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); printArray(NPOT, c, true); printCmpResult(NPOT, b, c); + my_csv << "efficient scan shared n_p_2" << cur_time << endrow; zeroArray(SIZE, c); printDesc("work-efficient scan with index scale, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a, EFF_method::idxMapping, true); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + cur_time = StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(); + printElapsedTime(cur_time, "(CUDA Measured)"); printArray(SIZE, c, true); printCmpResult(SIZE, b, c); + my_csv << "efficient scan idx p_2" << cur_time << endrow; zeroArray(SIZE, c); printDesc("work-efficient scan with index scale, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a, EFF_method::idxMapping, true); + cur_time = StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); printArray(NPOT, c, true); printCmpResult(NPOT, b, c); + my_csv << "efficient scan idx n_p_2" << cur_time << endrow; - //zeroArray(SIZE, c); - //printDesc("thrust scan, power-of-two"); - //StreamCompaction::Thrust::scan(SIZE, c, a); - //printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - ////printArray(SIZE, c, true); - //printCmpResult(SIZE, b, c); + zeroArray(SIZE, c); + printDesc("thrust scan, power-of-two"); + StreamCompaction::Thrust::scan(SIZE, c, a); + cur_time = StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(); + printElapsedTime(cur_time, "(CUDA Measured)"); + //printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + my_csv << "thrust scan p_2" << cur_time << endrow; - //zeroArray(SIZE, c); - //printDesc("thrust scan, non-power-of-two"); - //StreamCompaction::Thrust::scan(NPOT, c, a); - //printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - ////printArray(NPOT, c, true); - //printCmpResult(NPOT, b, c); + zeroArray(SIZE, c); + printDesc("thrust scan, non-power-of-two"); + StreamCompaction::Thrust::scan(NPOT, c, a); + cur_time = StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(); + printElapsedTime(cur_time, "(CUDA Measured)"); + //printArray(NPOT, c, true); + printCmpResult(NPOT, b, c); + my_csv << "thrust scan n_p_2" << cur_time << endrow; printf("\n"); printf("*****************************\n"); printf("** STREAM SORT TESTS **\n"); printf("*****************************\n"); - int sort_size_power = 5; - assert(sort_size_power <= power); - int sort_size = 1 << sort_size_power; + my_csv << endrow; + my_csv << "SORT" << endrow; + my_csv << endrow; - int num_power = 5; + genArray(sort_size - 1, a, 1 << num_power); // Leave a 0 at the end to test that edge case a[sort_size - 1] = 0; printArray(sort_size, a, true); @@ -151,21 +202,28 @@ int main(int argc, char* argv[]) { printArray(sort_size, a, true); printDesc("Std sort"); StreamCompaction::RadixSort::CpuStandardSort(sort_size, b, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + cur_time = StreamCompaction::RadixSort::timer().getCpuElapsedTimeForPreviousOperation(); + printElapsedTime(cur_time, "(std::chrono Measured)"); printArray(sort_size, b, true); + my_csv << "std sort" << cur_time << endrow; printDesc("Radix sort"); zeroArray(sort_size, c); StreamCompaction::RadixSort::GpuRadixSort(sort_size, c, a, num_power); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + cur_time = StreamCompaction::RadixSort::timer().getCpuElapsedTimeForPreviousOperation(); + printElapsedTime(cur_time, "(std::chrono Measured)"); printArray(sort_size, c, true); printCmpResult(sort_size, b, c); + my_csv << "Radix sort" << cur_time << endrow; printf("\n"); printf("*****************************\n"); printf("** STREAM COMPACTION TESTS **\n"); printf("*****************************\n"); + my_csv << endrow; + my_csv << "CAMPACTION" << endrow; + my_csv << endrow; // Compaction tests genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case @@ -179,67 +237,85 @@ int main(int argc, char* argv[]) { zeroArray(SIZE, b); printDesc("cpu compact without scan, power-of-two"); count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + cur_time = StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(); + printElapsedTime(cur_time, "(std::chrono Measured)"); expectedCount = count; printArray(count, b, true); printCmpLenResult(count, expectedCount, b, b); + my_csv << "cpu campact no scan p_2" << cur_time << endrow; zeroArray(SIZE, c); printDesc("cpu compact without scan, non-power-of-two"); count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + cur_time = StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(); + printElapsedTime(cur_time, "(std::chrono Measured)"); expectedNPOT = count; printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); + my_csv << "cpu campact no scan n_p_2" << cur_time << endrow; zeroArray(SIZE, c); printDesc("cpu compact with scan"); count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + cur_time = StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(); + printElapsedTime(cur_time, "(std::chrono Measured)"); printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); + my_csv << "cpu campact scan n_2" << cur_time << endrow; zeroArray(SIZE, c); printDesc("work-efficient compact, power-of-two"); count = StreamCompaction::Efficient::compact(SIZE, c, a, EFF_method::nonOptimization); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + cur_time = StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(); + printElapsedTime(cur_time, "(CUDA Measured)"); printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); + my_csv << "eff compact non-opt p_2" << cur_time << endrow; zeroArray(SIZE, c); printDesc("work-efficient compact, non-power-of-two"); count = StreamCompaction::Efficient::compact(NPOT, c, a, EFF_method::nonOptimization); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + cur_time = StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(); + printElapsedTime(cur_time, "(CUDA Measured)"); printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); + my_csv << "eff compact non-opt n_p_2" << cur_time << endrow; zeroArray(SIZE, c); printDesc("work-efficient compact with idx mapping, power-of-two"); count = StreamCompaction::Efficient::compact(SIZE, c, a, EFF_method::idxMapping); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + cur_time = StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(); + printElapsedTime(cur_time, "(CUDA Measured)"); printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); + my_csv << "eff compact idx map p_2" << cur_time << endrow; zeroArray(SIZE, c); printDesc("work-efficient compact with idx mapping, non-power-of-two"); count = StreamCompaction::Efficient::compact(NPOT, c, a, EFF_method::idxMapping); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + cur_time = StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(); + printElapsedTime(cur_time, "(CUDA Measured)"); printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); + my_csv << "eff compact idx map n_p_2" << cur_time << endrow; zeroArray(SIZE, c); printDesc("work-efficient compact with shared memory, power-of-two"); count = StreamCompaction::Efficient::compact(SIZE, c, a, EFF_method::sharedMemory); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + cur_time = StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(); + printElapsedTime(cur_time, "(CUDA Measured)"); printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); + my_csv << "eff compact shared p_2" << cur_time << endrow; zeroArray(SIZE, c); printDesc("work-efficient compact with shared memory, non-power-of-two"); count = StreamCompaction::Efficient::compact(NPOT, c, a, EFF_method::sharedMemory); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + cur_time = StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(); + printElapsedTime(cur_time, "(CUDA Measured)"); printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); + my_csv << "eff compact shared n_p_2" << cur_time << endrow; system("pause"); // stop Win32 console from closing on exit @@ -249,18 +325,18 @@ int main(int argc, char* argv[]) { // save to csv try { - csvfile csv("MyTable.csv"); // throws exceptions! - // Hearer - csv << "X" << "VALUE" << endrow; - // Data - int i = 1; - csv << i++ << "String value" << endrow; + //csvfile csv("MyTable.csv"); // throws exceptions! + //// Hearer + //csv << "X" << "VALUE" << endrow; + //// Data + //int i = 1; + /*csv << i++ << "String value" << endrow; csv << i++ << 123 << endrow; csv << i++ << 1.f << endrow; csv << i++ << 1.2 << endrow; - csv << i++ << "One more string" << endrow; - csv << i++ << "\"Escaped\"" << endrow; - csv << i++ << "=HYPERLINK(\"https://playkey.net\"; \"Playkey Service\")" << endrow; + csv << i++ << "One more string" << endrow;*/ + /*csv << i++ << "\"Escaped\"" << endrow; + csv << i++ << "=HYPERLINK(\"https://playkey.net\"; \"Playkey Service\")" << endrow;*/ } catch (const std::exception& ex) { diff --git a/stream_compaction/naive.h b/stream_compaction/naive.h index 9475500..f0d94c2 100644 --- a/stream_compaction/naive.h +++ b/stream_compaction/naive.h @@ -2,7 +2,7 @@ #include "common.h" -constexpr int blocksize = 128; +constexpr int blocksize = 256; namespace StreamCompaction { namespace Naive { From 5a73220589762a6f9137ba3d475c1822f2413a8c Mon Sep 17 00:00:00 2001 From: Jack12xl Date: Wed, 23 Sep 2020 02:12:46 +0800 Subject: [PATCH 22/23] update readme --- README.md | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 19f4b13..b49b492 100644 --- a/README.md +++ b/README.md @@ -17,8 +17,8 @@ Thanks to [FLARE LAB](http://faculty.sist.shanghaitech.edu.cn/faculty/liuxp/flar Add -1. [csvfile.hpp]() to automatically record the performance in CSV form. -2. [radixSort.h](), [radixSort.]() for [Part 6]() +1. [csvfile.hpp](https://github.com/Jack12xl/Project2-Stream-Compaction/blob/master/src/csvfile.hpp) to automatically record the performance in CSV form. +2. [radixSort.h](https://github.com/Jack12xl/Project2-Stream-Compaction/blob/master/stream_compaction/radixSort.h), [radixSort.cu](https://github.com/Jack12xl/Project2-Stream-Compaction/blob/master/stream_compaction/radixSort.cu) for [Part 6](https://github.com/Jack12xl/Project2-Stream-Compaction#part-6-radix-sort-extra-point) #### Intro @@ -34,7 +34,7 @@ Here we managed to implement **all** the compulsory and extra point sections. After implementing all the functions, first we try to find the optimized block size. -![alt text](https://github.com/Jack12xl/Project1-CUDA-Flocking/blob/master/images/2_x_baseline.png) +![alt text](https://github.com/Jack12xl/Project2-Stream-Compaction/blob/master/img/SCAN_for_block.svg) From the image, we may choose the optimized block size as 256 @@ -42,15 +42,15 @@ From the image, we may choose the optimized block size as 256 Here we compare each scan, compact implementations under different array size. The results below are ran under block size = 256. -![alt text](https://github.com/Jack12xl/Project1-CUDA-Flocking/blob/master/images/2_x_baseline.png) +![alt text](https://github.com/Jack12xl/Project2-Stream-Compaction/blob/master/img/SCAN.svg) -![alt text](https://github.com/Jack12xl/Project1-CUDA-Flocking/blob/master/images/2_x_baseline.png) +![alt text](https://github.com/Jack12xl/Project2-Stream-Compaction/blob/master/img/Compact.svg) ##### notes: - the **non-opt** refers to the non-optimization scan function before Part 5. - The **idx** refers to the optimized version in Part 5. -- The shared memory refers to the optimized version in Part 7 +- The **shared memory** refers to the optimized version in Part 7 ##### Output of test program @@ -167,15 +167,15 @@ The array to be sorted is : #### Part 1~3: -The performance is showed in previous image. +The performance is showed in previous [image](https://github.com/Jack12xl/Project2-Stream-Compaction#implementation-comparisons). #### Part 4: Here shows the thrust summary and timeline: -![alt text](https://github.com/Jack12xl/Project1-CUDA-Flocking/blob/master/images/2_x_baseline.png) +![alt text](https://github.com/Jack12xl/Project2-Stream-Compaction/blob/master/img/Thrust.png) -![alt text](https://github.com/Jack12xl/Project1-CUDA-Flocking/blob/master/images/2_x_baseline.png) +![alt text](https://github.com/Jack12xl/Project2-Stream-Compaction/blob/master/img/Thrust_timeline.png) #### Part 5: why GPU version so slow [Extra point] @@ -230,7 +230,7 @@ So instead, we tear the up-sweep and down-sweep process in several part( based o ##### Detail: -In our design, we set the shared memory size twice as big as the block size. The reason for this is to utilize the index mapping from [part 5](). +In our design, we set the shared memory size twice as big as the block size. The reason for this is to utilize the index mapping from [part 5](https://github.com/Jack12xl/Project2-Stream-Compaction#part-5-why-gpu-version-so-slow-extra-point). Sadly we do not consider the bank conflict effect. @@ -244,17 +244,17 @@ The shared memory version is prone to cause integer overflow so we decrease the - ##### Roughly optimize the block sizes of each of your implementations for minimal run time on your GPU. - - As is discussed in [here](), we adopt the 256 block size for both naive and efficient version. + - As is discussed in [here](https://github.com/Jack12xl/Project2-Stream-Compaction#optimized-block-size), we adopt the 256 block size for both naive and efficient version. - ##### Compare all of these GPU Scan implementations (Naive, Work-Efficient, and Thrust) to the serial CPU version of Scan. Plot a graph of the comparison (with array size on the independent axis). - - The picture is showed [here](). + - The picture is showed [here](https://github.com/Jack12xl/Project2-Stream-Compaction#implementation-comparisons). - ##### Can you find the performance bottlenecks? Is it memory I/O? Computation? Is it different for each implementation? - - Personally I believe the bottlenecks lie mainly in memory I/O. Because for each implementation the computation is pretty straight(with complexity **O(n)** and **O(n * log(n)**). When the shared memory is introduced, the performance goes up drastically. + - Personally I believe the bottlenecks lie mainly in memory I/O. Because for each implementation the computation is pretty straight(with complexity **O(n)** and **O(n * log(n)**). Besides, when the shared memory is introduced to decrease the I/O latency, the performance goes up drastically. - ##### Paste the output of the test program into a triple-backtick block in your README. - - Pasted [here]() + - Pasted [here](https://github.com/Jack12xl/Project2-Stream-Compaction#output-of-test-program) From 9781e4808aeebf874ca8641e3b5eb3ff426c0791 Mon Sep 17 00:00:00 2001 From: Jack12xl Date: Wed, 23 Sep 2020 02:34:35 +0800 Subject: [PATCH 23/23] update readme --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index b49b492..715b37c 100644 --- a/README.md +++ b/README.md @@ -254,6 +254,12 @@ The shared memory version is prone to cause integer overflow so we decrease the - Personally I believe the bottlenecks lie mainly in memory I/O. Because for each implementation the computation is pretty straight(with complexity **O(n)** and **O(n * log(n)**). Besides, when the shared memory is introduced to decrease the I/O latency, the performance goes up drastically. +- ##### Thrust implementation + + - As is depicted in this [figure](https://github.com/Jack12xl/Project2-Stream-Compaction/blob/master/img/Thrust_timeline.png) + - **Memory**: thrust use thrust pointer to wrap the memory. It would automatically copy memory between host and device when necessary. + - + - ##### Paste the output of the test program into a triple-backtick block in your README. - Pasted [here](https://github.com/Jack12xl/Project2-Stream-Compaction#output-of-test-program)