diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..3aa2982 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -24,6 +24,13 @@ namespace StreamCompaction { */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { // TODO + int idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx >= n) { + return; + } + + bools[idx] = idata[idx] != 0 ? 1 : 0; } /** @@ -33,6 +40,14 @@ namespace StreamCompaction { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { // TODO + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= n) { + return; + } + + if (idata[idx] != 0) { + odata[bools[idx]] = idata[idx]; + } } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..dfb900b 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -19,7 +19,11 @@ namespace StreamCompaction { */ void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + // TODO -> DONE + odata[0] = 0; + for (int i = 1; i < n; i++) { + odata[i] = odata[i - 1] + idata[i - 1]; + } timer().endCpuTimer(); } @@ -30,9 +34,17 @@ namespace StreamCompaction { */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + // TODO -> DONE + + int num = 0; + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + odata[num] = idata[i]; + num++; + } + } timer().endCpuTimer(); - return -1; + return num; } /** @@ -42,9 +54,33 @@ namespace StreamCompaction { */ int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + // TODO -> DONE + const int size = n; + int* temp = new int[size]; + + //mapping + for (int i = 0; i < n; i++) { + temp[i] = (idata[i] != 0) ? 1 : 0; + } + + // scanning + int* scannedArray = new int[size]; + scannedArray[0] = 0; + for (int i = 1; i < n; i++) { + scannedArray[i] = scannedArray[i - 1] + temp[i - 1]; + } + + // Scatter + int count = 0; + for (int i = 0; i < n; i++) { + if (temp[i] == 1) { + odata[scannedArray[i]] = idata[i]; + count++; + } + } + timer().endCpuTimer(); - return -1; + return count; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..1df373c 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -3,6 +3,14 @@ #include "common.h" #include "efficient.h" +#define blockSize 256 +#define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__) + +int* dev_data; +int* dev_oData; +int* dev_scanData; +int* dev_boolData; + namespace StreamCompaction { namespace Efficient { using StreamCompaction::Common::PerformanceTimer; @@ -12,13 +20,71 @@ namespace StreamCompaction { return timer; } + + + __global__ void kern_UpSweep(int n, int* arr, int pow) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + + if (index >= n) { + return; + } + + if (index % (2 * pow) == 0) { + arr[index + 2 * pow - 1] += arr[index + pow - 1]; + } + + } + + __global__ void kern_SetRoot(int n, int* arr) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n) { + return; + } + + arr[n - 1] = 0; + } + + __global__ void kern_DownSweep(int n, int* arr, int pow) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n) { + return; + } + + if (index % (2 * pow) == 0) { + int temp = arr[index + pow - 1]; + arr[index + pow - 1] = arr[index + 2 * pow - 1]; + arr[index + 2 * pow - 1] += temp; + } + } + /** - * Performs prefix-sum (aka scan) on idata, storing the result into odata. - */ - void scan(int n, int *odata, const int *idata) { + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ + void scan(int n, int* odata, const int* idata) { + int blocks = ceil((float)n / (float)blockSize); + int logN = ilog2ceil(n); + const int len = (int)powf(2, logN); + + cudaMalloc((void**)&dev_data, sizeof(int) * (int)powf(2, logN)); + cudaMemcpy(dev_data, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + timer().startGpuTimer(); - // TODO + + for (int d = 0; d <= logN - 1; d++) { + kern_UpSweep << > > (len, dev_data, (int)powf(2, d)); + } + + kern_SetRoot << <1, 1 >> > (len, dev_data); + + for (int d = logN - 1; d >= 0; d--) { + kern_DownSweep << > > (len, dev_data, (int)powf(2, d)); + } + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_data, sizeof(int) * n, cudaMemcpyDeviceToHost); + + cudaFree(dev_data); } /** @@ -31,10 +97,49 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + + int numBlocks = ceil((float)n / (float)blockSize); + int logN = ilog2ceil(n); + const int len = (int)powf(2, logN); + + cudaMalloc((void**)&dev_data, sizeof(int) * len); + cudaMalloc((void**)&dev_boolData, sizeof(int) * len); + cudaMalloc((void**)&dev_oData, sizeof(int) * n); + cudaMemcpy(dev_data, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + + timer().startGpuTimer(); - // TODO + + // TODO -> DONE + StreamCompaction::Common::kernMapToBoolean << > > (len, dev_boolData, dev_data); + + for (int d = 0; d <= logN - 1; d++) { + kern_UpSweep << > > (len, dev_boolData, (int)powf(2, d)); + } + + kern_SetRoot << <1, 1 >> > (len, dev_boolData); + + for (int d = logN - 1; d >= 0; d--) { + kern_DownSweep << > > (len, dev_boolData, (int)powf(2, d)); + } + + StreamCompaction::Common::kernScatter << > > (n, dev_oData, dev_data, dev_boolData, nullptr); + timer().endGpuTimer(); - return -1; + + int* finalBoolArr = new int[n]; + cudaMemcpy(odata, dev_oData, sizeof(int) * n, cudaMemcpyDeviceToHost); + cudaMemcpy(finalBoolArr, dev_boolData, sizeof(int) * n, cudaMemcpyDeviceToHost); + + cudaFree(dev_data); + cudaFree(dev_boolData); + cudaFree(dev_oData); + + if (idata[n - 1] == 0) { + return finalBoolArr[n - 1]; + } + + return finalBoolArr[n - 1] + 1; } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..5d98b76 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -3,6 +3,12 @@ #include "common.h" #include "naive.h" +#define blockSize 256 +#define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__) + +int* dev_idata; +int* dev_odata; + namespace StreamCompaction { namespace Naive { using StreamCompaction::Common::PerformanceTimer; @@ -12,14 +18,63 @@ namespace StreamCompaction { return timer; } // TODO: __global__ + __global__ void kern_NaiveScan(int n, int* odata, int* idata, int pow) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= n) { + return; + } + + if (idx >= pow) { + odata[idx] = idata[idx - pow] + idata[idx]; + } + else { + odata[idx] = idata[idx]; + } + } + + __global__ void kern_Exclusive(int n, int* odata, int* idata) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= n) { + return; + } + if (idx == 0) { + odata[idx] = 0; + } + else { + odata[idx] = idata[idx - 1]; + } + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + + + int blocks = ceil((float)n / (float)blockSize); + + cudaMalloc((void**)&dev_idata, sizeof(int) * n); + cudaMalloc((void**)&dev_odata, sizeof(int) * n); + cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + timer().startGpuTimer(); // TODO + int logVal = ilog2ceil(n); + for (int d = 1; d <= logVal; d++) { + kern_NaiveScan <<>> (n, dev_odata, dev_idata, (int)powf(2, d - 1)); + if (d < logVal) { + int* tempPtr = dev_odata; + dev_odata = dev_idata; + dev_idata = tempPtr; + } + } + kern_Exclusive <<>> (n, dev_idata, dev_odata); + timer().endGpuTimer(); + cudaMemcpy(odata, dev_idata, sizeof(int) * n, cudaMemcpyDeviceToHost); + + cudaFree(dev_idata); + cudaFree(dev_odata); } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..2c7013b 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -6,6 +6,9 @@ #include "common.h" #include "thrust.h" +int* dev_inData; +int* dev_outData; + namespace StreamCompaction { namespace Thrust { using StreamCompaction::Common::PerformanceTimer; @@ -22,6 +25,14 @@ namespace StreamCompaction { // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + + //DONE + thrust::host_vector hostVec(n); + thrust::copy(idata, idata + n, hostVec.begin()); + thrust::device_vector devVec = hostVec; + thrust::device_vector outVec(n); + thrust::exclusive_scan(devVec.begin(), devVec.end(), outVec.begin()); + thrust::copy(outVec.begin(), outVec.end(), odata); timer().endGpuTimer(); } }