CIS565-Fall-2020 · tusadi · Sep 23, 2020
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
@@ -24,6 +24,13 @@ namespace StreamCompaction {
          */
         __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
             // TODO
+            int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+            if (idx >= n) {
+                return;
+            }
+
+            bools[idx] = idata[idx] != 0 ? 1 : 0;
         }
 
         /**
@@ -33,6 +40,14 @@ namespace StreamCompaction {
         __global__ void kernScatter(int n, int *odata,
                 const int *idata, const int *bools, const int *indices) {
             // TODO
+            int idx = blockIdx.x * blockDim.x + threadIdx.x;
+            if (idx >= n) {
+                return;
+            }
+
+            if (idata[idx] != 0) {
+                odata[bools[idx]] = idata[idx];
+            }
         }
 
     }

diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
@@ -19,7 +19,11 @@ namespace StreamCompaction {
          */
         void scan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
-            // TODO
+            // TODO -> DONE
+            odata[0] = 0;
+            for (int i = 1; i < n; i++) {
+                odata[i] = odata[i - 1] + idata[i - 1];
+            }
             timer().endCpuTimer();
         }
 
@@ -30,9 +34,17 @@ namespace StreamCompaction {
          */
         int compactWithoutScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
-            // TODO
+            // TODO -> DONE
+
+            int num = 0;
+            for (int i = 0; i < n; i++) {
+                if (idata[i] != 0) {
+                    odata[num] = idata[i];
+                    num++;
+                }
+            }
             timer().endCpuTimer();
-            return -1;
+            return num;
         }
 
         /**
@@ -42,9 +54,33 @@ namespace StreamCompaction {
          */
         int compactWithScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
-            // TODO
+            // TODO -> DONE
+            const int size = n;
+            int* temp = new int[size];
+
+            //mapping 
+            for (int i = 0; i < n; i++) {
+                temp[i] = (idata[i] != 0) ? 1 : 0;
+            }
+
+            // scanning
+            int* scannedArray = new int[size];
+            scannedArray[0] = 0;
+            for (int i = 1; i < n; i++) {
+                scannedArray[i] = scannedArray[i - 1] + temp[i - 1];
+            }
+
+            // Scatter
+            int count = 0;
+            for (int i = 0; i < n; i++) {
+                if (temp[i] == 1) {
+                    odata[scannedArray[i]] = idata[i];
+                    count++;
+                }
+            }
+
             timer().endCpuTimer();
-            return -1;
+            return count;
         }
     }
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
@@ -3,6 +3,14 @@
 #include "common.h"
 #include "efficient.h"
 
+#define blockSize 256
+#define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__)
+
+int* dev_data;
+int* dev_oData;
+int* dev_scanData;
+int* dev_boolData;
+
 namespace StreamCompaction {
     namespace Efficient {
         using StreamCompaction::Common::PerformanceTimer;
@@ -12,13 +20,71 @@ namespace StreamCompaction {
             return timer;
         }
 
+
+
+        __global__ void kern_UpSweep(int n, int* arr, int pow) {
+            int index = blockIdx.x * blockDim.x + threadIdx.x;
+
+            if (index >= n) {
+                return;
+            }
+
+            if (index % (2 * pow) == 0) {
+                arr[index + 2 * pow - 1] += arr[index + pow - 1];
+            }
+
+        }
+
+        __global__ void kern_SetRoot(int n, int* arr) {
+            int index = blockIdx.x * blockDim.x + threadIdx.x;
+            if (index >= n) {
+                return;
+            }
+
+            arr[n - 1] = 0;
+        }
+
+        __global__ void kern_DownSweep(int n, int* arr, int pow) {
+            int index = blockIdx.x * blockDim.x + threadIdx.x;
+            if (index >= n) {
+                return;
+            }
+
+            if (index % (2 * pow) == 0) {
+                int temp = arr[index + pow - 1];
+                arr[index + pow - 1] = arr[index + 2 * pow - 1];
+                arr[index + 2 * pow - 1] += temp;
+            }
+        }
+
         /**
-         * Performs prefix-sum (aka scan) on idata, storing the result into odata.
-         */
-        void scan(int n, int *odata, const int *idata) {
+        * Performs prefix-sum (aka scan) on idata, storing the result into odata.
+        */
+        void scan(int n, int* odata, const int* idata) {
+            int blocks = ceil((float)n / (float)blockSize);
+            int logN = ilog2ceil(n);
+            const int len = (int)powf(2, logN);
+
+            cudaMalloc((void**)&dev_data, sizeof(int) * (int)powf(2, logN));
+            cudaMemcpy(dev_data, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+
             timer().startGpuTimer();
-            // TODO
+
+            for (int d = 0; d <= logN - 1; d++) {
+                kern_UpSweep << <blocks, blockSize >> > (len, dev_data, (int)powf(2, d));
+            }
+
+            kern_SetRoot << <1, 1 >> > (len, dev_data);
+
+            for (int d = logN - 1; d >= 0; d--) {
+                kern_DownSweep << <blocks, blockSize >> > (len, dev_data, (int)powf(2, d));
+            }
+
             timer().endGpuTimer();
+
+            cudaMemcpy(odata, dev_data, sizeof(int) * n, cudaMemcpyDeviceToHost);
+
+            cudaFree(dev_data);
         }
 
         /**
@@ -31,10 +97,49 @@ namespace StreamCompaction {
          * @returns      The number of elements remaining after compaction.
          */
         int compact(int n, int *odata, const int *idata) {
+
+            int numBlocks = ceil((float)n / (float)blockSize);
+            int logN = ilog2ceil(n);
+            const int len = (int)powf(2, logN);
+
+            cudaMalloc((void**)&dev_data, sizeof(int) * len);
+            cudaMalloc((void**)&dev_boolData, sizeof(int) * len);
+            cudaMalloc((void**)&dev_oData, sizeof(int) * n);
+            cudaMemcpy(dev_data, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+
+
             timer().startGpuTimer();
-            // TODO
+
+            // TODO -> DONE
+            StreamCompaction::Common::kernMapToBoolean << <numBlocks, blockSize >> > (len, dev_boolData, dev_data);
+
+            for (int d = 0; d <= logN - 1; d++) {
+                kern_UpSweep << <numBlocks, blockSize >> > (len, dev_boolData, (int)powf(2, d));
+            }
+
+            kern_SetRoot << <1, 1 >> > (len, dev_boolData);
+
+            for (int d = logN - 1; d >= 0; d--) {
+                kern_DownSweep << <numBlocks, blockSize >> > (len, dev_boolData, (int)powf(2, d));
+            }
+
+            StreamCompaction::Common::kernScatter << <numBlocks, blockSize >> > (n, dev_oData, dev_data, dev_boolData, nullptr);
+
             timer().endGpuTimer();
-            return -1;
+
+            int* finalBoolArr = new int[n];
+            cudaMemcpy(odata, dev_oData, sizeof(int) * n, cudaMemcpyDeviceToHost);
+            cudaMemcpy(finalBoolArr, dev_boolData, sizeof(int) * n, cudaMemcpyDeviceToHost);
+
+            cudaFree(dev_data);
+            cudaFree(dev_boolData);
+            cudaFree(dev_oData);
+
+            if (idata[n - 1] == 0) {
+                return finalBoolArr[n - 1];
+            }
+
+            return finalBoolArr[n - 1] + 1;
         }
     }
 }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
@@ -3,6 +3,12 @@
 #include "common.h"
 #include "naive.h"
 
+#define blockSize 256
+#define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__)
+
+int* dev_idata;
+int* dev_odata;
+
 namespace StreamCompaction {
     namespace Naive {
         using StreamCompaction::Common::PerformanceTimer;
@@ -12,14 +18,63 @@ namespace StreamCompaction {
             return timer;
         }
         // TODO: __global__
+        __global__ void kern_NaiveScan(int n, int* odata, int* idata, int pow) {
+            int idx = blockIdx.x * blockDim.x + threadIdx.x;
+            if (idx >= n) {
+                return;
+            }
+
+            if (idx >= pow) {
+                odata[idx] = idata[idx - pow] + idata[idx];
+            }
+            else {
+                odata[idx] = idata[idx];
+            }
+        }
+
+        __global__ void kern_Exclusive(int n, int* odata, int* idata) {
+            int idx = blockIdx.x * blockDim.x + threadIdx.x;
+            if (idx >= n) {
+                return;
+            }
 
+            if (idx == 0) {
+                odata[idx] = 0;
+            }
+            else {
+                odata[idx] = idata[idx - 1];
+            }
+        }
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+
+
+            int blocks = ceil((float)n / (float)blockSize);
+
+            cudaMalloc((void**)&dev_idata, sizeof(int) * n);
+            cudaMalloc((void**)&dev_odata, sizeof(int) * n);
+            cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+
             timer().startGpuTimer();
             // TODO
+            int logVal = ilog2ceil(n);
+            for (int d = 1; d <= logVal; d++) {
+                kern_NaiveScan <<<blocks, blockSize >>> (n, dev_odata, dev_idata, (int)powf(2, d - 1));
+                if (d < logVal) {
+                    int* tempPtr = dev_odata;
+                    dev_odata = dev_idata;
+                    dev_idata = tempPtr;
+                }
+            }
+            kern_Exclusive <<<blocks, blockSize >>> (n, dev_idata, dev_odata);
+
             timer().endGpuTimer();
+            cudaMemcpy(odata, dev_idata, sizeof(int) * n, cudaMemcpyDeviceToHost);
+
+            cudaFree(dev_idata);
+            cudaFree(dev_odata);
         }
     }
 }
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
@@ -6,6 +6,9 @@
 #include "common.h"
 #include "thrust.h"
 
+int* dev_inData;
+int* dev_outData;
+
 namespace StreamCompaction {
     namespace Thrust {
         using StreamCompaction::Common::PerformanceTimer;
@@ -22,6 +25,14 @@ namespace StreamCompaction {
             // TODO use `thrust::exclusive_scan`
             // example: for device_vectors dv_in and dv_out:
             // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+
+            //DONE
+            thrust::host_vector<int> hostVec(n);
+            thrust::copy(idata, idata + n, hostVec.begin());
+            thrust::device_vector<int> devVec = hostVec;
+            thrust::device_vector<int> outVec(n);
+            thrust::exclusive_scan(devVec.begin(), devVec.end(), outVec.begin());
+            thrust::copy(outVec.begin(), outVec.end(), odata);
             timer().endGpuTimer();
         }
     }