CIS565-Fall-2018 · ACskyline · Sep 18, 2018 · Sep 18, 2018 · Sep 19, 2018 · Sep 19, 2018
diff --git a/README.md b/README.md
diff --git a/img/1.JPG b/img/1.JPG
diff --git a/img/2.JPG b/img/2.JPG
diff --git a/src/main.cpp b/src/main.cpp
@@ -137,14 +137,14 @@ int main(int argc, char* argv[]) {
     printDesc("work-efficient compact, power-of-two");
     count = StreamCompaction::Efficient::compact(SIZE, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
+    printArray(count, c, true);
     printCmpLenResult(count, expectedCount, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient compact, non-power-of-two");
     count = StreamCompaction::Efficient::compact(NPOT, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
+    printArray(count, c, true);
     printCmpLenResult(count, expectedNPOT, b, c);
 
     system("pause"); // stop Win32 console from closing on exit

diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt
@@ -13,5 +13,5 @@ set(SOURCE_FILES
 
 cuda_add_library(stream_compaction
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_60
     )
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
@@ -14,7 +14,6 @@ void checkCUDAErrorFn(const char *msg, const char *file, int line) {
     exit(EXIT_FAILURE);
 }
 
-
 namespace StreamCompaction {
     namespace Common {
 
@@ -24,6 +23,10 @@ namespace StreamCompaction {
          */
         __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
             // TODO
+			int index = blockIdx.x * blockDim.x + threadIdx.x;
+			if (index >= n) return;
+
+			bools[index] = (int)(idata[index] != 0);
         }
 
         /**
@@ -33,6 +36,13 @@ namespace StreamCompaction {
         __global__ void kernScatter(int n, int *odata,
                 const int *idata, const int *bools, const int *indices) {
             // TODO
+			int index = blockIdx.x * blockDim.x + threadIdx.x;
+			if (index >= n) return;
+
+			if (bools[index])
+			{
+				odata[indices[index]] = idata[index];
+			}
         }
 
     }

diff --git a/stream_compaction/common.h b/stream_compaction/common.h
@@ -13,6 +13,10 @@
 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
 #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
 
+#define blocksize 512
+
+//#define SYNC_GRID
+
 /**
  * Check for CUDA errors; print and exit if there was a problem.
  */

diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
@@ -20,6 +20,13 @@ namespace StreamCompaction {
         void scan(int n, int *odata, const int *idata) {
 	        timer().startCpuTimer();
             // TODO
+
+			odata[0] = 0;
+			for (int i = 1; i < n; i++)
+			{
+				odata[i] = odata[i - 1] + idata[i - 1];
+			}
+
 	        timer().endCpuTimer();
         }
 
@@ -31,8 +38,18 @@ namespace StreamCompaction {
         int compactWithoutScan(int n, int *odata, const int *idata) {
 	        timer().startCpuTimer();
             // TODO
+
+			int count = 0;
+			for (int i = 0; i < n; i++)
+			{
+				if (idata[i] != 0)
+				{
+					odata[count++] = idata[i];
+				}
+			}
+
 	        timer().endCpuTimer();
-            return -1;
+            return count;
         }
 
         /**
@@ -43,8 +60,27 @@ namespace StreamCompaction {
         int compactWithScan(int n, int *odata, const int *idata) {
 	        timer().startCpuTimer();
 	        // TODO
+			int count = 0;
+
+			//scan
+			odata[0] = 0;
+			for (int i = 1; i < n; i++)
+			{
+				odata[i] = odata[i - 1] + (idata[i - 1]!=0);//map to boolean
+			}
+
+			//scatter
+			for (int i = 0; i < n; i++)
+			{
+				if (idata[i] != 0)
+				{
+					odata[odata[i]] = idata[i];
+					count++;
+				}
+			}
+
 	        timer().endCpuTimer();
-            return -1;
+            return count;
         }
     }
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
@@ -4,37 +4,176 @@
 #include "efficient.h"
 
 namespace StreamCompaction {
-    namespace Efficient {
-        using StreamCompaction::Common::PerformanceTimer;
-        PerformanceTimer& timer()
-        {
-            static PerformanceTimer timer;
-            return timer;
-        }
-
-        /**
-         * Performs prefix-sum (aka scan) on idata, storing the result into odata.
-         */
-        void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
-            // TODO
-            timer().endGpuTimer();
-        }
-
-        /**
-         * Performs stream compaction on idata, storing the result into odata.
-         * All zeroes are discarded.
-         *
-         * @param n      The number of elements in idata.
-         * @param odata  The array into which to store elements.
-         * @param idata  The array of elements to compact.
-         * @returns      The number of elements remaining after compaction.
-         */
-        int compact(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
-            // TODO
-            timer().endGpuTimer();
-            return -1;
-        }
-    }
-}
+	namespace Efficient {
+		using StreamCompaction::Common::PerformanceTimer;
+		PerformanceTimer& timer()
+		{
+			static PerformanceTimer timer;
+			return timer;
+		}
+
+		__global__ void kernUpSweep(int n, int POT, int POT_EX, int *data)
+		{
+			int index = blockIdx.x * blockDim.x + threadIdx.x;
+			if (index >= n) return;
+			if (index % POT_EX != 0) return;
+
+			data[index + POT_EX - 1] += data[index + POT - 1];
+		}
+
+		__global__ void kernDownSweep(int n, int POT, int POT_EX, int *data)
+		{
+			int index = blockIdx.x * blockDim.x + threadIdx.x;
+			if (index >= n) return;
+			if (index % POT_EX != 0) return;
+
+			int temp = data[index + POT - 1];
+			data[index + POT - 1] = data[index + POT_EX - 1];
+			data[index + POT_EX - 1] += temp;
+		}
+
+
+		/**
+		 * Performs prefix-sum (aka scan) on idata, storing the result into odata.
+		 */
+		void scan(int n, int *odata, const int *idata) {
+			int count = ilog2ceil(n);
+			int number = 1 << count;
+			int *dev_data;
+			dim3 gridsize((number - 1) / blocksize + 1);
+
+			cudaMalloc((void**)&dev_data, number * sizeof(int));
+			checkCUDAErrorFn("malloc dev_data");
+
+			cudaMemcpy(dev_data, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+			if (number > n)
+			{
+				cudaMemset(dev_data + n, 0, (number - n) * sizeof(int));
+				checkCUDAErrorFn("set dev_data");
+			}
+
+			//start ticking
+			timer().startGpuTimer();
+			for (int i = 0; i < count; i++)
+			{
+				kernUpSweep << <gridsize, blocksize >> > (number, 1 << i, 1 << i + 1, dev_data);
+#ifdef SYNC_GRID
+				cudaThreadSynchronize();
+#endif
+			}
+
+			//set data[number-1] to 0
+			cudaMemset((void*)(dev_data + (number - 1)), 0, sizeof(int));
+			checkCUDAErrorFn("set dev_data[number-1]");
+
+			for (int i = count - 1; i >= 0; i--)
+			{
+				kernDownSweep << <gridsize, blocksize >> > (number, 1 << i, 1 << i + 1, dev_data);
+#ifdef SYNC_GRID
+				cudaThreadSynchronize();
+#endif
+			}
+
+			//stop ticking
+			timer().endGpuTimer();
+
+			cudaMemcpy(odata, dev_data, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+			cudaFree(dev_data);
+			checkCUDAErrorFn("free dev_data");
+		}
+
+		/**
+		 * Performs stream compaction on idata, storing the result into odata.
+		 * All zeroes are discarded.
+		 *
+		 * @param n      The number of elements in idata.
+		 * @param odata  The array into which to store elements.
+		 * @param idata  The array of elements to compact.
+		 * @returns      The number of elements remaining after compaction.
+		 */
+		int compact(int n, int *odata, const int *idata) {
+			int result = 0;
+			int count = ilog2ceil(n);
+			int number = 1 << count;
+			int *dev_idata;
+			int *dev_odata;
+			int *dev_indices;
+			int *dev_bools;
+			dim3 gridsize((number - 1) / blocksize + 1);
+			dim3 gridsize_EXACT((n - 1) / blocksize + 1);
+
+			cudaMalloc((void**)&dev_idata, n * sizeof(int));
+			checkCUDAErrorFn("malloc dev_idata");
+
+			cudaMalloc((void**)&dev_odata, n * sizeof(int));
+			checkCUDAErrorFn("malloc dev_odata");
+
+			cudaMalloc((void**)&dev_indices, number * sizeof(int));
+			checkCUDAErrorFn("malloc dev_indices");
+
+			cudaMalloc((void**)&dev_bools, n * sizeof(int));
+			checkCUDAErrorFn("malloc dev_bools");
+
+
+			cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+			checkCUDAErrorFn("memcpy dev_idata");
+
+			Common::kernMapToBoolean << <gridsize_EXACT, blocksize >> > (n, dev_bools, dev_idata);
+
+			cudaMemcpy(dev_indices, dev_bools, n * sizeof(int), cudaMemcpyDeviceToDevice);
+
+			if (number > n)
+			{
+				cudaMemset(dev_indices + n, 0, (number - n) * sizeof(int));
+				checkCUDAErrorFn("set dev_indices");
+			}
+
+			//start ticking
+			timer().startGpuTimer();
+
+			for (int i = 0; i < count; i++)
+			{
+				kernUpSweep << <gridsize, blocksize >> > (number, 1 << i, 1 << i + 1, dev_indices);
+#ifdef SYNC_GRID
+				cudaThreadSynchronize();
+#endif
+			}
+
+			//set data[number-1] to 0
+			cudaMemset((void*)(dev_indices + (number - 1)), 0, sizeof(int));
+			checkCUDAErrorFn("set dev_indices[number-1]");
+
+
+			for (int i = count - 1; i >= 0; i--)
+			{
+				kernDownSweep << <gridsize, blocksize >> > (number, 1 << i, 1 << i + 1, dev_indices);
+#ifdef SYNC_GRID
+				cudaThreadSynchronize();
+#endif
+			}
+
+			Common::kernScatter << <gridsize_EXACT, blocksize >> > (n, dev_odata, dev_idata, dev_bools, dev_indices);
+
+			//stop ticking
+			timer().endGpuTimer();
+
+			cudaMemcpy(&result, dev_indices + (n - 1), sizeof(int), cudaMemcpyDeviceToHost);
+			result += (int)(idata[n - 1] != 0);
+			cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+			cudaFree(dev_idata);
+			checkCUDAErrorFn("free dev_idata");
+
+			cudaFree(dev_odata);
+			checkCUDAErrorFn("free dev_odata");
+
+			cudaFree(dev_indices);
+			checkCUDAErrorFn("free dev_indices");
+
+			cudaFree(dev_bools);
+			checkCUDAErrorFn("free dev_bools");
+			return result;
+		}
+	}
+}