diff --git a/.gitignore b/.gitignore index b8bd026..b056ede 100644 --- a/.gitignore +++ b/.gitignore @@ -26,3 +26,14 @@ *.exe *.out *.app + +# Windows specific +[Rr]elease/ +[Dd]ebug/ +*.suo +*.pdb +*.sdf +*.opensdf +*.user +*.deps +*.ipch \ No newline at end of file diff --git a/2014-09-27_stream_compaction/2014-09-27_stream_compaction.sln b/2014-09-27_stream_compaction/2014-09-27_stream_compaction.sln new file mode 100644 index 0000000..e29554e --- /dev/null +++ b/2014-09-27_stream_compaction/2014-09-27_stream_compaction.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 11.00 +# Visual Studio 2010 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "2014-09-27_stream_compaction", "2014-09-27_stream_compaction\2014-09-27_stream_compaction.vcxproj", "{1450E427-18FD-4EF6-922C-C02C27920CE1}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Win32 = Debug|Win32 + Release|Win32 = Release|Win32 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {1450E427-18FD-4EF6-922C-C02C27920CE1}.Debug|Win32.ActiveCfg = Debug|Win32 + {1450E427-18FD-4EF6-922C-C02C27920CE1}.Debug|Win32.Build.0 = Debug|Win32 + {1450E427-18FD-4EF6-922C-C02C27920CE1}.Release|Win32.ActiveCfg = Release|Win32 + {1450E427-18FD-4EF6-922C-C02C27920CE1}.Release|Win32.Build.0 = Release|Win32 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/2014-09-27_stream_compaction/2014-09-27_stream_compaction/2014-09-27_stream_compaction.vcxproj b/2014-09-27_stream_compaction/2014-09-27_stream_compaction/2014-09-27_stream_compaction.vcxproj new file mode 100644 index 0000000..0847178 --- /dev/null +++ b/2014-09-27_stream_compaction/2014-09-27_stream_compaction/2014-09-27_stream_compaction.vcxproj @@ -0,0 +1,87 @@ + + + + + Debug + Win32 + + + Release + Win32 + + + + {1450E427-18FD-4EF6-922C-C02C27920CE1} + My20140927_stream_compaction + + + + Application + true + MultiByte + + + Application + false + true + MultiByte + + + + + + + + + + + + + + + + Level3 + Disabled + + + true + %(AdditionalLibraryDirectories) + cudart.lib;%(AdditionalDependencies) + + + $(CudaToolkitIncludeDir);%(Include) + $(ProjectDir)$(Platform)/$(Configuration)/%(Filename)%(Extension).obj + + + + + Level3 + MaxSpeed + true + true + + + true + true + true + cudart.lib;%(AdditionalDependencies) + + + + + + + + + Document + + + + + + + + + + + \ No newline at end of file diff --git a/2014-09-27_stream_compaction/2014-09-27_stream_compaction/2014-09-27_stream_compaction.vcxproj.filters b/2014-09-27_stream_compaction/2014-09-27_stream_compaction/2014-09-27_stream_compaction.vcxproj.filters new file mode 100644 index 0000000..d9d5b45 --- /dev/null +++ b/2014-09-27_stream_compaction/2014-09-27_stream_compaction/2014-09-27_stream_compaction.vcxproj.filters @@ -0,0 +1,38 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hpp;hxx;hm;inl;inc;xsd + + + {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Source Files + + + Source Files + + + + + Header Files + + + Header Files + + + + + Source Files + + + \ No newline at end of file diff --git a/2014-09-27_stream_compaction/2014-09-27_stream_compaction/CPUTimer.cpp b/2014-09-27_stream_compaction/2014-09-27_stream_compaction/CPUTimer.cpp new file mode 100644 index 0000000..e6e8970 --- /dev/null +++ b/2014-09-27_stream_compaction/2014-09-27_stream_compaction/CPUTimer.cpp @@ -0,0 +1,43 @@ +#include "CPUTimer.h" + +#include + + +CPUTimer::CPUTimer() +{ + start_time = GetTimeMs64(); +} + +CPUTimer::~CPUTimer() +{ +} + +void CPUTimer::start() +{ + start_time = GetTimeMs64(); +} + +void CPUTimer::stop( std::string some_desriptor ) +{ + __int64 end_time = GetTimeMs64(); + std::cout << some_desriptor << ": " << end_time - start_time << " milliseconds\n" << std::endl; +} + +__int64 CPUTimer::GetTimeMs64() +{ + /* Windows */ + FILETIME ft; + LARGE_INTEGER li; + + /* Get the amount of 100 nano seconds intervals elapsed since January 1, 1601 (UTC) and copy it + * to a LARGE_INTEGER structure. */ + GetSystemTimeAsFileTime(&ft); + li.LowPart = ft.dwLowDateTime; + li.HighPart = ft.dwHighDateTime; + + __int64 ret = li.QuadPart; + ret -= 116444736000000000LL; /* Convert from file time to UNIX epoch time. */ + ret /= 10000; /* From 100 nano seconds (10^-7) to 1 millisecond (10^-3) intervals */ + + return ret; +} \ No newline at end of file diff --git a/2014-09-27_stream_compaction/2014-09-27_stream_compaction/CPUTimer.h b/2014-09-27_stream_compaction/2014-09-27_stream_compaction/CPUTimer.h new file mode 100644 index 0000000..d7b9733 --- /dev/null +++ b/2014-09-27_stream_compaction/2014-09-27_stream_compaction/CPUTimer.h @@ -0,0 +1,22 @@ +#pragma once + +#include +#include + +class CPUTimer +{ +public: + CPUTimer( void ); + ~CPUTimer( void ); + + // methods + void start( void ); + void stop( std::string some_desriptor ); + +private: + // method + __int64 GetTimeMs64( void ); + + // member + __int64 start_time; +}; \ No newline at end of file diff --git a/2014-09-27_stream_compaction/2014-09-27_stream_compaction/StreamCompaction.cu b/2014-09-27_stream_compaction/2014-09-27_stream_compaction/StreamCompaction.cu new file mode 100644 index 0000000..8f593cf --- /dev/null +++ b/2014-09-27_stream_compaction/2014-09-27_stream_compaction/StreamCompaction.cu @@ -0,0 +1,572 @@ +#pragma once + +#include "StreamCompaction.h" +#include + + +// TODO: Clean this up so there isn't so much duplicated code present. +// For instance, the only real difference between the global and shared memory wrappers for scan is the buffer allocation. + + + +/******************************************/ +/* Function prototypes. */ +/******************************************/ + + +__global__ +void naiveScanGlobalMemoryOneBlock( float *g_out, float *g_in, float *g_buffer, int n ); + +__global__ +void computeBlockSumsAndInitializeBuffer( float *g_in, float *g_buffer, float *g_block_sums ); + +__global__ +void naiveScanGlobalMemory( float *g_out, float *g_in, float *g_buffer, float *g_block_sums, int n ); + +__global__ +void naiveScanSharedMemoryOneBlock( float *g_out, float *g_in, int n ); + +__global__ +void computeBlockSums( float *g_in, float *g_block_sums ); + +__global__ +void naiveScanSharedMemory( float *g_out, float *g_in, float *g_block_sums, int n ); + +__global__ +void computeBinaryArray( float *g_out, float *g_in, int n ); + +__global__ +void streamCompaction( float *g_out, float *g_in, float *g_scatter_results, int n ); + + + +/*************************************/ +/* KERNEL WRAPPERS */ +/*************************************/ + + +//////////////////////////////////////////////////// +// Wrapper to call kernel that performs a naive parallel prefix sum that uses global memory exclusively. +//////////////////////////////////////////////////// +void naiveScanGlobalMemoryWrapper( float *h_out, const float *h_in, const int &n ) +{ + float *h_block_sums; + float *d_in, *d_out, *d_buffer, *d_block_sums; + + // Compute number of blocks. + int num_blocks = ( int )ceil( ( float )n / ( float )BLOCK_SIZE ); + + // Compute bytes needed for various array allocations. + int size = sizeof( float ) * n; + int buffer_size = size * 2; + int block_sums_size = sizeof( float ) * num_blocks; + + // Allocate memory for host data. + h_block_sums = ( float* )malloc( block_sums_size ); + + // Allocate memory for device data. + cudaMalloc( ( void** )&d_in, size ); + cudaMalloc( ( void** )&d_out, size ); + cudaMalloc( ( void** )&d_buffer, buffer_size ); + cudaMalloc( ( void** )&d_block_sums, block_sums_size ); + + // Copy host input to device input array. + cudaMemcpy( d_in, h_in, size, cudaMemcpyHostToDevice ); + + // CUDA timers start. + cudaEvent_t start, stop; + float time; + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaEventRecord( start, 0 ); + + if ( num_blocks > 1 ) { + // Initialize h_block_sums values to 0. + for ( int i = 0; i < num_blocks; ++i ) { + h_block_sums[i] = 0.0f; + } + + // Copy h_block_sums to d_block_sums. + cudaMemcpy( d_block_sums, h_block_sums, block_sums_size, cudaMemcpyHostToDevice ); + + // Kernel calls. + computeBlockSumsAndInitializeBuffer<<< dim3( num_blocks ), BLOCK_SIZE >>>( d_in, d_buffer, d_block_sums ); + naiveScanGlobalMemory<<< dim3( num_blocks ), BLOCK_SIZE >>>( d_out, d_in, d_buffer, d_block_sums, n ); + } + else { + // Kernel call. + naiveScanGlobalMemoryOneBlock<<< 1, n >>>( d_out, d_in, d_buffer, n ); + } + + // CUDA timers end. + cudaEventRecord( stop, 0 ); + cudaEventSynchronize( stop ); + cudaEventElapsedTime( &time, start, stop ); + cudaEventDestroy( start ); + cudaEventDestroy( stop ); + std::cout << "Naive scan global memory kernel call: " << time << " milliseconds\n" << std::endl; + + // Copy device result to host output array. + cudaMemcpy( h_out, d_out, size, cudaMemcpyDeviceToHost ); + + // Release allocated memory. + free( h_block_sums ); + cudaFree( d_in ); + cudaFree( d_out ); + cudaFree( d_buffer ); + cudaFree( d_block_sums ); +} + + +//////////////////////////////////////////////////// +// Wrapper to call kernel that performs a naive parallel prefix sum that uses shared memory. +//////////////////////////////////////////////////// +void naiveScanSharedMemoryWrapper( float *h_out, const float *h_in, const int &n ) +{ + float *h_block_sums; + float *d_in, *d_out, *d_block_sums; + + // Compute number of blocks. + int num_blocks = ( int )ceil( ( float )n / ( float )BLOCK_SIZE ); + + // Compute bytes needed for various array allocations. + int size = sizeof( float ) * n; + int buffer_size = size * 2; + int block_sums_size = sizeof( float ) * num_blocks; + + // Allocate memory for host data. + h_block_sums = ( float* )malloc( block_sums_size ); + + // Allocate memory for device data. + cudaMalloc( ( void** )&d_in, size ); + cudaMalloc( ( void** )&d_out, size ); + cudaMalloc( ( void** )&d_block_sums, block_sums_size ); + + // Copy host input to device input array. + cudaMemcpy( d_in, h_in, size, cudaMemcpyHostToDevice ); + + // CUDA timers start. + cudaEvent_t start, stop; + float time; + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaEventRecord( start, 0 ); + + if ( num_blocks > 1 ) { + // Initialize h_block_sums values to 0. + for ( int i = 0; i < num_blocks; ++i ) { + h_block_sums[i] = 0.0f; + } + + // Copy h_block_sums to d_block_sums. + cudaMemcpy( d_block_sums, h_block_sums, block_sums_size, cudaMemcpyHostToDevice ); + + // Kernel calls. + computeBlockSums<<< dim3( num_blocks ), BLOCK_SIZE >>>( d_in, d_block_sums ); + naiveScanSharedMemory<<< dim3( num_blocks ), BLOCK_SIZE, sizeof( float ) * BLOCK_SIZE * 2 >>>( d_out, d_in, d_block_sums, n ); + } + else { + // Kernel call. + naiveScanSharedMemoryOneBlock<<< 1, n, buffer_size >>>( d_out, d_in, n ); + } + + // CUDA timers end. + cudaEventRecord( stop, 0 ); + cudaEventSynchronize( stop ); + cudaEventElapsedTime( &time, start, stop ); + cudaEventDestroy( start ); + cudaEventDestroy( stop ); + std::cout << "Naive scan shared memory kernel call: " << time << " milliseconds\n" << std::endl; + + // Copy device result to host output array. + cudaMemcpy( h_out, d_out, size, cudaMemcpyDeviceToHost ); + + // Release allocated memory. + free( h_block_sums ); + cudaFree( d_in ); + cudaFree( d_out ); + cudaFree( d_block_sums ); +} + + +//////////////////////////////////////////////////// +// Wrapper to call the kernels that perform the scatter operation for stream compaction. +// [ 0, 0, 3, 4, 0, 6, 6, 7, 0, 1 ] => [ 0, 0, 1, 1, 0, 1, 1, 1, 0, 1 ] => [ 0, 0, 0, 1, 2, 2, 3, 4, 5, 5 ] +//////////////////////////////////////////////////// +void scatterWrapper( float *h_output, const float *h_input, const int &n ) +{ + float *h_binary_array; + float *d_input, *d_binary_array; + + // Compute number of blocks. + int num_blocks = ( int )ceil( ( float )n / ( float )BLOCK_SIZE ); + + // Compute bytes needed for various array allocations. + int size = sizeof( float ) * n; + + // Allocate memory for host data. + h_binary_array = ( float* )malloc( size ); + + // Allocate memory for device data. + cudaMalloc( ( void** )&d_input, size ); + cudaMalloc( ( void** )&d_binary_array, size ); + + // Copy host input to device input array. + cudaMemcpy( d_input, h_input, size, cudaMemcpyHostToDevice ); + + // Kernel call. + computeBinaryArray<<< dim3( num_blocks ), BLOCK_SIZE >>>( d_binary_array, d_input, n ); + + // Copy device result to host output array. + cudaMemcpy( h_binary_array, d_binary_array, size, cudaMemcpyDeviceToHost ); + + // Perfrom parallel prefix sum on the boolean array. + naiveScanSharedMemoryWrapper( h_output, h_binary_array, n ); + + // Release allocated memory. + free( h_binary_array ); + cudaFree( d_input ); + cudaFree( d_binary_array ); +} + + +//////////////////////////////////////////////////// +// Wrapper to call the kernels that perform full stream compaction. +//////////////////////////////////////////////////// +void streamCompactionWrapper( float **h_output, const float *h_input, const int &n, int &output_length ) +{ + float *h_scatter_results; + float *d_input, *d_output, *d_scatter_array; + + // Compute bytes needed for scatter_results and d_input. + int input_array_size = sizeof( float ) * n; + + // Get results of performing the scatter operation on the input array. + h_scatter_results = ( float* )malloc( input_array_size ); + scatterWrapper( h_scatter_results, h_input, n ); + + // Allocate memory for stream compaction result (h_output). + output_length = ( int )h_scatter_results[n - 1] + 1; + int output_array_size = sizeof( float ) * output_length; + *h_output = ( float* )malloc( output_array_size ); + + // Allocate memory for device data. + cudaMalloc( ( void** )&d_input, input_array_size ); + cudaMalloc( ( void** )&d_output, output_array_size ); + cudaMalloc( ( void** )&d_scatter_array, input_array_size ); + + // Copy host input to device input array. + cudaMemcpy( d_input, h_input, input_array_size, cudaMemcpyHostToDevice ); + + // Copy host scatter array to device scatter array. + cudaMemcpy( d_scatter_array, h_scatter_results, input_array_size, cudaMemcpyHostToDevice ); + + // Compute number of blocks. + int num_blocks = ( int )ceil( ( float )n / ( float )BLOCK_SIZE ); + + // Stream compaction kernel call. + streamCompaction<<< dim3( num_blocks ), BLOCK_SIZE >>>( d_output, d_input, d_scatter_array, n ); + + // Copy device result to host output array. + cudaMemcpy( *h_output, d_output, output_array_size, cudaMemcpyDeviceToHost ); + + free( h_scatter_results ); + cudaFree( d_input ); + cudaFree( d_output ); + cudaFree( d_scatter_array ); +} + + + +/**********************************************************/ +/* NAIVE SCAN (GLOBAL MEMORY) FUNCTIONS */ +/**********************************************************/ + + +//////////////////////////////////////////////////// +// Parallel (GPU) version of a naive exclusive prefix sum. +// This method can only operate on a single block. +// This method utilizes global memory exclusively. +//////////////////////////////////////////////////// +__global__ +void naiveScanGlobalMemoryOneBlock( float *g_out, float *g_in, float *g_buffer, int n ) +{ + int threadId = threadIdx.x; + int pout = 0; + int pin = 1; + + // Populate g_buffer. + // This is exclusive scan, so shift right by one and set first element to 0. + g_buffer[pout * n + threadId] = ( threadId > 0 ) ? g_in[threadId - 1] : 0; + + // Do not continue until g_buffer is populated. + __syncthreads(); + + for ( int offset = 1; offset < n; offset *= 2 ) { + // Swap pout and pin. + pout = 1 - pout; + pin = 1 - pin; + + if ( threadId < offset ) { + // output[index] = input[index]. + g_buffer[pout * n + threadId] = g_buffer[pin * n + threadId]; + } + else { + // output[index] = input[index] + input[index - offset]. + g_buffer[pout * n + threadId] = g_buffer[pin * n + threadId] + g_buffer[pin * n + threadId - offset]; + } + + // All threads must complete this "level" before proceeding. + __syncthreads(); + } + + // Write output. + g_out[threadId] = g_buffer[pout * n + threadId]; +} + + +//////////////////////////////////////////////////// +// Helper method for parallel (GPU) version of naive exclusive prefix sum. +// This method initializes the global buffer. +// Additionally, this method sums up the elements in each block, and saves those sums to global memory. +//////////////////////////////////////////////////// +__global__ +void computeBlockSumsAndInitializeBuffer( float *g_in, float *g_buffer, float *g_block_sums ) +{ + int block_thread_id = threadIdx.x; + int global_thread_id = threadIdx.x + ( blockIdx.x * blockDim.x ); + + // Populate g_buffer per block. + // This is exclusive scan, so shift right by one and set first element to 0. + g_buffer[global_thread_id] = ( block_thread_id > 0 ) ? g_in[global_thread_id - 1] : 0; + + // Compute sum of all elements for every block. + atomicAdd( &g_block_sums[blockIdx.x], g_in[global_thread_id] ); +} + + +//////////////////////////////////////////////////// +// Parallel (GPU) version of a naive exclusive prefix sum. +// This method can operate on any number of blocks of any size. +// This method utilizes global memory exclusively. +//////////////////////////////////////////////////// +__global__ +void naiveScanGlobalMemory( float *g_out, float *g_in, float *g_buffer, float *g_block_sums, int n ) +{ + int block_thread_id = threadIdx.x; + int global_thread_id = threadIdx.x + ( blockIdx.x * blockDim.x ); + int pout = 0; + int pin = 1; + + // In cases where blocks do not evenly divide input array, some invalid indices may be present. + // If invalid index is detected, then return immediately. + if ( global_thread_id > n - 1 ) { + return; + } + + for ( int offset = 1; offset < blockDim.x; offset *= 2 ) { + // Swap pout and pin. + pout = 1 - pout; + pin = 1 - pin; + + if ( block_thread_id < offset ) { + // output[index] = input[index]. + g_buffer[pout * n + global_thread_id] = g_buffer[pin * n + global_thread_id]; + } + else { + // output[index] = input[index] + input[index - offset]. + g_buffer[pout * n + global_thread_id] = g_buffer[pin * n + global_thread_id] + g_buffer[pin * n + global_thread_id - offset]; + } + + // All threads must complete this "level" before proceeding. + __syncthreads(); + } + + // Compute sum of all previous blocks to add to the values in the current block. + int previous_block_sum = 0; + for ( int i = blockIdx.x; i > 0; --i ) { + previous_block_sum += g_block_sums[i - 1]; + } + + // Write output. + g_out[global_thread_id] = g_buffer[pout * n + global_thread_id] + previous_block_sum; +} + + + +/**********************************************************/ +/* NAIVE SCAN (SHARED MEMORY) FUNCTIONS */ +/**********************************************************/ + +//////////////////////////////////////////////////// +// Parallel (GPU) version of a naive exclusive prefix sum. +// This method can only operate on a single block. +// This method utilizes shared memory. +//////////////////////////////////////////////////// +__global__ +void naiveScanSharedMemoryOneBlock( float *g_out, float *g_in, int n ) +{ + // Shared memory is allocated on kernel invocation. + // buffer is twice the size of the input array. + extern __shared__ float buffer[]; + + int threadId = threadIdx.x; + int pout = 0; + int pin = 1; + + // Populate g_buffer. + // This is exclusive scan, so shift right by one and set first element to 0. + buffer[pout * n + threadId] = ( threadId > 0 ) ? g_in[threadId - 1] : 0; + + // Do not continue until buffer is populated. + __syncthreads(); + + for ( int offset = 1; offset < n; offset *= 2 ) { + // Swap pout and pin. + pout = 1 - pout; + pin = 1 - pin; + + if ( threadId < offset ) { + // output[index] = input[index]. + buffer[pout * n + threadId] = buffer[pin * n + threadId]; + } + else { + // output[index] = input[index] + input[index - offset]. + buffer[pout * n + threadId] = buffer[pin * n + threadId] + buffer[pin * n + threadId - offset]; + } + + // All threads must complete this "level" before proceeding. + __syncthreads(); + } + + // Write output. + g_out[threadId] = buffer[pout * n + threadId]; +} + + +//////////////////////////////////////////////////// +// Helper method for parallel (GPU) version of naive exclusive prefix sum. +// This method sums up the elements in each block, and saves those sums to global memory. +//////////////////////////////////////////////////// +__global__ +void computeBlockSums( float *g_in, float *g_block_sums ) +{ + // Compute sum of all elements for every block. + atomicAdd( &g_block_sums[blockIdx.x], g_in[threadIdx.x + ( blockIdx.x * blockDim.x )] ); +} + + +//////////////////////////////////////////////////// +// Parallel (GPU) version of a naive exclusive prefix sum. +// This method can operate on any number of blocks of any size. +// This method utilizes shared memory. +//////////////////////////////////////////////////// +__global__ +void naiveScanSharedMemory( float *g_out, float *g_in, float *g_block_sums, int n ) +{ + // Shared memory is allocated on kernel invocation. + // buffer is twice the size of the current block. + extern __shared__ float buffer[]; + + int block_thread_id = threadIdx.x; + int global_thread_id = threadIdx.x + ( blockIdx.x * blockDim.x ); + int block_size = blockDim.x; + int pout = 0; + int pin = 1; + + // Populate g_buffer. + // This is exclusive scan, so shift right by one and set first element to 0. + buffer[pout * block_size + block_thread_id] = ( block_thread_id > 0 ) ? g_in[global_thread_id - 1] : 0; + + // Do not continue until buffer is populated. + __syncthreads(); + + // In cases where blocks do not evenly divide input array, some invalid indices may be present. + // If invalid index is detected, then return immediately. + if ( global_thread_id > n - 1 ) { + return; + } + + for ( int offset = 1; offset < blockDim.x; offset *= 2 ) { + // Swap pout and pin. + pout = 1 - pout; + pin = 1 - pin; + + if ( block_thread_id < offset ) { + // output[index] = input[index]. + buffer[pout * block_size + block_thread_id] = buffer[pin * block_size + block_thread_id]; // TODO. + } + else { + // output[index] = input[index] + input[index - offset]. + buffer[pout * block_size + block_thread_id] = buffer[pin * block_size + block_thread_id] + buffer[pin * block_size + block_thread_id - offset]; // TODO. + } + + // All threads must complete this "level" before proceeding. + __syncthreads(); + } + + // Compute sum of all previous blocks to add to the values in the current block. + int previous_block_sum = 0; + for ( int i = blockIdx.x; i > 0; --i ) { + previous_block_sum += g_block_sums[i - 1]; + } + + // Write output. + g_out[global_thread_id] = buffer[pout * block_size + block_thread_id] + previous_block_sum; // TODO. +} + + + +/***************************************/ +/* SCATTER FUNCTIONS */ +/***************************************/ + + +//////////////////////////////////////////////////// +// Returns a binary array (0s and 1s) based on an input array. +// [ 0, 0, 3, 4, 0, 6, 6, 7, 0, 1 ] => [ 0, 0, 1, 1, 0, 1, 1, 1, 0, 1 ] +//////////////////////////////////////////////////// +__global__ +void computeBinaryArray( float *g_out, float *g_in, int n ) +{ + int global_thread_id = threadIdx.x + ( blockIdx.x * blockDim.x ); + + // In cases where blocks do not evenly divide input array, some invalid indices may be present. + // If invalid index is detected, then return immediately. + if ( global_thread_id > n - 1 ) { + return; + } + + if ( g_in[global_thread_id] > -D_EPSILON && g_in[global_thread_id] < D_EPSILON ) { + g_out[global_thread_id] = 0.0f; + } + else { + g_out[global_thread_id] = 1.0f; + } +} + + + +/*************************************************/ +/* STREAM COMPACTION FUNCTIONS */ +/*************************************************/ + + +//////////////////////////////////////////////////// +// Perform stream compaction by looking up output indices in g_scatter_results. +//////////////////////////////////////////////////// +__global__ +void streamCompaction( float *g_out, float *g_in, float *g_scatter_results, int n ) +{ + int global_thread_id = threadIdx.x + ( blockIdx.x * blockDim.x ); + + // In cases where blocks do not evenly divide input array, some invalid indices may be present. + // If invalid index is detected, then return immediately. + if ( global_thread_id > n - 1 ) { + return; + } + + if ( g_in[global_thread_id] < -D_EPSILON || g_in[global_thread_id] > D_EPSILON ) { + g_out[( int )g_scatter_results[global_thread_id]] = g_in[global_thread_id]; + } +} \ No newline at end of file diff --git a/2014-09-27_stream_compaction/2014-09-27_stream_compaction/StreamCompaction.h b/2014-09-27_stream_compaction/2014-09-27_stream_compaction/StreamCompaction.h new file mode 100644 index 0000000..7db47cd --- /dev/null +++ b/2014-09-27_stream_compaction/2014-09-27_stream_compaction/StreamCompaction.h @@ -0,0 +1,27 @@ +#pragma once + +#ifndef _STREAM_COMPACTION +#define _STREAM_COMPACTION + +#include + + +// Constants. +const int BLOCK_SIZE = 128; +const float EPSILON = 0.0001f; +__device__ __constant__ float D_EPSILON = 0.0001f; + + +// Naive parallel prefix sum that uses global memory exclusively. +void naiveScanGlobalMemoryWrapper( float *h_out, const float *h_in, const int &n ); + +// Naive parallel prefix sum that uses shared memory. +void naiveScanSharedMemoryWrapper( float *h_out, const float *h_in, const int &n ); + +// Scatter operation. +void scatterWrapper( float *h_output, const float *h_input, const int &n ); + +// Full stream compaction. +void streamCompactionWrapper( float **h_output, const float *h_input, const int &n, int &output_length ); + +#endif \ No newline at end of file diff --git a/2014-09-27_stream_compaction/2014-09-27_stream_compaction/main.cpp b/2014-09-27_stream_compaction/2014-09-27_stream_compaction/main.cpp new file mode 100644 index 0000000..783fd5b --- /dev/null +++ b/2014-09-27_stream_compaction/2014-09-27_stream_compaction/main.cpp @@ -0,0 +1,208 @@ +#pragma once + +#include "StreamCompaction.h" +#include +#include "CPUTimer.h" + + +//////////////////////////////////////////////////// +// Function prototypes for algorithms. +//////////////////////////////////////////////////// + +template +void serialScan( T *output, const T *input, const int &n ); + +template +void serialScatter( T *output, const T *input, const int &n ); + + +//////////////////////////////////////////////////// +// Helper methods. +//////////////////////////////////////////////////// + +template +void printArray( const T *input, const int &n ); + +template +bool testTwoArraysForEquality( const T *arr1, const T *arr2, const int &n ); + + +//////////////////////////////////////////////////// +// main() +//////////////////////////////////////////////////// +int main(int argc, char** argv) +{ + //////////////////////////////////////////////////// + // Create input array for testing. + //////////////////////////////////////////////////// + + int n = 2500; + //float input[] = { 3.0f, 4.0f, 6.0f, 7.0f, 9.0f, 10.0f }; + //float input[] = { 0.0f, 0.0f, 3.0f, 4.0f, 0.0f, 6.0f, 6.0f, 7.0f, 0.0f, 1.0f }; + float *input = new float[n]; + for ( int i = 0; i < n; ++i ) { + input[i] = ( float )i + 1.0f; + //input[i] = ( i % 2 ) ? 0.0f : ( float )i; + } + + + //////////////////////////////////////////////////// + // Serial scan. + //////////////////////////////////////////////////// + + float *serial_scan_result = new float[n]; + + CPUTimer *serial_scan_timer = new CPUTimer(); + serialScan( serial_scan_result, input, n ); + serial_scan_timer->stop( "Serial scan time" ); + + + //////////////////////////////////////////////////// + // Serial scatter. + //////////////////////////////////////////////////// + + float *serial_scatter_result = new float[n]; + serialScatter( serial_scatter_result, input, n ); + + + //////////////////////////////////////////////////// + // Global memory naive parallel scan. + //////////////////////////////////////////////////// + + float *naive_parallel_scan_result_global = new float[n]; + naiveScanGlobalMemoryWrapper( naive_parallel_scan_result_global, input, n ); + + // Validate global memory naive parallel scan result. + printf( "Global memory naive parallel scan: %s\n\n", ( testTwoArraysForEquality( naive_parallel_scan_result_global, serial_scan_result, n ) ? "OK" : "ERROR" ) ); + + + //////////////////////////////////////////////////// + // Shared memory naive parallel scan. + //////////////////////////////////////////////////// + + float *naive_parallel_scan_result_shared = new float[n]; + naiveScanSharedMemoryWrapper( naive_parallel_scan_result_shared, input, n ); + + // Validate global memory naive parallel scan result. + printf( "Shared memory naive parallel scan: %s\n\n", ( testTwoArraysForEquality( naive_parallel_scan_result_shared, serial_scan_result, n ) ? "OK" : "ERROR" ) ); + + + //////////////////////////////////////////////////// + // Parallel scatter operation. + //////////////////////////////////////////////////// + + float *parallel_scatter_result = new float[n]; + scatterWrapper( parallel_scatter_result, input, n ); + + // Validate parallel scatter result. + printf( "Scatter operation: %s\n\n", ( testTwoArraysForEquality( parallel_scatter_result, serial_scatter_result, n ) ? "OK" : "ERROR" ) ); + + + //////////////////////////////////////////////////// + // Full stream compaction. + //////////////////////////////////////////////////// + + float *stream_compaction_result = NULL; + int stream_compaction_length = 0; + streamCompactionWrapper( &stream_compaction_result, input, n, stream_compaction_length ); + + // Print stream compaction results. + //printArray( input, n ); + //printArray( stream_compaction_result, stream_compaction_length ); + + + //////////////////////////////////////////////////// + // Free allocated memory. + //////////////////////////////////////////////////// + + delete[] serial_scan_result; + delete[] naive_parallel_scan_result_global; + delete[] naive_parallel_scan_result_shared; + delete[] parallel_scatter_result; + delete[] stream_compaction_result; + + // Prevent output window from closing prematurely. + std::cin.ignore(); + + return 0; +} + + +//////////////////////////////////////////////////// +// Serial (CPU) version of an exclusive prefix sum. +// output: Pointer to a number that represents our output array. +// input: Pointer to a constant number that represents our input array. +// n: Constant reference to an integer that represents the size of our arrays. +//////////////////////////////////////////////////// +template +void serialScan( T *output, const T *input, const int &n ) +{ + // Validate arguments. + if ( n <= 0 || input == NULL || output == NULL ) { + return; + } + + // Perform exclusive prefix sum. + output[0] = 0; + for ( int i = 1; i < n; ++i ) { + output[i] = output[i - 1] + input[i - 1]; + } +} + + +//////////////////////////////////////////////////// +// Serial (CPU) version of the scatter section of stream compaction. +// output: Pointer to a number that represents our output array. +// input: Pointer to a constant number that represents our input array. +// n: Constant reference to an integer that represents the size of our arrays. +//////////////////////////////////////////////////// +template +void serialScatter( T *output, const T *input, const int &n ) +{ + // Convert input array into a boolean array. + // 0 in boolean array if input is 0. + // 1 in boolean array if input is not 0. + T *boolean_array = new T[n]; + for ( int i = 0; i < n; ++i ) { + if ( input[i] > -EPSILON && input[i] < EPSILON ) { + boolean_array[i] = false; + } + else { + boolean_array[i] = true; + } + } + + // Perform a scan on the newly created boolean array. + serialScan( output, boolean_array, n ); + + // Free allocated memory. + delete[] boolean_array; +} + + +//////////////////////////////////////////////////// +// Helper method to print the contents of an array. +//////////////////////////////////////////////////// +template +void printArray( const T *input, const int &n ) +{ + for ( int i = 0; i < n; ++i ) { + std::cout << i << ": " << input[i] << std::endl; + } + std::cout << std::endl; +} + + +//////////////////////////////////////////////////// +// Helper method to test if two arrays are equivalent. +//////////////////////////////////////////////////// +template +bool testTwoArraysForEquality( const T *arr1, const T *arr2, const int &n ) +{ + for ( int i = 0; i < n; ++i ) { + if ( arr1[i] != arr2[i] ) { + return false; + } + } + return true; +} \ No newline at end of file diff --git a/README.md b/README.md index 6e02afa..bb85afd 100644 --- a/README.md +++ b/README.md @@ -131,3 +131,27 @@ project. Like other projects, please open a pull request and email Harmony. # REFERENCES "Parallel Prefix Sum (Scan) with CUDA." GPU Gems 3. + +# RESULTS + + I was able to successfully implement the parallel prefix sum algorithm using both global and shared memory. Unfortunately, my implementations appear to error once the size of the input array grows larger than a certain threshold. Currently, this threshold sits at around 6000 elements. As of the time of this writing, I have been unable to determine why my parallel scan implementations fail on large input arrays. + + As a result of being unable to perform tests on very large input arrays, my data points for performance analysis are severely limited. GPUs are best suited for performing simple operations on large data sets--much larger than 6000 elements. With small input arrays (< 6000 elements), program execution time on the GPU can vary significantly between runs. Additionally, with small input arrays, the serial CPU implementation of an algorithm is often faster than the parallel GPU implementation of the same algorithm due to some overhead incurred when performing operations on the GPU. This is the case here. Since my tests were limited to small input arrays, the serial version of my parallel prefix sum algorithm consistently clocked in at 0ms--faster than its GPU counterparts. Of course, this could also be a shortcoming of the methods I used to time my CPU algorithms. + + Here are a few data points comparing my serial scan, my naive parallel scan implemented that uses global memory exclusively, and my naive parallel scan that utilizes shared memory. + + Number of elements in the input array | Scan method | Execution time (ms) +:---: | :---: | :---: +3000 | Serial | 0.0 +4000 | Serial | 0.0 +5000 | Serial | 0.0 +3000 | Parallel - Global | 0.183936 +4000 | Parallel - Global | 0.216736 +5000 | Parallel - Global | 0.273824 +3000 | Parallel - Shared | 0.13968 +4000 | Parallel - Shared | 0.153888 +5000 | Parallel - Shared | 0.177312 + +Even using a limited number of test cases, it is easy to see that the parallel scan implementation that utilizes shared memory is more efficient than the parallel implementation that does not utilize shared memory. This is to be expected, as global memory calls are considered expensive to make within a GPU kernel. Reducing the number of accesses to global memory, and replacing them with shared memory accesses wherever possible, should therefore increase performance, which is shown to hold true in these test cases. As the number of elements in the input array increases, I expect this trend to continue, and the performance gap between the method using shared memory and the method using only global memory to grow. + +As the number of input elements increases, I also expect the parallel implementations of scan to overtake the serial implementation. After I locate the logic error in my parallel scan implementations, I will run additional tests to test the hypotheses I have stated here. \ No newline at end of file