Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
386 changes: 380 additions & 6 deletions README.md

Large diffs are not rendered by default.

Binary file added img/1.JPG
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added img/2.JPG
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
4 changes: 2 additions & 2 deletions src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,14 +137,14 @@ int main(int argc, char* argv[]) {
printDesc("work-efficient compact, power-of-two");
count = StreamCompaction::Efficient::compact(SIZE, c, a);
printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
//printArray(count, c, true);
printArray(count, c, true);
printCmpLenResult(count, expectedCount, b, c);

zeroArray(SIZE, c);
printDesc("work-efficient compact, non-power-of-two");
count = StreamCompaction::Efficient::compact(NPOT, c, a);
printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
//printArray(count, c, true);
printArray(count, c, true);
printCmpLenResult(count, expectedNPOT, b, c);

system("pause"); // stop Win32 console from closing on exit
Expand Down
2 changes: 1 addition & 1 deletion stream_compaction/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,5 @@ set(SOURCE_FILES

cuda_add_library(stream_compaction
${SOURCE_FILES}
OPTIONS -arch=sm_20
OPTIONS -arch=sm_60
)
12 changes: 11 additions & 1 deletion stream_compaction/common.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ void checkCUDAErrorFn(const char *msg, const char *file, int line) {
exit(EXIT_FAILURE);
}


namespace StreamCompaction {
namespace Common {

Expand All @@ -24,6 +23,10 @@ namespace StreamCompaction {
*/
__global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
// TODO
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index >= n) return;

bools[index] = (int)(idata[index] != 0);
}

/**
Expand All @@ -33,6 +36,13 @@ namespace StreamCompaction {
__global__ void kernScatter(int n, int *odata,
const int *idata, const int *bools, const int *indices) {
// TODO
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index >= n) return;

if (bools[index])
{
odata[indices[index]] = idata[index];
}
}

}
Expand Down
4 changes: 4 additions & 0 deletions stream_compaction/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@
#define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
#define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)

#define blocksize 512

//#define SYNC_GRID

/**
* Check for CUDA errors; print and exit if there was a problem.
*/
Expand Down
40 changes: 38 additions & 2 deletions stream_compaction/cpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,13 @@ namespace StreamCompaction {
void scan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO

odata[0] = 0;
for (int i = 1; i < n; i++)
{
odata[i] = odata[i - 1] + idata[i - 1];
}

timer().endCpuTimer();
}

Expand All @@ -31,8 +38,18 @@ namespace StreamCompaction {
int compactWithoutScan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO

int count = 0;
for (int i = 0; i < n; i++)
{
if (idata[i] != 0)
{
odata[count++] = idata[i];
}
}

timer().endCpuTimer();
return -1;
return count;
}

/**
Expand All @@ -43,8 +60,27 @@ namespace StreamCompaction {
int compactWithScan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO
int count = 0;

//scan
odata[0] = 0;
for (int i = 1; i < n; i++)
{
odata[i] = odata[i - 1] + (idata[i - 1]!=0);//map to boolean
}

//scatter
for (int i = 0; i < n; i++)
{
if (idata[i] != 0)
{
odata[odata[i]] = idata[i];
count++;
}
}

timer().endCpuTimer();
return -1;
return count;
}
}
}
207 changes: 173 additions & 34 deletions stream_compaction/efficient.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,37 +4,176 @@
#include "efficient.h"

namespace StreamCompaction {
namespace Efficient {
using StreamCompaction::Common::PerformanceTimer;
PerformanceTimer& timer()
{
static PerformanceTimer timer;
return timer;
}

/**
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
void scan(int n, int *odata, const int *idata) {
timer().startGpuTimer();
// TODO
timer().endGpuTimer();
}

/**
* Performs stream compaction on idata, storing the result into odata.
* All zeroes are discarded.
*
* @param n The number of elements in idata.
* @param odata The array into which to store elements.
* @param idata The array of elements to compact.
* @returns The number of elements remaining after compaction.
*/
int compact(int n, int *odata, const int *idata) {
timer().startGpuTimer();
// TODO
timer().endGpuTimer();
return -1;
}
}
}
namespace Efficient {
using StreamCompaction::Common::PerformanceTimer;
PerformanceTimer& timer()
{
static PerformanceTimer timer;
return timer;
}

__global__ void kernUpSweep(int n, int POT, int POT_EX, int *data)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index >= n) return;
if (index % POT_EX != 0) return;

data[index + POT_EX - 1] += data[index + POT - 1];
}

__global__ void kernDownSweep(int n, int POT, int POT_EX, int *data)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index >= n) return;
if (index % POT_EX != 0) return;

int temp = data[index + POT - 1];
data[index + POT - 1] = data[index + POT_EX - 1];
data[index + POT_EX - 1] += temp;
}


/**
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
void scan(int n, int *odata, const int *idata) {
int count = ilog2ceil(n);
int number = 1 << count;
int *dev_data;
dim3 gridsize((number - 1) / blocksize + 1);

cudaMalloc((void**)&dev_data, number * sizeof(int));
checkCUDAErrorFn("malloc dev_data");

cudaMemcpy(dev_data, idata, n * sizeof(int), cudaMemcpyHostToDevice);
if (number > n)
{
cudaMemset(dev_data + n, 0, (number - n) * sizeof(int));
checkCUDAErrorFn("set dev_data");
}

//start ticking
timer().startGpuTimer();
for (int i = 0; i < count; i++)
{
kernUpSweep << <gridsize, blocksize >> > (number, 1 << i, 1 << i + 1, dev_data);
#ifdef SYNC_GRID
cudaThreadSynchronize();
#endif
}

//set data[number-1] to 0
cudaMemset((void*)(dev_data + (number - 1)), 0, sizeof(int));
checkCUDAErrorFn("set dev_data[number-1]");

for (int i = count - 1; i >= 0; i--)
{
kernDownSweep << <gridsize, blocksize >> > (number, 1 << i, 1 << i + 1, dev_data);
#ifdef SYNC_GRID
cudaThreadSynchronize();
#endif
}

//stop ticking
timer().endGpuTimer();

cudaMemcpy(odata, dev_data, n * sizeof(int), cudaMemcpyDeviceToHost);

cudaFree(dev_data);
checkCUDAErrorFn("free dev_data");
}

/**
* Performs stream compaction on idata, storing the result into odata.
* All zeroes are discarded.
*
* @param n The number of elements in idata.
* @param odata The array into which to store elements.
* @param idata The array of elements to compact.
* @returns The number of elements remaining after compaction.
*/
int compact(int n, int *odata, const int *idata) {
int result = 0;
int count = ilog2ceil(n);
int number = 1 << count;
int *dev_idata;
int *dev_odata;
int *dev_indices;
int *dev_bools;
dim3 gridsize((number - 1) / blocksize + 1);
dim3 gridsize_EXACT((n - 1) / blocksize + 1);

cudaMalloc((void**)&dev_idata, n * sizeof(int));
checkCUDAErrorFn("malloc dev_idata");

cudaMalloc((void**)&dev_odata, n * sizeof(int));
checkCUDAErrorFn("malloc dev_odata");

cudaMalloc((void**)&dev_indices, number * sizeof(int));
checkCUDAErrorFn("malloc dev_indices");

cudaMalloc((void**)&dev_bools, n * sizeof(int));
checkCUDAErrorFn("malloc dev_bools");


cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
checkCUDAErrorFn("memcpy dev_idata");

Common::kernMapToBoolean << <gridsize_EXACT, blocksize >> > (n, dev_bools, dev_idata);

cudaMemcpy(dev_indices, dev_bools, n * sizeof(int), cudaMemcpyDeviceToDevice);

if (number > n)
{
cudaMemset(dev_indices + n, 0, (number - n) * sizeof(int));
checkCUDAErrorFn("set dev_indices");
}

//start ticking
timer().startGpuTimer();

for (int i = 0; i < count; i++)
{
kernUpSweep << <gridsize, blocksize >> > (number, 1 << i, 1 << i + 1, dev_indices);
#ifdef SYNC_GRID
cudaThreadSynchronize();
#endif
}

//set data[number-1] to 0
cudaMemset((void*)(dev_indices + (number - 1)), 0, sizeof(int));
checkCUDAErrorFn("set dev_indices[number-1]");


for (int i = count - 1; i >= 0; i--)
{
kernDownSweep << <gridsize, blocksize >> > (number, 1 << i, 1 << i + 1, dev_indices);
#ifdef SYNC_GRID
cudaThreadSynchronize();
#endif
}

Common::kernScatter << <gridsize_EXACT, blocksize >> > (n, dev_odata, dev_idata, dev_bools, dev_indices);

//stop ticking
timer().endGpuTimer();

cudaMemcpy(&result, dev_indices + (n - 1), sizeof(int), cudaMemcpyDeviceToHost);
result += (int)(idata[n - 1] != 0);
cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost);

cudaFree(dev_idata);
checkCUDAErrorFn("free dev_idata");

cudaFree(dev_odata);
checkCUDAErrorFn("free dev_odata");

cudaFree(dev_indices);
checkCUDAErrorFn("free dev_indices");

cudaFree(dev_bools);
checkCUDAErrorFn("free dev_bools");
return result;
}
}
}
Loading