Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions stream_compaction/common.cu
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,13 @@ namespace StreamCompaction {
*/
__global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
// TODO
int idx = blockIdx.x * blockDim.x + threadIdx.x;

if (idx >= n) {
return;
}

bools[idx] = idata[idx] != 0 ? 1 : 0;
}

/**
Expand All @@ -33,6 +40,14 @@ namespace StreamCompaction {
__global__ void kernScatter(int n, int *odata,
const int *idata, const int *bools, const int *indices) {
// TODO
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= n) {
return;
}

if (idata[idx] != 0) {
odata[bools[idx]] = idata[idx];
}
}

}
Expand Down
46 changes: 41 additions & 5 deletions stream_compaction/cpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,11 @@ namespace StreamCompaction {
*/
void scan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO
// TODO -> DONE
odata[0] = 0;
for (int i = 1; i < n; i++) {
odata[i] = odata[i - 1] + idata[i - 1];
}
timer().endCpuTimer();
}

Expand All @@ -30,9 +34,17 @@ namespace StreamCompaction {
*/
int compactWithoutScan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO
// TODO -> DONE

int num = 0;
for (int i = 0; i < n; i++) {
if (idata[i] != 0) {
odata[num] = idata[i];
num++;
}
}
timer().endCpuTimer();
return -1;
return num;
}

/**
Expand All @@ -42,9 +54,33 @@ namespace StreamCompaction {
*/
int compactWithScan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO
// TODO -> DONE
const int size = n;
int* temp = new int[size];

//mapping
for (int i = 0; i < n; i++) {
temp[i] = (idata[i] != 0) ? 1 : 0;
}

// scanning
int* scannedArray = new int[size];
scannedArray[0] = 0;
for (int i = 1; i < n; i++) {
scannedArray[i] = scannedArray[i - 1] + temp[i - 1];
}

// Scatter
int count = 0;
for (int i = 0; i < n; i++) {
if (temp[i] == 1) {
odata[scannedArray[i]] = idata[i];
count++;
}
}

timer().endCpuTimer();
return -1;
return count;
}
}
}
117 changes: 111 additions & 6 deletions stream_compaction/efficient.cu
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,14 @@
#include "common.h"
#include "efficient.h"

#define blockSize 256
#define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__)

int* dev_data;
int* dev_oData;
int* dev_scanData;
int* dev_boolData;

namespace StreamCompaction {
namespace Efficient {
using StreamCompaction::Common::PerformanceTimer;
Expand All @@ -12,13 +20,71 @@ namespace StreamCompaction {
return timer;
}



__global__ void kern_UpSweep(int n, int* arr, int pow) {
int index = blockIdx.x * blockDim.x + threadIdx.x;

if (index >= n) {
return;
}

if (index % (2 * pow) == 0) {
arr[index + 2 * pow - 1] += arr[index + pow - 1];
}

}

__global__ void kern_SetRoot(int n, int* arr) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index >= n) {
return;
}

arr[n - 1] = 0;
}

__global__ void kern_DownSweep(int n, int* arr, int pow) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index >= n) {
return;
}

if (index % (2 * pow) == 0) {
int temp = arr[index + pow - 1];
arr[index + pow - 1] = arr[index + 2 * pow - 1];
arr[index + 2 * pow - 1] += temp;
}
}

/**
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
void scan(int n, int *odata, const int *idata) {
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
void scan(int n, int* odata, const int* idata) {
int blocks = ceil((float)n / (float)blockSize);
int logN = ilog2ceil(n);
const int len = (int)powf(2, logN);

cudaMalloc((void**)&dev_data, sizeof(int) * (int)powf(2, logN));
cudaMemcpy(dev_data, idata, sizeof(int) * n, cudaMemcpyHostToDevice);

timer().startGpuTimer();
// TODO

for (int d = 0; d <= logN - 1; d++) {
kern_UpSweep << <blocks, blockSize >> > (len, dev_data, (int)powf(2, d));
}

kern_SetRoot << <1, 1 >> > (len, dev_data);

for (int d = logN - 1; d >= 0; d--) {
kern_DownSweep << <blocks, blockSize >> > (len, dev_data, (int)powf(2, d));
}

timer().endGpuTimer();

cudaMemcpy(odata, dev_data, sizeof(int) * n, cudaMemcpyDeviceToHost);

cudaFree(dev_data);
}

/**
Expand All @@ -31,10 +97,49 @@ namespace StreamCompaction {
* @returns The number of elements remaining after compaction.
*/
int compact(int n, int *odata, const int *idata) {

int numBlocks = ceil((float)n / (float)blockSize);
int logN = ilog2ceil(n);
const int len = (int)powf(2, logN);

cudaMalloc((void**)&dev_data, sizeof(int) * len);
cudaMalloc((void**)&dev_boolData, sizeof(int) * len);
cudaMalloc((void**)&dev_oData, sizeof(int) * n);
cudaMemcpy(dev_data, idata, sizeof(int) * n, cudaMemcpyHostToDevice);


timer().startGpuTimer();
// TODO

// TODO -> DONE
StreamCompaction::Common::kernMapToBoolean << <numBlocks, blockSize >> > (len, dev_boolData, dev_data);

for (int d = 0; d <= logN - 1; d++) {
kern_UpSweep << <numBlocks, blockSize >> > (len, dev_boolData, (int)powf(2, d));
}

kern_SetRoot << <1, 1 >> > (len, dev_boolData);

for (int d = logN - 1; d >= 0; d--) {
kern_DownSweep << <numBlocks, blockSize >> > (len, dev_boolData, (int)powf(2, d));
}

StreamCompaction::Common::kernScatter << <numBlocks, blockSize >> > (n, dev_oData, dev_data, dev_boolData, nullptr);

timer().endGpuTimer();
return -1;

int* finalBoolArr = new int[n];
cudaMemcpy(odata, dev_oData, sizeof(int) * n, cudaMemcpyDeviceToHost);
cudaMemcpy(finalBoolArr, dev_boolData, sizeof(int) * n, cudaMemcpyDeviceToHost);

cudaFree(dev_data);
cudaFree(dev_boolData);
cudaFree(dev_oData);

if (idata[n - 1] == 0) {
return finalBoolArr[n - 1];
}

return finalBoolArr[n - 1] + 1;
}
}
}
55 changes: 55 additions & 0 deletions stream_compaction/naive.cu
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,12 @@
#include "common.h"
#include "naive.h"

#define blockSize 256
#define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__)

int* dev_idata;
int* dev_odata;

namespace StreamCompaction {
namespace Naive {
using StreamCompaction::Common::PerformanceTimer;
Expand All @@ -12,14 +18,63 @@ namespace StreamCompaction {
return timer;
}
// TODO: __global__
__global__ void kern_NaiveScan(int n, int* odata, int* idata, int pow) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= n) {
return;
}

if (idx >= pow) {
odata[idx] = idata[idx - pow] + idata[idx];
}
else {
odata[idx] = idata[idx];
}
}

__global__ void kern_Exclusive(int n, int* odata, int* idata) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= n) {
return;
}

if (idx == 0) {
odata[idx] = 0;
}
else {
odata[idx] = idata[idx - 1];
}
}
/**
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
void scan(int n, int *odata, const int *idata) {


int blocks = ceil((float)n / (float)blockSize);

cudaMalloc((void**)&dev_idata, sizeof(int) * n);
cudaMalloc((void**)&dev_odata, sizeof(int) * n);
cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice);

timer().startGpuTimer();
// TODO
int logVal = ilog2ceil(n);
for (int d = 1; d <= logVal; d++) {
kern_NaiveScan <<<blocks, blockSize >>> (n, dev_odata, dev_idata, (int)powf(2, d - 1));
if (d < logVal) {
int* tempPtr = dev_odata;
dev_odata = dev_idata;
dev_idata = tempPtr;
}
}
kern_Exclusive <<<blocks, blockSize >>> (n, dev_idata, dev_odata);

timer().endGpuTimer();
cudaMemcpy(odata, dev_idata, sizeof(int) * n, cudaMemcpyDeviceToHost);

cudaFree(dev_idata);
cudaFree(dev_odata);
}
}
}
11 changes: 11 additions & 0 deletions stream_compaction/thrust.cu
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
#include "common.h"
#include "thrust.h"

int* dev_inData;
int* dev_outData;

namespace StreamCompaction {
namespace Thrust {
using StreamCompaction::Common::PerformanceTimer;
Expand All @@ -22,6 +25,14 @@ namespace StreamCompaction {
// TODO use `thrust::exclusive_scan`
// example: for device_vectors dv_in and dv_out:
// thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());

//DONE
thrust::host_vector<int> hostVec(n);
thrust::copy(idata, idata + n, hostVec.begin());
thrust::device_vector<int> devVec = hostVec;
thrust::device_vector<int> outVec(n);
thrust::exclusive_scan(devVec.begin(), devVec.end(), outVec.begin());
thrust::copy(outVec.begin(), outVec.end(), odata);
timer().endGpuTimer();
}
}
Expand Down