diff --git a/README.md b/README.md index d63a6a1..1f3a09d 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,56 @@ +# Project 1 - Flocking Simulation + **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 1 - Flocking** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Weiqi Chen + * [LinkedIn](https://www.linkedin.com/in/weiqi-ricky-chen-2b04b2ab/) +* Tested on: Windows 10, i7-8750H @ 2.20GHz 2.21GHz, 16GB, GTX 1050 2GB + +## Screenshots +| 5,000 Boids | 100,000 Boids | +|--|--| +|![Number of boids: 5k](images/5K.gif) | ![Number of boids: 60k](images/80k.gif) | + +## Performance Analysis +### 1. Number of Boids +The plot shows the FPS vs Boid Size for the 3 implementations. + +![](images/part1.png) + +* As the number of boids increases, performance decreases for all implementations. +* Coherent grid has the best performance with large number of boids. +* Visualization decreases performance. +* The improvement from uniform grid to coherent uniform grid is more obvious as the number of boids increases. +* The naive method is only good with a small number of boids. + +For each boid, naive method requires us to check every other boid, while uniform grid methods only need to check some/all of a boid's neighboring cells. Therefore computation +time is decreased. + +### 2. Block Size + +![](images/part2.png) + +The plot below shows the FPS vs block size and visualization is turned off. +* Performance is increased significantly when block size is increased from 16 to 32. +* From block size 32 to 256, the performance is similar. +Since the warp size is 32, a block size of less than 32 will lead to inactive threads in a warp. When block size is greater than 32, all threads will be used for parallel computation. + +### 3. Coherent Uniform Grid +Coherent uniform grid has better performance than uniform grid. The more boids there are, the greater the difference. This is because: +* We reorder `dev_pos` and `dev_vel` and boids in the same cell now occupy contiguous memory. + +* `dev_particleArrayIndices` in the global memory is accessed one time less, resulting in 1 level decrease of indirection. + +### 4. Cell Width +Half cell width increases the performance due to decrease in computation. Assuming a neighborhood distance of 1: +* For full cell width, 2, need to check a volume of 43 = 64 +* For half cell width, 1, need to check a volume of 33 = 27 -### (TODO: Your README) +Using the coherent uniform grid implementation, we have: -Include screenshots, analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +||Full Width (8 cells)| Half Width (27 cells) +|--|--|--| +|5k Boids|1431 FPS|1521 FPS| +|15k Boids|1011 FPS | 1132FPS| +|50k Boids|547 FPS |556 FPS| diff --git a/images/5K.gif b/images/5K.gif new file mode 100644 index 0000000..e9bb005 Binary files /dev/null and b/images/5K.gif differ diff --git a/images/80k.gif b/images/80k.gif new file mode 100644 index 0000000..d68bf09 Binary files /dev/null and b/images/80k.gif differ diff --git a/images/part1.png b/images/part1.png new file mode 100644 index 0000000..d014bd8 Binary files /dev/null and b/images/part1.png differ diff --git a/images/part2.png b/images/part2.png new file mode 100644 index 0000000..2893c98 Binary files /dev/null and b/images/part2.png differ diff --git a/src/kernel.cu b/src/kernel.cu index 74dffcb..75e2fde 100644 --- a/src/kernel.cu +++ b/src/kernel.cu @@ -85,6 +85,8 @@ int *dev_gridCellEndIndices; // to this cell? // TODO-2.3 - consider what additional buffers you might need to reshuffle // the position and velocity data to be coherent within cells. +glm::vec3 *dev_pos_coherent; +glm::vec3 *dev_vel_coherent; // LOOK-2.1 - Grid parameters based on simulation parameters. // These are automatically computed for you in Boids::initSimulation @@ -169,6 +171,27 @@ void Boids::initSimulation(int N) { gridMinimum.z -= halfGridWidth; // TODO-2.1 TODO-2.3 - Allocate additional buffers here. + cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_particleArrayIndices failed!"); + + cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices failed!"); + + cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_gridCellStartIndices failed!"); + + cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_gridCellEndIndices failed!"); + + dev_thrust_particleArrayIndices = thrust::device_pointer_cast(dev_particleArrayIndices); + dev_thrust_particleGridIndices = thrust::device_pointer_cast(dev_particleGridIndices); + + cudaMalloc((void**)&dev_pos_coherent, N * sizeof(glm::vec3)); + checkCUDAErrorWithLine("cudaMalloc dev_pos_coherent failed!"); + + cudaMalloc((void**)&dev_vel_coherent, N * sizeof(glm::vec3)); + checkCUDAErrorWithLine("cudaMalloc dev_vel_coherent failed!"); + cudaDeviceSynchronize(); } @@ -231,20 +254,64 @@ void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities) */ __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *pos, const glm::vec3 *vel) { // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves - // Rule 2: boids try to stay a distance d away from each other + // Rule 2: boids try to stay a distance d away from each other // Rule 3: boids try to match the speed of surrounding boids - return glm::vec3(0.0f, 0.0f, 0.0f); + glm::vec3 perceivedCen = glm::vec3(0.0f, 0.0f, 0.0f); + glm::vec3 c = glm::vec3(0.0f, 0.0f, 0.0f); + glm::vec3 perceivedVel = glm::vec3(0.0f, 0.0f, 0.0f); + glm::vec3 change = glm::vec3(0.0f, 0.0f, 0.0f); + glm::vec3 thisPos = pos[iSelf]; + glm::vec3 thisVel = vel[iSelf]; + int neighbor1 = 0; + int neighbor2 = 0; + for (int i = 0; i < N; i++) { + if (iSelf == i) { + continue; + } + float dist = glm::distance(pos[i], thisPos); + if (dist < rule1Distance) { + neighbor1++; + perceivedCen += pos[i]; + } + if (dist < rule2Distance) { + c -= pos[i] - thisPos; + } + if (dist < rule3Distance) { + neighbor2++; + perceivedVel += vel[i]; + } + } + if (neighbor1 > 0) { + perceivedCen /= neighbor1; + change += (perceivedCen - thisPos) * rule1Scale; + } + if (neighbor2 > 0) { + perceivedVel /= neighbor2; + change += perceivedVel * rule3Scale; + } + change += c * rule2Scale; + return change; } /** * TODO-1.2 implement basic flocking -* For each of the `N` bodies, update its position based on its current velocity. +* For each of the `N` bodies, update its velocity based on its current velocity and position. */ __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) { // Compute a new velocity based on pos and vel1 // Clamp the speed // Record the new velocity into vel2. Question: why NOT vel1? + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= N) { + return; + } + glm::vec3 newVel = computeVelocityChange(N, index, pos, vel1) + vel1[index]; + float speed = glm::length(newVel); + if (speed > maxSpeed) { + newVel = newVel / speed * maxSpeed; + } + vel2[index] = newVel; } /** @@ -289,6 +356,16 @@ __global__ void kernComputeIndices(int N, int gridResolution, // - Label each boid with the index of its grid cell. // - Set up a parallel array of integer indices as pointers to the actual // boid data in pos and vel1/vel2 + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) { + return; + } + indices[index] = index; + int x = floor((pos[index].x - gridMin.x)) * inverseCellWidth; + int y = floor((pos[index].y - gridMin.y)) * inverseCellWidth; + int z = floor((pos[index].z - gridMin.z)) * inverseCellWidth; + gridIndices[index] = gridIndex3Dto1D(x, y, z, gridResolution); + } // LOOK-2.1 Consider how this could be useful for indicating that a cell @@ -306,6 +383,29 @@ __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices, // Identify the start point of each cell in the gridIndices array. // This is basically a parallel unrolling of a loop that goes // "this index doesn't match the one before it, must be a new cell!" + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + int current = particleGridIndices[index]; + if (index >= N) { + return; + } + else if (index == 0) { + gridCellStartIndices[current] = 0; + } + else if (index > 0 && index < N - 1) { + int previous = particleGridIndices[index - 1]; + if (previous != current) { + gridCellStartIndices[current] = index; + gridCellEndIndices[previous] = index-1; + } + } + else { + int previous = particleGridIndices[index - 1]; + if (previous != current) { + gridCellStartIndices[current] = index; + gridCellEndIndices[previous] = index - 1; + } + gridCellEndIndices[current] = index; + } } __global__ void kernUpdateVelNeighborSearchScattered( @@ -322,6 +422,83 @@ __global__ void kernUpdateVelNeighborSearchScattered( // - Access each boid in the cell and compute velocity change from // the boids rules, if this boid is within the neighborhood distance. // - Clamp the speed change before putting the new speed in vel2 + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) { + return; + } + glm::vec3 thisPos = pos[index]; + glm::vec3 perceivedCen(0.0f, 0.0f, 0.0f); + glm::vec3 c(0.0f, 0.0f, 0.0f); + glm::vec3 perceivedVel(0.0f, 0.0f, 0.0f); + glm::vec3 newVel(0.0f, 0.0f, 0.0f); + + float halfCellWidth = cellWidth / 2; + int left = imax(floor(thisPos.x - gridMin.x - halfCellWidth) * inverseCellWidth, 0); + int right = imin(floor(thisPos.x - gridMin.x + halfCellWidth) * inverseCellWidth, gridResolution-1); + int down = imax(floor(thisPos.y - gridMin.y - halfCellWidth) * inverseCellWidth, 0); + int top = imin(floor(thisPos.y - gridMin.y + halfCellWidth) * inverseCellWidth, gridResolution - 1); + int back = imax(floor(thisPos.z - gridMin.z - halfCellWidth) * inverseCellWidth, 0); + int front = imin(floor(thisPos.z - gridMin.z + halfCellWidth) * inverseCellWidth, gridResolution - 1); + int neighbor1 = 0; + int neighbor2 = 0; + for (int i = back; i <= front; i++) { + for (int j = down; j <= top; j++) { + for (int k = left; k <= right; k++) { + int idx = gridIndex3Dto1D(k, j, i, gridResolution); + int start = gridCellStartIndices[idx]; + if (start < 0) { + continue; + } + int end = gridCellEndIndices[idx]; + for (int m = start; m <= end;m++) { + int boid = particleArrayIndices[m]; + if (boid == index) {continue;} + glm::vec3 thatPos = pos[boid]; + float dist = glm::distance(thisPos, thatPos); + // rule 1 + if (dist < rule1Distance) { + neighbor1++; + perceivedCen += thatPos; + } + // rule 2 + if (dist < rule2Distance) { + c -= thatPos - thisPos; + } + // rule 3 + if (dist < rule3Distance) { + neighbor2++; + perceivedVel += vel1[boid]; + } + } + } + } + } + + if (neighbor1 > 0) { + perceivedCen /= neighbor1; + newVel += (perceivedCen - thisPos) * rule1Scale; + } + if (neighbor2 > 0) { + perceivedVel /= neighbor2; + newVel += perceivedVel * rule3Scale; + } + newVel += c * rule2Scale + vel1[index]; + float speed = glm::length(newVel); + if (speed > maxSpeed) { + newVel = newVel / speed * maxSpeed; + } + vel2[index] = newVel; +} + +__global__ void kernReorder(int N, + int *particleArrayIndices, glm::vec3 *pos, + glm::vec3 *vel, glm::vec3 *pos_coherent, glm::vec3 *vel_coherent) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) { + return; + } + pos_coherent[index] = pos[particleArrayIndices[index]]; + vel_coherent[index] = vel[particleArrayIndices[index]]; } __global__ void kernUpdateVelNeighborSearchCoherent( @@ -341,6 +518,74 @@ __global__ void kernUpdateVelNeighborSearchCoherent( // - Access each boid in the cell and compute velocity change from // the boids rules, if this boid is within the neighborhood distance. // - Clamp the speed change before putting the new speed in vel2 + + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) { + return; + } + glm::vec3 thisPos = pos[index]; + glm::vec3 perceivedCen(0.0f, 0.0f, 0.0f); + glm::vec3 c(0.0f, 0.0f, 0.0f); + glm::vec3 perceivedVel(0.0f, 0.0f, 0.0f); + glm::vec3 newVel(0.0f, 0.0f, 0.0f); + + float halfCellWidth = cellWidth / 2; + int left = imax(floor(thisPos.x - gridMin.x - halfCellWidth) * inverseCellWidth, 0); + int right = imin(floor(thisPos.x - gridMin.x + halfCellWidth) * inverseCellWidth, gridResolution - 1); + int down = imax(floor(thisPos.y - gridMin.y - halfCellWidth) * inverseCellWidth, 0); + int top = imin(floor(thisPos.y - gridMin.y + halfCellWidth) * inverseCellWidth, gridResolution - 1); + int back = imax(floor(thisPos.z - gridMin.z - halfCellWidth) * inverseCellWidth, 0); + int front = imin(floor(thisPos.z - gridMin.z + halfCellWidth) * inverseCellWidth, gridResolution - 1); + int neighbor1 = 0; + int neighbor2 = 0; + for (int i = back; i <= front; i++) { + for (int j = down; j <= top; j++) { + for (int k = left; k <= right; k++) { + int idx = gridIndex3Dto1D(k, j, i, gridResolution); + int start = gridCellStartIndices[idx]; + if (start < 0) { + continue; + } + int end = gridCellEndIndices[idx]; + for (int m = start; m <= end; m++) { + if (m == index) { + continue; + } + glm::vec3 thatPos = pos[m]; + float dist = glm::distance(thisPos, thatPos); + // rule 1 + if (dist < rule1Distance) { + neighbor1++; + perceivedCen += thatPos; + } + // rule 2 + if (dist < rule2Distance) { + c -= thatPos - thisPos; + } + // rule 3 + if (dist < rule3Distance) { + neighbor2++; + perceivedVel += vel1[m]; + } + } + } + } + } + + if (neighbor1 > 0) { + perceivedCen /= neighbor1; + newVel += (perceivedCen - thisPos) * rule1Scale; + } + if (neighbor2 > 0) { + perceivedVel /= neighbor2; + newVel += perceivedVel * rule3Scale; + } + newVel += c * rule2Scale + vel1[index]; + float speed = glm::length(newVel); + if (speed > maxSpeed) { + newVel = newVel / speed * maxSpeed; + } + vel2[index] = newVel; } /** @@ -349,6 +594,14 @@ __global__ void kernUpdateVelNeighborSearchCoherent( void Boids::stepSimulationNaive(float dt) { // TODO-1.2 - use the kernels you wrote to step the simulation forward in time. // TODO-1.2 ping-pong the velocity buffers + + dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize); + kernUpdateVelocityBruteForce <<>> (numObjects, dev_pos, dev_vel1, dev_vel2); + checkCUDAErrorWithLine("kernUpdateVelocityBruteForce failed!"); + kernUpdatePos <<>> (numObjects, dt, dev_pos, dev_vel2); + checkCUDAErrorWithLine("kernUpdatePos failed!"); + cudaMemcpy(dev_vel1, dev_vel2, numObjects * sizeof(glm::vec3), cudaMemcpyDeviceToDevice); + } void Boids::stepSimulationScatteredGrid(float dt) { @@ -364,6 +617,35 @@ void Boids::stepSimulationScatteredGrid(float dt) { // - Perform velocity updates using neighbor search // - Update positions // - Ping-pong buffers as needed + dim3 fullBlocksPerGrid1((gridCellCount + blockSize - 1) / blockSize); + kernResetIntBuffer << > > (gridCellCount, dev_gridCellStartIndices, -1); + checkCUDAErrorWithLine("kernResetIntBuffer failed!"); + kernResetIntBuffer << > > (gridCellCount, dev_gridCellEndIndices, -1); + checkCUDAErrorWithLine("kernResetIntBuffer failed!"); + + dim3 fullBlocksPerGrid2((numObjects + blockSize - 1) / blockSize); + kernComputeIndices << > > (numObjects, gridSideCount, gridMinimum, + gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices); + checkCUDAErrorWithLine("kernComputeIndices failed!"); + + thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices); + + kernIdentifyCellStartEnd << > > (numObjects, dev_particleGridIndices, + dev_gridCellStartIndices, dev_gridCellEndIndices); + checkCUDAErrorWithLine("kernIdentifyCellStartEnd failed!"); + + kernUpdateVelNeighborSearchScattered << > > ( + numObjects, gridSideCount, gridMinimum, + gridInverseCellWidth, gridCellWidth, + dev_gridCellStartIndices, dev_gridCellEndIndices, + dev_particleArrayIndices, + dev_pos, dev_vel1, dev_vel2); + checkCUDAErrorWithLine("kernUpdateVelNeighborSearchScattered failed!"); + + kernUpdatePos << > > (numObjects, dt, dev_pos, dev_vel2); + checkCUDAErrorWithLine("kernUpdatePos failed!"); + + cudaMemcpy(dev_vel1, dev_vel2, numObjects * sizeof(glm::vec3), cudaMemcpyDeviceToDevice); } void Boids::stepSimulationCoherentGrid(float dt) { @@ -382,6 +664,39 @@ void Boids::stepSimulationCoherentGrid(float dt) { // - Perform velocity updates using neighbor search // - Update positions // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE. + dim3 fullBlocksPerGrid1((gridCellCount + blockSize - 1) / blockSize); + kernResetIntBuffer << > > (gridCellCount, dev_gridCellStartIndices, -1); + checkCUDAErrorWithLine("kernResetIntBuffer failed!"); + kernResetIntBuffer << > > (gridCellCount, dev_gridCellEndIndices, -1); + checkCUDAErrorWithLine("kernResetIntBuffer failed!"); + + dim3 fullBlocksPerGrid2((numObjects + blockSize - 1) / blockSize); + kernComputeIndices << > > (numObjects, gridSideCount, gridMinimum, + gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices); + checkCUDAErrorWithLine("kernComputeIndices failed!"); + + thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices); + + kernIdentifyCellStartEnd << > > (numObjects, dev_particleGridIndices, + dev_gridCellStartIndices, dev_gridCellEndIndices); + checkCUDAErrorWithLine("kernIdentifyCellStartEnd failed!"); + kernReorder << > > (numObjects, dev_particleArrayIndices, dev_pos, + dev_vel1, dev_pos_coherent, dev_vel_coherent); + checkCUDAErrorWithLine("kernReorder failed!"); + + kernUpdateVelNeighborSearchCoherent << > > ( + numObjects, gridSideCount, gridMinimum, + gridInverseCellWidth, gridCellWidth, + dev_gridCellStartIndices, dev_gridCellEndIndices, + dev_pos_coherent, dev_vel_coherent, dev_vel2); + checkCUDAErrorWithLine("kernUpdateVelNeighborSearchScattered failed!"); + + kernUpdatePos << > > (numObjects, dt, dev_pos_coherent, dev_vel2); + checkCUDAErrorWithLine("kernUpdatePos failed!"); + + cudaMemcpy(dev_vel1, dev_vel2, numObjects * sizeof(glm::vec3), cudaMemcpyDeviceToDevice); + cudaMemcpy(dev_pos, dev_pos_coherent, sizeof(glm::vec3) * numObjects, cudaMemcpyDeviceToDevice); + } void Boids::endSimulation() { @@ -390,6 +705,13 @@ void Boids::endSimulation() { cudaFree(dev_pos); // TODO-2.1 TODO-2.3 - Free any additional buffers here. + cudaFree(dev_particleGridIndices); + cudaFree(dev_particleArrayIndices); + cudaFree(dev_gridCellStartIndices); + cudaFree(dev_gridCellEndIndices); + + cudaFree(dev_pos_coherent); + cudaFree(dev_vel_coherent); } void Boids::unitTest() { diff --git a/src/main.cpp b/src/main.cpp index b82c8c6..ddd0e3b 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -14,8 +14,8 @@ // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID #define VISUALIZE 1 -#define UNIFORM_GRID 0 -#define COHERENT_GRID 0 +#define UNIFORM_GRID 1 +#define COHERENT_GRID 1 // LOOK-1.2 - change this to adjust particle count in the simulation const int N_FOR_VIS = 5000;