diff --git a/README.md b/README.md
index d63a6a1..d12a1dd 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,31 @@
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture,
 Project 1 - Flocking**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Nicholas Magarino
+  * [https://www.linkedin.com/in/nicholasmagarino/](), [https://www.nickmagarino.com/]()
+* Tested on: Windows 10 Home Version 1803, i5-8300H @ 2.30GHz 8GB, GeForce GTX 1050 8GB (MSI GL63 8RC Laptop)
 
-### (TODO: Your README)
+# FLOCKING ZOOMIES
+
+### READ-ME
+
+![](/images/boidsGif.gif)
+
+Boids are particles that follow three rules: cohesion, separation, and alignment, and check their neighboring boids to compute their velocity on the next frame of the simulation.
+
+**Performance Analysis**
+
+![](/images/visual.png)
+
+![](/images/novisual.png)
+
+For each implementation, increasing the number of boids generally decreases the framerate as more neighbor boid data accesses are done per boid.
+
+![](/images/blocksize.png)
+
+Increasing the CUDA kernels' block size generally improved performance, as doing so allows more threads to run in parallel on the GPU.
+
+In all, the coherent grid had greater performance than any other implementation, as expected, as this method allowed the most data to be continguous in memory, thus decreasing the time needed to access the next piece of data.
+
+Decreasing cell width reduces performance, as it adds more overall cells to check.  In general, more cells results in more memory access that may not be contiguous, reducing performance.
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
diff --git a/images/blocksize.png b/images/blocksize.png
new file mode 100644
index 0000000..1f18288
Binary files /dev/null and b/images/blocksize.png differ
diff --git a/images/boidsGif.gif b/images/boidsGif.gif
new file mode 100644
index 0000000..26522d1
Binary files /dev/null and b/images/boidsGif.gif differ
diff --git a/images/novisual.png b/images/novisual.png
new file mode 100644
index 0000000..04165d9
Binary files /dev/null and b/images/novisual.png differ
diff --git a/images/visual.png b/images/visual.png
new file mode 100644
index 0000000..c40f6c0
Binary files /dev/null and b/images/visual.png differ
diff --git a/src/kernel.cu b/src/kernel.cu
index 74dffcb..cf27775 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -86,6 +86,11 @@ int *dev_gridCellEndIndices;   // to this cell?
 // TODO-2.3 - consider what additional buffers you might need to reshuffle
 // the position and velocity data to be coherent within cells.
 
+glm::vec3* shuffledPos;
+glm::vec3* shuffledVec1;
+glm::vec3* shuffledVec2;
+
+
 // LOOK-2.1 - Grid parameters based on simulation parameters.
 // These are automatically computed for you in Boids::initSimulation
 int gridCellCount;
@@ -169,6 +174,32 @@ void Boids::initSimulation(int N) {
   gridMinimum.z -= halfGridWidth;
 
   // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
+
+  cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleArrayIndices failed!");
+
+  cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices failed!");
+
+  cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellStartIndices failed!");
+
+  cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellEndIndices failed!");
+
+
+  cudaMalloc((void**)&shuffledPos, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc shuffledPos failed!");
+
+  cudaMalloc((void**)&shuffledVec1, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc shuffledVel1 failed!");
+
+  cudaMalloc((void**)&shuffledVec2, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc shuffledVel2 failed!");
+
+
+
+
   cudaDeviceSynchronize();
 }
 
@@ -230,10 +261,59 @@ void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities)
 * in the `pos` and `vel` arrays.
 */
 __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *pos, const glm::vec3 *vel) {
-  // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
-  // Rule 2: boids try to stay a distance d away from each other
-  // Rule 3: boids try to match the speed of surrounding boids
-  return glm::vec3(0.0f, 0.0f, 0.0f);
+ 
+	glm::vec3 rule1(0.f);
+	glm::vec3 rule2(0.f);
+	glm::vec3 rule3(0.f);
+	glm::vec3 boid = pos[iSelf];
+  
+	glm::vec3 perceivedCenter(0.f);
+	glm::vec3 c(0.f);
+	glm::vec3 perceivedVelocity(0.f);
+
+	// How many boids fall in range for rules 1 and 3
+	float count1 = 0.0f;
+	float count3 = 0.0f;
+
+	for (int i = 0; i < N; i++) {
+		glm::vec3 b = pos[i];
+
+		if ((glm::distance(boid, b) < rule1Distance)) {
+			count1++;
+			perceivedCenter += b;
+		}
+
+		if (i != iSelf && (glm::distance(boid, b) < rule2Distance)) {
+			c -= (b - boid);
+		}
+
+		if (i != iSelf && (glm::distance(boid, b) < rule3Distance)) {
+			count3++;
+			perceivedVelocity += vel[i];
+		}
+	}
+
+	// Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+
+	if (count1 != 0) {
+		perceivedCenter /= (count1);
+	}
+
+	rule1 = (perceivedCenter - boid) * rule1Scale;
+
+	// Rule 2: boids try to stay a distance d away from each other
+
+	rule2 = c * rule2Scale;
+
+	// Rule 3: boids try to match the speed of surrounding boids
+
+	if (count3 != 0) {
+		perceivedVelocity /= (count3);
+	}
+
+	rule3 = perceivedVelocity * rule3Scale;
+	
+	return (rule1 + rule2 + rule3);
 }
 
 /**
@@ -245,6 +325,20 @@ __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
   // Compute a new velocity based on pos and vel1
   // Clamp the speed
   // Record the new velocity into vel2. Question: why NOT vel1?
+
+	int index = threadIdx.x + (blockIdx.x * blockDim.x);
+
+	if (index < N) {
+
+		glm::vec3 newVel = vel1[index] + computeVelocityChange(N, index, pos, vel1);
+
+		if (glm::abs(glm::length(newVel)) > maxSpeed) {
+			vel2[index] = glm::normalize(newVel) * maxSpeed;
+		}
+		else {
+			vel2[index] = newVel;
+		}
+	}
 }
 
 /**
@@ -289,6 +383,20 @@ __global__ void kernComputeIndices(int N, int gridResolution,
     // - Label each boid with the index of its grid cell.
     // - Set up a parallel array of integer indices as pointers to the actual
     //   boid data in pos and vel1/vel2
+
+	// Sort outside of this kernel function
+
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index < N) {
+		glm::vec3 boidPos = pos[index];
+
+		// retain this index to dev_pos, dev_vel's even once sorted
+		indices[index] = index;
+
+		// find the cell this boid position occupies
+		glm::vec3 cellID = glm::floor(((boidPos - gridMin) * inverseCellWidth));
+		gridIndices[index] = gridIndex3Dto1D(cellID.x, cellID.y, cellID.z, gridResolution);
+	}
 }
 
 // LOOK-2.1 Consider how this could be useful for indicating that a cell
@@ -306,6 +414,28 @@ __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
   // Identify the start point of each cell in the gridIndices array.
   // This is basically a parallel unrolling of a loop that goes
   // "this index doesn't match the one before it, must be a new cell!"
+
+	// assumes grid indices are sorted
+
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index < N) {
+
+		int currBoid = particleGridIndices[index];
+		int prevBoid = particleGridIndices[index - 1];
+		int nextBoid = particleGridIndices[index + 1];
+
+		// look at each element in gridIndices, see if it's a start or end,
+		// and assign to the currBoid's value as index of start and end
+
+		if (currBoid != prevBoid) {
+			gridCellStartIndices[currBoid] = index;
+		}
+
+		if (currBoid != nextBoid) {
+			gridCellEndIndices[currBoid] = index;
+		}
+
+	}
 }
 
 __global__ void kernUpdateVelNeighborSearchScattered(
@@ -322,6 +452,92 @@ __global__ void kernUpdateVelNeighborSearchScattered(
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index < N) {
+		glm::vec3 boid = pos[index];
+
+		glm::vec3 cellID3D = glm::floor(((boid - gridMin) * inverseCellWidth));
+		int cellID = gridIndex3Dto1D(cellID3D.x, cellID3D.y, cellID3D.z, gridResolution);
+
+		// data for rules
+		glm::vec3 rule1(0.f);
+		glm::vec3 rule2(0.f);
+		glm::vec3 rule3(0.f);
+
+		glm::vec3 perceivedCenter(0.f);
+		glm::vec3 c(0.f);
+		glm::vec3 perceivedVelocity(0.f);
+
+		// How many boids fall in range for rules 1 and 3
+		float count1 = 0.0f;
+		float count3 = 0.0f;
+
+		// Iterate through each neighbor cell (and current cell)
+		for (int i = -1; i < 2; i++) {
+			for (int j = -1; j < 2; j++) {
+				for (int k = -1; k < 2; k++) {
+
+					glm::vec3 neighborID3D = cellID3D + glm::vec3(i, j, k);
+					int neighborID = gridIndex3Dto1D(neighborID3D.x, neighborID3D.y, neighborID3D.z, gridResolution);
+
+
+					// What if these values are not filled out?
+					int startID = gridCellStartIndices[neighborID];
+					int endID = gridCellEndIndices[neighborID];
+
+					for (int l = startID; l < endID + 1; l++) {
+						int boidDataIdx = particleArrayIndices[l];
+						glm::vec3 b = pos[boidDataIdx];
+
+						// do rules for this boid
+						if ((glm::distance(boid, b) < rule1Distance)) {
+							count1++;
+							perceivedCenter += b;
+						}
+
+						if (boidDataIdx != index && (glm::distance(boid, b) < rule2Distance)) {
+							c -= (b - boid);
+						}
+
+						if (boidDataIdx != index && (glm::distance(boid, b) < rule3Distance)) {
+							count3++;
+							perceivedVelocity += vel1[boidDataIdx];
+						}
+
+					}
+
+				}
+			}
+		}
+
+		// Compute rules
+
+		if (count1 != 0) {
+			perceivedCenter /= (count1);
+		}
+
+		rule1 = (perceivedCenter - boid) * rule1Scale;
+
+		rule2 = c * rule2Scale;
+
+		if (count3 != 0) {
+			perceivedVelocity /= (count3);
+		}
+
+		rule3 = perceivedVelocity * rule3Scale;
+
+
+		glm::vec3 newVel = vel1[index] + (rule1 + rule2 + rule3);
+
+		if (glm::abs(glm::length(newVel)) > maxSpeed) {
+			vel2[index] = glm::normalize(newVel) * maxSpeed;
+		}
+		else {
+			vel2[index] = newVel;
+		}
+	}
+
 }
 
 __global__ void kernUpdateVelNeighborSearchCoherent(
@@ -341,6 +557,94 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+
+
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index < N) {
+		glm::vec3 boid = pos[index];
+
+		glm::vec3 cellID3D = glm::floor(((boid - gridMin) * inverseCellWidth));
+		int cellID = gridIndex3Dto1D(cellID3D.x, cellID3D.y, cellID3D.z, gridResolution);
+
+		// data for rules
+		glm::vec3 rule1(0.f);
+		glm::vec3 rule2(0.f);
+		glm::vec3 rule3(0.f);
+
+		glm::vec3 perceivedCenter(0.f);
+		glm::vec3 c(0.f);
+		glm::vec3 perceivedVelocity(0.f);
+
+		// How many boids fall in range for rules 1 and 3
+		float count1 = 0.0f;
+		float count3 = 0.0f;
+
+		// Iterate through each neighbor cell (and current cell)
+		for (int i = -1; i < 2; i++) {
+			for (int j = -1; j < 2; j++) {
+				for (int k = -1; k < 2; k++) {
+
+					glm::vec3 neighborID3D = cellID3D + glm::vec3(i, j, k);
+					int neighborID = gridIndex3Dto1D(neighborID3D.x, neighborID3D.y, neighborID3D.z, gridResolution);
+
+
+					// What if these values are not filled out?
+					int startID = gridCellStartIndices[neighborID];
+					int endID = gridCellEndIndices[neighborID];
+
+					for (int l = startID; l < endID + 1; l++) {
+						// reduce memory indirection
+						int boidDataIdx = l;
+						glm::vec3 b = pos[l];
+
+						// do rules for this boid
+						if ((glm::distance(boid, b) < rule1Distance)) {
+							count1++;
+							perceivedCenter += b;
+						}
+
+						if (boidDataIdx != index && (glm::distance(boid, b) < rule2Distance)) {
+							c -= (b - boid);
+						}
+
+						if (boidDataIdx != index && (glm::distance(boid, b) < rule3Distance)) {
+							count3++;
+							perceivedVelocity += vel1[boidDataIdx];
+						}
+
+					}
+
+				}
+			}
+		}
+
+		// Compute rules
+
+		if (count1 != 0) {
+			perceivedCenter /= (count1);
+		}
+
+		rule1 = (perceivedCenter - boid) * rule1Scale;
+
+		rule2 = c * rule2Scale;
+
+		if (count3 != 0) {
+			perceivedVelocity /= (count3);
+		}
+
+		rule3 = perceivedVelocity * rule3Scale;
+
+
+		glm::vec3 newVel = vel1[index] + (rule1 + rule2 + rule3);
+
+		if (glm::abs(glm::length(newVel)) > maxSpeed) {
+			vel2[index] = glm::normalize(newVel) * maxSpeed;
+		}
+		else {
+			vel2[index] = newVel;
+		}
+	}
+
 }
 
 /**
@@ -349,6 +653,16 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
 void Boids::stepSimulationNaive(float dt) {
   // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
   // TODO-1.2 ping-pong the velocity buffers
+
+	dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+
+	kernUpdatePos<<<fullBlocksPerGrid, blockSize>>> (numObjects, dt, dev_pos, dev_vel2);
+	kernUpdateVelocityBruteForce<<<fullBlocksPerGrid, blockSize>>> (numObjects, dev_pos, dev_vel1, dev_vel2);
+	
+	glm::vec3* temp;
+	temp = dev_vel1;
+	dev_vel1 = dev_vel2;
+	
 }
 
 void Boids::stepSimulationScatteredGrid(float dt) {
@@ -364,6 +678,45 @@ void Boids::stepSimulationScatteredGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed
+
+	dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+	dim3 fullBlocksPerGridCellIdx((gridCellCount + blockSize - 1) / blockSize);
+
+	kernComputeIndices<<<fullBlocksPerGrid, blockSize>>>(numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, 
+		dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+	
+	// sort
+	dev_thrust_particleArrayIndices = thrust::device_pointer_cast(dev_particleArrayIndices);
+	dev_thrust_particleGridIndices = thrust::device_pointer_cast(dev_particleGridIndices);
+	thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+
+	kernResetIntBuffer<<<fullBlocksPerGrid, blockSize>>>(gridCellCount, dev_gridCellStartIndices, INT16_MAX);
+	kernResetIntBuffer<<<fullBlocksPerGrid, blockSize>>>(gridCellCount, dev_gridCellEndIndices, INT16_MAX);
+
+	kernIdentifyCellStartEnd<<<fullBlocksPerGrid, blockSize>>>(numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+
+	kernUpdateVelNeighborSearchScattered<<<fullBlocksPerGrid, blockSize>>>(numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth,
+		dev_gridCellStartIndices, dev_gridCellEndIndices, dev_particleArrayIndices, dev_pos, dev_vel1, dev_vel2);
+
+	kernUpdatePos << <fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_pos, dev_vel2);
+
+	glm::vec3* temp;
+	temp = dev_vel1;
+	dev_vel1 = dev_vel2;
+	dev_vel2 = temp;
+}
+
+__global__ void kernShufflePosVel(int N, int* gridCellIndices, glm::vec3* pos, glm::vec3* vel1, glm::vec3* vel2,
+	glm::vec3* shuffPos, glm::vec3* shuffVel1, glm::vec3* shuffVel2) {
+
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index < N) {
+
+		int boid = gridCellIndices[index];
+		shuffPos[index] = pos[boid];
+		shuffVel1[index] = vel1[boid];
+	}
+
 }
 
 void Boids::stepSimulationCoherentGrid(float dt) {
@@ -382,6 +735,39 @@ void Boids::stepSimulationCoherentGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+
+	dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+	dim3 fullBlocksPerGridCellIdx((gridCellCount + blockSize - 1) / blockSize);
+
+	kernComputeIndices << <fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth,
+		dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+
+	// sort
+	dev_thrust_particleArrayIndices = thrust::device_pointer_cast(dev_particleArrayIndices);
+	dev_thrust_particleGridIndices = thrust::device_pointer_cast(dev_particleGridIndices);
+	thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+
+	kernResetIntBuffer << <fullBlocksPerGrid, blockSize >> > (gridCellCount, dev_gridCellStartIndices, INT16_MAX);
+	kernResetIntBuffer << <fullBlocksPerGrid, blockSize >> > (gridCellCount, dev_gridCellEndIndices, INT16_MAX);
+
+	kernIdentifyCellStartEnd << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+
+	// rearrange pos, vel data to be in line with grid/array indices
+	kernShufflePosVel << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_particleArrayIndices, dev_pos, dev_vel1, dev_vel2,
+		shuffledPos, shuffledVec1, shuffledVec2);
+
+	kernUpdateVelNeighborSearchCoherent << <fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth,
+		dev_gridCellStartIndices, dev_gridCellEndIndices, shuffledPos, shuffledVec1, dev_vel2);
+
+	glm::vec3* temp;
+	temp = dev_pos;
+	dev_pos = shuffledPos;
+	shuffledPos = temp;
+
+	kernUpdatePos << <fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_pos, dev_vel2);
+
+	dev_vel1 = dev_vel2;
+
 }
 
 void Boids::endSimulation() {
@@ -390,6 +776,14 @@ void Boids::endSimulation() {
   cudaFree(dev_pos);
 
   // TODO-2.1 TODO-2.3 - Free any additional buffers here.
+  cudaFree(dev_gridCellEndIndices);
+  cudaFree(dev_gridCellStartIndices);
+  cudaFree(dev_particleArrayIndices);
+  cudaFree(dev_particleGridIndices);
+
+  cudaFree(shuffledPos);
+  cudaFree(shuffledVec1);
+  cudaFree(shuffledVec2);
 }
 
 void Boids::unitTest() {
diff --git a/src/main.cpp b/src/main.cpp
index b82c8c6..fafcc7b 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -14,11 +14,11 @@
 
 // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID
 #define VISUALIZE 1
-#define UNIFORM_GRID 0
-#define COHERENT_GRID 0
+#define UNIFORM_GRID 1
+#define COHERENT_GRID 1
 
 // LOOK-1.2 - change this to adjust particle count in the simulation
-const int N_FOR_VIS = 5000;
+const int N_FOR_VIS = 50000;
 const float DT = 0.2f;
 
 /**