diff --git a/README.md b/README.md
index d63a6a1..4cd9707 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,31 @@
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture,
 Project 1 - Flocking**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+ ![FLOCKUP](/images/flockup.PNG)
+ ![Flocking Boiiiiiids](/images/flocking.gif)
 
-### (TODO: Your README)
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+* Davis Polito
+  *  [https://github.com/davispolito/Project0-Getting-Started/blob/master]()
+* Tested on: Windows 10, i7-8750H @ 2.20GHz 16GB, GTX 1060
+
+ **For each implementation, how does changing the number of boids affect performance? Why do you think this is?**
+
+ ![Graph of Num Boid vs. fps (visualize)](/images/boidgraph.PNG)
+ ![Graph of Num Boid vs. fps (no-visual)](/images/novisual.PNG)
+
+For the Naive implementation low boid counts outperform the more efficient implementations due to reduced overhead for cell checking and array management. Scattered implementation performs significantly better as the boids increase. Coherent implementation performs even better than scattered due to the contiguous memory accesses.o
+
+**For each implementation, how does changing the block count and block size affect performance? Why do you think this is?**
+
+ ![Graphing Block Size vs. fps](/images/blockgraph.PNG)
+Block count has little affect on the algorithms. Due to warp size of 32 that is used I kept all of the blocks on multiples of 32 when i did testing such that we didn't have any unused threads.Performance improvements were not of note. 
+
+**For the coherent uniform grid: did you experience any performance improvements with the more coherent uniform grid? Was this the outcome you expected? Why or why not?**
+There were massive improvements at higher boid counts as expected. I expected this becuase the contiguous accesses are much better and quicker due to "cache coherency".
+
+
+**Did changing cell width and checking 27 vs 8 neighboring cells affect performance? Why or why not? Be careful: it is insufficient (and possibly incorrect) to say that 27-cell is slower simply because there are more cells to check!**
+
+The larger the cell width the higher the chance that checked boids are not within the neighborhood distance of the current boid. This causes slowdown as we get to more cells.
+
diff --git a/images/blockgraph.PNG b/images/blockgraph.PNG
new file mode 100644
index 0000000..b584247
Binary files /dev/null and b/images/blockgraph.PNG differ
diff --git a/images/boidgraph.PNG b/images/boidgraph.PNG
new file mode 100644
index 0000000..8c94ce2
Binary files /dev/null and b/images/boidgraph.PNG differ
diff --git a/images/flocking.gif b/images/flocking.gif
new file mode 100644
index 0000000..2f67130
Binary files /dev/null and b/images/flocking.gif differ
diff --git a/images/flockup.PNG b/images/flockup.PNG
new file mode 100644
index 0000000..e65663a
Binary files /dev/null and b/images/flockup.PNG differ
diff --git a/images/novisual.PNG b/images/novisual.PNG
new file mode 100644
index 0000000..981b339
Binary files /dev/null and b/images/novisual.PNG differ
diff --git a/src/kernel.cu b/src/kernel.cu
index 74dffcb..1c02f3b 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -37,8 +37,7 @@ void checkCUDAError(const char *msg, int line = -1) {
 *****************/
 
 /*! Block size used for CUDA kernel launch. */
-#define blockSize 128
-
+#define blockSize 256
 // LOOK-1.2 Parameters for the boids algorithm.
 // These worked well in our reference implementation.
 #define rule1Distance 5.0f
@@ -60,6 +59,7 @@ void checkCUDAError(const char *msg, int line = -1) {
 
 int numObjects;
 dim3 threadsPerBlock(blockSize);
+dim3 fullBlocksPerGrid((numObjects + blockSize - 1 / blockSize));
 
 // LOOK-1.2 - These buffers are here to hold all your boid information.
 // These get allocated for you in Boids::initSimulation.
@@ -85,6 +85,8 @@ int *dev_gridCellEndIndices;   // to this cell?
 
 // TODO-2.3 - consider what additional buffers you might need to reshuffle
 // the position and velocity data to be coherent within cells.
+glm::vec3 *dev_pos_copy;
+glm::vec3 *dev_vel_copy;
 
 // LOOK-2.1 - Grid parameters based on simulation parameters.
 // These are automatically computed for you in Boids::initSimulation
@@ -138,7 +140,7 @@ __global__ void kernGenerateRandomPosArray(int time, int N, glm::vec3 * arr, flo
 */
 void Boids::initSimulation(int N) {
   numObjects = N;
-  dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
+  //dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
 
   // LOOK-1.2 - This is basic CUDA memory management and error checking.
   // Don't forget to cudaFree in  Boids::endSimulation.
@@ -169,6 +171,23 @@ void Boids::initSimulation(int N) {
   gridMinimum.z -= halfGridWidth;
 
   // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
+  //TODO thrust support
+  cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int));
+  dev_thrust_particleArrayIndices = thrust::device_pointer_cast(dev_particleArrayIndices);
+
+  cudaMalloc((void**)&dev_particleGridIndices,  N * sizeof(int));
+  dev_thrust_particleGridIndices = thrust::device_pointer_cast(dev_particleGridIndices);
+
+  cudaMalloc((void**)&dev_gridCellStartIndices, N * sizeof(int));
+
+  cudaMalloc((void**)&dev_gridCellEndIndices,   N * sizeof(int));
+
+  cudaMalloc((void**)&dev_pos_copy, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("posCopy failed!");
+
+  cudaMalloc((void**)&dev_vel_copy, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("celcopy failed!");
+
   cudaDeviceSynchronize();
 }
 
@@ -200,6 +219,7 @@ __global__ void kernCopyVelocitiesToVBO(int N, glm::vec3 *vel, float *vbo, float
     vbo[4 * index + 0] = vel[index].x + 0.3f;
     vbo[4 * index + 1] = vel[index].y + 0.3f;
     vbo[4 * index + 2] = vel[index].z + 0.3f;
+	
     vbo[4 * index + 3] = 1.0f;
   }
 }
@@ -208,7 +228,7 @@ __global__ void kernCopyVelocitiesToVBO(int N, glm::vec3 *vel, float *vbo, float
 * Wrapper for call to the kernCopyboidsToVBO CUDA kernel.
 */
 void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities) {
-  dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+  //dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
 
   kernCopyPositionsToVBO << <fullBlocksPerGrid, blockSize >> >(numObjects, dev_pos, vbodptr_positions, scene_scale);
   kernCopyVelocitiesToVBO << <fullBlocksPerGrid, blockSize >> >(numObjects, dev_vel1, vbodptr_velocities, scene_scale);
@@ -233,18 +253,80 @@ __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *po
   // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
   // Rule 2: boids try to stay a distance d away from each other
   // Rule 3: boids try to match the speed of surrounding boids
-  return glm::vec3(0.0f, 0.0f, 0.0f);
+	glm::vec3 curr_boid_pos = pos[iSelf];
+	glm::vec3 p_center_r1(0);
+
+	glm::vec3 vel_r1(0);
+	glm::vec3 vel_r2(0);
+	glm::vec3 vel_r3(0);
+
+	int num_neighbors1 = 0;
+	int num_neighbors3 = 0;
+
+	
+	for (int i = 0; i < N; i++) {
+		if (i == iSelf) {
+			continue;
+		}
+		glm::vec3 neighbor_boid_pos = pos[i];
+		glm::vec3 neighbor_boid_vel = vel[i];
+		
+		float dist = glm::distance(curr_boid_pos, neighbor_boid_pos);
+
+		//1
+		if (dist < rule1Distance) {
+			p_center_r1 += neighbor_boid_pos;
+			num_neighbors1++;
+		}
+		//2
+		if (dist < rule2Distance) {
+			vel_r2 += (curr_boid_pos - neighbor_boid_pos);
+		}
+		//3
+		if (dist < rule3Distance) {
+			vel_r3 += neighbor_boid_vel;
+			num_neighbors3++;
+		}
+	}
+	//1
+	if (num_neighbors1) {
+		p_center_r1 /= num_neighbors1;
+		vel_r1 = (p_center_r1 - curr_boid_pos) * rule1Scale;
+	}
+	//2
+	vel_r2 *= rule2Scale;
+    //3
+	if (num_neighbors3) {
+		vel_r3 /= num_neighbors3;
+		vel_r3 *= rule3Scale;
+	}
+	return vel_r1 + vel_r2 + vel_r3;
 }
 
 /**
 * TODO-1.2 implement basic flocking
 * For each of the `N` bodies, update its position based on its current velocity.
 */
-__global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
-  glm::vec3 *vel1, glm::vec3 *vel2) {
+__global__ void kernUpdateVelocityBruteForce(int N, 
+	glm::vec3 *pos,
+    glm::vec3 *vel1, 
+	glm::vec3 *vel2) {
+
+	int idx = threadIdx.x + (blockIdx.x * blockDim.x);
+	if (idx >= N) {
+		return;
+	}
   // Compute a new velocity based on pos and vel1
+	glm::vec3 new_vel = vel1[idx] + computeVelocityChange(N, idx, pos, vel1);
   // Clamp the speed
+	if (glm::length(new_vel) > maxSpeed) {
+		vel2[idx] = glm::normalize(new_vel) * maxSpeed;
+	}
+	else {
+		vel2[idx] = new_vel;
+	}
   // Record the new velocity into vel2. Question: why NOT vel1?
+
 }
 
 /**
@@ -282,6 +364,8 @@ __device__ int gridIndex3Dto1D(int x, int y, int z, int gridResolution) {
   return x + y * gridResolution + z * gridResolution * gridResolution;
 }
 
+//indices is buffer containing pointer to data (array indices)
+//gridindices is buffer containing grid indices
 __global__ void kernComputeIndices(int N, int gridResolution,
   glm::vec3 gridMin, float inverseCellWidth,
   glm::vec3 *pos, int *indices, int *gridIndices) {
@@ -289,6 +373,15 @@ __global__ void kernComputeIndices(int N, int gridResolution,
     // - Label each boid with the index of its grid cell.
     // - Set up a parallel array of integer indices as pointers to the actual
     //   boid data in pos and vel1/vel2
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index >= N) {
+		return;
+	}
+
+	glm::ivec3 gridIndex3D = glm::floor((pos[index] - gridMin) * inverseCellWidth);
+	gridIndices[index] = gridIndex3Dto1D(gridIndex3D.x, gridIndex3D.y, gridIndex3D.z, gridResolution);
+	indices[index] = index;
+
 }
 
 // LOOK-2.1 Consider how this could be useful for indicating that a cell
@@ -306,6 +399,103 @@ __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
   // Identify the start point of each cell in the gridIndices array.
   // This is basically a parallel unrolling of a loop that goes
   // "this index doesn't match the one before it, must be a new cell!"
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index >= N) {
+		return;
+	}
+	if (index ) {
+		int curr_cell_index = particleGridIndices[index];
+		int prev_cell_index = particleGridIndices[index - 1];
+
+		if (curr_cell_index != prev_cell_index) {
+			gridCellEndIndices[prev_cell_index] = index;
+			gridCellStartIndices[curr_cell_index] = index;
+
+		}
+		if (index == N - 1) {
+			gridCellEndIndices[curr_cell_index] = N;
+		}
+		
+
+	}
+	else if (index == 0 ){
+		gridCellStartIndices[particleGridIndices[0]] = 0;
+	}
+
+	
+}
+
+__device__ glm::vec3 computeVelocityCoherent(int gridResolution, int index, 
+	int *gridCellStartIndices, 
+	int *gridCellEndIndices, 
+	glm::vec3 *min_cell_index, 
+	glm::vec3 *max_cell_index, 
+	glm::vec3 *pos,
+	glm::vec3 *vel1,
+	glm::vec3 *boid_pos) {
+
+	glm::vec3 curr_boid_pos = *boid_pos;
+	glm::vec3 p_center_r1(0);
+
+	glm::vec3 vel_r1(0);
+	glm::vec3 vel_r2(0);
+	glm::vec3 vel_r3(0);
+
+	int num_neighbors1 = 0;
+	int num_neighbors3 = 0;
+
+	for (int x = min_cell_index->x; x <= max_cell_index->x; x++) {
+		for (int y = min_cell_index->y; y <= max_cell_index->y; y++) {
+			for (int z = min_cell_index->z; z <= max_cell_index->z; z++) {
+				int neighbor_cell_index = gridIndex3Dto1D(x, y, z, gridResolution);
+				int start_index = gridCellStartIndices[neighbor_cell_index];
+				int end_index = gridCellEndIndices[neighbor_cell_index];
+				
+				//ignore cells with no boids as set by kernResetIntBuffer in stepSimulation
+				if (start_index == -1 || end_index == -1) {	continue;}
+				for (int i = start_index; i < end_index; i++) {
+					if (i != index) {
+						glm::vec3 neighbor_boid_pos = pos[i];
+						glm::vec3 neighbor_boid_vel = vel1[i];
+
+
+						float dist = glm::distance(curr_boid_pos, neighbor_boid_pos);
+
+						//1
+						if (dist < rule1Distance) {
+							p_center_r1 += neighbor_boid_pos;
+							num_neighbors1++;
+						}
+						//2
+						if (dist < rule2Distance) {
+							vel_r2 += (curr_boid_pos - neighbor_boid_pos);
+						}
+						//3
+						if (dist < rule3Distance) {
+							vel_r3 += neighbor_boid_vel;
+							num_neighbors3++;
+						}
+					}
+				}
+			}
+		}
+
+
+	}
+	//1
+	if (num_neighbors1) {
+		p_center_r1 /= num_neighbors1;
+		vel_r1 = (p_center_r1 - curr_boid_pos) * rule1Scale;
+	}
+	//2
+	vel_r2 *= rule2Scale;
+	//3
+	if (num_neighbors3) {
+		vel_r3 /= num_neighbors3;
+		vel_r3 *= rule3Scale;
+	}
+
+	return vel_r1 + vel_r2 + vel_r3;
 }
 
 __global__ void kernUpdateVelNeighborSearchScattered(
@@ -322,25 +512,201 @@ __global__ void kernUpdateVelNeighborSearchScattered(
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index >= N) {
+		return;
+	}
+
+	glm::vec3  boid_pos = pos[index];
+	glm::ivec3 boid_cell_index = glm::floor((boid_pos - gridMin) * inverseCellWidth);
+
+	//determine neighboring cells
+	glm::vec3 neighborhood_distance(rule1Distance);
+	glm::vec3 max_cell_index = ((boid_pos - gridMin) + neighborhood_distance) * inverseCellWidth;
+	glm::vec3 max = glm::clamp(max_cell_index, glm::vec3(0), glm::vec3(gridResolution));
+	glm::vec3 min_cell_index = ((boid_pos - gridMin) - neighborhood_distance) * inverseCellWidth;
+	glm::vec3 min = glm::clamp(min_cell_index, glm::vec3(0), glm::vec3(gridResolution));
+	
+	glm::vec3 curr_boid_pos = boid_pos;
+	glm::vec3 p_center_r1(0);
+
+	glm::vec3 vel_r1(0);
+	glm::vec3 vel_r2(0);
+	glm::vec3 vel_r3(0);
+
+	int num_neighbors1 = 0;
+	int num_neighbors3 = 0;
+
+	for (int x = min.x; x <= max.x; x++) {
+		for (int y = min.y; y <= max.y; y++) {
+			for (int z = min.z; z <= max.z; z++) {
+				int neighbor_cell_index = gridIndex3Dto1D(x, y, z, gridResolution);
+				int start_index = gridCellStartIndices[neighbor_cell_index];
+				int end_index = gridCellEndIndices[neighbor_cell_index];
+
+				//ignore cells with no boids as set by kernResetIntBuffer in stepSimulation
+				if (start_index == -1 || end_index == -1) { continue; }
+				for (int i = start_index; i < end_index; i++) {
+					int neighbor_boid_index = particleArrayIndices[i];
+					if (neighbor_boid_index != index) {
+						glm::vec3 neighbor_boid_pos = pos[neighbor_boid_index];
+						glm::vec3 neighbor_boid_vel = vel1[neighbor_boid_index];
+
+
+						float dist = glm::distance(curr_boid_pos, neighbor_boid_pos);
+
+						//1
+						if (dist < rule1Distance) {
+							p_center_r1 += neighbor_boid_pos;
+							num_neighbors1++;
+						}
+						//2
+						if (dist < rule2Distance) {
+							vel_r2 += (curr_boid_pos - neighbor_boid_pos);
+						}
+						//3
+						if (dist < rule3Distance) {
+							vel_r3 += neighbor_boid_vel;
+							num_neighbors3++;
+						}
+					}
+				}
+			}
+		}
+
+
+	}
+	//1
+	if (num_neighbors1) {
+		p_center_r1 /= num_neighbors1;
+		vel_r1 = (p_center_r1 - curr_boid_pos) * rule1Scale;
+	}
+	//2
+	vel_r2 *= rule2Scale;
+	//3
+	if (num_neighbors3) {
+		vel_r3 /= num_neighbors3;
+		vel_r3 *= rule3Scale;
+	}
+
+	//Compute Velocity
+	//glm::vec3 new_vel = vel1[index] + computeVelocityScattered(gridResolution, index, gridCellStartIndices, gridCellEndIndices, particleArrayIndices, &min, &max, pos, vel1, &boid_pos);
+	glm::vec3 new_vel = vel1[index] + vel_r1 + vel_r2 + vel_r3;
+	// Clamp the speed
+	if (glm::length(new_vel) > maxSpeed) {
+		vel2[index] = glm::normalize(new_vel) * maxSpeed;
+	}
+	else {
+		vel2[index] = new_vel;
+	}
+
+
 }
 
 __global__ void kernUpdateVelNeighborSearchCoherent(
-  int N, int gridResolution, glm::vec3 gridMin,
-  float inverseCellWidth, float cellWidth,
-  int *gridCellStartIndices, int *gridCellEndIndices,
-  glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
-  // TODO-2.3 - This should be very similar to kernUpdateVelNeighborSearchScattered,
-  // except with one less level of indirection.
-  // This should expect gridCellStartIndices and gridCellEndIndices to refer
-  // directly to pos and vel1.
-  // - Identify the grid cell that this particle is in
-  // - Identify which cells may contain neighbors. This isn't always 8.
-  // - For each cell, read the start/end indices in the boid pointer array.
-  //   DIFFERENCE: For best results, consider what order the cells should be
-  //   checked in to maximize the memory benefits of reordering the boids data.
-  // - Access each boid in the cell and compute velocity change from
-  //   the boids rules, if this boid is within the neighborhood distance.
-  // - Clamp the speed change before putting the new speed in vel2
+	int N, int gridResolution, glm::vec3 gridMin,
+	float inverseCellWidth, float cellWidth,
+	int *gridCellStartIndices, int *gridCellEndIndices,
+	glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
+	// TODO-2.3 - This should be very similar to kernUpdateVelNeighborSearchScattered,
+	// except with one less level of indirection.
+	// This should expect gridCellStartIndices and gridCellEndIndices to refer
+	// directly to pos and vel1.
+	// - Identify the grid cell that this particle is in
+	// - Identify which cells may contain neighbors. This isn't always 8.
+	// - For each cell, read the start/end indices in the boid pointer array.
+	//   DIFFERENCE: For best results, consider what order the cells should be
+	//   checked in to maximize the memory benefits of reordering the boids data.
+	// - Access each boid in the cell and compute velocity change from
+	//   the boids rules, if this boid is within the neighborhood distance.
+	// - Clamp the speed change before putting the new speed in vel2
+
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index >= N) {
+		return;
+	}
+
+	glm::vec3  boid_pos = pos[index];
+	glm::ivec3 boid_cell_index = glm::floor((boid_pos - gridMin) * inverseCellWidth);
+
+	//determine neighboring cells
+	glm::vec3 neighborhood_distance(rule1Distance);
+	glm::vec3 max_cell_index = ((boid_pos - gridMin) + neighborhood_distance) * inverseCellWidth;
+	glm::vec3 max = glm::clamp(max_cell_index, glm::vec3(0), glm::vec3(gridResolution));
+	glm::vec3 min_cell_index = ((boid_pos - gridMin) - neighborhood_distance) * inverseCellWidth;
+	glm::vec3 min = glm::clamp(min_cell_index, glm::vec3(0), glm::vec3(gridResolution));
+
+	glm::vec3 curr_boid_pos = boid_pos;
+	glm::vec3 p_center_r1(0);
+
+	glm::vec3 vel_r1(0);
+	glm::vec3 vel_r2(0);
+	glm::vec3 vel_r3(0);
+
+	int num_neighbors1 = 0;
+	int num_neighbors3 = 0;
+	
+	for (int z = min.z; z <= max.z; z++) {
+		for (int y = min.y; y <= max.y; y++) {
+			for (int x = min.x; x <= max.x; x++) {
+				int neighbor_cell_index = gridIndex3Dto1D(x, y, z, gridResolution);
+				int start_index = gridCellStartIndices[neighbor_cell_index];
+				int end_index = gridCellEndIndices[neighbor_cell_index];
+
+				//ignore cells with no boids as set by kernResetIntBuffer in stepSimulation
+				if (start_index == -1 || end_index == -1) { continue; }
+				for (int i = start_index; i < end_index; i++) {
+					if (i != index) {
+						glm::vec3 neighbor_boid_pos = pos[i];
+						glm::vec3 neighbor_boid_vel = vel1[i];
+
+
+						float dist = glm::distance(curr_boid_pos, neighbor_boid_pos);
+
+						//1
+						if (dist < rule1Distance) {
+							p_center_r1 += neighbor_boid_pos;
+							num_neighbors1++;
+						}
+						//2
+						if (dist < rule2Distance) {
+							vel_r2 += (curr_boid_pos - neighbor_boid_pos);
+						}
+						//3
+						if (dist < rule3Distance) {
+							vel_r3 += neighbor_boid_vel;
+							num_neighbors3++;
+						}
+					}
+				}
+			}
+		}
+
+
+	}
+	//1
+	if (num_neighbors1) {
+		p_center_r1 /= num_neighbors1;
+		vel_r1 = (p_center_r1 - curr_boid_pos) * rule1Scale;
+	}
+	//2
+	vel_r2 *= rule2Scale;
+	//3
+	if (num_neighbors3) {
+		vel_r3 /= num_neighbors3;
+		vel_r3 *= rule3Scale;
+	}
+	//Compute Velocity
+	//glm::vec3 new_vel = vel1[index] + computeVelocityCoherent(gridResolution, index, gridCellStartIndices, gridCellEndIndices, &min, &max, pos, vel1, &boid_pos);
+	glm::vec3 new_vel = vel1[index] + vel_r1 + vel_r2 + vel_r3;
+	// Clamp the speed
+	if (glm::length(new_vel) > maxSpeed) {
+		vel2[index] = glm::normalize(new_vel) * maxSpeed;
+	}
+	else {
+		vel2[index] = new_vel;
+	}
+
 }
 
 /**
@@ -348,7 +714,11 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
 */
 void Boids::stepSimulationNaive(float dt) {
   // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
+	//dim3 fullBlocksPerGrid((numObjects + blockSize - 1 / blockSize));
+	kernUpdateVelocityBruteForce <<<fullBlocksPerGrid, threadsPerBlock >> > (numObjects, dev_pos, dev_vel1, dev_vel2);
+	kernUpdatePos<<<fullBlocksPerGrid, threadsPerBlock >>>(numObjects, dt, dev_pos, dev_vel1);
   // TODO-1.2 ping-pong the velocity buffers
+	cudaMemcpy(dev_vel1, dev_vel2, sizeof(glm::vec3) * numObjects, cudaMemcpyDeviceToDevice);
 }
 
 void Boids::stepSimulationScatteredGrid(float dt) {
@@ -364,6 +734,35 @@ void Boids::stepSimulationScatteredGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed
+	//Setup grid and array buffers
+	kernComputeIndices <<<fullBlocksPerGrid, threadsPerBlock>>> (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+	
+	thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+
+	//Ensure that we only traverse cells with  boids
+	dim3 fullBlocksPerCell((gridCellCount + blockSize - 1) / blockSize);
+	kernResetIntBuffer <<<fullBlocksPerCell, threadsPerBlock>>> (gridCellCount, dev_gridCellStartIndices, -1);
+	kernResetIntBuffer <<<fullBlocksPerCell, threadsPerBlock>>> (gridCellCount, dev_gridCellEndIndices, -1);
+
+	//find start and endpoints then compute vel
+	kernIdentifyCellStartEnd <<<fullBlocksPerGrid,threadsPerBlock>>> (numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+	kernUpdateVelNeighborSearchScattered <<<fullBlocksPerGrid, threadsPerBlock>>> (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth, 
+		dev_gridCellStartIndices, dev_gridCellEndIndices, dev_particleArrayIndices, dev_pos, dev_vel1, dev_vel2);
+
+	kernUpdatePos << <fullBlocksPerGrid, threadsPerBlock >>> (numObjects, dt, dev_pos, dev_vel2);
+
+	cudaMemcpy(dev_vel1, dev_vel2, sizeof(glm::vec3) * numObjects, cudaMemcpyDeviceToDevice);
+
+}
+
+__global__ void kernRemoveIndirection(int N, int *particleArrayIndices, glm::vec3 *pos, glm::vec3* vel, glm::vec3 *pos_copy, glm::vec3 *vel_copy) {
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index >= N) {
+		return;
+	}
+
+	pos_copy[index] = pos[particleArrayIndices[index]];
+	vel_copy[index] = vel[particleArrayIndices[index]];
 }
 
 void Boids::stepSimulationCoherentGrid(float dt) {
@@ -382,6 +781,59 @@ void Boids::stepSimulationCoherentGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+	kernComputeIndices << <fullBlocksPerGrid, threadsPerBlock >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth,
+		dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+	checkCUDAErrorWithLine("kernComputeIndices  failed!");
+
+	thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+	kernRemoveIndirection << <fullBlocksPerGrid, threadsPerBlock >> > (numObjects, dev_particleArrayIndices, dev_pos, dev_vel1, dev_pos_copy, dev_vel_copy);
+	checkCUDAErrorWithLine("kernRemoveIndireciton failed!");
+
+	//Ensure that we only traverse cells with  boids
+	dim3 fullBlocksPerCell((gridCellCount + blockSize - 1) / blockSize);
+	kernResetIntBuffer << <fullBlocksPerCell, threadsPerBlock >> > (gridCellCount, dev_gridCellStartIndices, -1);
+	checkCUDAErrorWithLine("kernResetIntBuffer STartfailed!");
+	kernResetIntBuffer << <fullBlocksPerCell, threadsPerBlock >> > (gridCellCount, dev_gridCellEndIndices, -1);
+	checkCUDAErrorWithLine("kernResetIntBufferfailed!");
+
+	//find start and endpoints then compute vel
+	kernIdentifyCellStartEnd << <fullBlocksPerGrid, threadsPerBlock >> > (numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+	checkCUDAErrorWithLine("kernIdentifyCellStartEnd vel back failed!");
+	//remove Indirection
+
+	cudaDeviceSynchronize();
+
+	/*glm::vec3* temp1;
+	glm::vec3* temp2;
+	temp1 = dev_vel1;
+	temp2 = dev_pos;
+	dev_pos = dev_pos_copy;
+	dev_vel1 = dev_vel_copy;
+	dev_pos_copy = temp2;
+	dev_vel_copy = temp1;*/
+
+
+	cudaMemcpy(dev_vel1, dev_vel_copy, numObjects* sizeof(glm::vec3), cudaMemcpyDeviceToDevice);
+	checkCUDAErrorWithLine("memcpy vel back failed!");
+	cudaMemcpy(dev_pos,  dev_pos_copy, sizeof(glm::vec3) * numObjects, cudaMemcpyDeviceToDevice);
+	checkCUDAErrorWithLine("memcpy pos back failed!");
+
+	kernUpdateVelNeighborSearchCoherent << <fullBlocksPerGrid, threadsPerBlock >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, 
+		gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices, dev_pos, dev_vel1, dev_vel2);
+	checkCUDAErrorWithLine("kernUpdateVel failed!");
+
+	kernUpdatePos << <fullBlocksPerGrid, threadsPerBlock >> > (numObjects, dt, dev_pos, dev_vel2);
+	checkCUDAErrorWithLine("kernupdatepos failed!");
+	/*cudaDeviceSynchronize();
+	cudaMemcpy(dev_vel1, dev_vel_copy, sizeof(glm::vec3) * numObjects, cudaMemcpyDeviceToDevice);
+	checkCUDAErrorWithLine("memcpy vel back failed!");
+	cudaMemcpy(dev_pos, dev_pos_copy, sizeof(glm::vec3) * numObjects, cudaMemcpyDeviceToDevice);
+	checkCUDAErrorWithLine("memcpy pos back failed!");*/
+	cudaMemcpy(dev_vel1, dev_vel2, sizeof(glm::vec3) * numObjects, cudaMemcpyDeviceToDevice);
+	checkCUDAErrorWithLine("ping pong failed!");
+	
+
+
 }
 
 void Boids::endSimulation() {
@@ -390,6 +842,12 @@ void Boids::endSimulation() {
   cudaFree(dev_pos);
 
   // TODO-2.1 TODO-2.3 - Free any additional buffers here.
+  cudaFree(dev_gridCellEndIndices);
+  cudaFree(dev_gridCellStartIndices);
+  cudaFree(dev_particleArrayIndices);
+  cudaFree(dev_particleGridIndices);
+  cudaFree(dev_pos_copy);
+  cudaFree(dev_vel_copy);
 }
 
 void Boids::unitTest() {
diff --git a/src/main.cpp b/src/main.cpp
index b82c8c6..fafcc7b 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -14,11 +14,11 @@
 
 // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID
 #define VISUALIZE 1
-#define UNIFORM_GRID 0
-#define COHERENT_GRID 0
+#define UNIFORM_GRID 1
+#define COHERENT_GRID 1
 
 // LOOK-1.2 - change this to adjust particle count in the simulation
-const int N_FOR_VIS = 5000;
+const int N_FOR_VIS = 50000;
 const float DT = 0.2f;
 
 /**