diff --git a/README.md b/README.md
index 41b91f0..337e251 100644
--- a/README.md
+++ b/README.md
@@ -1,18 +1,58 @@
 CUDA Rasterizer
 ===============
 
-[CLICK ME FOR INSTRUCTION OF THIS PROJECT](./INSTRUCTION.md)
-
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 4**
 
-* (TODO) YOUR NAME HERE
-	* (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Ishan Ranade
+* Tested on personal computer: Gigabyte Aero 14, Windows 10, i7-7700HQ, GTX 1060
+
+## Demo
+
+![](renders/demo.gif)
+
+## Introduction
+
+I implemented a GPU based rasterization pipeline with a number of added on features.  For a brief overview the rasterization pipeline consists of a number of steps including vertex transformation of a set of user defined vertices, primitive assembly to arrange these vertices into shapes, a fragment depth test in which the closest fragment is saved, fragment transformation to determine the final color of a pixel, and finally rendering to the screen.  Specific extra features that I chose to implement were super sampled antialiasing, UV texture mapping with bilinear texture filtering and perspective correct texture coordinates, correct color interpolation between points on a primitive, and backface culling.
+
+## Features
+
+![](renders/truck.JPG)
+
+![](renders/texture1.JPG)
+
+- UV texture mapping with bilinear texture filtering and perspective correct texture coordinates
+
+<br/>
+
+![](renders/cow.JPG)
+
+- Correct color interpolation between points on a primitive
+
+<br/>
+
+![](renders/aa2.JPG)
+
+- Super sampled anti aliasing
+
+<br/>
+
+![](renders/noaa.JPG)
+
+- No anti aliasing
+
+<br/>
+
+## Performance Analysis
+
+The rasterization step of the pipeline seemed to take the longest time, and this is because of the heavy computation in calculating which pixels overlap which triangles.  I implemented backface culling to try to reduce the computation time, and the graphs can be seen below comparing runtimes with and without this optimization.
+
+Antialiasing also increased the runtime as the size of the PBO quadrupled in size so the computations became 4 times as expensive.  This step was necessary though because without it you can easily see jagged edges along diagonal lines as can be seen in the above images.
+
+![](renders/graph1.JPG)
+
+![](renders/graph2.JPG)
 
-### (TODO: Your README)
 
-*DO NOT* leave the README to the last minute! It is a crucial part of the
-project, and we will not be able to grade you without a good README.
 
 
 ### Credits
diff --git a/graphs.pptx b/graphs.pptx
new file mode 100644
index 0000000..4e108b6
Binary files /dev/null and b/graphs.pptx differ
diff --git a/renders/aa.JPG b/renders/aa.JPG
new file mode 100644
index 0000000..a60e8d4
Binary files /dev/null and b/renders/aa.JPG differ
diff --git a/renders/aa2.JPG b/renders/aa2.JPG
new file mode 100644
index 0000000..ed8aca1
Binary files /dev/null and b/renders/aa2.JPG differ
diff --git a/renders/cow.JPG b/renders/cow.JPG
new file mode 100644
index 0000000..7d0bab5
Binary files /dev/null and b/renders/cow.JPG differ
diff --git a/renders/demo.gif b/renders/demo.gif
new file mode 100644
index 0000000..b5c3010
Binary files /dev/null and b/renders/demo.gif differ
diff --git a/renders/graph1.JPG b/renders/graph1.JPG
new file mode 100644
index 0000000..bf067ec
Binary files /dev/null and b/renders/graph1.JPG differ
diff --git a/renders/graph2.JPG b/renders/graph2.JPG
new file mode 100644
index 0000000..2e81e22
Binary files /dev/null and b/renders/graph2.JPG differ
diff --git a/renders/noaa.JPG b/renders/noaa.JPG
new file mode 100644
index 0000000..8e9ab25
Binary files /dev/null and b/renders/noaa.JPG differ
diff --git a/renders/texture1.JPG b/renders/texture1.JPG
new file mode 100644
index 0000000..fabd6ba
Binary files /dev/null and b/renders/texture1.JPG differ
diff --git a/renders/truck.JPG b/renders/truck.JPG
new file mode 100644
index 0000000..6039beb
Binary files /dev/null and b/renders/truck.JPG differ
diff --git a/shaders/README.md b/shaders/README.md
new file mode 100644
index 0000000..e69de29
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index a57f69f..d9247c3 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -6,5 +6,5 @@ set(SOURCE_FILES
 
 cuda_add_library(src
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_60
     )
diff --git a/src/rasterize.cu b/src/rasterize.cu
index 1262a09..cf71b33 100644
--- a/src/rasterize.cu
+++ b/src/rasterize.cu
@@ -17,6 +17,9 @@
 #include "rasterize.h"
 #include <glm/gtc/quaternion.hpp>
 #include <glm/gtc/matrix_transform.hpp>
+#include <device_launch_parameters.h>
+#include <thrust/partition.h>
+#include <thrust/execution_policy.h>
 
 namespace {
 
@@ -28,7 +31,7 @@ namespace {
 
 	typedef unsigned char BufferByte;
 
-	enum PrimitiveType{
+	enum PrimitiveType {
 		Point = 1,
 		Line = 2,
 		Triangle = 3
@@ -41,12 +44,12 @@ namespace {
 		// The attributes listed below might be useful, 
 		// but always feel free to modify on your own
 
-		 glm::vec3 eyePos;	// eye space position used for shading
-		 glm::vec3 eyeNor;	// eye space normal used for shading, cuz normal will go wrong after perspective transformation
-		// glm::vec3 col;
-		 glm::vec2 texcoord0;
-		 TextureData* dev_diffuseTex = NULL;
-		// int texWidth, texHeight;
+		glm::vec3 eyePos;	// eye space position used for shading
+		glm::vec3 eyeNor;	// eye space normal used for shading, cuz normal will go wrong after perspective transformation
+		glm::vec3 color;
+		glm::vec2 texcoord0;
+		TextureData* dev_diffuseTex = NULL;
+		int texWidth, texHeight;
 		// ...
 	};
 
@@ -62,10 +65,8 @@ namespace {
 		// The attributes listed below might be useful, 
 		// but always feel free to modify on your own
 
-		// glm::vec3 eyePos;	// eye space position used for shading
-		// glm::vec3 eyeNor;
-		// VertexAttributeTexcoord texcoord0;
-		// TextureData* dev_diffuseTex;
+		glm::vec3 eyePos;	// eye space position used for shading
+		glm::vec3 eyeNor;
 		// ...
 	};
 
@@ -100,77 +101,117 @@ namespace {
 
 static std::map<std::string, std::vector<PrimitiveDevBufPointers>> mesh2PrimitivesMap;
 
-
-static int width = 0;
-static int height = 0;
-
 static int totalNumPrimitives = 0;
 static Primitive *dev_primitives = NULL;
+static Primitive *dev_primitives_copy = NULL;
 static Fragment *dev_fragmentBuffer = NULL;
 static glm::vec3 *dev_framebuffer = NULL;
 
-static int * dev_depth = NULL;	// you might need this buffer when doing depth test
+static float * dev_depth = NULL;	// you might need this buffer when doing depth test
+
+static int *mutex;
+
+static int width = 0;
+static int height = 0;
+static int originalWidth = 0;
+static int originalHeight = 0;
+static int antialiasing = 2;
 
 /**
  * Kernel that writes the image to the OpenGL PBO directly.
  */
-__global__ 
-void sendImageToPBO(uchar4 *pbo, int w, int h, glm::vec3 *image) {
-    int x = (blockIdx.x * blockDim.x) + threadIdx.x;
-    int y = (blockIdx.y * blockDim.y) + threadIdx.y;
-    int index = x + (y * w);
-
-    if (x < w && y < h) {
-        glm::vec3 color;
-        color.x = glm::clamp(image[index].x, 0.0f, 1.0f) * 255.0;
-        color.y = glm::clamp(image[index].y, 0.0f, 1.0f) * 255.0;
-        color.z = glm::clamp(image[index].z, 0.0f, 1.0f) * 255.0;
-        // Each thread writes one pixel location in the texture (textel)
-        pbo[index].w = 0;
-        pbo[index].x = color.x;
-        pbo[index].y = color.y;
-        pbo[index].z = color.z;
-    }
+__global__
+void sendImageToPBO(uchar4 *pbo, int w, int h, glm::vec3 *image, int antialiasing, int width) {
+	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+	int index = x + (y * w);
+
+	if (x < w && y < h) {
+
+		glm::vec3 color;
+		for (int i = 0; i < antialiasing; ++i) {
+			for (int j = 0; j < antialiasing; ++j) {
+				int tempIndex = ((antialiasing * x) + i) + (((antialiasing * y) + j) * width);
+				color.x += glm::clamp(image[tempIndex].x, 0.0f, 1.0f) * 255.0;
+				color.y += glm::clamp(image[tempIndex].y, 0.0f, 1.0f) * 255.0;
+				color.z += glm::clamp(image[tempIndex].z, 0.0f, 1.0f) * 255.0;
+			}
+		}
+
+		color /= antialiasing * antialiasing;
+
+		// Each thread writes one pixel location in the texture (textel)
+		pbo[index].w = 0;
+		pbo[index].x = color.x;
+		pbo[index].y = color.y;
+		pbo[index].z = color.z;
+	}
 }
 
-/** 
+/**
 * Writes fragment colors to the framebuffer
 */
 __global__
 void render(int w, int h, Fragment *fragmentBuffer, glm::vec3 *framebuffer) {
-    int x = (blockIdx.x * blockDim.x) + threadIdx.x;
-    int y = (blockIdx.y * blockDim.y) + threadIdx.y;
-    int index = x + (y * w);
+	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+	int index = x + (y * w);
+
+	const int numLights = 2;
+	glm::vec3 lights[numLights] = { glm::normalize(glm::vec3(-1,-1,-1)), glm::normalize(glm::vec3(1,1,1)) };
+
+	if (x < w && y < h) {
+		framebuffer[index] = glm::vec3(0, 0, 0);
 
-    if (x < w && y < h) {
-        framebuffer[index] = fragmentBuffer[index].color;
+		for (int i = 0; i < numLights; ++i) {
+			float lambert = glm::clamp(glm::dot(fragmentBuffer[index].eyeNor, lights[i]), 0.f, 1.f);
+
+			framebuffer[index] += fragmentBuffer[index].color * lambert;
+		}
 
 		// TODO: add your fragment shader code here
 
-    }
+	}
 }
 
+struct cullpredicate
+{
+	__device__
+		bool operator()(const Primitive &primitive)
+	{
+		glm::vec3 normal = glm::cross(primitive.v[1].eyePos - primitive.v[0].eyePos, primitive.v[2].eyePos - primitive.v[0].eyePos);
+
+		return glm::dot(normal, glm::vec3(0,0,1)) > -0.1;
+	}
+};
+
 /**
  * Called once at the beginning of the program to allocate memory.
  */
 void rasterizeInit(int w, int h) {
-    width = w;
-    height = h;
+	originalWidth = w;
+	originalHeight = h;
+	width = antialiasing * originalWidth;
+	height = antialiasing * originalHeight;
+
 	cudaFree(dev_fragmentBuffer);
 	cudaMalloc(&dev_fragmentBuffer, width * height * sizeof(Fragment));
 	cudaMemset(dev_fragmentBuffer, 0, width * height * sizeof(Fragment));
-    cudaFree(dev_framebuffer);
-    cudaMalloc(&dev_framebuffer,   width * height * sizeof(glm::vec3));
-    cudaMemset(dev_framebuffer, 0, width * height * sizeof(glm::vec3));
-    
+	cudaFree(dev_framebuffer);
+	cudaMalloc(&dev_framebuffer, width * height * sizeof(glm::vec3));
+	cudaMemset(dev_framebuffer, 0, width * height * sizeof(glm::vec3));
+
 	cudaFree(dev_depth);
-	cudaMalloc(&dev_depth, width * height * sizeof(int));
+	cudaMalloc(&dev_depth, width * height * sizeof(float));
+
+	cudaMalloc((void **)&mutex, width * height * sizeof(int));
+	cudaMemset(mutex, 0, width * height * sizeof(int));
 
 	checkCUDAError("rasterizeInit");
 }
 
 __global__
-void initDepth(int w, int h, int * depth)
+void initDepth(int w, int h, float * depth)
 {
 	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
 	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
@@ -178,7 +219,7 @@ void initDepth(int w, int h, int * depth)
 	if (x < w && y < h)
 	{
 		int index = x + (y * w);
-		depth[index] = INT_MAX;
+		depth[index] = 1.0;
 	}
 }
 
@@ -187,9 +228,9 @@ void initDepth(int w, int h, int * depth)
 * kern function with support for stride to sometimes replace cudaMemcpy
 * One thread is responsible for copying one component
 */
-__global__ 
+__global__
 void _deviceBufferCopy(int N, BufferByte* dev_dst, const BufferByte* dev_src, int n, int byteStride, int byteOffset, int componentTypeByteSize) {
-	
+
 	// Attribute (vec3 position)
 	// component (3 * float)
 	// byte (4 * byte)
@@ -202,20 +243,20 @@ void _deviceBufferCopy(int N, BufferByte* dev_dst, const BufferByte* dev_src, in
 		int offset = i - count * n;	// which component of the attribute
 
 		for (int j = 0; j < componentTypeByteSize; j++) {
-			
-			dev_dst[count * componentTypeByteSize * n 
-				+ offset * componentTypeByteSize 
+
+			dev_dst[count * componentTypeByteSize * n
+				+ offset * componentTypeByteSize
 				+ j]
 
-				= 
+				=
 
-			dev_src[byteOffset 
-				+ count * (byteStride == 0 ? componentTypeByteSize * n : byteStride) 
-				+ offset * componentTypeByteSize 
+				dev_src[byteOffset
+				+ count * (byteStride == 0 ? componentTypeByteSize * n : byteStride)
+				+ offset * componentTypeByteSize
 				+ j];
 		}
 	}
-	
+
 
 }
 
@@ -235,7 +276,7 @@ void _nodeMatrixTransform(
 }
 
 glm::mat4 getMatrixFromNodeMatrixVector(const tinygltf::Node & n) {
-	
+
 	glm::mat4 curMatrix(1.0);
 
 	const std::vector<double> &m = n.matrix;
@@ -247,7 +288,8 @@ glm::mat4 getMatrixFromNodeMatrixVector(const tinygltf::Node & n) {
 				curMatrix[i][j] = (float)m.at(4 * i + j);
 			}
 		}
-	} else {
+	}
+	else {
 		// no matrix, use rotation, scale, translation
 
 		if (n.translation.size() > 0) {
@@ -275,12 +317,12 @@ glm::mat4 getMatrixFromNodeMatrixVector(const tinygltf::Node & n) {
 	return curMatrix;
 }
 
-void traverseNode (
+void traverseNode(
 	std::map<std::string, glm::mat4> & n2m,
 	const tinygltf::Scene & scene,
 	const std::string & nodeString,
 	const glm::mat4 & parentMatrix
-	) 
+)
 {
 	const tinygltf::Node & n = scene.nodes.at(nodeString);
 	glm::mat4 M = parentMatrix * getMatrixFromNodeMatrixVector(n);
@@ -537,7 +579,7 @@ void rasterizeSetBuffers(const tinygltf::Scene & scene) {
 									size_t s = image.image.size() * sizeof(TextureData);
 									cudaMalloc(&dev_diffuseTex, s);
 									cudaMemcpy(dev_diffuseTex, &image.image.at(0), s, cudaMemcpyHostToDevice);
-									
+
 									diffuseTexWidth = image.width;
 									diffuseTexHeight = image.height;
 
@@ -554,7 +596,7 @@ void rasterizeSetBuffers(const tinygltf::Scene & scene) {
 
 					// ---------Node hierarchy transform--------
 					cudaDeviceSynchronize();
-					
+
 					dim3 numBlocksNodeTransform((numVertices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x);
 					_nodeMatrixTransform << <numBlocksNodeTransform, numThreadsPerBlock >> > (
 						numVertices,
@@ -584,7 +626,7 @@ void rasterizeSetBuffers(const tinygltf::Scene & scene) {
 						diffuseTexHeight,
 
 						dev_vertexOut	//VertexOut
-					});
+						});
 
 					totalNumPrimitives += numPrimitives;
 
@@ -595,21 +637,23 @@ void rasterizeSetBuffers(const tinygltf::Scene & scene) {
 		} // for each node
 
 	}
-	
+
+	printf("Num primitives: %u\n", totalNumPrimitives);
 
 	// 3. Malloc for dev_primitives
 	{
 		cudaMalloc(&dev_primitives, totalNumPrimitives * sizeof(Primitive));
+		cudaMalloc(&dev_primitives_copy, totalNumPrimitives * sizeof(Primitive));
 	}
-	
+
 
 	// Finally, cudaFree raw dev_bufferViews
 	{
 
 		std::map<std::string, BufferByte*>::const_iterator it(bufferViewDevPointers.begin());
 		std::map<std::string, BufferByte*>::const_iterator itEnd(bufferViewDevPointers.end());
-			
-			//bufferViewDevPointers
+
+		//bufferViewDevPointers
 
 		for (; it != itEnd; it++) {
 			cudaFree(it->second);
@@ -623,11 +667,11 @@ void rasterizeSetBuffers(const tinygltf::Scene & scene) {
 
 
 
-__global__ 
+__global__
 void _vertexTransformAndAssembly(
-	int numVertices, 
-	PrimitiveDevBufPointers primitive, 
-	glm::mat4 MVP, glm::mat4 MV, glm::mat3 MV_normal, 
+	int numVertices,
+	PrimitiveDevBufPointers primitive,
+	glm::mat4 MVP, glm::mat4 MV, glm::mat3 MV_normal,
 	int width, int height) {
 
 	// vertex id
@@ -638,18 +682,45 @@ void _vertexTransformAndAssembly(
 		// Multiply the MVP matrix for each vertex position, this will transform everything into clipping space
 		// Then divide the pos by its w element to transform into NDC space
 		// Finally transform x and y to viewport space
+		glm::vec3 &position = primitive.dev_position[vid];
+		glm::vec3 &normal = primitive.dev_normal[vid];
+
+		VertexOut &vertex = primitive.dev_verticesOut[vid];
+
+		glm::vec4 projected = MVP * glm::vec4(position, 1);
+		projected /= projected.w;
+
+		vertex.pos = glm::vec4((projected.x + 1.f) * width * 0.5f, (1.f - projected.y) * height * 0.5f, (projected.z + 1.f) * 0.5f, 1.0);
+		vertex.eyePos = glm::vec3(MV * glm::vec4(position, 1));
+		vertex.eyeNor = glm::normalize(MV_normal * normal);
+
+		// Give the vertex a random color or texture color
+		if (primitive.dev_diffuseTex == NULL) {
+			vertex.dev_diffuseTex = NULL;
+			thrust::default_random_engine rng = thrust::default_random_engine(utilhash(vid + 11));
+			thrust::uniform_real_distribution<float> u01(0, 1);
+			vertex.color = glm::vec3(1.0, u01(rng), u01(rng));
+		}
+		else {
+			vertex.dev_diffuseTex = primitive.dev_diffuseTex;
+			vertex.texcoord0 = primitive.dev_texcoord0[vid];
+			vertex.texWidth = primitive.diffuseTexWidth;
+			vertex.texHeight = primitive.diffuseTexHeight;
+		}
 
 		// TODO: Apply vertex assembly here
 		// Assemble all attribute arraies into the primitive array
-		
+
 	}
 }
 
 
 
+
+
 static int curPrimitiveBeginId = 0;
 
-__global__ 
+__global__
 void _primitiveAssembly(int numIndices, int curPrimitiveBeginId, Primitive* dev_primitives, PrimitiveDevBufPointers primitive) {
 
 	// index id
@@ -660,30 +731,144 @@ void _primitiveAssembly(int numIndices, int curPrimitiveBeginId, Primitive* dev_
 		// TODO: uncomment the following code for a start
 		// This is primitive assembly for triangles
 
-		//int pid;	// id for cur primitives vector
-		//if (primitive.primitiveMode == TINYGLTF_MODE_TRIANGLES) {
-		//	pid = iid / (int)primitive.primitiveType;
-		//	dev_primitives[pid + curPrimitiveBeginId].v[iid % (int)primitive.primitiveType]
-		//		= primitive.dev_verticesOut[primitive.dev_indices[iid]];
-		//}
+		int pid;	// id for cur primitives vector
+		if (primitive.primitiveMode == TINYGLTF_MODE_TRIANGLES) {
+			pid = iid / (int)primitive.primitiveType;
+			dev_primitives[pid + curPrimitiveBeginId].v[iid % (int)primitive.primitiveType]
+				= primitive.dev_verticesOut[primitive.dev_indices[iid]];
+		}
 
 
 		// TODO: other primitive types (point, line)
 	}
-	
+
+}
+
+__host__ __device__ static
+float perspectiveCorrectZ(const glm::vec3 vertices[3], const glm::vec3 &barycentric) {
+	float sum = (barycentric[0] / vertices[0][2]) + (barycentric[1] / vertices[1][2]) + (barycentric[2] / vertices[2][2]);
+	return 1.0 / sum;
 }
 
+__host__ __device__ static
+glm::vec3 perspectiveCorrectInterpolation(const glm::vec3 vertices[3], const float &z, const glm::vec3 values[3], const glm::vec3 &barycentric) {
+	glm::vec3 sum = (barycentric[0] * values[0] / vertices[0][2])
+		+ (barycentric[1] * values[1] / vertices[1][2])
+		+ (barycentric[2] * values[2] / vertices[2][2]);
+	return sum * z;
+}
+
+/**
+* Rasterization
+*/
+/**
+ * Rasterization kernel.
+ */
+__global__
+void rasterizeKernel(int numPrimitives, int width, int height, Fragment *fragmentBuffer, Primitive* dev_primitives, float* depth, int* mutex) {
+	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+	int idx = x + (y * width);
+
+	if (idx < numPrimitives) {
+		Primitive &primitive = dev_primitives[idx];
+		glm::vec3 tri[3] = { glm::vec3(primitive.v[0].pos), glm::vec3(primitive.v[1].pos), glm::vec3(primitive.v[2].pos) };
+
+		AABB aabb = getAABBForTriangle(tri);
+		aabb.min.x = glm::min((float)width - 1, glm::max(0.f, aabb.min.x));
+		aabb.min.y = glm::min((float)height - 1, glm::max(0.f, aabb.min.y));
+		aabb.max.x = glm::max(0.f, glm::min((float)width - 1, aabb.max.x));
+		aabb.max.y = glm::max(0.f, glm::min((float)height - 1, aabb.max.y));
+
+		for (int col = aabb.min.x; col <= aabb.max.x; ++col) {
+			for (int row = aabb.min.y; row <= aabb.max.y; ++row) {
+				glm::vec2 point = glm::vec2(col, row);
+				int fragmentIndex = glm::min(width*height - 1, glm::max(0, col + (row * width)));
+
+				glm::vec3 bary = calculateBarycentricCoordinate(tri, point);
+
+				if (isBarycentricCoordInBounds(bary)) {
+					bool isSet;
+					do {
+						isSet = (atomicCAS(&mutex[fragmentIndex], 0, 1) == 0);
+						if (isSet) {
+							Fragment &fragment = fragmentBuffer[fragmentIndex];
+
+							// Only set this fragments attributes if closest depth
+							float fragmentDepth = getZAtCoordinate(bary, tri);
+
+							if (fragmentDepth < depth[fragmentIndex]) {
+								depth[fragmentIndex] = fragmentDepth;
+
+								// Perspective correct z
+								glm::vec3 eyeTri[3] = { glm::vec3(primitive.v[0].eyePos), glm::vec3(primitive.v[1].eyePos), glm::vec3(primitive.v[2].eyePos) };
+								float perspZ = perspectiveCorrectZ (eyeTri, bary);
+
+								// Calculate normal
+								glm::vec3 normals[3] = { primitive.v[0].eyeNor, primitive.v[1].eyeNor, primitive.v[2].eyeNor };
+								fragment.eyeNor = glm::normalize( perspectiveCorrectInterpolation (eyeTri, perspZ, normals, bary));
+
+								if (primitive.v[0].dev_diffuseTex) {
+									glm::vec3 uv[3] = { glm::vec3(primitive.v[0].texcoord0, 0), glm::vec3(primitive.v[1].texcoord0, 0), glm::vec3(primitive.v[2].texcoord0, 0) };
+
+									glm::vec2 final_uv = bary[0] * primitive.v[0].texcoord0 + bary[1] * primitive.v[1].texcoord0 + bary[2] * primitive.v[2].texcoord0;
+									float u = final_uv.x * primitive.v[0].texWidth;
+									float v = final_uv.y * primitive.v[0].texHeight;
+
+									int uInt = glm::floor(u);
+									int vInt = glm::floor(v);
+
+									TextureData* texture = primitive.v[0].dev_diffuseTex;
+
+									float u_fract = u - glm::floor(u);
+									float v_fract = v - glm::floor(v);
+
+									int col_00_offset = (uInt + (vInt * primitive.v[0].texWidth)) * 3;
+									glm::vec3 col_00 = glm::vec3(texture[col_00_offset], texture[col_00_offset + 1], texture[col_00_offset + 2]);
+
+									int col_10_offset = (uInt + 1 + (vInt * primitive.v[0].texWidth)) * 3;
+									glm::vec3 col_10 = glm::vec3(texture[col_10_offset], texture[col_10_offset + 1], texture[col_10_offset + 2]);
+
+									int col_01_offset = (uInt + ((vInt + 1) * primitive.v[0].texWidth)) * 3;
+									glm::vec3 col_01 = glm::vec3(texture[col_01_offset], texture[col_01_offset + 1], texture[col_01_offset + 2]);
+
+									int col_11_offset = (uInt + 1 + ((vInt + 1) * primitive.v[0].texWidth)) * 3;
+									glm::vec3 col_11 = glm::vec3(texture[col_11_offset], texture[col_11_offset + 1], texture[col_11_offset + 2]);
+
+									glm::vec3 col_interp1 = glm::mix(col_00, col_10, u_fract);
+									glm::vec3 col_interp2 = glm::mix(col_01, col_11, u_fract);
+
+									fragment.color = glm::mix(col_interp1, col_interp2, v_fract) / 255.f;
+								}
+								else {
+									glm::vec3 colors[3] = { primitive.v[0].color, primitive.v[1].color, primitive.v[2].color };
+									fragment.color = perspectiveCorrectInterpolation (eyeTri, perspZ, colors, bary);
+								}
+							}
+						}
+						if (isSet) {
+							mutex[fragmentIndex] = 0;
+						}
+					} while (!isSet);
+				}
+			}
+		}
+	}
+}
 
 
 /**
  * Perform rasterization.
  */
 void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const glm::mat3 MV_normal) {
-    int sideLength2d = 8;
-    dim3 blockSize2d(sideLength2d, sideLength2d);
-    dim3 blockCount2d((width  - 1) / blockSize2d.x + 1,
+	int sideLength2d = 8;
+	dim3 blockSize2d(sideLength2d, sideLength2d);
+	dim3 blockCount2d((width - 1) / blockSize2d.x + 1,
 		(height - 1) / blockSize2d.y + 1);
 
+	cudaMemcpy(dev_primitives_copy, dev_primitives, totalNumPrimitives * sizeof(Primitive), cudaMemcpyDeviceToDevice);
+
+
 	// Execute your rasterization pipeline here
 	// (See README for rasterization pipeline outline.)
 
@@ -702,14 +887,14 @@ void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const g
 				dim3 numBlocksForVertices((p->numVertices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x);
 				dim3 numBlocksForIndices((p->numIndices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x);
 
-				_vertexTransformAndAssembly << < numBlocksForVertices, numThreadsPerBlock >> >(p->numVertices, *p, MVP, MV, MV_normal, width, height);
+				_vertexTransformAndAssembly << < numBlocksForVertices, numThreadsPerBlock >> > (p->numVertices, *p, MVP, MV, MV_normal, width, height);
 				checkCUDAError("Vertex Processing");
 				cudaDeviceSynchronize();
 				_primitiveAssembly << < numBlocksForIndices, numThreadsPerBlock >> >
-					(p->numIndices, 
-					curPrimitiveBeginId, 
-					dev_primitives, 
-					*p);
+					(p->numIndices,
+						curPrimitiveBeginId,
+						dev_primitives_copy,
+						*p);
 				checkCUDAError("Primitive Assembly");
 
 				curPrimitiveBeginId += p->numPrimitives;
@@ -718,20 +903,27 @@ void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const g
 
 		checkCUDAError("Vertex Processing and Primitive Assembly");
 	}
-	
+
 	cudaMemset(dev_fragmentBuffer, 0, width * height * sizeof(Fragment));
-	initDepth << <blockCount2d, blockSize2d >> >(width, height, dev_depth);
-	
-	// TODO: rasterize
+	initDepth << <blockCount2d, blockSize2d >> > (width, height, dev_depth);
 
+	// Cull all the primitives facing away
 
+	Primitive *end = thrust::partition(thrust::device, dev_primitives_copy, dev_primitives_copy + totalNumPrimitives, cullpredicate());
+	cudaDeviceSynchronize();
 
-    // Copy depthbuffer colors into framebuffer
-	render << <blockCount2d, blockSize2d >> >(width, height, dev_fragmentBuffer, dev_framebuffer);
+	int primitivesLeft = end - dev_primitives_copy;
+
+	// TODO: rasterize
+	rasterizeKernel << <blockCount2d, blockSize2d >> > (primitivesLeft, width, height, dev_fragmentBuffer, dev_primitives_copy, dev_depth, mutex);
+
+
+	// Copy depthbuffer colors into framebuffer
+	render << <blockCount2d, blockSize2d >> > (width, height, dev_fragmentBuffer, dev_framebuffer);
 	checkCUDAError("fragment shader");
-    // Copy framebuffer into OpenGL buffer for OpenGL previewing
-    sendImageToPBO<<<blockCount2d, blockSize2d>>>(pbo, width, height, dev_framebuffer);
-    checkCUDAError("copy render result to pbo");
+	// Copy framebuffer into OpenGL buffer for OpenGL previewing
+	sendImageToPBO << <blockCount2d, blockSize2d >> > (pbo, originalWidth, originalHeight, dev_framebuffer, antialiasing, width);
+	checkCUDAError("copy render result to pbo");
 }
 
 /**
@@ -739,7 +931,7 @@ void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const g
  */
 void rasterizeFree() {
 
-    // deconstruct primitives attribute/indices device buffer
+	// deconstruct primitives attribute/indices device buffer
 
 	auto it(mesh2PrimitivesMap.begin());
 	auto itEnd(mesh2PrimitivesMap.end());
@@ -753,24 +945,27 @@ void rasterizeFree() {
 
 			cudaFree(p->dev_verticesOut);
 
-			
+
 			//TODO: release other attributes and materials
 		}
 	}
 
 	////////////
 
-    cudaFree(dev_primitives);
-    dev_primitives = NULL;
+	cudaFree(dev_primitives);
+	dev_primitives = NULL;
 
 	cudaFree(dev_fragmentBuffer);
 	dev_fragmentBuffer = NULL;
 
-    cudaFree(dev_framebuffer);
-    dev_framebuffer = NULL;
+	cudaFree(dev_framebuffer);
+	dev_framebuffer = NULL;
 
 	cudaFree(dev_depth);
 	dev_depth = NULL;
 
-    checkCUDAError("rasterize Free");
-}
+	cudaFree(mutex);
+	mutex = NULL;
+
+	checkCUDAError("rasterize Free");
+}
\ No newline at end of file
diff --git a/src/rasterizeTools.h b/src/rasterizeTools.h
index 46c701e..555a1ab 100644
--- a/src/rasterizeTools.h
+++ b/src/rasterizeTools.h
@@ -13,8 +13,8 @@
 #include <util/utilityCore.hpp>
 
 struct AABB {
-    glm::vec3 min;
-    glm::vec3 max;
+	glm::vec3 min;
+	glm::vec3 max;
 };
 
 /**
@@ -22,7 +22,7 @@ struct AABB {
  */
 __host__ __device__ static
 glm::vec3 multiplyMV(glm::mat4 m, glm::vec4 v) {
-    return glm::vec3(m * v);
+	return glm::vec3(m * v);
 }
 
 // CHECKITOUT
@@ -31,16 +31,16 @@ glm::vec3 multiplyMV(glm::mat4 m, glm::vec4 v) {
  */
 __host__ __device__ static
 AABB getAABBForTriangle(const glm::vec3 tri[3]) {
-    AABB aabb;
-    aabb.min = glm::vec3(
-            min(min(tri[0].x, tri[1].x), tri[2].x),
-            min(min(tri[0].y, tri[1].y), tri[2].y),
-            min(min(tri[0].z, tri[1].z), tri[2].z));
-    aabb.max = glm::vec3(
-            max(max(tri[0].x, tri[1].x), tri[2].x),
-            max(max(tri[0].y, tri[1].y), tri[2].y),
-            max(max(tri[0].z, tri[1].z), tri[2].z));
-    return aabb;
+	AABB aabb;
+	aabb.min = glm::vec3(
+		min(min(tri[0].x, tri[1].x), tri[2].x),
+		min(min(tri[0].y, tri[1].y), tri[2].y),
+		min(min(tri[0].z, tri[1].z), tri[2].z));
+	aabb.max = glm::vec3(
+		max(max(tri[0].x, tri[1].x), tri[2].x),
+		max(max(tri[0].y, tri[1].y), tri[2].y),
+		max(max(tri[0].z, tri[1].z), tri[2].z));
+	return aabb;
 }
 
 // CHECKITOUT
@@ -49,7 +49,7 @@ AABB getAABBForTriangle(const glm::vec3 tri[3]) {
  */
 __host__ __device__ static
 float calculateSignedArea(const glm::vec3 tri[3]) {
-    return 0.5 * ((tri[2].x - tri[0].x) * (tri[1].y - tri[0].y) - (tri[1].x - tri[0].x) * (tri[2].y - tri[0].y));
+	return 0.5 * ((tri[2].x - tri[0].x) * (tri[1].y - tri[0].y) - (tri[1].x - tri[0].x) * (tri[2].y - tri[0].y));
 }
 
 // CHECKITOUT
@@ -58,11 +58,11 @@ float calculateSignedArea(const glm::vec3 tri[3]) {
  */
 __host__ __device__ static
 float calculateBarycentricCoordinateValue(glm::vec2 a, glm::vec2 b, glm::vec2 c, const glm::vec3 tri[3]) {
-    glm::vec3 baryTri[3];
-    baryTri[0] = glm::vec3(a, 0);
-    baryTri[1] = glm::vec3(b, 0);
-    baryTri[2] = glm::vec3(c, 0);
-    return calculateSignedArea(baryTri) / calculateSignedArea(tri);
+	glm::vec3 baryTri[3];
+	baryTri[0] = glm::vec3(a, 0);
+	baryTri[1] = glm::vec3(b, 0);
+	baryTri[2] = glm::vec3(c, 0);
+	return calculateSignedArea(baryTri) / calculateSignedArea(tri);
 }
 
 // CHECKITOUT
@@ -71,10 +71,10 @@ float calculateBarycentricCoordinateValue(glm::vec2 a, glm::vec2 b, glm::vec2 c,
  */
 __host__ __device__ static
 glm::vec3 calculateBarycentricCoordinate(const glm::vec3 tri[3], glm::vec2 point) {
-    float beta  = calculateBarycentricCoordinateValue(glm::vec2(tri[0].x, tri[0].y), point, glm::vec2(tri[2].x, tri[2].y), tri);
-    float gamma = calculateBarycentricCoordinateValue(glm::vec2(tri[0].x, tri[0].y), glm::vec2(tri[1].x, tri[1].y), point, tri);
-    float alpha = 1.0 - beta - gamma;
-    return glm::vec3(alpha, beta, gamma);
+	float beta = calculateBarycentricCoordinateValue(glm::vec2(tri[0].x, tri[0].y), point, glm::vec2(tri[2].x, tri[2].y), tri);
+	float gamma = calculateBarycentricCoordinateValue(glm::vec2(tri[0].x, tri[0].y), glm::vec2(tri[1].x, tri[1].y), point, tri);
+	float alpha = 1.0 - beta - gamma;
+	return glm::vec3(alpha, beta, gamma);
 }
 
 // CHECKITOUT
@@ -83,9 +83,9 @@ glm::vec3 calculateBarycentricCoordinate(const glm::vec3 tri[3], glm::vec2 point
  */
 __host__ __device__ static
 bool isBarycentricCoordInBounds(const glm::vec3 barycentricCoord) {
-    return barycentricCoord.x >= 0.0 && barycentricCoord.x <= 1.0 &&
-           barycentricCoord.y >= 0.0 && barycentricCoord.y <= 1.0 &&
-           barycentricCoord.z >= 0.0 && barycentricCoord.z <= 1.0;
+	return barycentricCoord.x >= 0.0 && barycentricCoord.x <= 1.0 &&
+		barycentricCoord.y >= 0.0 && barycentricCoord.y <= 1.0 &&
+		barycentricCoord.z >= 0.0 && barycentricCoord.z <= 1.0;
 }
 
 // CHECKITOUT
@@ -95,7 +95,25 @@ bool isBarycentricCoordInBounds(const glm::vec3 barycentricCoord) {
  */
 __host__ __device__ static
 float getZAtCoordinate(const glm::vec3 barycentricCoord, const glm::vec3 tri[3]) {
-    return -(barycentricCoord.x * tri[0].z
-           + barycentricCoord.y * tri[1].z
-           + barycentricCoord.z * tri[2].z);
+	return (barycentricCoord.x * tri[0].z
+		+ barycentricCoord.y * tri[1].z
+		+ barycentricCoord.z * tri[2].z);
 }
+
+__host__ __device__ static
+int convert2Dto1D(const int x, const int y, const int width) {
+	return x + (y * width);
+}
+
+/**
+* Handy-dandy hash function that provides seeds for random number generation.
+*/
+__host__ __device__ inline unsigned int utilhash(unsigned int a) {
+	a = (a + 0x7ed55d16) + (a << 12);
+	a = (a ^ 0xc761c23c) ^ (a >> 19);
+	a = (a + 0x165667b1) + (a << 5);
+	a = (a + 0xd3a2646c) ^ (a << 9);
+	a = (a + 0xfd7046c5) + (a << 3);
+	a = (a ^ 0xb55a4f09) ^ (a >> 16);
+	return a;
+}
\ No newline at end of file