diff --git a/.cproject b/.cproject
index 8757ba5..5788666 100644
--- a/.cproject
+++ b/.cproject
@@ -45,50 +45,6 @@
 			<storageModule moduleId="com.nvidia.cuda.ide.build.project.ICudaProjectConfiguration"/>
 			<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
 		</cconfiguration>
-		<cconfiguration id="com.nvidia.cuda.ide.toolchain.on_path.1673801701.1348522543">
-			<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="com.nvidia.cuda.ide.toolchain.on_path.1673801701.1348522543" moduleId="org.eclipse.cdt.core.settings" name="DebugFast">
-				<externalSettings/>
-				<extensions>
-					<extension id="com.nvidia.cuda.ide.cubin" point="org.eclipse.cdt.core.BinaryParser"/>
-					<extension id="com.nvidia.cuda.ide.elf" point="org.eclipse.cdt.core.BinaryParser"/>
-					<extension id="com.nvidia.cuda.ide.macho" point="org.eclipse.cdt.core.BinaryParser"/>
-					<extension id="nvcc.errorParser" point="org.eclipse.cdt.core.ErrorParser"/>
-					<extension id="org.eclipse.cdt.core.VCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
-					<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
-					<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
-					<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
-					<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
-				</extensions>
-			</storageModule>
-			<storageModule moduleId="cdtBuildSystem" version="4.0.0">
-				<configuration buildProperties="" description="" id="com.nvidia.cuda.ide.toolchain.on_path.1673801701.1348522543" name="DebugFast" parent="org.eclipse.cdt.build.core.emptycfg">
-					<folderInfo id="com.nvidia.cuda.ide.toolchain.on_path.1673801701.1348522543." name="/" resourcePath="">
-						<toolChain id="com.nvidia.cuda.ide.toolchain.on_path.1158742694" name="CUDA Toolkit on the path" superClass="com.nvidia.cuda.ide.toolchain.on_path">
-							<targetPlatform archList="all" binaryParser="com.nvidia.cuda.ide.elf;com.nvidia.cuda.ide.macho;com.nvidia.cuda.ide.cubin" id="com.nvidia.cuda.ide.targetPlatform.1389426403" isAbstract="false" name="Debug Platform" osList="linux,macosx" superClass="com.nvidia.cuda.ide.targetPlatform"/>
-							<builder id="com.nvidia.cuda.ide.builder.1491667439" incrementalBuildTarget="${ConfigName}" keepEnvironmentInBuildfile="false" managedBuildOn="false" name="CUDA Toolkit 7.5 Builder" superClass="com.nvidia.cuda.ide.builder"/>
-							<tool id="nvcc.compiler.base.822438225" name="NVCC Compiler" superClass="nvcc.compiler.base">
-								<option id="nvcc.compiler.pic.1384251557" name="Position Independent Code (-fPIC)" superClass="nvcc.compiler.pic"/>
-								<inputType id="nvcc.compiler.input.cu.793886351" superClass="nvcc.compiler.input.cu"/>
-								<inputType id="nvcc.compiler.input.cpp.1191158573" superClass="nvcc.compiler.input.cpp"/>
-								<inputType id="nvcc.compiler.input.c.1507569216" superClass="nvcc.compiler.input.c"/>
-							</tool>
-							<tool id="nvcc.linker.base.864810484" name="NVCC Linker" superClass="nvcc.linker.base">
-								<inputType id="nvcc.linker.input.1359608795" superClass="nvcc.linker.input">
-									<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
-									<additionalInput kind="additionalinput" paths="$(LIBS)"/>
-								</inputType>
-							</tool>
-							<tool id="nvcc.archiver.base.535863149" name="NVCC Archiver" superClass="nvcc.archiver.base"/>
-							<tool id="com.nvidia.host.assembler.918142547" name="Host Assembler" superClass="com.nvidia.host.assembler">
-								<inputType id="cdt.managedbuild.tool.gnu.assembler.input.868159639" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
-							</tool>
-						</toolChain>
-					</folderInfo>
-				</configuration>
-			</storageModule>
-			<storageModule moduleId="com.nvidia.cuda.ide.build.project.ICudaProjectConfiguration"/>
-			<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
-		</cconfiguration>
 		<cconfiguration id="com.nvidia.cuda.ide.toolchain.on_path.1673801701.1040373476">
 			<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="com.nvidia.cuda.ide.toolchain.on_path.1673801701.1040373476" moduleId="org.eclipse.cdt.core.settings" name="DebugNDEBUG">
 				<externalSettings/>
@@ -191,4 +147,5 @@
 		</configuration>
 	</storageModule>
 	<storageModule moduleId="org.eclipse.cdt.internal.ui.text.commentOwnerProjectMappings"/>
+	<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
 </cproject>
diff --git a/CMakeLists.txt b/CMakeLists.txt
index dff84f8..c9100fb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,6 +3,7 @@ cmake_minimum_required(VERSION 3.1)
 project(cis565_rasterizer)
 
 set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH})
+SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -fpermissive")
 
 # Set up include and lib paths
 include_directories(.)
@@ -48,9 +49,9 @@ set(CUDA_PROPAGATE_HOST_FLAGS OFF)
 set(CMAKE_CXX_FLAGS_DEBUG                 "-O0 -g")
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO        "-O2 -g")
 set(CMAKE_CXX_FLAGS_RELEASE               "-O3    -DNDEBUG")
-list(APPEND CUDA_NVCC_FLAGS_DEBUG          -O0 -g -G)
-list(APPEND CUDA_NVCC_FLAGS_RELWITHDEBINFO -O2 -g -lineinfo)
-list(APPEND CUDA_NVCC_FLAGS_RELEASE        -O3    -DNDEBUG)
+list(APPEND CUDA_NVCC_FLAGS_DEBUG          -O0 -g -G -std=c++11)
+list(APPEND CUDA_NVCC_FLAGS_RELWITHDEBINFO -O2 -g -lineinfo -std=c++11)
+list(APPEND CUDA_NVCC_FLAGS_RELEASE        -O3    -DNDEBUG -std=c++11)
 if (WIN32)
     set(CUDA_PROPAGATE_HOST_FLAGS ON)
     set(CMAKE_CXX_FLAGS                       "/MD /EHsc /D _CRT_SECURE_NO_WARNINGS")
@@ -95,6 +96,7 @@ target_link_libraries(${CMAKE_PROJECT_NAME}
     util
     #stream_compaction  # TODO: uncomment if using your own stream compaction
     ${CORELIBS}
+    X11
     )
 
 add_custom_command(
diff --git a/README.md b/README.md
index 41b91f0..e8e915a 100644
--- a/README.md
+++ b/README.md
@@ -1,21 +1,102 @@
 CUDA Rasterizer
-===============
-
-[CLICK ME FOR INSTRUCTION OF THIS PROJECT](./INSTRUCTION.md)
+======================
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 4**
 
-* (TODO) YOUR NAME HERE
-	* (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Edward Atter
+  * [LinkedIn](https://www.linkedin.com/in/atter/)
+  * Tested on: Linux Mint 18.3 Sylvia (4.13.0-41-generic), Ryzen 7 2700x @ 3.7 ghz (base clock) 16GB, GTX 1070 TI 8GB GDDR5 (Personal)
+  * CUDA 9
+
+![](img/intro.png)
+
+## Overview
+
+A rasterizer takes 3D objects (defined using vector graphics) and converts it into a 2D representation to be viewed on a computer monitor. Rasterization is the technology behind all major video games as it is much faster than path tracing. 
+
+![](img/box-demo.gif)
+
+_Note:_ The screen door affect oberserved is due to the conversion from `mp4` to `gif` and **not** an affect of the rasterizer.
+
+This project implements a a rasterizer with the following features:
+ - Vertex shading
+ - Primitive assembly support
+ - Rasterization
+ - Fragment shading
+ - Depth buffer for storing depth testing fragments
+ - Fragment-to-depth buffer with mutex lock to prevent race conditions
+ - Lambert fragment shader
+ - Supersampling Anti-Aliasing (SSAA) using the ["random" algorithm](https://en.wikipedia.org/wiki/Supersampling#Supersampling_patterns)
+ - Point cloud drawing mode
+ - Line drawing mode
+ - Back-face culling
+
+All features may be toggled by changing the defined constants at the start of `src/rasterize.cu`.
+
+## Features
+
+#### Lambert shading
+
+Lambert shading has no noticable affect on performance. The image on the left is without shading, while the image on the right has Lambert shading enabled. The left image has simple normal coloring while the image on the right uses Lambert shading. 
+
+![](img/lambert-comparison.png)
+
+
+#### Anti-Aliasing via Supersampling
+
+This technique requires upscaling the image, using one of several methods to "average out" the pixels, and setting this new value in the final image. Specifically, this implementation uses the "random" algorithm. 
+
+![](https://upload.wikimedia.org/wikipedia/commons/thumb/a/a3/Supersampling_-_Random.svg/180px-Supersampling_-_Random.svg.png)
+
+A list of nearby pixels is generated (the size of which depends on `OPTION_SSAA_GRID_SIZE`). Then multiple random samples are taken from the nearby pixels. The average of the samples represents the final color in the image. 
+
+SSAA has an significant, negative impact on performance. This is unsuprising, since a much larger image must be generated first, typically with four times as many pixels. However, images can look much more realistic. Humans are accustomed to seeing objects in a continuous space in real life, not broken down into discrete pixels. Anti-aliasing aids to more accurately represent a continuous space. 
+
+![](img/ssaa-comparison.png)
+
+The performance impact is further illustrated by the graph below. The flower model is used in this test with a `GRID_SIZE` of 2.
+
+![](img/ssaa-graph.png)
+
+The graph below compares the percentage of time spent in each pipeline stage with and without SSAA. 
+
+![](img/ssaa-pipelines-graph.png)
+
+It is not surprising to see the PBO stage greatly increase. The random sampling takes place in this stage, allowing us to reduce the supersampled image back to the original resolution. I was surprised to see the render stage decrease. It is important to keep in mind, however, that this is representing the relative percentage of overall time, not the total time spent in each stage. Further analysis into the profile shows the render time did, in fact, increase with SSAA enabled even though the relative percent of time spent decreased as shown in the graph. This is why the other minor pipeline stages are greatly reduced. With SSAA enabled, the render and PBO stage dominate even more than usual, while the actual time spent in seconds of pipelines such as assembly remain constant.
+
+The benefits of SSAA can be achieved with much higher performance by implementing [MSAA](https://en.wikipedia.org/wiki/Multisample_anti-aliasing). This special case of supersampling only upscales edges of the objects, where anti aliasing provides the largest benefit. Since the entire image is no longer being upscaled, the result is comparable to SSAA with significantly better performance. 
+
+#### Alternate Drawing Modes
+
+In addition to the typical `triangle` mode, `line` and `point` modes are also available. If enabled, the original vector graphic is reduced to solely its vertices (point mode). Line mode takes this one step further, connecting each vertex without filling triangle. Triangle mode uses the normal value for coloring, while all other modes are statically defined to be white. Without the need to fill the image, significantly fewer pixels need to be processed. This results in both point and line modes achieving much higher FPS than the normal triangle mode.
+
+![](img/points-cow.png) ![](img/lines-cow.png)
+
+The graph below shows the difference in FPS across each of the three drawing modes while drawing the cow model. 
+
+![](img/draw-mode-graph.png)
+
+Line drawing may be further improved by calculating each pixel in parallel. Currently, one line is calculated per thread. Utilizing shared memory instead of global memory in all drawing modes is also likely to yield a significant performance benefit. 
+
+#### Back-face culling
+
+In a naive approach, all triangles are rendered whether they are visible in the field of view or not. Back-face culling is meant to improve performance by only rendering the shapes visible to the camera. 
+
+![](img/culling-graph.png)
+
+Analysis shows that, at least for this implementation, back-face culling is not guaranteed to improve performance. Like most things in computer science, it depends. Performance actually gets worse with culling enabled when rendering the box model. This is likely because the box has only a small number of large fragments on screen at any given time. The overhead for the culling calculation is simply not woth it. The conclusion is very different for the cow model however. Unlike the box, the cow has many small triangles, only about half of which are visible at a fixed perspective. The cow model is more representative of typical applications, where the detail is high, requiring many small shapes. Thus, it's expected as a whole back-face culling will improve performance in real life applications. 
+
+As with the other features, shared memory would likely help significantly here. As well as some sort of caching for the objects that are not in view. 
 
-### (TODO: Your README)
+## Methodology
 
-*DO NOT* leave the README to the last minute! It is a crucial part of the
-project, and we will not be able to grade you without a good README.
+When testing performance, the model being rendered was first zoomed to fill the screen. Once the proper size was reached, the FPS was calculated as the average over the next 10 seconds. All models used for testing and shown in the screenshots are available in the `gltf` directory. Unless otherwise specified, all toggleable features except the one being tested were disabled before each performance test. 
 
+## Resources & Credits
 
-### Credits
+ - [glTF Sample Models](https://github.com/KhronosGroup/glTF/blob/master/sampleModels/README.md)
+ - [OpenGL.org, Lambert lighting](https://www.opengl.org/sdk/docs/tutorials/ClockworkCoders/lighting.php)
+ - [tinygltfloader](https://github.com/syoyo/tinygltfloader) by [@soyoyo](https://github.com/syoyo)
+ - [University of Pennsylvania, CIS 565, Skeleton Project](https://github.com/CIS565-Fall-2018/Project4-CUDA-Rasterizer)
+ - [Wikipedia, Supersampling](https://en.wikipedia.org/wiki/Supersampling#Supersampling_patterns)
 
-* [tinygltfloader](https://github.com/syoyo/tinygltfloader) by [@soyoyo](https://github.com/syoyo)
-* [glTF Sample Models](https://github.com/KhronosGroup/glTF/blob/master/sampleModels/README.md)
diff --git a/cis565_rasterizer.launch b/cis565_rasterizer - box.launch
similarity index 93%
rename from cis565_rasterizer.launch
rename to cis565_rasterizer - box.launch
index 2c211dc..6d936bc 100644
--- a/cis565_rasterizer.launch
+++ b/cis565_rasterizer - box.launch	
@@ -7,10 +7,10 @@
 <intAttribute key="org.eclipse.cdt.launch.ATTR_BUILD_BEFORE_LAUNCH_ATTR" value="2"/>
 <stringAttribute key="org.eclipse.cdt.launch.DEBUGGER_ID" value="com.nvidia.cuda.ide.debug.cudagdb"/>
 <stringAttribute key="org.eclipse.cdt.launch.DEBUGGER_START_MODE" value="run"/>
-<stringAttribute key="org.eclipse.cdt.launch.PROGRAM_ARGUMENTS" value="objs/tri.obj"/>
+<stringAttribute key="org.eclipse.cdt.launch.PROGRAM_ARGUMENTS" value="gltfs/box/box.gltf"/>
 <stringAttribute key="org.eclipse.cdt.launch.PROGRAM_NAME" value="build/cis565_rasterizer"/>
 <stringAttribute key="org.eclipse.cdt.launch.PROJECT_ATTR" value="Project4-CUDA-Rasterizer"/>
-<stringAttribute key="org.eclipse.cdt.launch.PROJECT_BUILD_CONFIG_ID_ATTR" value="com.nvidia.cuda.ide.toolchain.on_path.1673801701.1348522543"/>
+<stringAttribute key="org.eclipse.cdt.launch.PROJECT_BUILD_CONFIG_ID_ATTR" value=""/>
 <listAttribute key="org.eclipse.debug.core.MAPPED_RESOURCE_PATHS">
 <listEntry value="/Project4-CUDA-Rasterizer"/>
 </listAttribute>
diff --git a/cis565_rasterizer - cow.launch b/cis565_rasterizer - cow.launch
new file mode 100644
index 0000000..4220045
--- /dev/null
+++ b/cis565_rasterizer - cow.launch	
@@ -0,0 +1,21 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<launchConfiguration type="org.eclipse.cdt.launch.applicationLaunchType">
+<stringAttribute key="bad_container_name" value="/Project4-CUDA-Rasterize"/>
+<stringAttribute key="org.eclipse.cdt.debug.mi.core.DEBUG_NAME" value="${cuda_tk_bin:/Project4-CUDA-Rasterizer}/cuda-gdb"/>
+<stringAttribute key="org.eclipse.cdt.debug.mi.core.GDB_INIT" value=".cuda-gdbinit"/>
+<stringAttribute key="org.eclipse.cdt.debug.mi.core.protocol" value="mi"/>
+<intAttribute key="org.eclipse.cdt.launch.ATTR_BUILD_BEFORE_LAUNCH_ATTR" value="2"/>
+<stringAttribute key="org.eclipse.cdt.launch.DEBUGGER_ID" value="com.nvidia.cuda.ide.debug.cudagdb"/>
+<stringAttribute key="org.eclipse.cdt.launch.DEBUGGER_START_MODE" value="run"/>
+<stringAttribute key="org.eclipse.cdt.launch.PROGRAM_ARGUMENTS" value="gltfs/cow/cow.gltf"/>
+<stringAttribute key="org.eclipse.cdt.launch.PROGRAM_NAME" value="build/cis565_rasterizer"/>
+<stringAttribute key="org.eclipse.cdt.launch.PROJECT_ATTR" value="Project4-CUDA-Rasterizer"/>
+<stringAttribute key="org.eclipse.cdt.launch.PROJECT_BUILD_CONFIG_ID_ATTR" value=""/>
+<listAttribute key="org.eclipse.debug.core.MAPPED_RESOURCE_PATHS">
+<listEntry value="/Project4-CUDA-Rasterizer"/>
+</listAttribute>
+<listAttribute key="org.eclipse.debug.core.MAPPED_RESOURCE_TYPES">
+<listEntry value="4"/>
+</listAttribute>
+<stringAttribute key="process_factory_id" value="org.eclipse.cdt.dsf.gdb.GdbProcessFactory"/>
+</launchConfiguration>
diff --git a/cis565_rasterizer - flower.launch b/cis565_rasterizer - flower.launch
new file mode 100644
index 0000000..3e39070
--- /dev/null
+++ b/cis565_rasterizer - flower.launch	
@@ -0,0 +1,21 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<launchConfiguration type="org.eclipse.cdt.launch.applicationLaunchType">
+<stringAttribute key="bad_container_name" value="/Project4-CUDA-Rasterize"/>
+<stringAttribute key="org.eclipse.cdt.debug.mi.core.DEBUG_NAME" value="${cuda_tk_bin:/Project4-CUDA-Rasterizer}/cuda-gdb"/>
+<stringAttribute key="org.eclipse.cdt.debug.mi.core.GDB_INIT" value=".cuda-gdbinit"/>
+<stringAttribute key="org.eclipse.cdt.debug.mi.core.protocol" value="mi"/>
+<intAttribute key="org.eclipse.cdt.launch.ATTR_BUILD_BEFORE_LAUNCH_ATTR" value="2"/>
+<stringAttribute key="org.eclipse.cdt.launch.DEBUGGER_ID" value="com.nvidia.cuda.ide.debug.cudagdb"/>
+<stringAttribute key="org.eclipse.cdt.launch.DEBUGGER_START_MODE" value="run"/>
+<stringAttribute key="org.eclipse.cdt.launch.PROGRAM_ARGUMENTS" value="gltfs/flower/flower.gltf"/>
+<stringAttribute key="org.eclipse.cdt.launch.PROGRAM_NAME" value="build/cis565_rasterizer"/>
+<stringAttribute key="org.eclipse.cdt.launch.PROJECT_ATTR" value="Project4-CUDA-Rasterizer"/>
+<stringAttribute key="org.eclipse.cdt.launch.PROJECT_BUILD_CONFIG_ID_ATTR" value=""/>
+<listAttribute key="org.eclipse.debug.core.MAPPED_RESOURCE_PATHS">
+<listEntry value="/Project4-CUDA-Rasterizer"/>
+</listAttribute>
+<listAttribute key="org.eclipse.debug.core.MAPPED_RESOURCE_TYPES">
+<listEntry value="4"/>
+</listAttribute>
+<stringAttribute key="process_factory_id" value="org.eclipse.cdt.dsf.gdb.GdbProcessFactory"/>
+</launchConfiguration>
diff --git a/cis565_rasterizer - triangle.launch b/cis565_rasterizer - triangle.launch
new file mode 100644
index 0000000..ad6c0c3
--- /dev/null
+++ b/cis565_rasterizer - triangle.launch	
@@ -0,0 +1,21 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<launchConfiguration type="org.eclipse.cdt.launch.applicationLaunchType">
+<stringAttribute key="bad_container_name" value="/Project4-CUDA-Rasterize"/>
+<stringAttribute key="org.eclipse.cdt.debug.mi.core.DEBUG_NAME" value="${cuda_tk_bin:/Project4-CUDA-Rasterizer}/cuda-gdb"/>
+<stringAttribute key="org.eclipse.cdt.debug.mi.core.GDB_INIT" value=".cuda-gdbinit"/>
+<stringAttribute key="org.eclipse.cdt.debug.mi.core.protocol" value="mi"/>
+<intAttribute key="org.eclipse.cdt.launch.ATTR_BUILD_BEFORE_LAUNCH_ATTR" value="2"/>
+<stringAttribute key="org.eclipse.cdt.launch.DEBUGGER_ID" value="com.nvidia.cuda.ide.debug.cudagdb"/>
+<stringAttribute key="org.eclipse.cdt.launch.DEBUGGER_START_MODE" value="run"/>
+<stringAttribute key="org.eclipse.cdt.launch.PROGRAM_ARGUMENTS" value="gltfs/triangle/triangle.gltf"/>
+<stringAttribute key="org.eclipse.cdt.launch.PROGRAM_NAME" value="build/cis565_rasterizer"/>
+<stringAttribute key="org.eclipse.cdt.launch.PROJECT_ATTR" value="Project4-CUDA-Rasterizer"/>
+<stringAttribute key="org.eclipse.cdt.launch.PROJECT_BUILD_CONFIG_ID_ATTR" value=""/>
+<listAttribute key="org.eclipse.debug.core.MAPPED_RESOURCE_PATHS">
+<listEntry value="/Project4-CUDA-Rasterizer"/>
+</listAttribute>
+<listAttribute key="org.eclipse.debug.core.MAPPED_RESOURCE_TYPES">
+<listEntry value="4"/>
+</listAttribute>
+<stringAttribute key="process_factory_id" value="org.eclipse.cdt.dsf.gdb.GdbProcessFactory"/>
+</launchConfiguration>
diff --git a/external/lib/linux/libglfw3.a b/external/lib/linux/libglfw3.a
index da7ab6c..17b4438 100644
Binary files a/external/lib/linux/libglfw3.a and b/external/lib/linux/libglfw3.a differ
diff --git a/img/box-demo.gif b/img/box-demo.gif
new file mode 100644
index 0000000..8041c91
Binary files /dev/null and b/img/box-demo.gif differ
diff --git a/img/culling-graph.png b/img/culling-graph.png
new file mode 100644
index 0000000..05c5bf0
Binary files /dev/null and b/img/culling-graph.png differ
diff --git a/img/draw-mode-graph.png b/img/draw-mode-graph.png
new file mode 100644
index 0000000..44544ce
Binary files /dev/null and b/img/draw-mode-graph.png differ
diff --git a/img/intro.png b/img/intro.png
new file mode 100644
index 0000000..3aaf79b
Binary files /dev/null and b/img/intro.png differ
diff --git a/img/lambert-comparison.png b/img/lambert-comparison.png
new file mode 100644
index 0000000..717fc82
Binary files /dev/null and b/img/lambert-comparison.png differ
diff --git a/img/lines-cow.png b/img/lines-cow.png
new file mode 100644
index 0000000..e480908
Binary files /dev/null and b/img/lines-cow.png differ
diff --git a/img/points-cow.png b/img/points-cow.png
new file mode 100644
index 0000000..f88467a
Binary files /dev/null and b/img/points-cow.png differ
diff --git a/img/ssaa-comparison.png b/img/ssaa-comparison.png
new file mode 100644
index 0000000..777f989
Binary files /dev/null and b/img/ssaa-comparison.png differ
diff --git a/img/ssaa-graph.png b/img/ssaa-graph.png
new file mode 100644
index 0000000..bb3c390
Binary files /dev/null and b/img/ssaa-graph.png differ
diff --git a/img/ssaa-pipelines-graph.png b/img/ssaa-pipelines-graph.png
new file mode 100644
index 0000000..ef6143e
Binary files /dev/null and b/img/ssaa-pipelines-graph.png differ
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index a57f69f..d9247c3 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -6,5 +6,5 @@ set(SOURCE_FILES
 
 cuda_add_library(src
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_60
     )
diff --git a/src/rasterize.cu b/src/rasterize.cu
index 1262a09..2625539 100644
--- a/src/rasterize.cu
+++ b/src/rasterize.cu
@@ -2,8 +2,8 @@
  * @file      rasterize.cu
  * @brief     CUDA-accelerated rasterization pipeline.
  * @authors   Skeleton code: Yining Karl Li, Kai Ninomiya, Shuai Shao (Shrek)
- * @date      2012-2016
- * @copyright University of Pennsylvania & STUDENT
+ * @date      2012-2018
+ * @copyright University of Pennsylvania & Edward Atter
  */
 
 #include <cmath>
@@ -18,83 +18,97 @@
 #include <glm/gtc/quaternion.hpp>
 #include <glm/gtc/matrix_transform.hpp>
 
+#include <math.h>
+
+#define MODE_TRIANGLE	0
+#define MODE_POINT		1
+#define MODE_LINE		2
+
+#define OPTION_ENABLE_LAMBERT			0
+#define OPTION_ENABLE_SSAA				0
+#define OPTION_SSAA_GRID_SIZE 			2
+#define OPTION_ENABLE_BACK_FACE_CULLING	0
+#define OPTION_SELECT_MODE				MODE_TRIANGLE
+
+#define SSAA_GRID_AREA				OPTION_SSAA_GRID_SIZE * OPTION_SSAA_GRID_SIZE
+
 namespace {
 
-	typedef unsigned short VertexIndex;
-	typedef glm::vec3 VertexAttributePosition;
-	typedef glm::vec3 VertexAttributeNormal;
-	typedef glm::vec2 VertexAttributeTexcoord;
-	typedef unsigned char TextureData;
-
-	typedef unsigned char BufferByte;
-
-	enum PrimitiveType{
-		Point = 1,
-		Line = 2,
-		Triangle = 3
-	};
-
-	struct VertexOut {
-		glm::vec4 pos;
-
-		// TODO: add new attributes to your VertexOut
-		// The attributes listed below might be useful, 
-		// but always feel free to modify on your own
-
-		 glm::vec3 eyePos;	// eye space position used for shading
-		 glm::vec3 eyeNor;	// eye space normal used for shading, cuz normal will go wrong after perspective transformation
-		// glm::vec3 col;
-		 glm::vec2 texcoord0;
-		 TextureData* dev_diffuseTex = NULL;
-		// int texWidth, texHeight;
-		// ...
-	};
-
-	struct Primitive {
-		PrimitiveType primitiveType = Triangle;	// C++ 11 init
-		VertexOut v[3];
-	};
-
-	struct Fragment {
-		glm::vec3 color;
-
-		// TODO: add new attributes to your Fragment
-		// The attributes listed below might be useful, 
-		// but always feel free to modify on your own
-
-		// glm::vec3 eyePos;	// eye space position used for shading
-		// glm::vec3 eyeNor;
-		// VertexAttributeTexcoord texcoord0;
-		// TextureData* dev_diffuseTex;
-		// ...
-	};
-
-	struct PrimitiveDevBufPointers {
-		int primitiveMode;	//from tinygltfloader macro
-		PrimitiveType primitiveType;
-		int numPrimitives;
-		int numIndices;
-		int numVertices;
-
-		// Vertex In, const after loaded
-		VertexIndex* dev_indices;
-		VertexAttributePosition* dev_position;
-		VertexAttributeNormal* dev_normal;
-		VertexAttributeTexcoord* dev_texcoord0;
-
-		// Materials, add more attributes when needed
-		TextureData* dev_diffuseTex;
-		int diffuseTexWidth;
-		int diffuseTexHeight;
-		// TextureData* dev_specularTex;
-		// TextureData* dev_normalTex;
-		// ...
-
-		// Vertex Out, vertex used for rasterization, this is changing every frame
-		VertexOut* dev_verticesOut;
-
-		// TODO: add more attributes when needed
-	};
+typedef unsigned short VertexIndex;
+typedef glm::vec3 VertexAttributePosition;
+typedef glm::vec3 VertexAttributeNormal;
+typedef glm::vec2 VertexAttributeTexcoord;
+typedef unsigned char TextureData;
+
+typedef unsigned char BufferByte;
+
+enum PrimitiveType{
+	Point = 1,
+	Line = 2,
+	Triangle = 3
+};
+
+struct VertexOut {
+	glm::vec4 pos;
+
+	// TODO: add new attributes to your VertexOut
+	// The attributes listed below might be useful,
+	// but always feel free to modify on your own
+
+	glm::vec3 eyePos;	// eye space position used for shading
+	glm::vec3 eyeNor;	// eye space normal used for shading, cuz normal will go wrong after perspective transformation
+	// glm::vec3 col;
+	glm::vec2 texcoord0;
+	TextureData* dev_diffuseTex = NULL;
+	// int texWidth, texHeight;
+	// ...
+};
+
+struct Primitive {
+	PrimitiveType primitiveType = Triangle;	// C++ 11 init
+	VertexOut v[3];
+};
+
+struct Fragment {
+	glm::vec3 color;
+
+	// TODO: add new attributes to your Fragment
+	// The attributes listed below might be useful,
+	// but always feel free to modify on your own
+
+	glm::vec3 eyePos;	// eye space position used for shading
+	glm::vec3 eyeNor;
+	// VertexAttributeTexcoord texcoord0;
+	// TextureData* dev_diffuseTex;
+	// ...
+};
+
+struct PrimitiveDevBufPointers {
+	int primitiveMode;	//from tinygltfloader macro
+	PrimitiveType primitiveType;
+	int numPrimitives;
+	int numIndices;
+	int numVertices;
+
+	// Vertex In, const after loaded
+	VertexIndex* dev_indices;
+	VertexAttributePosition* dev_position;
+	VertexAttributeNormal* dev_normal;
+	VertexAttributeTexcoord* dev_texcoord0;
+
+	// Materials, add more attributes when needed
+	TextureData* dev_diffuseTex;
+	int diffuseTexWidth;
+	int diffuseTexHeight;
+	// TextureData* dev_specularTex;
+	// TextureData* dev_normalTex;
+	// ...
+
+	// Vertex Out, vertex used for rasterization, this is changing every frame
+	VertexOut* dev_verticesOut;
+
+	// TODO: add more attributes when needed
+};
 
 }
 
@@ -103,6 +117,8 @@ static std::map<std::string, std::vector<PrimitiveDevBufPointers>> mesh2Primitiv
 
 static int width = 0;
 static int height = 0;
+static int trueWidth = 0;
+static int trueHeight = 0;
 
 static int totalNumPrimitives = 0;
 static Primitive *dev_primitives = NULL;
@@ -111,61 +127,164 @@ static glm::vec3 *dev_framebuffer = NULL;
 
 static int * dev_depth = NULL;	// you might need this buffer when doing depth test
 
+//Additional globals
+static int * dev_mutex = NULL; //int []
+
+// Generates a random float between A and B
+// From https://stackoverflow.com/a/24537113/3421536
+// See also: https://stackoverflow.com/a/25034092/3421536
+__device__
+int generateRandomInt(int A, int B, float randu_f) {
+	//float randu_f = curand_uniform(state);
+	randu_f *= (B-A+0.999999); // You should not use (B-A+1)*
+	randu_f += A;
+	int randu_int = __float2int_rz(randu_f);
+	//printf("RAND: %i <--%f \n", randu_int, randu_f);
+	if (randu_int > B || randu_int < A) {
+		printf("WARN: generateRandomInt out of bounds! %i -> [%i, %i]\n", randu_int, A, B);
+	}
+	return randu_int;
+}
+
+/**
+ * From https://github.com/CIS565-Fall-2018/Project3-CUDA-Path-Tracer
+ * Handy-dandy hash function that provides seeds for random number generation.
+ */
+__host__ __device__ inline unsigned int utilhash(unsigned int a) {
+	a = (a + 0x7ed55d16) + (a << 12);
+	a = (a ^ 0xc761c23c) ^ (a >> 19);
+	a = (a + 0x165667b1) + (a << 5);
+	a = (a + 0xd3a2646c) ^ (a << 9);
+	a = (a + 0xfd7046c5) + (a << 3);
+	a = (a ^ 0xb55a4f09) ^ (a >> 16);
+	return a;
+}
+
+// From https://github.com/CIS565-Fall-2018/Project3-CUDA-Path-Tracer
+__host__ __device__
+thrust::default_random_engine makeSeededRandomEngine(int iter, int index, int depth) {
+	int h = utilhash((1 << 31) | (depth << 22) | iter) ^ utilhash(index);
+	return thrust::default_random_engine(h);
+}
+
 /**
  * Kernel that writes the image to the OpenGL PBO directly.
  */
 __global__ 
-void sendImageToPBO(uchar4 *pbo, int w, int h, glm::vec3 *image) {
-    int x = (blockIdx.x * blockDim.x) + threadIdx.x;
-    int y = (blockIdx.y * blockDim.y) + threadIdx.y;
-    int index = x + (y * w);
-
-    if (x < w && y < h) {
-        glm::vec3 color;
-        color.x = glm::clamp(image[index].x, 0.0f, 1.0f) * 255.0;
-        color.y = glm::clamp(image[index].y, 0.0f, 1.0f) * 255.0;
-        color.z = glm::clamp(image[index].z, 0.0f, 1.0f) * 255.0;
-        // Each thread writes one pixel location in the texture (textel)
-        pbo[index].w = 0;
-        pbo[index].x = color.x;
-        pbo[index].y = color.y;
-        pbo[index].z = color.z;
-    }
+void sendImageToPBO(uchar4 *pbo, int w, int h, int trueWidth, int trueHeight, glm::vec3 *image) {
+	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+	int index = x + (y * trueWidth);
+
+	if (x >= trueWidth || y >= trueHeight) { return; }
+
+	glm::vec3 color;
+	color.x = 0;
+	color.y = 0;
+	color.z = 0;
+#if OPTION_ENABLE_SSAA
+	//Random SSAA
+	thrust::default_random_engine rng =
+			makeSeededRandomEngine(0, index, 1);
+	thrust::uniform_real_distribution<float> u01(0, 1);
+
+	//Possible samples
+	glm::vec3 samples[SSAA_GRID_AREA];
+	int i = 0;
+	for (int xOffset = 0; xOffset < OPTION_SSAA_GRID_SIZE; xOffset ++) {
+		for (int yOffset = 0; yOffset < OPTION_SSAA_GRID_SIZE; yOffset ++) {
+			int xIdx = xOffset + x * OPTION_SSAA_GRID_SIZE;
+			int yIdx = (yOffset + y * OPTION_SSAA_GRID_SIZE) * w;
+			int imageColorIdx = xIdx + yIdx;
+			samples[i] = image[imageColorIdx];
+			i++;
+		}
+	}
+
+	// Generate random samples and add randomly selected pixels
+	for (int i = 0; i < SSAA_GRID_AREA; i++){
+		int randIdx = generateRandomInt(0, SSAA_GRID_AREA - 1, u01(rng));
+		color.x += glm::clamp(samples[randIdx].x, 0.0f, 1.0f) * 255.0;
+		color.y += glm::clamp(samples[randIdx].y, 0.0f, 1.0f) * 255.0;
+		color.z += glm::clamp(samples[randIdx].z, 0.0f, 1.0f) * 255.0;
+	}
+
+	//Take the average
+	color /= (float)SSAA_GRID_AREA;
+
+#else
+	color.x = glm::clamp(image[index].x, 0.0f, 1.0f) * 255.0;
+	color.y = glm::clamp(image[index].y, 0.0f, 1.0f) * 255.0;
+	color.z = glm::clamp(image[index].z, 0.0f, 1.0f) * 255.0;
+#endif
+	// Each thread writes one pixel location in the texture (textel)
+	//if(color.x != 0 || color.y != 0 || color.z != 0) printf("COLOR: %f, %f, %f\n", color.x, color.y, color.z);
+	pbo[index].w = 0;
+	pbo[index].x = color.x;
+	pbo[index].y = color.y;
+	pbo[index].z = color.z;
 }
 
 /** 
-* Writes fragment colors to the framebuffer
-*/
+ * Writes fragment colors to the framebuffer
+ */
 __global__
-void render(int w, int h, Fragment *fragmentBuffer, glm::vec3 *framebuffer) {
-    int x = (blockIdx.x * blockDim.x) + threadIdx.x;
-    int y = (blockIdx.y * blockDim.y) + threadIdx.y;
-    int index = x + (y * w);
-
-    if (x < w && y < h) {
-        framebuffer[index] = fragmentBuffer[index].color;
+void render(int w, int h, Fragment *fragmentBuffer, glm::vec3 *frameBuffer) {
+	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+	int index = x + (y * w);
 
+	if (x < w && y < h) {
+		frameBuffer[index] = fragmentBuffer[index].color;
+#if OPTION_SELECT_MODE == MODE_TRIANGLE
 		// TODO: add your fragment shader code here
-
-    }
+#if OPTION_ENABLE_LAMBERT
+		// Adapted from https://www.opengl.org/sdk/docs/tutorials/ClockworkCoders/lighting.php
+		glm::vec3 v = fragmentBuffer[index].eyePos;
+		glm::vec3 N = fragmentBuffer[index].eyeNor;
+		glm::vec3 lightSource(1, 1, 1);
+		glm::vec3 L = glm::normalize(lightSource - v);
+		float Idiff = glm::dot(L, N);
+		Idiff = glm::clamp(Idiff, 0.0f, 1.0f);
+		frameBuffer[index] = Idiff * fragmentBuffer[index].color;
+#endif
+//End MODE_TRIANGLE
+#endif
+	}
 }
 
 /**
  * Called once at the beginning of the program to allocate memory.
  */
 void rasterizeInit(int w, int h) {
-    width = w;
-    height = h;
+	width = w;
+	height = h;
+	trueWidth = width;
+	trueHeight = height;
+
+#if OPTION_ENABLE_SSAA
+	width *= OPTION_SSAA_GRID_SIZE;
+	height *= OPTION_SSAA_GRID_SIZE;
+#endif
+
 	cudaFree(dev_fragmentBuffer);
 	cudaMalloc(&dev_fragmentBuffer, width * height * sizeof(Fragment));
 	cudaMemset(dev_fragmentBuffer, 0, width * height * sizeof(Fragment));
-    cudaFree(dev_framebuffer);
-    cudaMalloc(&dev_framebuffer,   width * height * sizeof(glm::vec3));
-    cudaMemset(dev_framebuffer, 0, width * height * sizeof(glm::vec3));
-    
+	cudaFree(dev_framebuffer);
+	cudaMalloc(&dev_framebuffer,   width * height * sizeof(glm::vec3));
+	cudaMemset(dev_framebuffer, 0, width * height * sizeof(glm::vec3));
+
 	cudaFree(dev_depth);
 	cudaMalloc(&dev_depth, width * height * sizeof(int));
 
+	// Additional vars
+	// Why free before malloc?
+	cudaFree(dev_mutex);
+	const int devMutexSize = sizeof(int) * width * height;
+	cudaMalloc(&dev_mutex, devMutexSize);
+	// Initialize empty, since cuda does not have calloc
+	cudaMemset(dev_mutex, 0, devMutexSize);
+
 	checkCUDAError("rasterizeInit");
 }
 
@@ -184,12 +303,12 @@ void initDepth(int w, int h, int * depth)
 
 
 /**
-* kern function with support for stride to sometimes replace cudaMemcpy
-* One thread is responsible for copying one component
-*/
+ * kern function with support for stride to sometimes replace cudaMemcpy
+ * One thread is responsible for copying one component
+ */
 __global__ 
 void _deviceBufferCopy(int N, BufferByte* dev_dst, const BufferByte* dev_src, int n, int byteStride, int byteOffset, int componentTypeByteSize) {
-	
+
 	// Attribute (vec3 position)
 	// component (3 * float)
 	// byte (4 * byte)
@@ -202,29 +321,29 @@ void _deviceBufferCopy(int N, BufferByte* dev_dst, const BufferByte* dev_src, in
 		int offset = i - count * n;	// which component of the attribute
 
 		for (int j = 0; j < componentTypeByteSize; j++) {
-			
+
 			dev_dst[count * componentTypeByteSize * n 
-				+ offset * componentTypeByteSize 
-				+ j]
+			        + offset * componentTypeByteSize
+			        + j]
 
-				= 
+			        =
 
-			dev_src[byteOffset 
-				+ count * (byteStride == 0 ? componentTypeByteSize * n : byteStride) 
-				+ offset * componentTypeByteSize 
-				+ j];
+			        		dev_src[byteOffset
+			        		        + count * (byteStride == 0 ? componentTypeByteSize * n : byteStride)
+			        		        + offset * componentTypeByteSize
+			        		        + j];
 		}
 	}
-	
+
 
 }
 
 __global__
 void _nodeMatrixTransform(
-	int numVertices,
-	VertexAttributePosition* position,
-	VertexAttributeNormal* normal,
-	glm::mat4 MV, glm::mat3 MV_normal) {
+		int numVertices,
+		VertexAttributePosition* position,
+		VertexAttributeNormal* normal,
+		glm::mat4 MV, glm::mat3 MV_normal) {
 
 	// vertex id
 	int vid = (blockIdx.x * blockDim.x) + threadIdx.x;
@@ -235,7 +354,7 @@ void _nodeMatrixTransform(
 }
 
 glm::mat4 getMatrixFromNodeMatrixVector(const tinygltf::Node & n) {
-	
+
 	glm::mat4 curMatrix(1.0);
 
 	const std::vector<double> &m = n.matrix;
@@ -276,11 +395,11 @@ glm::mat4 getMatrixFromNodeMatrixVector(const tinygltf::Node & n) {
 }
 
 void traverseNode (
-	std::map<std::string, glm::mat4> & n2m,
-	const tinygltf::Scene & scene,
-	const std::string & nodeString,
-	const glm::mat4 & parentMatrix
-	) 
+		std::map<std::string, glm::mat4> & n2m,
+		const tinygltf::Scene & scene,
+		const std::string & nodeString,
+		const glm::mat4 & parentMatrix
+)
 {
 	const tinygltf::Node & n = scene.nodes.at(nodeString);
 	glm::mat4 M = parentMatrix * getMatrixFromNodeMatrixVector(n);
@@ -303,9 +422,9 @@ void rasterizeSetBuffers(const tinygltf::Scene & scene) {
 	// 1. copy all `bufferViews` to device memory
 	{
 		std::map<std::string, tinygltf::BufferView>::const_iterator it(
-			scene.bufferViews.begin());
+				scene.bufferViews.begin());
 		std::map<std::string, tinygltf::BufferView>::const_iterator itEnd(
-			scene.bufferViews.end());
+				scene.bufferViews.end());
 
 		for (; it != itEnd; it++) {
 			const std::string key = it->first;
@@ -396,13 +515,13 @@ void rasterizeSetBuffers(const tinygltf::Scene & scene) {
 					dim3 numBlocks((numIndices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x);
 					cudaMalloc(&dev_indices, byteLength);
 					_deviceBufferCopy << <numBlocks, numThreadsPerBlock >> > (
-						numIndices,
-						(BufferByte*)dev_indices,
-						dev_bufferView,
-						n,
-						indexAccessor.byteStride,
-						indexAccessor.byteOffset,
-						componentTypeByteSize);
+							numIndices,
+							(BufferByte*)dev_indices,
+							dev_bufferView,
+							n,
+							indexAccessor.byteStride,
+							indexAccessor.byteOffset,
+							componentTypeByteSize);
 
 
 					checkCUDAError("Set Index Buffer");
@@ -499,13 +618,13 @@ void rasterizeSetBuffers(const tinygltf::Scene & scene) {
 						cudaMalloc(dev_attribute, byteLength);
 
 						_deviceBufferCopy << <numBlocks, numThreadsPerBlock >> > (
-							n * numVertices,
-							*dev_attribute,
-							dev_bufferView,
-							n,
-							accessor.byteStride,
-							accessor.byteOffset,
-							componentTypeByteSize);
+								n * numVertices,
+								*dev_attribute,
+								dev_bufferView,
+								n,
+								accessor.byteStride,
+								accessor.byteOffset,
+								componentTypeByteSize);
 
 						std::string msg = "Set Attribute Buffer: " + it->first;
 						checkCUDAError(msg.c_str());
@@ -537,7 +656,7 @@ void rasterizeSetBuffers(const tinygltf::Scene & scene) {
 									size_t s = image.image.size() * sizeof(TextureData);
 									cudaMalloc(&dev_diffuseTex, s);
 									cudaMemcpy(dev_diffuseTex, &image.image.at(0), s, cudaMemcpyHostToDevice);
-									
+
 									diffuseTexWidth = image.width;
 									diffuseTexHeight = image.height;
 
@@ -554,14 +673,14 @@ void rasterizeSetBuffers(const tinygltf::Scene & scene) {
 
 					// ---------Node hierarchy transform--------
 					cudaDeviceSynchronize();
-					
+
 					dim3 numBlocksNodeTransform((numVertices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x);
 					_nodeMatrixTransform << <numBlocksNodeTransform, numThreadsPerBlock >> > (
-						numVertices,
-						dev_position,
-						dev_normal,
-						matrix,
-						matrixNormal);
+							numVertices,
+							dev_position,
+							dev_normal,
+							matrix,
+							matrixNormal);
 
 					checkCUDAError("Node hierarchy transformation");
 
@@ -595,21 +714,21 @@ void rasterizeSetBuffers(const tinygltf::Scene & scene) {
 		} // for each node
 
 	}
-	
+
 
 	// 3. Malloc for dev_primitives
 	{
 		cudaMalloc(&dev_primitives, totalNumPrimitives * sizeof(Primitive));
 	}
-	
+
 
 	// Finally, cudaFree raw dev_bufferViews
 	{
 
 		std::map<std::string, BufferByte*>::const_iterator it(bufferViewDevPointers.begin());
 		std::map<std::string, BufferByte*>::const_iterator itEnd(bufferViewDevPointers.end());
-			
-			//bufferViewDevPointers
+
+		//bufferViewDevPointers
 
 		for (; it != itEnd; it++) {
 			cudaFree(it->second);
@@ -625,10 +744,10 @@ void rasterizeSetBuffers(const tinygltf::Scene & scene) {
 
 __global__ 
 void _vertexTransformAndAssembly(
-	int numVertices, 
-	PrimitiveDevBufPointers primitive, 
-	glm::mat4 MVP, glm::mat4 MV, glm::mat3 MV_normal, 
-	int width, int height) {
+		int numVertices,
+		PrimitiveDevBufPointers primitive,
+		glm::mat4 MVP, glm::mat4 MV, glm::mat3 MV_normal,
+		int width, int height) {
 
 	// vertex id
 	int vid = (blockIdx.x * blockDim.x) + threadIdx.x;
@@ -638,10 +757,25 @@ void _vertexTransformAndAssembly(
 		// Multiply the MVP matrix for each vertex position, this will transform everything into clipping space
 		// Then divide the pos by its w element to transform into NDC space
 		// Finally transform x and y to viewport space
+		glm::vec4 vPosition = glm::vec4(primitive.dev_position[vid], 1.0f);
+		glm::vec3 vNormal = primitive.dev_normal[vid];
+		// Order of multiplication is important here!
+		glm::vec4 clipPosition = MVP * vPosition;
+
+		clipPosition = clipPosition / clipPosition.w;
+
+		clipPosition.x = ( width * (clipPosition.x / clipPosition.w + 1.0f) / 2);
+		clipPosition.y = ( height * (1 - (clipPosition.y / clipPosition.w)) / 2);
+
+		glm::vec3 eyePos = glm::vec3(vPosition * MV);
+		glm::vec3 eyeNor = glm::normalize(vNormal * MV_normal);
 
 		// TODO: Apply vertex assembly here
 		// Assemble all attribute arraies into the primitive array
-		
+		primitive.dev_verticesOut[vid].eyePos = eyePos;
+		primitive.dev_verticesOut[vid].eyeNor = eyeNor;
+		primitive.dev_verticesOut[vid].pos = clipPosition;
+
 	}
 }
 
@@ -657,32 +791,171 @@ void _primitiveAssembly(int numIndices, int curPrimitiveBeginId, Primitive* dev_
 
 	if (iid < numIndices) {
 
-		// TODO: uncomment the following code for a start
 		// This is primitive assembly for triangles
+		int pid;	// id for cur primitives vector
+		if (primitive.primitiveMode == TINYGLTF_MODE_TRIANGLES) {
+			pid = iid / (int)primitive.primitiveType;
+			dev_primitives[pid + curPrimitiveBeginId].v[iid % (int)primitive.primitiveType]
+			                                            = primitive.dev_verticesOut[primitive.dev_indices[iid]];
+		}
+	}
+}
 
-		//int pid;	// id for cur primitives vector
-		//if (primitive.primitiveMode == TINYGLTF_MODE_TRIANGLES) {
-		//	pid = iid / (int)primitive.primitiveType;
-		//	dev_primitives[pid + curPrimitiveBeginId].v[iid % (int)primitive.primitiveType]
-		//		= primitive.dev_verticesOut[primitive.dev_indices[iid]];
-		//}
+__device__
+void xyz(glm::vec4 * v4, glm::vec3 * v3) {
+	v3->x = v4->x;
+	v3->y = v4->y;
+	v3->z = v4->z;
+}
 
+__global__
+void kernRasterizePrimitive (
+		int N,
+		Primitive * dev_primitives, Fragment * dev_fragmentBuffer,
+		int * dev_depth, int * dev_mutex, int width, int height) {
+	int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+	if (idx >= N) { return; }
+
+#if OPTION_ENABLE_BACK_FACE_CULLING
+	//https://en.wikipedia.org/wiki/Back-face_culling
+	//Convert from vec4 for vec3
+	glm::vec3 p0(0);
+	glm::vec3 p1(0);
+	glm::vec3 p2(0);
+	xyz(&dev_primitives[idx].v[0].pos, &p0);
+	xyz(&dev_primitives[idx].v[1].pos, &p1);
+	xyz(&dev_primitives[idx].v[2].pos, &p2);
+
+	//Back culling calculation
+	glm::vec3 _N = glm::cross(p1 - p0, p2 - p0);
+	float backface_check = glm::dot(p0 - dev_primitives[idx].v[0].eyePos, _N);
+	if (backface_check > 0) {
+		return;
+	}
+#endif
+
+	//Triangle, defined by three points, used to get AABB
+	glm::vec3 points[3];
+	points[0] = glm::vec3(dev_primitives[idx].v[0].pos);
+	points[1] = glm::vec3(dev_primitives[idx].v[1].pos);
+	points[2] = glm::vec3(dev_primitives[idx].v[2].pos);
+	AABB aabb = getAABBForTriangle(points);
+
+	//Get bounds, max of screen, img
+	// x = width; y = height;
+	int widthStart = max(0, (int) aabb.min.x);
+	int widthEnd = min(width, (int) aabb.max.x);
+	int heightStart = max(0, (int) aabb.min.y);
+	int heightEnd = min(height, (int) aabb.max.y);
+
+#if OPTION_SELECT_MODE == MODE_TRIANGLE
+	// Process each visible pixel
+	for (int h = heightStart; h <= heightEnd; h++) {
+		for (int w = widthStart; w <= widthEnd; w++) {
+			int fragmentIdx = width * h + w;
+			glm::vec3 barycentricCoordinate =
+					calculateBarycentricCoordinate(points, glm::vec2(w, h));
+			if (isBarycentricCoordInBounds(barycentricCoordinate)) {
+				// Wait for mutex lock
+				int isSet;
+				do {
+					isSet = (atomicCAS(&dev_mutex[idx], 0, 1));
+					if(!isSet) { continue; }
+
+					float depth = getZAtCoordinate(barycentricCoordinate, points) * INT_MAX * -1;
+
+					if (depth < dev_depth[fragmentIdx]) {
+						//Update the fragment
+						dev_depth[fragmentIdx] = depth;
+						dev_fragmentBuffer[fragmentIdx].eyePos =
+								dev_primitives[idx].v[0].eyePos * barycentricCoordinate[0] +
+								dev_primitives[idx].v[1].eyePos * barycentricCoordinate[1] +
+								dev_primitives[idx].v[2].eyePos * barycentricCoordinate[2];
+						dev_fragmentBuffer[fragmentIdx].eyeNor =
+								dev_primitives[idx].v[0].eyeNor * barycentricCoordinate[0] +
+								dev_primitives[idx].v[1].eyeNor * barycentricCoordinate[1] +
+								dev_primitives[idx].v[2].eyeNor * barycentricCoordinate[2];
+						dev_fragmentBuffer[fragmentIdx].color = dev_fragmentBuffer[fragmentIdx].eyeNor;
+					}
+				} while(!isSet);
+				dev_mutex[idx] = 0;
+			}
+		}
+	}
+#elif OPTION_SELECT_MODE == MODE_POINT
+	//Iterate thru triangle, generating point at each vertex
+	for (int i = 0; i < 3; i++) {
+		int x = dev_primitives[idx].v[i].pos.x;
+		int y = dev_primitives[idx].v[i].pos.y;
+		int pointIdx = y * width + x;
+
+		//Set to static (white) color
+		dev_fragmentBuffer[pointIdx].color = glm::vec3(1, 1, 1);
+	}
+#elif OPTION_SELECT_MODE == MODE_LINE
+	for (int i = 0; i < 3; i++) {
+		int j = (i + 1) % 2;
+		//Use vecs not ints so we can calculate diffY
+		glm::vec4 origin = dev_primitives[idx].v[i].pos;
+		glm::vec4 dest = dev_primitives[idx].v[j].pos;
+		//Flip to ensure origin < dest
+		if (dest.x < origin.x) {
+			glm::vec4 tmp = origin;
+			origin = dest;
+			dest = tmp;
+		}
+
+		//Calculate travel distance
+		int diffX = dest.x - origin.x;
+		//Prevent divide by 0
+		if (diffX == 0) {
+			diffX = 1;
+		}
+		int diffY = dest.y - origin.y;
+
+		int last = origin.y;
+		for (int x = origin.x; x <= dest.x; x++) {
+			int y = diffY * (x - origin.x) / diffX + origin.y;
+
+			//Same flipping as before, this time sorted by y
+			int originY = y;
+			int destY = last;
+			if (destY < originY) {
+				int tmpy = originY;
+				originY = destY;
+				destY = tmpy;
+			}
 
-		// TODO: other primitive types (point, line)
+			for (int y2 = originY; y2 <= destY; y2++) {
+				// Prevent memory access exception, OOB
+				if (x > widthEnd || x < widthStart || y2 > heightEnd || y2 < heightStart) {
+					continue;
+				}
+
+				int pointIdx = y2 * width + x;
+				//Set to static (white) color
+				dev_fragmentBuffer[pointIdx].color = glm::vec3(1, 1, 1);
+			}
+			last = y;
+		}
 	}
-	
+#else
+	printf("ERROR: Invalid mode selected\n");
+#endif
 }
 
 
-
 /**
  * Perform rasterization.
  */
 void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const glm::mat3 MV_normal) {
-    int sideLength2d = 8;
-    dim3 blockSize2d(sideLength2d, sideLength2d);
-    dim3 blockCount2d((width  - 1) / blockSize2d.x + 1,
-		(height - 1) / blockSize2d.y + 1);
+	int sideLength2d = 8;
+	dim3 blockSize2d(sideLength2d, sideLength2d);
+	dim3 blockCount2d((width  - 1) / blockSize2d.x + 1,
+			(height - 1) / blockSize2d.y + 1);
+	dim3 trueBlockCount2d((trueWidth  - 1) / blockSize2d.x + 1,
+			(trueHeight - 1) / blockSize2d.y + 1);
 
 	// Execute your rasterization pipeline here
 	// (See README for rasterization pipeline outline.)
@@ -706,10 +979,10 @@ void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const g
 				checkCUDAError("Vertex Processing");
 				cudaDeviceSynchronize();
 				_primitiveAssembly << < numBlocksForIndices, numThreadsPerBlock >> >
-					(p->numIndices, 
-					curPrimitiveBeginId, 
-					dev_primitives, 
-					*p);
+				(p->numIndices,
+						curPrimitiveBeginId,
+						dev_primitives,
+						*p);
 				checkCUDAError("Primitive Assembly");
 
 				curPrimitiveBeginId += p->numPrimitives;
@@ -718,20 +991,27 @@ void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const g
 
 		checkCUDAError("Vertex Processing and Primitive Assembly");
 	}
-	
+
 	cudaMemset(dev_fragmentBuffer, 0, width * height * sizeof(Fragment));
 	initDepth << <blockCount2d, blockSize2d >> >(width, height, dev_depth);
-	
-	// TODO: rasterize
 
-
-
-    // Copy depthbuffer colors into framebuffer
+	// rasterize
+	// Copied from code above, why 128?
+	dim3 numThreadsPerBlock(128, 1, 1);
+	int primitiveBlockCount = (numThreadsPerBlock.x + totalNumPrimitives - 1) / numThreadsPerBlock.x;
+	// Launch primitive kernel
+	kernRasterizePrimitive <<< primitiveBlockCount, numThreadsPerBlock >>>(
+			totalNumPrimitives,
+			dev_primitives, dev_fragmentBuffer, dev_depth, dev_mutex,
+			width, height);
+
+	// Copy depthbuffer colors into framebuffer
 	render << <blockCount2d, blockSize2d >> >(width, height, dev_fragmentBuffer, dev_framebuffer);
 	checkCUDAError("fragment shader");
-    // Copy framebuffer into OpenGL buffer for OpenGL previewing
-    sendImageToPBO<<<blockCount2d, blockSize2d>>>(pbo, width, height, dev_framebuffer);
-    checkCUDAError("copy render result to pbo");
+	// Copy framebuffer into OpenGL buffer for OpenGL previewing
+	//printf("TW, TH = %i, %i || %i, %i", trueWidth, trueHeight, width, height);
+	sendImageToPBO<<<trueBlockCount2d, blockSize2d>>>(pbo, width, height, trueWidth, trueHeight, dev_framebuffer);
+	checkCUDAError("copy render result to pbo");
 }
 
 /**
@@ -739,7 +1019,7 @@ void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const g
  */
 void rasterizeFree() {
 
-    // deconstruct primitives attribute/indices device buffer
+	// deconstruct primitives attribute/indices device buffer
 
 	auto it(mesh2PrimitivesMap.begin());
 	auto itEnd(mesh2PrimitivesMap.end());
@@ -753,24 +1033,27 @@ void rasterizeFree() {
 
 			cudaFree(p->dev_verticesOut);
 
-			
+
 			//TODO: release other attributes and materials
 		}
 	}
 
 	////////////
 
-    cudaFree(dev_primitives);
-    dev_primitives = NULL;
+	cudaFree(dev_primitives);
+	dev_primitives = NULL;
 
 	cudaFree(dev_fragmentBuffer);
 	dev_fragmentBuffer = NULL;
 
-    cudaFree(dev_framebuffer);
-    dev_framebuffer = NULL;
+	cudaFree(dev_framebuffer);
+	dev_framebuffer = NULL;
 
 	cudaFree(dev_depth);
 	dev_depth = NULL;
 
-    checkCUDAError("rasterize Free");
+	cudaFree(dev_mutex);
+	dev_mutex = NULL;
+
+	checkCUDAError("rasterize Free");
 }