diff --git a/2014-10-27-1018-28.mp4 b/2014-10-27-1018-28.mp4
new file mode 100644
index 0000000..6a1904e
Binary files /dev/null and b/2014-10-27-1018-28.mp4 differ
diff --git a/INSTRUCTIONS.md b/INSTRUCTIONS.md
new file mode 100644
index 0000000..ae0896a
--- /dev/null
+++ b/INSTRUCTIONS.md
@@ -0,0 +1,184 @@
+-------------------------------------------------------------------------------
+CIS565: Project 4: CUDA Rasterizer
+-------------------------------------------------------------------------------
+Fall 2014
+-------------------------------------------------------------------------------
+Due Monday 10/27/2014 @ 12 PM
+-------------------------------------------------------------------------------
+
+-------------------------------------------------------------------------------
+NOTE:
+-------------------------------------------------------------------------------
+This project requires an NVIDIA graphics card with CUDA capability! Any card with CUDA compute capability 1.1 or higher will work fine for this project. For a full list of CUDA capable cards and their compute capability, please consult: http://developer.nvidia.com/cuda/cuda-gpus. If you do not have an NVIDIA graphics card in the machine you are working on, feel free to use any machine in the SIG Lab or in Moore100 labs. All machines in the SIG Lab and Moore100 are equipped with CUDA capable NVIDIA graphics cards. If this too proves to be a problem, please contact Patrick or Karl as soon as possible.
+
+-------------------------------------------------------------------------------
+INTRODUCTION:
+-------------------------------------------------------------------------------
+In this project, you will implement a simplified CUDA based implementation of a standard rasterized graphics pipeline, similar to the OpenGL pipeline. In this project, you will implement vertex shading, primitive assembly, perspective transformation, rasterization, fragment shading, and write the resulting fragments to a framebuffer. More information about the rasterized graphics pipeline can be found in the class slides and in your notes from CIS560.
+
+The basecode provided includes an OBJ loader and much of the mundane I/O and bookkeeping code. The basecode also includes some functions that you may find useful, described below. The core rasterization pipeline is left for you to implement.
+
+You MAY NOT use ANY raycasting/raytracing AT ALL in this project, EXCEPT in the fragment shader step. One of the purposes of this project is to see how a rasterization pipeline can generate graphics WITHOUT the need for raycasting! Raycasting may only be used in the fragment shader effect for interesting shading results, but is absolutely not allowed in any other stages of the pipeline.
+
+Also, you MAY NOT use OpenGL ANYWHERE in this project, aside from the given OpenGL code for drawing Pixel Buffer Objects to the screen. Use of OpenGL for any pipeline stage instead of your own custom implementation will result in an incomplete project.
+
+Finally, note that while this basecode is meant to serve as a strong starting point for a CUDA rasterizer, you are not required to use this basecode if you wish, and you may also change any part of the basecode specification as you please, so long as the final rendered result is correct.
+
+-------------------------------------------------------------------------------
+CONTENTS:
+-------------------------------------------------------------------------------
+The Project4 root directory contains the following subdirectories:
+	
+* src/ contains the source code for the project. Both the Windows Visual Studio solution and the OSX makefile reference this folder for all source; the base source code compiles on OSX and Windows without modification.
+* objs/ contains example obj test files: cow.obj, cube.obj, tri.obj.
+* renders/ contains an example render of the given example cow.obj file with a z-depth fragment shader. 
+* windows/ contains a Windows Visual Studio 2010 project and all dependencies needed for building and running on Windows 7.
+
+The Windows and OSX versions of the project build and run exactly the same way as in Project0, Project1, and Project2.
+
+-------------------------------------------------------------------------------
+REQUIREMENTS:
+-------------------------------------------------------------------------------
+In this project, you are given code for:
+
+* A library for loading/reading standard Alias/Wavefront .obj format mesh files and converting them to OpenGL style VBOs/IBOs
+* A suggested order of kernels with which to implement the graphics pipeline
+* Working code for CUDA-GL interop
+
+You will need to implement the following stages of the graphics pipeline and features:
+
+* Vertex Shading
+* Primitive Assembly with support for triangle VBOs/IBOs
+* Perspective Transformation
+* Rasterization through either a scanline or a tiled approach
+* Fragment Shading
+* A depth buffer for storing and depth testing fragments
+* Fragment to framebuffer writing
+* A simple lighting/shading scheme, such as Lambert or Blinn-Phong, implemented in the fragment shader
+
+You are also required to implement at least 3 of the following features:
+
+* Additional pipeline stages. Each one of these stages can count as 1 feature:
+   * Geometry shader
+   * Transformation feedback
+   * Back-face culling
+   * Scissor test
+   * Stencil test
+   * Blending
+
+IMPORTANT: For each of these stages implemented, you must also add a section to your README stating what the expected performance impact of that pipeline stage is, and real performance comparisons between your rasterizer with that stage and without.
+
+* Correct color interpolation between points on a primitive
+* Texture mapping WITH texture filtering and perspective correct texture coordinates
+* Support for additional primitices. Each one of these can count as HALF of a feature.
+   * Lines
+   * Line strips
+   * Triangle fans
+   * Triangle strips
+   * Points
+* Anti-aliasing
+* Order-independent translucency using a k-buffer
+* MOUSE BASED interactive camera support. Interactive camera support based only on the keyboard is not acceptable for this feature.
+
+-------------------------------------------------------------------------------
+BASE CODE TOUR:
+-------------------------------------------------------------------------------
+You will be working primarily in two files: rasterizeKernel.cu, and rasterizerTools.h. Within these files, areas that you need to complete are marked with a TODO comment. Areas that are useful to and serve as hints for optional features are marked with TODO (Optional). Functions that are useful for reference are marked with the comment LOOK.
+
+* rasterizeKernels.cu contains the core rasterization pipeline. 
+	* A suggested sequence of kernels exists in this file, but you may choose to alter the order of this sequence or merge entire kernels if you see fit. For example, if you decide that doing has benefits, you can choose to merge the vertex shader and primitive assembly kernels, or merge the perspective transform into another kernel. There is not necessarily a right sequence of kernels (although there are wrong sequences, such as placing fragment shading before vertex shading), and you may choose any sequence you want. Please document in your README what sequence you choose and why.
+	* The provided kernels have had their input parameters removed beyond basic inputs such as the framebuffer. You will have to decide what inputs should go into each stage of the pipeline, and what outputs there should be. 
+
+* rasterizeTools.h contains various useful tools, including a number of barycentric coordinate related functions that you may find useful in implementing scanline based rasterization...
+	* A few pre-made structs are included for you to use, such as fragment and triangle. A simple rasterizer can be implemented with these structs as is. However, as with any part of the basecode, you may choose to modify, add to, use as-is, or outright ignore them as you see fit.
+	* If you do choose to add to the fragment struct, be sure to include in your README a rationale for why. 
+
+You will also want to familiarize yourself with:
+
+* main.cpp, which contains code that transfers VBOs/CBOs/IBOs to the rasterization pipeline. Interactive camera work will also have to be implemented in this file if you choose that feature.
+* utilities.h, which serves as a kitchen-sink of useful functions
+
+-------------------------------------------------------------------------------
+SOME RESOURCES:
+-------------------------------------------------------------------------------
+The following resources may be useful for this project:
+
+* High-Performance Software Rasterization on GPUs
+	* Paper (HPG 2011): http://www.tml.tkk.fi/~samuli/publications/laine2011hpg_paper.pdf
+	* Code: http://code.google.com/p/cudaraster/ Note that looking over this code for reference with regard to the paper is fine, but we most likely will not grant any requests to actually incorporate any of this code into your project.
+	* Slides: http://bps11.idav.ucdavis.edu/talks/08-gpuSoftwareRasterLaineAndPantaleoni-BPS2011.pdf
+* The Direct3D 10 System (SIGGRAPH 2006) - for those interested in doing geometry shaders and transform feedback.
+	* http://133.11.9.3/~takeo/course/2006/media/papers/Direct3D10_siggraph2006.pdf
+* Multi-Fragment Eﬀects on the GPU using the k-Buﬀer - for those who want to do a k-buffer
+	* http://www.inf.ufrgs.br/~comba/papers/2007/kbuffer_preprint.pdf
+* FreePipe: A Programmable, Parallel Rendering Architecture for Efficient Multi-Fragment Effects (I3D 2010)
+	* https://sites.google.com/site/hmcen0921/cudarasterizer
+* Writing A Software Rasterizer In Javascript:
+	* Part 1: http://simonstechblog.blogspot.com/2012/04/software-rasterizer-part-1.html
+	* Part 2: http://simonstechblog.blogspot.com/2012/04/software-rasterizer-part-2.html
+
+-------------------------------------------------------------------------------
+NOTES ON GLM:
+-------------------------------------------------------------------------------
+This project uses GLM, the GL Math library, for linear algebra. You need to know two important points on how GLM is used in this project:
+
+* In this project, indices in GLM vectors (such as vec3, vec4), are accessed via swizzling. So, instead of v[0], v.x is used, and instead of v[1], v.y is used, and so on and so forth.
+* GLM Matrix operations work fine on NVIDIA Fermi cards and later, but pre-Fermi cards do not play nice with GLM matrices. As such, in this project, GLM matrices are replaced with a custom matrix struct, called a cudaMat4, found in cudaMat4.h. A custom function for multiplying glm::vec4s and cudaMat4s is provided as multiplyMV() in intersections.h.
+
+-------------------------------------------------------------------------------
+README
+-------------------------------------------------------------------------------
+All students must replace or augment the contents of this Readme.md in a clear 
+manner with the following:
+
+* A brief description of the project and the specific features you implemented.
+* At least one screenshot of your project running.
+* A 30 second or longer video of your project running.  To create the video you
+  can use http://www.microsoft.com/expression/products/Encoder4_Overview.aspx 
+* A performance evaluation (described in detail below).
+
+-------------------------------------------------------------------------------
+PERFORMANCE EVALUATION
+-------------------------------------------------------------------------------
+The performance evaluation is where you will investigate how to make your CUDA
+programs more efficient using the skills you've learned in class. You must have
+performed at least one experiment on your code to investigate the positive or
+negative effects on performance. 
+
+We encourage you to get creative with your tweaks. Consider places in your code
+that could be considered bottlenecks and try to improve them. 
+
+Each student should provide no more than a one page summary of their
+optimizations along with tables and or graphs to visually explain any
+performance differences.
+
+-------------------------------------------------------------------------------
+THIRD PARTY CODE POLICY
+-------------------------------------------------------------------------------
+* Use of any third-party code must be approved by asking on Piazza.  If it is approved, all students are welcome to use it.  Generally, we approve use of third-party code that is not a core part of the project.  For example, for the ray tracer, we would approve using a third-party library for loading models, but would not approve copying and pasting a CUDA function for doing refraction.
+* Third-party code must be credited in README.md.
+* Using third-party code without its approval, including using another student's code, is an academic integrity violation, and will result in you receiving an F for the semester.
+
+-------------------------------------------------------------------------------
+SELF-GRADING
+-------------------------------------------------------------------------------
+* On the submission date, email your grade, on a scale of 0 to 100, to Liam, harmoli+cis565@seas.upenn.edu, with a one paragraph explanation.  Be concise and realistic.  Recall that we reserve 30 points as a sanity check to adjust your grade.  Your actual grade will be (0.7 * your grade) + (0.3 * our grade).  We hope to only use this in extreme cases when your grade does not realistically reflect your work - it is either too high or too low.  In most cases, we plan to give you the exact grade you suggest.
+* Projects are not weighted evenly, e.g., Project 0 doesn't count as much as the path tracer.  We will determine the weighting at the end of the semester based on the size of each project.
+
+---
+SUBMISSION
+---
+As with the previous project, you should fork this project and work inside of
+your fork. Upon completion, commit your finished project back to your fork, and
+make a pull request to the master repository.  You should include a README.md
+file in the root directory detailing the following
+
+* A brief description of the project and specific features you implemented
+* At least one screenshot of your project running.
+* A link to a video of your raytracer running.
+* Instructions for building and running your project if they differ from the
+  base code.
+* A performance writeup as detailed above.
+* A list of all third-party code used.
+* This Readme file edited as described above in the README section.
+
diff --git a/README.md b/README.md
index ae0896a..db82f37 100644
--- a/README.md
+++ b/README.md
@@ -3,182 +3,83 @@ CIS565: Project 4: CUDA Rasterizer
 -------------------------------------------------------------------------------
 Fall 2014
 -------------------------------------------------------------------------------
-Due Monday 10/27/2014 @ 12 PM
+Jiatong He
 -------------------------------------------------------------------------------
+Base code by Karl Li
 
--------------------------------------------------------------------------------
-NOTE:
--------------------------------------------------------------------------------
-This project requires an NVIDIA graphics card with CUDA capability! Any card with CUDA compute capability 1.1 or higher will work fine for this project. For a full list of CUDA capable cards and their compute capability, please consult: http://developer.nvidia.com/cuda/cuda-gpus. If you do not have an NVIDIA graphics card in the machine you are working on, feel free to use any machine in the SIG Lab or in Moore100 labs. All machines in the SIG Lab and Moore100 are equipped with CUDA capable NVIDIA graphics cards. If this too proves to be a problem, please contact Patrick or Karl as soon as possible.
+![cow render](https://raw.githubusercontent.com/JivingTechnostic/Project4-Rasterizer/master/renders/2.png)
 
--------------------------------------------------------------------------------
-INTRODUCTION:
--------------------------------------------------------------------------------
-In this project, you will implement a simplified CUDA based implementation of a standard rasterized graphics pipeline, similar to the OpenGL pipeline. In this project, you will implement vertex shading, primitive assembly, perspective transformation, rasterization, fragment shading, and write the resulting fragments to a framebuffer. More information about the rasterized graphics pipeline can be found in the class slides and in your notes from CIS560.
+Implemented Features:
+---------------------
+### Vertex Shader
+Implemented a basic vertex shader that takes the vertices, and a model-view-projection matrix, and transforms each vertex into clip space.
 
-The basecode provided includes an OBJ loader and much of the mundane I/O and bookkeeping code. The basecode also includes some functions that you may find useful, described below. The core rasterization pipeline is left for you to implement.
+### Primitive Assembly for Triangles
+Assembles triangles from vertex, color, and normal buffer objects using the index buffer.  Triangles store color, normal, and position for each vertex.  The position is in clip space, but the normal is in world space.
 
-You MAY NOT use ANY raycasting/raytracing AT ALL in this project, EXCEPT in the fragment shader step. One of the purposes of this project is to see how a rasterization pipeline can generate graphics WITHOUT the need for raycasting! Raycasting may only be used in the fragment shader effect for interesting shading results, but is absolutely not allowed in any other stages of the pipeline.
+### Backface Culling and Clipping
+Simple triangle backface culling using a calculated normal from the vertices (since the stored normal is in world space), and clipping that removes triangles that are outside of the (1, 1) to (-1, -1) box.
 
-Also, you MAY NOT use OpenGL ANYWHERE in this project, aside from the given OpenGL code for drawing Pixel Buffer Objects to the screen. Use of OpenGL for any pipeline stage instead of your own custom implementation will result in an incomplete project.
+### Rasterization
+Rasterization implemented as a scanline algorithm.  This section currently takes the most time, and large triangles (in screen space) will slow down the program significantly or even crash it.  For every triangle, we begin by sorting the vertices from top to bottom.  Then, starting from the top, we render it in two steps--top to middle, and middle to bottom (note that either of these may have 0 height, if the top and middle, or middle and bottom are at the same height).
 
-Finally, note that while this basecode is meant to serve as a strong starting point for a CUDA rasterizer, you are not required to use this basecode if you wish, and you may also change any part of the basecode specification as you please, so long as the final rendered result is correct.
+#### Color & Normal Interpolation
+![color interpolation](https://raw.githubusercontent.com/JivingTechnostic/Project4-Rasterizer/master/renders/3.png)
 
--------------------------------------------------------------------------------
-CONTENTS:
--------------------------------------------------------------------------------
-The Project4 root directory contains the following subdirectories:
-	
-* src/ contains the source code for the project. Both the Windows Visual Studio solution and the OSX makefile reference this folder for all source; the base source code compiles on OSX and Windows without modification.
-* objs/ contains example obj test files: cow.obj, cube.obj, tri.obj.
-* renders/ contains an example render of the given example cow.obj file with a z-depth fragment shader. 
-* windows/ contains a Windows Visual Studio 2010 project and all dependencies needed for building and running on Windows 7.
+I use double linear interpolation to calculate the appropriate depth, color, and normal for each fragment.  I did not use the provided code for barycentric coordinates.  Instead, I LERP first along the edges to find a left fragment and right fragment, then LERP between them to fill in the shape.
+![lerp diagram](https://raw.githubusercontent.com/JivingTechnostic/Project4-Rasterizer/master/renders/TLERP.png)
 
-The Windows and OSX versions of the project build and run exactly the same way as in Project0, Project1, and Project2.
+I am fairly certain that this method gives the correct color, though it might favor colors horizontally.  I will have to check later.  Normal interpolation comes for free as well, but the OBJ's I am using have uniform normals on each face, so it doesn't change anything.
 
--------------------------------------------------------------------------------
-REQUIREMENTS:
--------------------------------------------------------------------------------
-In this project, you are given code for:
-
-* A library for loading/reading standard Alias/Wavefront .obj format mesh files and converting them to OpenGL style VBOs/IBOs
-* A suggested order of kernels with which to implement the graphics pipeline
-* Working code for CUDA-GL interop
-
-You will need to implement the following stages of the graphics pipeline and features:
-
-* Vertex Shading
-* Primitive Assembly with support for triangle VBOs/IBOs
-* Perspective Transformation
-* Rasterization through either a scanline or a tiled approach
-* Fragment Shading
-* A depth buffer for storing and depth testing fragments
-* Fragment to framebuffer writing
-* A simple lighting/shading scheme, such as Lambert or Blinn-Phong, implemented in the fragment shader
-
-You are also required to implement at least 3 of the following features:
-
-* Additional pipeline stages. Each one of these stages can count as 1 feature:
-   * Geometry shader
-   * Transformation feedback
-   * Back-face culling
-   * Scissor test
-   * Stencil test
-   * Blending
-
-IMPORTANT: For each of these stages implemented, you must also add a section to your README stating what the expected performance impact of that pipeline stage is, and real performance comparisons between your rasterizer with that stage and without.
-
-* Correct color interpolation between points on a primitive
-* Texture mapping WITH texture filtering and perspective correct texture coordinates
-* Support for additional primitices. Each one of these can count as HALF of a feature.
-   * Lines
-   * Line strips
-   * Triangle fans
-   * Triangle strips
-   * Points
-* Anti-aliasing
-* Order-independent translucency using a k-buffer
-* MOUSE BASED interactive camera support. Interactive camera support based only on the keyboard is not acceptable for this feature.
+### Fragment Shading (Blinn-Phong Shader)
+Simple fragment shader that takes in a light, fragments, and the inverse of the model-view-projection matrix.  This inverse is multiplied with the position of the fragment in order to get the position in world-space.  The world-space coordinate is then used, along with the normal and light position, to calculate shading using a Blinn-Phong shader.  Objects do not obscure each other yet.  So long as a plane has a normal towards the light, it will be lit.
 
--------------------------------------------------------------------------------
-BASE CODE TOUR:
--------------------------------------------------------------------------------
-You will be working primarily in two files: rasterizeKernel.cu, and rasterizerTools.h. Within these files, areas that you need to complete are marked with a TODO comment. Areas that are useful to and serve as hints for optional features are marked with TODO (Optional). Functions that are useful for reference are marked with the comment LOOK.
+### Mouse Control
+Use the mouse to rotate, pan, and zoom the camera.  LMB rotates, RMB pans, and middle mouse button zooms.  Note that zooming too far in will cause the code to crash due to excessively large triangles.
 
-* rasterizeKernels.cu contains the core rasterization pipeline. 
-	* A suggested sequence of kernels exists in this file, but you may choose to alter the order of this sequence or merge entire kernels if you see fit. For example, if you decide that doing has benefits, you can choose to merge the vertex shader and primitive assembly kernels, or merge the perspective transform into another kernel. There is not necessarily a right sequence of kernels (although there are wrong sequences, such as placing fragment shading before vertex shading), and you may choose any sequence you want. Please document in your README what sequence you choose and why.
-	* The provided kernels have had their input parameters removed beyond basic inputs such as the framebuffer. You will have to decide what inputs should go into each stage of the pipeline, and what outputs there should be. 
+### Key Control
+Pressing 'z' will draw faces.  Pressing 'c' will draw only vertices.  'x' is supposed to draw a wireframe but it's not done yet.
+![vertices](https://raw.githubusercontent.com/JivingTechnostic/Project4-Rasterizer/master/renders/6.png)
+Pressing 'a' will use Blinn-Phong lighting.  's' will color by normals.  'd' will color by depth (not really working).
+![normal colors](https://raw.githubusercontent.com/JivingTechnostic/Project4-Rasterizer/master/renders/7.png)
 
-* rasterizeTools.h contains various useful tools, including a number of barycentric coordinate related functions that you may find useful in implementing scanline based rasterization...
-	* A few pre-made structs are included for you to use, such as fragment and triangle. A simple rasterizer can be implemented with these structs as is. However, as with any part of the basecode, you may choose to modify, add to, use as-is, or outright ignore them as you see fit.
-	* If you do choose to add to the fragment struct, be sure to include in your README a rationale for why. 
+Missing Features
+----------------
+### Depth buffer testing
+I did not have time to do proper depth checking, so you can see depth errors such as in the following image, where the cow's tail is visible through its body.
+![depth errors](https://raw.githubusercontent.com/JivingTechnostic/Project4-Rasterizer/master/renders/4.png)
 
-You will also want to familiarize yourself with:
+Each fragment does have a depth value, it's just a matter of setting up atomics and locking the fragment properly.
 
-* main.cpp, which contains code that transfers VBOs/CBOs/IBOs to the rasterization pipeline. Interactive camera work will also have to be implemented in this file if you choose that feature.
-* utilities.h, which serves as a kitchen-sink of useful functions
+Back-Face Culling and Clipping Performance Analysis
+--------------------------------------
+### Expectations
+I don't actually expect that this increases runtime too much.  I don't think that the bottleneck is due to too many triangles. Rather, it's caused by large triangles.  Large triangles cause a single thread to take longer, and mess everything up.  There should be a small performance impact, but not huge.
 
--------------------------------------------------------------------------------
-SOME RESOURCES:
--------------------------------------------------------------------------------
-The following resources may be useful for this project:
-
-* High-Performance Software Rasterization on GPUs
-	* Paper (HPG 2011): http://www.tml.tkk.fi/~samuli/publications/laine2011hpg_paper.pdf
-	* Code: http://code.google.com/p/cudaraster/ Note that looking over this code for reference with regard to the paper is fine, but we most likely will not grant any requests to actually incorporate any of this code into your project.
-	* Slides: http://bps11.idav.ucdavis.edu/talks/08-gpuSoftwareRasterLaineAndPantaleoni-BPS2011.pdf
-* The Direct3D 10 System (SIGGRAPH 2006) - for those interested in doing geometry shaders and transform feedback.
-	* http://133.11.9.3/~takeo/course/2006/media/papers/Direct3D10_siggraph2006.pdf
-* Multi-Fragment Eﬀects on the GPU using the k-Buﬀer - for those who want to do a k-buffer
-	* http://www.inf.ufrgs.br/~comba/papers/2007/kbuffer_preprint.pdf
-* FreePipe: A Programmable, Parallel Rendering Architecture for Efficient Multi-Fragment Effects (I3D 2010)
-	* https://sites.google.com/site/hmcen0921/cudarasterizer
-* Writing A Software Rasterizer In Javascript:
-	* Part 1: http://simonstechblog.blogspot.com/2012/04/software-rasterizer-part-1.html
-	* Part 2: http://simonstechblog.blogspot.com/2012/04/software-rasterizer-part-2.html
+However, since I did not implement a proper z-buffer, having back-face culling means that there will be less incorrect z-fighting due to race conditions, since there won't be any back face to fight with the front.
 
--------------------------------------------------------------------------------
-NOTES ON GLM:
--------------------------------------------------------------------------------
-This project uses GLM, the GL Math library, for linear algebra. You need to know two important points on how GLM is used in this project:
+### Performance Impact
+#####Without Culling and Clipping: 24-25 fps
+#####With Culling: 25-26 fps
+#####With Culling and Clipping (flank of cow shown): 27-28 fps
+#####With Culling and Clipping (head of cow shown): 29-31 fps
 
-* In this project, indices in GLM vectors (such as vec3, vec4), are accessed via swizzling. So, instead of v[0], v.x is used, and instead of v[1], v.y is used, and so on and so forth.
-* GLM Matrix operations work fine on NVIDIA Fermi cards and later, but pre-Fermi cards do not play nice with GLM matrices. As such, in this project, GLM matrices are replaced with a custom matrix struct, called a cudaMat4, found in cudaMat4.h. A custom function for multiplying glm::vec4s and cudaMat4s is provided as multiplyMV() in intersections.h.
+So as you can see, it does produce some level of speedup, but not much.  In addition, clipping gives more speedup if I hide the body offscreen and keep the head (which has more triangles, but they are smaller), which leads me back to the point I was making about triangle size causing the bottleneck.
 
--------------------------------------------------------------------------------
-README
--------------------------------------------------------------------------------
-All students must replace or augment the contents of this Readme.md in a clear 
-manner with the following:
+Performance Evaluation--A Better Linear Interpolation?
+------------------------------------------------------
+The current bottleneck in the code is rasterizationKernel (though the fragment shader and clearDepthBuffer take up considerable time as well).  When a single triangle takes up a significant part of the screen (maybe 10%), the program slows to a crawl and can crash.  This is caused by a single thread trying to process a large amount of fragments.  The image below shows an example of the runtime of my code while zoomed out (top), and zoomed in (the other one).
+![triangle size runtime comparison](https://raw.githubusercontent.com/JivingTechnostic/Project4-Rasterizer/master/renders/rasterization_comparison.png)
 
-* A brief description of the project and the specific features you implemented.
-* At least one screenshot of your project running.
-* A 30 second or longer video of your project running.  To create the video you
-  can use http://www.microsoft.com/expression/products/Encoder4_Overview.aspx 
-* A performance evaluation (described in detail below).
+As such, I will be addressing the rasterization kernel for improving performance.
 
--------------------------------------------------------------------------------
-PERFORMANCE EVALUATION
--------------------------------------------------------------------------------
-The performance evaluation is where you will investigate how to make your CUDA
-programs more efficient using the skills you've learned in class. You must have
-performed at least one experiment on your code to investigate the positive or
-negative effects on performance. 
+As mentioned above, I use linear interpolation to calculate coordinates/interpolate color&normals for my geometry rather than using barycentric coordinates.  However, I am recalculating the interpolation every fragment.  Since it's linear, each step should have a constant change.  What if I replaced the calculations with dNorm, dCol, dPos values, and added those to the current left, right, or center points?  This would add several variables to the kernel, but should require fewer calculations per triangle and speed up the processing time for large triangles.
 
-We encourage you to get creative with your tweaks. Consider places in your code
-that could be considered bottlenecks and try to improve them. 
+#### Results
+To put things simply, it didn't work.  There were too many values to keep track of, and too much going on between them, that I don't think it was any faster to take out a few adds and multiply's.  In the end, the number of operations turned out to be nearly equal.  I had hoped that, since you're fetching a constant value and adding it to the same address over and over again, that it would save time by not needing deeper memory access and fewer calculations, but I was wrong.
 
-Each student should provide no more than a one page summary of their
-optimizations along with tables and or graphs to visually explain any
-performance differences.
+Sampled over 10 frames, the performance compares as such:
 
--------------------------------------------------------------------------------
-THIRD PARTY CODE POLICY
--------------------------------------------------------------------------------
-* Use of any third-party code must be approved by asking on Piazza.  If it is approved, all students are welcome to use it.  Generally, we approve use of third-party code that is not a core part of the project.  For example, for the ray tracer, we would approve using a third-party library for loading models, but would not approve copying and pasting a CUDA function for doing refraction.
-* Third-party code must be credited in README.md.
-* Using third-party code without its approval, including using another student's code, is an academic integrity violation, and will result in you receiving an F for the semester.
+![rasterization runtime graph](https://raw.githubusercontent.com/JivingTechnostic/Project4-Rasterizer/master/renders/5.png)
 
--------------------------------------------------------------------------------
-SELF-GRADING
--------------------------------------------------------------------------------
-* On the submission date, email your grade, on a scale of 0 to 100, to Liam, harmoli+cis565@seas.upenn.edu, with a one paragraph explanation.  Be concise and realistic.  Recall that we reserve 30 points as a sanity check to adjust your grade.  Your actual grade will be (0.7 * your grade) + (0.3 * our grade).  We hope to only use this in extreme cases when your grade does not realistically reflect your work - it is either too high or too low.  In most cases, we plan to give you the exact grade you suggest.
-* Projects are not weighted evenly, e.g., Project 0 doesn't count as much as the path tracer.  We will determine the weighting at the end of the semester based on the size of each project.
-
----
-SUBMISSION
----
-As with the previous project, you should fork this project and work inside of
-your fork. Upon completion, commit your finished project back to your fork, and
-make a pull request to the master repository.  You should include a README.md
-file in the root directory detailing the following
-
-* A brief description of the project and specific features you implemented
-* At least one screenshot of your project running.
-* A link to a video of your raytracer running.
-* Instructions for building and running your project if they differ from the
-  base code.
-* A performance writeup as detailed above.
-* A list of all third-party code used.
-* This Readme file edited as described above in the README section.
 
diff --git a/external/include/objUtil/obj.h b/external/include/objUtil/obj.h
index a8ae205..1edf0ca 100644
--- a/external/include/objUtil/obj.h
+++ b/external/include/objUtil/obj.h
@@ -56,6 +56,7 @@ class obj{
 	float* getBoundingBox();    //returns vbo-formatted bounding box
 	float getTop();
 	void setColor(glm::vec3);
+  void randomizeColors();
 	glm::vec3 getColor();
 	float* getVBO();
 	float* getCBO();
@@ -72,6 +73,7 @@ class obj{
 	vector<glm::vec4>* getNormals();
 	vector<glm::vec4>* getTextureCoords();
     vector<float*>* getFaceBoxes();
+
 };
 
 #endif
\ No newline at end of file
diff --git a/external/src/objUtil/obj.cpp b/external/src/objUtil/obj.cpp
index 22a33aa..bc31c18 100644
--- a/external/src/objUtil/obj.cpp
+++ b/external/src/objUtil/obj.cpp
@@ -4,6 +4,7 @@
 #include <objUtil/obj.h>
 #include <iostream>
 #include <limits>
+#include <random>
 
 #define EPSILON std::numeric_limits<double>::epsilon()
 
@@ -93,7 +94,7 @@ void obj::buildVBOs(){
 	for(int i=0; i<IBOvec.size(); i++){
 		ibo[i] = IBOvec[i];
 	}
-	setColor(glm::vec3(.4,.4,.4));
+	setColor(glm::vec3(.5,.5,.4));
 }
 
 void obj::compareMaxMin(float x, float y, float z){
diff --git a/objs/tri.obj b/objs/tri.obj
index fb38e35..78ad5fe 100644
--- a/objs/tri.obj
+++ b/objs/tri.obj
@@ -1,6 +1,6 @@
-v 0 0 0
-v 0.1 0 0
-v 0 0.1 0
+v -.3 0.3 0
+v 0 -0 0
+v 0.3 0.2 0
 
 vn 0 0 1
 
diff --git a/renders/1.png b/renders/1.png
new file mode 100644
index 0000000..fa72115
Binary files /dev/null and b/renders/1.png differ
diff --git a/renders/2.png b/renders/2.png
new file mode 100644
index 0000000..72fef22
Binary files /dev/null and b/renders/2.png differ
diff --git a/renders/3.png b/renders/3.png
new file mode 100644
index 0000000..960d6c3
Binary files /dev/null and b/renders/3.png differ
diff --git a/renders/4.png b/renders/4.png
new file mode 100644
index 0000000..19bad49
Binary files /dev/null and b/renders/4.png differ
diff --git a/renders/5.png b/renders/5.png
new file mode 100644
index 0000000..500a832
Binary files /dev/null and b/renders/5.png differ
diff --git a/renders/6.png b/renders/6.png
new file mode 100644
index 0000000..ac1548f
Binary files /dev/null and b/renders/6.png differ
diff --git a/renders/7.png b/renders/7.png
new file mode 100644
index 0000000..0fb9726
Binary files /dev/null and b/renders/7.png differ
diff --git a/renders/TLERP.png b/renders/TLERP.png
new file mode 100644
index 0000000..04fbb7f
Binary files /dev/null and b/renders/TLERP.png differ
diff --git a/renders/rasterization_comparison.png b/renders/rasterization_comparison.png
new file mode 100644
index 0000000..e12a9bf
Binary files /dev/null and b/renders/rasterization_comparison.png differ
diff --git a/src/main.cpp b/src/main.cpp
index 13d8e67..0466dd8 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -29,10 +29,24 @@ int main(int argc, char** argv){
     return 0;
   }
 
+  drawmode = FACES;
+  colormode = COLOR;
   frame = 0;
   seconds = time (NULL);
   fpstracker = 0;
 
+  //------------------------------
+  //camera setup
+  //------------------------------
+  camera.pos = glm::vec3(0, 0, 5);
+  camera.view = glm::vec3(0, 0, -1);
+  camera.up = glm::vec3(0, 1, 0);
+  camera.right = glm::normalize(glm::cross(camera.view, camera.up));
+  camera.fovY = 45;
+  camera.aspect = 1.0;
+  camera.zNear = .01;
+  camera.zFar = 20;
+
   // Launch CUDA/GL
   if (init(argc, argv)) {
     // GLFW main loop
@@ -51,14 +65,14 @@ void mainLoop() {
 
     if(seconds2-seconds >= 1){
 
-        fps = fpstracker/(seconds2-seconds);
-        fpstracker = 0;
-        seconds = seconds2;
+      fps = fpstracker/(seconds2-seconds);
+      fpstracker = 0;
+      seconds = seconds2;
     }
 
     string title = "CIS565 Rasterizer | " + utilityCore::convertIntToString((int)fps) + " FPS";
-		glfwSetWindowTitle(window, title.c_str());
-    
+    glfwSetWindowTitle(window, title.c_str());
+
     glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo);
     glBindTexture(GL_TEXTURE_2D, displayImage);
     glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
@@ -84,17 +98,20 @@ void runCuda(){
   vbo = mesh->getVBO();
   vbosize = mesh->getVBOsize();
 
-  float newcbo[] = {0.0, 1.0, 0.0, 
-                    0.0, 0.0, 1.0, 
-                    1.0, 0.0, 0.0};
-  cbo = newcbo;
-  cbosize = 9;
+  cbo = mesh->getCBO();
+  cbosize = mesh->getCBOsize();
+
+  nbo = mesh->getNBO();
+  nbosize = mesh->getNBOsize();
 
   ibo = mesh->getIBO();
   ibosize = mesh->getIBOsize();
 
+  glm::mat4 matProj = glm::perspective(camera.fovY, camera.aspect, camera.zNear, camera.zFar);
+  glm::mat4 matView = glm::lookAt(camera.pos, camera.pos + camera.view, camera.up);
+
   cudaGLMapBufferObject((void**)&dptr, pbo);
-  cudaRasterizeCore(dptr, glm::vec2(width, height), frame, vbo, vbosize, cbo, cbosize, ibo, ibosize);
+  cudaRasterizeCore(dptr, glm::vec2(width, height), frame, vbo, vbosize, nbo, nbosize, cbo, cbosize, ibo, ibosize, matView, matProj, drawmode, colormode);
   cudaGLUnmapBufferObject(pbo);
 
   vbo = NULL;
@@ -105,7 +122,7 @@ void runCuda(){
   fpstracker++;
 
 }
-  
+
 //-------------------------------
 //----------SETUP STUFF----------
 //-------------------------------
@@ -114,18 +131,20 @@ bool init(int argc, char* argv[]) {
   glfwSetErrorCallback(errorCallback);
 
   if (!glfwInit()) {
-      return false;
+    return false;
   }
 
   width = 800;
   height = 800;
   window = glfwCreateWindow(width, height, "CIS 565 Pathtracer", NULL, NULL);
   if (!window){
-      glfwTerminate();
-      return false;
+    glfwTerminate();
+    return false;
   }
   glfwMakeContextCurrent(window);
   glfwSetKeyCallback(window, keyCallback);
+  glfwSetCursorPosCallback(window, cursorCallback);
+  glfwSetMouseButtonCallback(window, mouseButtonCallback);
 
   // Set up GL context
   glewExperimental = GL_TRUE;
@@ -138,7 +157,7 @@ bool init(int argc, char* argv[]) {
   initTextures();
   initCuda();
   initPBO();
-  
+
   GLuint passthroughProgram;
   passthroughProgram = initShader();
 
@@ -153,7 +172,7 @@ void initPBO(){
   int num_texels = width*height;
   int num_values = num_texels * 4;
   int size_tex_data = sizeof(GLubyte) * num_values;
-    
+
   // Generate a buffer ID called a PBO (Pixel Buffer Object)
   glGenBuffers(1, &pbo);
 
@@ -175,48 +194,48 @@ void initCuda(){
 }
 
 void initTextures(){
-    glGenTextures(1, &displayImage);
-    glBindTexture(GL_TEXTURE_2D, displayImage);
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
-    glTexImage2D( GL_TEXTURE_2D, 0, GL_RGBA8, width, height, 0, GL_BGRA,
-        GL_UNSIGNED_BYTE, NULL);
+  glGenTextures(1, &displayImage);
+  glBindTexture(GL_TEXTURE_2D, displayImage);
+  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+  glTexImage2D( GL_TEXTURE_2D, 0, GL_RGBA8, width, height, 0, GL_BGRA,
+    GL_UNSIGNED_BYTE, NULL);
 }
 
 void initVAO(void){
-    GLfloat vertices[] =
-    { 
-        -1.0f, -1.0f, 
-         1.0f, -1.0f, 
-         1.0f,  1.0f, 
-        -1.0f,  1.0f, 
-    };
-
-    GLfloat texcoords[] = 
-    { 
-        1.0f, 1.0f,
-        0.0f, 1.0f,
-        0.0f, 0.0f,
-        1.0f, 0.0f
-    };
-
-    GLushort indices[] = { 0, 1, 3, 3, 1, 2 };
-
-    GLuint vertexBufferObjID[3];
-    glGenBuffers(3, vertexBufferObjID);
-    
-    glBindBuffer(GL_ARRAY_BUFFER, vertexBufferObjID[0]);
-    glBufferData(GL_ARRAY_BUFFER, sizeof(vertices), vertices, GL_STATIC_DRAW);
-    glVertexAttribPointer((GLuint)positionLocation, 2, GL_FLOAT, GL_FALSE, 0, 0); 
-    glEnableVertexAttribArray(positionLocation);
-
-    glBindBuffer(GL_ARRAY_BUFFER, vertexBufferObjID[1]);
-    glBufferData(GL_ARRAY_BUFFER, sizeof(texcoords), texcoords, GL_STATIC_DRAW);
-    glVertexAttribPointer((GLuint)texcoordsLocation, 2, GL_FLOAT, GL_FALSE, 0, 0);
-    glEnableVertexAttribArray(texcoordsLocation);
-
-    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, vertexBufferObjID[2]);
-    glBufferData(GL_ELEMENT_ARRAY_BUFFER, sizeof(indices), indices, GL_STATIC_DRAW);
+  GLfloat vertices[] =
+  { 
+    -1.0f, -1.0f, 
+    1.0f, -1.0f, 
+    1.0f,  1.0f, 
+    -1.0f,  1.0f, 
+  };
+
+  GLfloat texcoords[] = 
+  { 
+    1.0f, 1.0f,
+    0.0f, 1.0f,
+    0.0f, 0.0f,
+    1.0f, 0.0f
+  };
+
+  GLushort indices[] = { 0, 1, 3, 3, 1, 2 };
+
+  GLuint vertexBufferObjID[3];
+  glGenBuffers(3, vertexBufferObjID);
+
+  glBindBuffer(GL_ARRAY_BUFFER, vertexBufferObjID[0]);
+  glBufferData(GL_ARRAY_BUFFER, sizeof(vertices), vertices, GL_STATIC_DRAW);
+  glVertexAttribPointer((GLuint)positionLocation, 2, GL_FLOAT, GL_FALSE, 0, 0); 
+  glEnableVertexAttribArray(positionLocation);
+
+  glBindBuffer(GL_ARRAY_BUFFER, vertexBufferObjID[1]);
+  glBufferData(GL_ARRAY_BUFFER, sizeof(texcoords), texcoords, GL_STATIC_DRAW);
+  glVertexAttribPointer((GLuint)texcoordsLocation, 2, GL_FLOAT, GL_FALSE, 0, 0);
+  glEnableVertexAttribArray(texcoordsLocation);
+
+  glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, vertexBufferObjID[2]);
+  glBufferData(GL_ELEMENT_ARRAY_BUFFER, sizeof(indices), indices, GL_STATIC_DRAW);
 }
 
 
@@ -224,7 +243,7 @@ GLuint initShader() {
   const char *attribLocations[] = { "Position", "Tex" };
   GLuint program = glslUtility::createDefaultProgram(attribLocations, 2);
   GLint location;
-  
+
   glUseProgram(program);
   if ((location = glGetUniformLocation(program, "u_image")) != -1)
   {
@@ -247,25 +266,25 @@ void deletePBO(GLuint* pbo){
   if (pbo) {
     // unregister this buffer object with CUDA
     cudaGLUnregisterBufferObject(*pbo);
-    
+
     glBindBuffer(GL_ARRAY_BUFFER, *pbo);
     glDeleteBuffers(1, pbo);
-    
+
     *pbo = (GLuint)NULL;
   }
 }
 
 void deleteTexture(GLuint* tex){
-    glDeleteTextures(1, tex);
-    *tex = (GLuint)NULL;
+  glDeleteTextures(1, tex);
+  *tex = (GLuint)NULL;
 }
- 
+
 void shut_down(int return_code){
   kernelCleanup();
   cudaDeviceReset();
-  #ifdef __APPLE__
+#ifdef __APPLE__
   glfwTerminate();
-  #endif
+#endif
   exit(return_code);
 }
 
@@ -274,11 +293,67 @@ void shut_down(int return_code){
 //------------------------------
 
 void errorCallback(int error, const char* description){
-    fputs(description, stderr);
+  fputs(description, stderr);
 }
 
 void keyCallback(GLFWwindow* window, int key, int scancode, int action, int mods){
-    if(key == GLFW_KEY_ESCAPE && action == GLFW_PRESS){
-        glfwSetWindowShouldClose(window, GL_TRUE);
+  if(key == GLFW_KEY_ESCAPE && action == GLFW_PRESS){
+    glfwSetWindowShouldClose(window, GL_TRUE);
+  } else if(key == GLFW_KEY_P && action == GLFW_PRESS) {
+    // print current image, with iteration_#
+  } else if(key == GLFW_KEY_Z && action == GLFW_PRESS) {
+    drawmode = FACES;
+  } else if(key == GLFW_KEY_X && action == GLFW_PRESS) {
+    drawmode = WIREFRAME;
+  } else if(key == GLFW_KEY_C && action == GLFW_PRESS) {
+    drawmode = VERTICES;
+  } else if(key == GLFW_KEY_A && action == GLFW_PRESS) {
+    colormode = COLOR;
+  } else if(key == GLFW_KEY_S && action == GLFW_PRESS) {
+    colormode = NORMAL;
+  } else if(key == GLFW_KEY_D && action == GLFW_PRESS) {
+    colormode = DEPTH;
+  }
+}
+
+// I apologize for the tank controls (rotate) but glm::rotate is refusing to work for some reason.
+// FREE MOTION BLURS YE
+void cursorCallback(GLFWwindow* window, double x, double y){
+  if (mouse.state == LEFT_MOUSE) {	// rotate
+    glm::mat4 rotateX = glm::rotate(glm::mat4(), (float)(mouse.x - x), camera.up);
+    glm::mat4 rotateY = glm::rotate(glm::mat4(), (float)(mouse.y - y), camera.right);
+    camera.pos = glm::vec3(rotateY * rotateX * glm::vec4(camera.pos, 1));
+    camera.view = glm::vec3(rotateY * rotateX * glm::vec4(camera.view, 1));
+    camera.up = glm::vec3(rotateY * glm::vec4(camera.up, 1));
+    camera.right = glm::cross(camera.view, camera.up);
+  } else if (mouse.state == RIGHT_MOUSE) {	// pan
+    camera.pos += (float)(-(mouse.y - y) / 200) * camera.up +
+                  (float)((mouse.x - x) / 200) * camera.right;
+  } else if (mouse.state == MIDDLE_MOUSE) {	// zoom
+    camera.pos += (float)((mouse.y - y) / 100) * camera.view;
+  }
+  mouse.x = x;
+  mouse.y = y;
+}
+
+void mouseButtonCallback(GLFWwindow* window, int button, int action, int mods) {
+  if (button == GLFW_MOUSE_BUTTON_1) {
+    if (action == GLFW_PRESS) {
+      mouse.state = LEFT_MOUSE;
+    } else if (mouse.state == LEFT_MOUSE){
+      mouse.state = NONE;
+    }
+  } else if (button == GLFW_MOUSE_BUTTON_2) {
+    if (action == GLFW_PRESS) {
+      mouse.state = RIGHT_MOUSE;
+    } else if (mouse.state == RIGHT_MOUSE){
+      mouse.state = NONE;
     }
+  } else if (button == GLFW_MOUSE_BUTTON_3) {
+    if (action == GLFW_PRESS) {
+      mouse.state = MIDDLE_MOUSE;
+    } else if (mouse.state == MIDDLE_MOUSE){
+      mouse.state = NONE;
+    }
+  }
 }
\ No newline at end of file
diff --git a/src/main.h b/src/main.h
index 8999110..caa390e 100644
--- a/src/main.h
+++ b/src/main.h
@@ -18,6 +18,7 @@
 #include <stdlib.h>
 #include <string>
 #include <time.h>
+#include "glm/gtc/matrix_transform.hpp"
 
 
 #include "rasterizeKernels.h"
@@ -45,11 +46,19 @@ obj* mesh;
 
 float* vbo;
 int vbosize;
+float* nbo;
+int nbosize;
 float* cbo;
 int cbosize;
 int* ibo;
 int ibosize;
 
+struct Camera {
+  glm::vec3 pos, up, view, right;
+  float fovY, aspect, zNear, zFar;
+};
+Camera camera;
+
 //-------------------------------
 //----------CUDA STUFF-----------
 //-------------------------------
@@ -99,5 +108,39 @@ void deleteTexture(GLuint* tex);
 void mainLoop();
 void errorCallback(int error, const char *description);
 void keyCallback(GLFWwindow *window, int key, int scancode, int action, int mods);
+void cursorCallback(GLFWwindow* window, double x, double y);
+void mouseButtonCallback(GLFWwindow* window, int button, int action, int mods);
 
+//------------------------------
+//-----USER INTERACTION---------
+//------------------------------
+enum mouseState {
+	NONE,
+	LEFT_MOUSE,
+	MIDDLE_MOUSE,
+	RIGHT_MOUSE
+};
+
+struct Mouse {
+	float x, y;
+	mouseState state;
+};
+
+Mouse mouse;
+
+enum drawMode {
+  FACES,
+  WIREFRAME,
+  VERTICES
+};
+
+drawMode drawmode;
+
+enum colorMode {
+  COLOR,
+  NORMAL,
+  DEPTH
+};
+
+colorMode colormode;
 #endif
\ No newline at end of file
diff --git a/src/rasterizeKernels.cu b/src/rasterizeKernels.cu
index 10b0000..08e908f 100644
--- a/src/rasterizeKernels.cu
+++ b/src/rasterizeKernels.cu
@@ -5,12 +5,16 @@
 #include <cuda.h>
 #include <cmath>
 #include <thrust/random.h>
+#include <thrust/remove.h>
+#include <thrust/device_ptr.h>
+#include <thrust/count.h>
 #include "rasterizeKernels.h"
 #include "rasterizeTools.h"
 
 glm::vec3* framebuffer;
 fragment* depthbuffer;
 float* device_vbo;
+float* device_nbo;
 float* device_cbo;
 int* device_ibo;
 triangle* primitives;
@@ -23,20 +27,28 @@ void checkCUDAError(const char *msg) {
   }
 } 
 
+__host__ __device__ void screenToNDC(int x, int resolution, float* ndcX) {
+  *ndcX = - 2 * (x / (float)resolution - 0.5f);
+}
+
+__host__ __device__ void ndcToScreen(float ndcX, int resolution, int* x) {
+  *x = -(ndcX - 1) * resolution / 2;
+}
+
 //Handy dandy little hashing function that provides seeds for random number generation
 __host__ __device__ unsigned int hash(unsigned int a){
-    a = (a+0x7ed55d16) + (a<<12);
-    a = (a^0xc761c23c) ^ (a>>19);
-    a = (a+0x165667b1) + (a<<5);
-    a = (a+0xd3a2646c) ^ (a<<9);
-    a = (a+0xfd7046c5) + (a<<3);
-    a = (a^0xb55a4f09) ^ (a>>16);
-    return a;
+  a = (a+0x7ed55d16) + (a<<12);
+  a = (a^0xc761c23c) ^ (a>>19);
+  a = (a+0x165667b1) + (a<<5);
+  a = (a+0xd3a2646c) ^ (a<<9);
+  a = (a+0xfd7046c5) + (a<<3);
+  a = (a^0xb55a4f09) ^ (a>>16);
+  return a;
 }
 
 //Writes a given fragment to a fragment buffer at a given location
 __host__ __device__ void writeToDepthbuffer(int x, int y, fragment frag, fragment* depthbuffer, glm::vec2 resolution){
-  if(x<resolution.x && y<resolution.y){
+  if(x>0 && x<resolution.x && y>0 && y<resolution.y){
     int index = (y*resolution.x) + x;
     depthbuffer[index] = frag;
   }
@@ -73,106 +85,349 @@ __host__ __device__ glm::vec3 getFromFramebuffer(int x, int y, glm::vec3* frameb
 
 //Kernel that clears a given pixel buffer with a given color
 __global__ void clearImage(glm::vec2 resolution, glm::vec3* image, glm::vec3 color){
-    int x = (blockIdx.x * blockDim.x) + threadIdx.x;
-    int y = (blockIdx.y * blockDim.y) + threadIdx.y;
-    int index = x + (y * resolution.x);
-    if(x<=resolution.x && y<=resolution.y){
-      image[index] = color;
-    }
+  int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+  int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+  int index = x + (y * resolution.x);
+  if(x<=resolution.x && y<=resolution.y){
+    image[index] = color;
+  }
 }
 
 //Kernel that clears a given fragment buffer with a given fragment
 __global__ void clearDepthBuffer(glm::vec2 resolution, fragment* buffer, fragment frag){
-    int x = (blockIdx.x * blockDim.x) + threadIdx.x;
-    int y = (blockIdx.y * blockDim.y) + threadIdx.y;
-    int index = x + (y * resolution.x);
-    if(x<=resolution.x && y<=resolution.y){
-      fragment f = frag;
-      f.position.x = x;
-      f.position.y = y;
-      buffer[index] = f;
-    }
+  int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+  int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+  int index = x + (y * resolution.x);
+  if(x<=resolution.x && y<=resolution.y){
+    fragment f = frag;
+    f.position.x = x;
+    f.position.y = y;
+    buffer[index] = f;
+  }
 }
 
 //Kernel that writes the image to the OpenGL PBO directly. 
 __global__ void sendImageToPBO(uchar4* PBOpos, glm::vec2 resolution, glm::vec3* image){
-  
+
   int x = (blockIdx.x * blockDim.x) + threadIdx.x;
   int y = (blockIdx.y * blockDim.y) + threadIdx.y;
   int index = x + (y * resolution.x);
-  
+
   if(x<=resolution.x && y<=resolution.y){
 
-      glm::vec3 color;      
-      color.x = image[index].x*255.0;
-      color.y = image[index].y*255.0;
-      color.z = image[index].z*255.0;
+    glm::vec3 color;      
+    color.x = image[index].x*255.0;
+    color.y = image[index].y*255.0;
+    color.z = image[index].z*255.0;
 
-      if(color.x>255){
-        color.x = 255;
-      }
+    if(color.x>255){
+      color.x = 255;
+    }
 
-      if(color.y>255){
-        color.y = 255;
-      }
+    if(color.y>255){
+      color.y = 255;
+    }
 
-      if(color.z>255){
-        color.z = 255;
-      }
-      
-      // Each thread writes one pixel location in the texture (textel)
-      PBOpos[index].w = 0;
-      PBOpos[index].x = color.x;     
-      PBOpos[index].y = color.y;
-      PBOpos[index].z = color.z;
+    if(color.z>255){
+      color.z = 255;
+    }
+
+    // Each thread writes one pixel location in the texture (textel)
+    PBOpos[index].w = 0;
+    PBOpos[index].x = color.x;     
+    PBOpos[index].y = color.y;
+    PBOpos[index].z = color.z;
   }
 }
 
 //TODO: Implement a vertex shader
-__global__ void vertexShadeKernel(float* vbo, int vbosize){
+__global__ void vertexShadeKernel(float* vbo, int vbosize, glm::mat4 mvp){
   int index = (blockIdx.x * blockDim.x) + threadIdx.x;
   if(index<vbosize/3){
+    glm::vec4 p (vbo[index * 3], vbo[index * 3 + 1], vbo[index * 3 + 2], 1);
+    p = mvp * p;
+    vbo[index * 3] = p.x / p.w;
+    vbo[index * 3 + 1] = p.y / p.w;
+    vbo[index * 3 + 2] = p.z / p.w;
   }
 }
 
 //TODO: Implement primative assembly
-__global__ void primitiveAssemblyKernel(float* vbo, int vbosize, float* cbo, int cbosize, int* ibo, int ibosize, triangle* primitives){
+__global__ void primitiveAssemblyKernel(float* vbo, int vbosize, float* nbo, int nbosize, float* cbo, int cbosize, int* ibo, int ibosize, triangle* primitives){
   int index = (blockIdx.x * blockDim.x) + threadIdx.x;
   int primitivesCount = ibosize/3;
   if(index<primitivesCount){
+    triangle tri;
+    int i = ibo[index * 3] * 3;
+    tri.p0 = glm::vec3(vbo[i], vbo[i + 1], vbo[i + 2]);
+    tri.n0 = glm::vec3(nbo[i], nbo[i + 1], nbo[i + 2]);
+    tri.c0 = glm::vec3(cbo[i], cbo[i + 1], cbo[i + 2]);
+    i = ibo[index * 3 + 1] * 3;
+    tri.p1 = glm::vec3(vbo[i], vbo[i + 1], vbo[i + 2]);
+    tri.n1 = glm::vec3(nbo[i], nbo[i + 1], nbo[i + 2]);
+    tri.c1 = glm::vec3(cbo[i], cbo[i + 1], cbo[i + 2]);
+    i = ibo[index * 3 + 2] * 3;
+    tri.p2 = glm::vec3(vbo[i], vbo[i + 1], vbo[i + 2]);
+    tri.n2 = glm::vec3(nbo[i], nbo[i + 1], nbo[i + 2]);
+    tri.c2 = glm::vec3(cbo[i], cbo[i + 1], cbo[i + 2]);
+    /*glm::vec3 normal = glm::normalize(glm::cross(tri.p1 - tri.p0, tri.p2 - tri.p0) +
+                                      glm::cross(tri.p2 - tri.p1, tri.p0 - tri.p1) +
+                                      glm::cross(tri.p0 - tri.p2, tri.p1 - tri.p2));
+    tri.n0 = tri.n1 = tri.n2 = normal;*/
+    primitives[index] = tri;
+  }
+}
+
+__global__ void backfaceCullingKernel(triangle* primitives, int primitivesCount) {
+  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if(index<primitivesCount){
+
   }
 }
 
 //TODO: Implement a rasterization method, such as scanline.
-__global__ void rasterizationKernel(triangle* primitives, int primitivesCount, fragment* depthbuffer, glm::vec2 resolution){
+__global__ void rasterizationKernel(triangle* primitives, int primitivesCount, fragment* depthbuffer, glm::vec2 resolution, int mode){
   int index = (blockIdx.x * blockDim.x) + threadIdx.x;
   if(index<primitivesCount){
+    if (mode == 0) {
+      // Draw Faces
+      triangle t = primitives[index];
+      point top, middle, bottom;
+      top.position = t.p0;
+      middle.position = t.p1;
+      bottom.position = t.p2;
+      top.color = t.c0;
+      middle.color = t.c1;
+      bottom.color = t.c2;
+      top.normal = t.n0;
+      middle.normal = t.n1;
+      bottom.normal = t.n2;
+
+      point temp;
+
+      // Do a basic bubble sort
+      for (int i = 0; i < 2; i++) {
+        if (bottom.position.y > middle.position.y) {
+          temp = bottom;
+          bottom = middle;
+          middle = temp;
+        }
+        if (middle.position.y > top.position.y) {
+          temp = middle;
+          middle = top;
+          top = temp;
+        }
+      }
+
+      // Ignore triangle if it's outside
+      // TODO: move this to a clipper later, with the x and z clipping as well.
+      if (top.position.y < -1 || bottom.position.y > 1) {
+        return;
+      }
+
+      // "left" and "right" are relative to each other, not top.
+      point pointLeft, pointRight;    // used for interpolation
+
+      if (bottom.position.x > middle.position.x) {  // top->middle is on the left
+        pointLeft = middle;
+        pointRight = bottom;
+      } else {        // top->bottom is on left
+        pointLeft = bottom;
+        pointRight = middle;
+      }
+    
+      float currNDCx = top.position.x;
+      float currNDCy = top.position.y;
+      int currY, currX;
+      ndcToScreen(currNDCx, resolution.x, &currX);
+      ndcToScreen(currNDCy, resolution.y, &currY);
+
+      while (currNDCy > middle.position.y && currNDCy > -1) {
+        // only perform these operations if the current y coordinate is in the screen.
+        if (currNDCy <= 1) {
+          // interpolate along the edges
+          float tLeft = (top.position.y - currNDCy) / (top.position.y - pointLeft.position.y);
+          if (top.position.y == pointLeft.position.y) {
+            tLeft = 0;
+          }
+          float tRight = (top.position.y - currNDCy) / (top.position.y - pointRight.position.y);
+          if (top.position.y == pointRight.position.y) {
+            tRight = 0;
+          }
+
+          // Would saving 1-tleft and 1-tright into variables save 1 cycle per statement (if they fit in registers?)
+          glm::vec3 cLeft, cRight, pLeft, pRight, nLeft, nRight;
+          pLeft = (1 - tLeft) * top.position + tLeft * pointLeft.position;
+          nLeft = (1 - tLeft) * top.normal + tLeft * pointLeft.normal;
+          cLeft = (1 - tLeft) * top.color + tLeft * pointLeft.color;
+          pRight = (1 - tRight) * top.position + tRight * pointRight.position;
+          nRight = (1 - tRight) * top.normal + tRight * pointRight.normal;
+          cRight = (1 - tRight) * top.color + tRight * pointRight.color;
+          int rBound = 0;
+          int lBound = 0;
+          ndcToScreen(pRight.x, resolution.x, &rBound);
+          ndcToScreen(pLeft.x, resolution.x, &lBound);
+          for (currX = lBound; currX >= rBound; currX--) {
+            if (currX >= 0 && currX < resolution.x) {
+              screenToNDC(currX, resolution.x, &currNDCx);
+              // interpolate color, normal, and position
+              float t = (currX - lBound) / (float) (rBound - lBound);
+              if (rBound == lBound) {
+                t = 0;
+              }
+              fragment frag;
+              frag.position = (1 - t) * pLeft + t * pRight;
+              frag.normal = (1 - t) * nLeft + t * nRight;
+              frag.color = (1 - t) * cLeft + t * cRight;
+              writeToDepthbuffer(currX, currY, frag, depthbuffer, resolution);
+            }
+          }
+        }
+        currY++;
+        screenToNDC(currY, resolution.y, &currNDCy);
+      }
+    
+      if (middle.position.x < top.position.x) {
+        pointLeft = middle;
+        pointRight = top;
+      } else {
+        pointLeft = top;
+        pointRight = middle;
+      }
+
+      while (currNDCy > bottom.position.y && currNDCy > -1) {
+        // only perform these operations if the current y coordinate is in the screen.
+        if (currNDCy <= 1) {
+          // interpolate along the edges
+          float tLeft = (pointLeft.position.y - currNDCy) / (pointLeft.position.y - bottom.position.y);
+          if (pointLeft.position.y == bottom.position.y) {
+            tLeft = 0;
+          }
+          float tRight = (pointRight.position.y - currNDCy) / (pointRight.position.y - bottom.position.y);
+          if (pointRight.position.y == bottom.position.y) {
+            tRight = 0;
+          }
+
+          // Would saving 1-tleft and 1-tright into variables save 1 cycle per statement (if they fit in registers?)
+          glm::vec3 cLeft, cRight, pLeft, pRight, nLeft, nRight;
+          pLeft = (1 - tLeft) * pointLeft.position + tLeft * bottom.position;
+          nLeft = (1 - tLeft) * pointLeft.normal + tLeft * bottom.normal;
+          cLeft = (1 - tLeft) * pointLeft.color + tLeft * bottom.color;
+          pRight = (1 - tRight) * pointRight.position + tRight * bottom.position;
+          nRight = (1 - tRight) * pointRight.normal + tRight * bottom.normal;
+          cRight = (1 - tRight) * pointRight.color + tRight * bottom.color;
+
+          int rBound = 0;
+          int lBound = 0;
+          ndcToScreen(pRight.x, resolution.x, &rBound);
+          ndcToScreen(pLeft.x, resolution.x, &lBound);
+          for (currX = lBound; currX >= rBound; currX--) {
+            if (currX >= 0 && currX < resolution.x) {
+              screenToNDC(currX, resolution.x, &currNDCx);
+              // interpolate color, normal, and position
+              float t = (currX - lBound) / (float) (rBound - lBound);
+              if (rBound == lBound) {
+                t = 0;
+              }
+              fragment frag;
+              frag.position = (1 - t) * pLeft + t * pRight;
+              frag.normal = (1 - t) * nLeft + t * nRight;
+              frag.color = (1 - t) * cLeft + t * cRight;
+              writeToDepthbuffer(currX, currY, frag, depthbuffer, resolution);
+            }
+          }
+        }
+        currY++;
+        screenToNDC(currY, resolution.y, &currNDCy);
+      }
+    } else if (mode == 1) { // draw wireframe
+      
+    } else if (mode == 2) { // draw vertices
+      int x, y;
+      ndcToScreen(primitives[index].p0.x, resolution.x, &x);
+      ndcToScreen(primitives[index].p0.y, resolution.y, &y);
+            
+      fragment frag;
+      frag.color = primitives[index].c0;
+      frag.normal = primitives[index].n0;
+      frag.position = primitives[index].p0;
+      writeToDepthbuffer(x, y, frag, depthbuffer, resolution);
+    
+      ndcToScreen(primitives[index].p1.x, resolution.x, &x);
+      ndcToScreen(primitives[index].p1.y, resolution.y, &y);
+            
+      frag.color = primitives[index].c1;
+      frag.normal = primitives[index].n1;
+      frag.position = primitives[index].p1;
+      writeToDepthbuffer(x, y, frag, depthbuffer, resolution);
+    
+      ndcToScreen(primitives[index].p2.x, resolution.x, &x);
+      ndcToScreen(primitives[index].p2.y, resolution.y, &y);
+            
+      frag.color = primitives[index].c2;
+      frag.normal = primitives[index].n2;
+      frag.position = primitives[index].p2;
+      writeToDepthbuffer(x, y, frag, depthbuffer, resolution);
+    }
   }
 }
 
 //TODO: Implement a fragment shader
-__global__ void fragmentShadeKernel(fragment* depthbuffer, glm::vec2 resolution){
+// Modifies the .color value per fragment.
+// Simple Blinn-Phong shading, light needs to be transformed into clip coordinates.
+__global__ void fragmentShadeKernel(fragment* depthbuffer, glm::vec2 resolution, light light, glm::mat4 matVPinv, int drawmode){
   int x = (blockIdx.x * blockDim.x) + threadIdx.x;
   int y = (blockIdx.y * blockDim.y) + threadIdx.y;
   int index = x + (y * resolution.x);
-  if(x<=resolution.x && y<=resolution.y){
+  if(x<=resolution.x && y<=resolution.y && drawmode == 0){
+    fragment f = depthbuffer[index];
+    if (f.position.z > 0) { //ignore all the empty space (z = -10000)
+      glm::vec4 origPos = matVPinv * glm::vec4(f.position,1);
+      float diffuse = glm::dot(f.normal, glm::normalize(light.position - glm::vec3(origPos * origPos.w)));
+      if (diffuse < 0) {
+        diffuse = 0;
+      }
+      depthbuffer[index].color *= light.color * diffuse;
+    }
   }
 }
 
 //Writes fragment colors to the framebuffer
-__global__ void render(glm::vec2 resolution, fragment* depthbuffer, glm::vec3* framebuffer){
+__global__ void render(glm::vec2 resolution, fragment* depthbuffer, glm::vec3* framebuffer, int colormode){
 
   int x = (blockIdx.x * blockDim.x) + threadIdx.x;
   int y = (blockIdx.y * blockDim.y) + threadIdx.y;
   int index = x + (y * resolution.x);
 
   if(x<=resolution.x && y<=resolution.y){
-    framebuffer[index] = depthbuffer[index].color;
+    if (colormode == 0) {
+      // Color
+      framebuffer[index] = depthbuffer[index].color;
+    } else if (colormode == 1) {
+      // Normal
+      framebuffer[index] = depthbuffer[index].normal;
+      //framebuffer[index] = glm::normalize(glm::vec3(depthbuffer[index].normal.r, depthbuffer[index].normal.g, 0));
+    } else if (colormode == 2) {
+      // Distance
+      framebuffer[index] = glm::vec3(depthbuffer[index].position.z);
+    }
   }
 }
 
+struct clippingOrBackface {
+  __host__ __device__
+    bool operator() (const triangle t) {
+      glm::vec3 normal = glm::normalize(glm::cross(t.p1 - t.p0, t.p2 - t.p0) +
+                                      glm::cross(t.p2 - t.p1, t.p0 - t.p1) +
+                                      glm::cross(t.p0 - t.p2, t.p1 - t.p2));
+      return (normal.z < 0) || ((t.p0.x > 1 || t.p0.x < -1) && (t.p0.y > 1 || t.p0.y < -1) &&
+                              (t.p1.x > 1 || t.p1.x < -1) && (t.p1.y > 1 || t.p1.y < -1) &&
+                              (t.p2.x > 1 || t.p2.x < -1) && (t.p2.y > 1 || t.p2.y < -1));
+  }
+};
+
 // Wrapper for the __global__ call that sets up the kernel calls and does a ton of memory management
-void cudaRasterizeCore(uchar4* PBOpos, glm::vec2 resolution, float frame, float* vbo, int vbosize, float* cbo, int cbosize, int* ibo, int ibosize){
+void cudaRasterizeCore(uchar4* PBOpos, glm::vec2 resolution, float frame, float* vbo, int vbosize, float* nbo, int nbosize, float* cbo, int cbosize, int* ibo, int ibosize, glm::mat4 view, glm::mat4 projection, int drawmode, int colormode){
 
   // set up crucial magic
   int tileSize = 8;
@@ -182,14 +437,14 @@ void cudaRasterizeCore(uchar4* PBOpos, glm::vec2 resolution, float frame, float*
   //set up framebuffer
   framebuffer = NULL;
   cudaMalloc((void**)&framebuffer, (int)resolution.x*(int)resolution.y*sizeof(glm::vec3));
-  
+
   //set up depthbuffer
   depthbuffer = NULL;
   cudaMalloc((void**)&depthbuffer, (int)resolution.x*(int)resolution.y*sizeof(fragment));
 
   //kernel launches to black out accumulated/unaccumlated pixel buffers and clear our scattering states
   clearImage<<<fullBlocksPerGrid, threadsPerBlock>>>(resolution, framebuffer, glm::vec3(0,0,0));
-  
+
   fragment frag;
   frag.color = glm::vec3(0,0,0);
   frag.normal = glm::vec3(0,0,0);
@@ -210,6 +465,10 @@ void cudaRasterizeCore(uchar4* PBOpos, glm::vec2 resolution, float frame, float*
   cudaMalloc((void**)&device_vbo, vbosize*sizeof(float));
   cudaMemcpy( device_vbo, vbo, vbosize*sizeof(float), cudaMemcpyHostToDevice);
 
+  device_nbo = NULL;
+  cudaMalloc((void**)&device_nbo, nbosize*sizeof(float));
+  cudaMemcpy( device_nbo, nbo, nbosize*sizeof(float), cudaMemcpyHostToDevice);
+
   device_cbo = NULL;
   cudaMalloc((void**)&device_cbo, cbosize*sizeof(float));
   cudaMemcpy( device_cbo, cbo, cbosize*sizeof(float), cudaMemcpyHostToDevice);
@@ -217,35 +476,61 @@ void cudaRasterizeCore(uchar4* PBOpos, glm::vec2 resolution, float frame, float*
   tileSize = 32;
   int primitiveBlocks = ceil(((float)vbosize/3)/((float)tileSize));
 
+  glm::mat4 matVP = projection * view;
+
+  //----------------------------
+  //light setup
+  //----------------------------
+  light light;
+  light.color = glm::vec3(1, 1, 1);
+  light.position = glm::vec3(5, 5, 5);
+
   //------------------------------
   //vertex shader
   //------------------------------
-  vertexShadeKernel<<<primitiveBlocks, tileSize>>>(device_vbo, vbosize);
+  vertexShadeKernel<<<primitiveBlocks, tileSize>>>(device_vbo, vbosize, matVP);
 
   cudaDeviceSynchronize();
   //------------------------------
   //primitive assembly
   //------------------------------
   primitiveBlocks = ceil(((float)ibosize/3)/((float)tileSize));
-  primitiveAssemblyKernel<<<primitiveBlocks, tileSize>>>(device_vbo, vbosize, device_cbo, cbosize, device_ibo, ibosize, primitives);
+  primitiveAssemblyKernel<<<primitiveBlocks, tileSize>>>(device_vbo, vbosize, device_nbo, nbosize, device_cbo, cbosize, device_ibo, ibosize, primitives);
 
   cudaDeviceSynchronize();
+
+  float numPrimitives = ibosize / 3;
+  if (drawmode == 0) {
+    //------------------------------
+    //ez backface culling and clipping
+    //------------------------------
+	  thrust::device_ptr<triangle> primitivesStart(primitives);
+
+	  float numRemoved = thrust::count_if(primitivesStart, primitivesStart + ibosize / 3, clippingOrBackface());
+	  thrust::remove_if(primitivesStart, primitivesStart + ibosize / 3, clippingOrBackface());
+	  numPrimitives = ibosize / 3 - numRemoved;
+    primitiveBlocks = ceil(((float)numPrimitives)/((float)tileSize));
+
+    cudaDeviceSynchronize();
+  }
+
   //------------------------------
   //rasterization
   //------------------------------
-  rasterizationKernel<<<primitiveBlocks, tileSize>>>(primitives, ibosize/3, depthbuffer, resolution);
+  rasterizationKernel<<<primitiveBlocks, tileSize>>>(primitives, numPrimitives, depthbuffer, resolution, drawmode);
 
   cudaDeviceSynchronize();
   //------------------------------
   //fragment shader
   //------------------------------
-  fragmentShadeKernel<<<fullBlocksPerGrid, threadsPerBlock>>>(depthbuffer, resolution);
+  glm::mat4 matMVPinv = glm::inverse(matVP);
+  fragmentShadeKernel<<<fullBlocksPerGrid, threadsPerBlock>>>(depthbuffer, resolution, light, matMVPinv, drawmode);
 
   cudaDeviceSynchronize();
   //------------------------------
   //write fragments to framebuffer
   //------------------------------
-  render<<<fullBlocksPerGrid, threadsPerBlock>>>(resolution, depthbuffer, framebuffer);
+  render<<<fullBlocksPerGrid, threadsPerBlock>>>(resolution, depthbuffer, framebuffer, colormode);
   sendImageToPBO<<<fullBlocksPerGrid, threadsPerBlock>>>(PBOpos, resolution, framebuffer);
 
   cudaDeviceSynchronize();
@@ -258,6 +543,7 @@ void cudaRasterizeCore(uchar4* PBOpos, glm::vec2 resolution, float frame, float*
 void kernelCleanup(){
   cudaFree( primitives );
   cudaFree( device_vbo );
+  cudaFree( device_nbo );
   cudaFree( device_cbo );
   cudaFree( device_ibo );
   cudaFree( framebuffer );
diff --git a/src/rasterizeKernels.h b/src/rasterizeKernels.h
index 784be17..1d0a18f 100644
--- a/src/rasterizeKernels.h
+++ b/src/rasterizeKernels.h
@@ -9,8 +9,9 @@
 #include <cuda.h>
 #include <cmath>
 #include "glm/glm.hpp"
+#include "glm/gtc/matrix_transform.hpp"
 
 void kernelCleanup();
-void cudaRasterizeCore(uchar4* pos, glm::vec2 resolution, float frame, float* vbo, int vbosize, float* cbo, int cbosize, int* ibo, int ibosize);
+void cudaRasterizeCore(uchar4* PBOpos, glm::vec2 resolution, float frame, float* vbo, int vbosize, float* nbo, int nbosize, float* cbo, int cbosize, int* ibo, int ibosize, glm::mat4 view, glm::mat4 projection, int drawmode, int colormode);
 
 #endif //RASTERIZEKERNEL_H
diff --git a/src/rasterizeTools.h b/src/rasterizeTools.h
index e9b5dcc..48760a4 100644
--- a/src/rasterizeTools.h
+++ b/src/rasterizeTools.h
@@ -10,20 +10,28 @@
 #include "cudaMat4.h"
 
 struct triangle {
-  glm::vec3 p0;
-  glm::vec3 p1;
-  glm::vec3 p2;
-  glm::vec3 c0;
-  glm::vec3 c1;
-  glm::vec3 c2;
+  glm::vec3 p0, p1, p2;
+  glm::vec3 n0, n1, n2;
+  glm::vec3 c0, c1, c2;
 };
 
-struct fragment{
+struct point {
   glm::vec3 color;
   glm::vec3 normal;
   glm::vec3 position;
 };
 
+struct fragment {
+  glm::vec3 color;
+  glm::vec3 normal;
+  glm::vec3 position;
+};
+
+struct light {
+  glm::vec3 position;
+  glm::vec3 color;
+};
+
 //Multiplies a cudaMat4 matrix and a vec4
 __host__ __device__ glm::vec3 multiplyMV(cudaMat4 m, glm::vec4 v){
   glm::vec3 r(1,1,1);
diff --git a/windows/PROJ4_Rasterizer/PROJ4_Rasterizer/PROJ4_Rasterizer.vcxproj b/windows/PROJ4_Rasterizer/PROJ4_Rasterizer/PROJ4_Rasterizer.vcxproj
index f640485..7a6dcc4 100644
--- a/windows/PROJ4_Rasterizer/PROJ4_Rasterizer/PROJ4_Rasterizer.vcxproj
+++ b/windows/PROJ4_Rasterizer/PROJ4_Rasterizer/PROJ4_Rasterizer.vcxproj
@@ -28,7 +28,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 5.5.props" />
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 6.5.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
@@ -87,6 +87,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 5.5.targets" />
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 6.5.targets" />
   </ImportGroup>
 </Project>
\ No newline at end of file