diff --git a/README.md b/README.md
index 41b91f0..57de14f 100644
--- a/README.md
+++ b/README.md
@@ -1,18 +1,85 @@
CUDA Rasterizer
===============
-[CLICK ME FOR INSTRUCTION OF THIS PROJECT](./INSTRUCTION.md)
+* Zach Corse
+ * LinkedIn: https://www.linkedin.com/in/wzcorse/
+ * Personal Website: https://wzcorse.com
+ * Twitter: @ZachCorse
+* Tested on: Windows 10, i7-6700HQ @ 2.60GHz 32GB, NVIDIA GeForce GTX 970M (personal computer)
-**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 4**
+## README
-* (TODO) YOUR NAME HERE
- * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+
-### (TODO: Your README)
+Introduction
+------------
-*DO NOT* leave the README to the last minute! It is a crucial part of the
-project, and we will not be able to grade you without a good README.
+This is a CUDA rasterizer that includes the standard rasterization pipeline, including:
+
+* Vertex shading
+* Primitive assembly
+* Rasterization
+* Fragments to depth buffer
+* Fragment shading
+* Fragment to framebuffer writing
+
+For rasterization, I implemented a naive parallelization per triangle, checking all fragments in a particular triangle's bounding box (in barycentric coordinates) before adding a particular fragment to the scene fragment buffer, assuming it clears the depth buffer. The depth buffer itself is regulated by a mutex lock buffer, which ensures that a particular index in the depth buffer is not being written to or read from while another thread is writing to that index.
+
+As will be shown in the performance section below, a tile-based rendering system would be more efficient in some cases than parallelization by triangle primitive.
+
+Features
+------------
+
+Present features included in my rasterizer. See below for sample renders and performance analysis!
+
+1. Basic Rasterizing
+ * Rasterizes per triangle primitive
+ * Depth buffer regulated by mutex lock buffer
+2. Textures
+ * Includes bilinear interpolation sampling
+3. Color interpolation
+4. Lambertian Shading
+5. Anti-Aliasing
+
+Textures
+------------
+
+
+
+*Shading includes textures*
+
+Textures are drawn using perspective-correct interpolation. UV coordinates, which are not discrete, are interpolated using neighboring pixel values using bilinear interpolation.
+
+Color Interpolation
+------------
+
+
+
+*Shading includes vertex color interpolation when no texture is specified.*
+
+Anti-Aliasing
+------------
+
+
+
+*No AA. Jagged edges can be seen on edge of duck.*
+
+
+
+*With AA X 2. Jagged edges are smoothed.*
+
+
+
+We see here that, although anti-aliasing does provide a smoother picture, it comes with an associated drop in FPS.
+
+Performance
+------------
+
+From the graph below, which depicts the percentage of computational time each step in the rasterization pipeline takes, we can divine a few limitations of my implementation. First of all, we see that large triangles take a disproportionate amount of time to rasterize (the checkerboard is only two triangles). Because my implementation loops over all primitives, and for each primitive, it tests all pixels in that primitive's bounding box, larger triangles will have more pixel misses, and for those pixels that do fall within the triangle, these must be drawn serially. Conversely, the duck, although it has over 4,000 triangles, has relatively smaller triangles, which draw quickly in parallel.
+
+Now consider the duck's timing proportion. Here, the camera, in its default position, is zoomed out, such that the duck takes a relatively small proportion of the screen. However, while this means that fewer pixels need to be checked against each primitive, it also means that there will be proportionally fewer fragments to shade. Anti-aliasing X2, which quadruples the number of fragments, also quadruples the number of screen-space pixels that need to be checked in triangle bounding boxes, so this shouldn't affect the relative timing between rasterization and shading. I must therefore conclude that texture reading, bilinear interpolation, and lambertian shading, together, are relatively costly.
+
+
### Credits
diff --git a/graphs/AA_graph.png b/graphs/AA_graph.png
new file mode 100644
index 0000000..1131ddc
Binary files /dev/null and b/graphs/AA_graph.png differ
diff --git a/graphs/data.xml b/graphs/data.xml
new file mode 100644
index 0000000..3970a04
--- /dev/null
+++ b/graphs/data.xml
@@ -0,0 +1,210 @@
+
+
+
+
+
+
+
+
+
+| Function Name | Grid
+Dimensions | Block
+Dimensions | Start Time
+(μs) | Duration
+(μs) | Occupancy | Registers
+per Thread | Static Shared
+Memory per
+Block (bytes) | Dynamic Shared
+Memory per
+Block (bytes) | Cache
+Configuration
+Executed | Global
+Caching
+Requested | Global
+Caching
+Executed | Local Memory
+per Thread
+(bytes) | Device
+Name | Context
+ID | Stream
+ID | Process
+Name | Occupancy [0]:
+Allocated Warps
+Per Block | Occupancy [0]:
+Allocated Registers
+Per Block | Occupancy [0]:
+Allocated Shared Memory
+Per Block | Occupancy [0]:
+Max Block Limit
+Warps | Occupancy [0]:
+Max Block Limit
+Registers | Occupancy [0]:
+Max Block Limit
+Shared Memory | Occupancy [0]:
+Block Limit Reason | Instruction Stats [2]:
+GPU Issued IPC | Instruction Stats [2]:
+GPU Executed IPC | Instruction Stats [2]:
+GPU SM Activity | Instruction Stats [2]:
+GPU SM Average IPW | Instruction Stats [2]:
+GPU Serialization | Achieved Occupancy [1]:
+Achieved Occupancy |
+
+| _deviceBufferCopy | {99, 1, 1} | {128, 1, 1} | 269080.834 | 12.8 | 1 | 15 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 4 | 2048 | 0 | 16 | 32 | 2147483647 | Warps | 0.86298407705917 | 0.845685079614704 | 0.714354813543455 | 97.7727272727273 | 0.0200455580865604 | 0.580820838076396 |
+
+| _deviceBufferCopy | {57, 1, 1} | {128, 1, 1} | 413704.898 | 5.44 | 1 | 15 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 4 | 2048 | 0 | 16 | 32 | 2147483647 | Warps | 0.475913129318855 | 0.466910167818361 | 0.73320787492762 | 103.723684210526 | 0.0189172370877411 | 0.339507459446616 |
+
+| _deviceBufferCopy | {57, 1, 1} | {128, 1, 1} | 555153.25 | 5.312 | 1 | 15 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 4 | 2048 | 0 | 16 | 32 | 2147483647 | Warps | 0.470593968693347 | 0.461003138462738 | 0.735048001146296 | 103.723684210526 | 0.0203802659376165 | 0.338303652538174 |
+
+| _deviceBufferCopy | {38, 1, 1} | {128, 1, 1} | 695772.674 | 5.152 | 1 | 15 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 4 | 2048 | 0 | 16 | 32 | 2147483647 | Warps | 0.321085336370357 | 0.315011288936842 | 0.731817517180874 | 103.723684210526 | 0.0189172370877411 | 0.226726013383506 |
+
+| _nodeMatrixTransform | {19, 1, 1} | {128, 1, 1} | 836581.762 | 5.568 | 1 | 20 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 4 | 3072 | 0 | 16 | 21 | 2147483647 | Warps | 0.142143887059634 | 0.134138205608281 | 0.689421364985163 | 82.0131578947368 | 0.0563209689629069 | 0.114977829328735 |
+
+| _vertexTransformAndAssembly | {19, 1, 1} | {128, 1, 1} | 980123.17 | 11.552 | 1 | 26 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 4 | 4096 | 0 | 16 | 16 | 2147483647 | Warps, Registers | 0.174054758800522 | 0.162814304339728 | 0.817697228144989 | 230.039473684211 | 0.0645799892990904 | 0.11114078796763 |
+
+| _primitiveAssembly | {99, 1, 1} | {128, 1, 1} | 1120466.018 | 38.304 | 1 | 28 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 4 | 4096 | 0 | 16 | 16 | 2147483647 | Warps, Registers | 0.115557250009464 | 0.11323458632792 | 0.895828892549172 | 105.752525252525 | 0.0200996794346819 | 0.583518889917815 |
+
+| initDepth | {200, 200, 1} | {8, 8, 1} | 1259768.29 | 143.424 | 1 | 8 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 2 | 512 | 0 | 32 | 128 | 2147483647 | Warps, Blocks | 1.30642572328904 | 1.30533658307753 | 0.986535520738316 | 24 | 0.000833679398751667 | 0.781278960750559 |
+
+| _rasterize_tris | {33, 1, 1} | {128, 1, 1} | 1401854.626 | 326.752 | 0.1875 | 142 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 4 | 18432 | 0 | 16 | 3 | 2147483647 | Registers | 0.503053161465813 | 0.502931061078438 | 0.674139744156822 | 8768.4696969697 | 0.000242718656254238 | 0.101232220989451 |
+
+| render | {200, 200, 1} | {8, 8, 1} | 1557381.634 | 1886.208 | 0.5625 | 53 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 2 | 3584 | 0 | 32 | 18 | 2147483647 | Registers | 0.363815022072252 | 0.363740384823104 | 0.999368286315479 | 88.8185 | 0.000205151641958886 | 0.484978208360749 |
+
+| sendImageToPBO | {200, 200, 1} | {8, 8, 1} | 1703430.626 | 687.872 | 1 | 28 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 2 | 2048 | 0 | 32 | 32 | 2147483647 | Warps, Registers, Blocks | 0.927412316662304 | 0.927167259847483 | 0.995620745325279 | 82.4798375 | 0.00026423717953477 | 0.75495835098774 |
+
+| _vertexTransformAndAssembly | {19, 1, 1} | {128, 1, 1} | 1914064.418 | 11.584 | 1 | 26 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 4 | 4096 | 0 | 16 | 16 | 2147483647 | Warps, Registers | 0.175270783513856 | 0.163951798190088 | 0.817941244151262 | 230.039473684211 | 0.0645799892990904 | 0.111558923714915 |
+
+| _primitiveAssembly | {99, 1, 1} | {128, 1, 1} | 2057898.85 | 39.104 | 1 | 28 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 4 | 4096 | 0 | 16 | 16 | 2147483647 | Warps, Registers | 0.114616960350061 | 0.112284295535226 | 0.893177191848073 | 105.752525252525 | 0.0203518293253486 | 0.584685815328522 |
+
+| initDepth | {200, 200, 1} | {8, 8, 1} | 2199326.466 | 143.36 | 1 | 8 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 2 | 512 | 0 | 32 | 128 | 2147483647 | Warps, Blocks | 1.30544263346395 | 1.3043407372786 | 0.986825415979513 | 24 | 0.000844078596102667 | 0.783281883928917 |
+
+| _rasterize_tris | {33, 1, 1} | {128, 1, 1} | 2348178.338 | 319.744 | 0.1875 | 142 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 4 | 18432 | 0 | 16 | 3 | 2147483647 | Registers | 0.504485154769425 | 0.504363169015619 | 0.67695660765213 | 8770.37878787879 | 0.000241802464657979 | 0.101051494347002 |
+
+| render | {200, 200, 1} | {8, 8, 1} | 2508212.162 | 1885.056 | 0.5625 | 53 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 2 | 3584 | 0 | 32 | 18 | 2147483647 | Registers | 0.363750421744663 | 0.363676974706785 | 0.99888229655287 | 88.8185 | 0.000201916021227213 | 0.482364211446398 |
+
+| sendImageToPBO | {200, 200, 1} | {8, 8, 1} | 2654513.986 | 690.208 | 1 | 28 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 2 | 2048 | 0 | 32 | 32 | 2147483647 | Warps, Registers, Blocks | 0.926378776895369 | 0.926080955050936 | 0.99616009999259 | 82.4798375 | 0.000321490357789074 | 0.778320699223218 |
+
+| _vertexTransformAndAssembly | {19, 1, 1} | {128, 1, 1} | 2801809.57 | 11.52 | 1 | 26 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 4 | 4096 | 0 | 16 | 16 | 2147483647 | Warps, Registers | 0.173973750349065 | 0.162738527413199 | 0.815284207330955 | 230.039473684211 | 0.0645799892990904 | 0.111523404313634 |
+
+| _primitiveAssembly | {99, 1, 1} | {128, 1, 1} | 2944743.938 | 51.52 | 1 | 28 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 4 | 4096 | 0 | 16 | 16 | 2147483647 | Warps, Registers | 0.116215782705739 | 0.1138505789312 | 0.894600773402729 | 105.752525252525 | 0.0203518293253486 | 0.583825450146154 |
+
+| initDepth | {200, 200, 1} | {8, 8, 1} | 3085938.178 | 143.52 | 1 | 8 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 2 | 512 | 0 | 32 | 128 | 2147483647 | Warps, Blocks | 1.30639619858195 | 1.30520791554217 | 0.987029979333852 | 24 | 0.00090958856207994 | 0.778739854232955 |
+
+| _rasterize_tris | {33, 1, 1} | {128, 1, 1} | 3230410.018 | 322.176 | 0.1875 | 142 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 4 | 18432 | 0 | 16 | 3 | 2147483647 | Registers | 0.506254518512946 | 0.506132056698071 | 0.677341442533009 | 8766.92424242424 | 0.000241897722187377 | 0.101677145381831 |
+
+| render | {200, 200, 1} | {8, 8, 1} | 3391765.122 | 1886.656 | 0.5625 | 53 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 2 | 3584 | 0 | 32 | 18 | 2147483647 | Registers | 0.36346732366392 | 0.36339214416135 | 0.998976255336367 | 88.8185 | 0.000206839783592118 | 0.48615563432843 |
+
+| sendImageToPBO | {200, 200, 1} | {8, 8, 1} | 3540035.458 | 686.432 | 1 | 28 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 2 | 2048 | 0 | 32 | 32 | 2147483647 | Warps, Registers, Blocks | 0.930332241882126 | 0.930066826132246 | 0.996170911848128 | 82.4798375 | 0.000285291359292877 | 0.756484483628558 |
+
+| _vertexTransformAndAssembly | {19, 1, 1} | {128, 1, 1} | 3700068.578 | 22.688 | 1 | 26 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 4 | 4096 | 0 | 16 | 16 | 2147483647 | Warps, Registers | 0.175552299368801 | 0.164215133754133 | 0.795516700291415 | 230.039473684211 | 0.0645799892990904 | 0.111182808279518 |
+
+| _primitiveAssembly | {99, 1, 1} | {128, 1, 1} | 3837571.714 | 47.968 | 1 | 28 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 4 | 4096 | 0 | 16 | 16 | 2147483647 | Warps, Registers | 0.11571838458207 | 0.113355348635773 | 0.893273369118429 | 105.752525252525 | 0.0204205749573109 | 0.585364484487167 |
+
+| initDepth | {200, 200, 1} | {8, 8, 1} | 3979133.474 | 143.552 | 1 | 8 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 2 | 512 | 0 | 32 | 128 | 2147483647 | Warps, Blocks | 1.30471047283971 | 1.30358002453718 | 0.986924911885713 | 24 | 0.000866436137412088 | 0.777236894760645 |
+
+| _rasterize_tris | {33, 1, 1} | {128, 1, 1} | 4120905.41 | 325.664 | 0.1875 | 142 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 4 | 18432 | 0 | 16 | 3 | 2147483647 | Registers | 0.504966372587629 | 0.504833354485392 | 0.677549096668578 | 8769.26515151515 | 0.000263419723487021 | 0.101391960032109 |
+
+| render | {200, 200, 1} | {8, 8, 1} | 4278789.09 | 1882.944 | 0.5625 | 53 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 2 | 3584 | 0 | 32 | 18 | 2147483647 | Registers | 0.363210712391492 | 0.36313548377469 | 0.998734777440165 | 88.8185 | 0.000207121139976744 | 0.486818149632364 |
+
+| sendImageToPBO | {200, 200, 1} | {8, 8, 1} | 4427674.018 | 691.904 | 1 | 28 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 2 | 2048 | 0 | 32 | 32 | 2147483647 | Warps, Registers, Blocks | 0.931281130459775 | 0.931014174487334 | 0.994810844043188 | 82.4798375 | 0.000286654548997095 | 0.758627577765074 |
+
+| _vertexTransformAndAssembly | {19, 1, 1} | {128, 1, 1} | 4586458.85 | 11.424 | 1 | 26 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 4 | 4096 | 0 | 16 | 16 | 2147483647 | Warps, Registers | 0.175651291304838 | 0.164307732792 | 0.81038842345773 | 230.039473684211 | 0.0645799892990904 | 0.111276727251733 |
+
+| _primitiveAssembly | {99, 1, 1} | {128, 1, 1} | 4730588.578 | 39.008 | 1 | 28 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 4 | 4096 | 0 | 16 | 16 | 2147483647 | Warps, Registers | 0.1161691224619 | 0.113786235772645 | 0.896196459444323 | 105.752525252525 | 0.0205122207928897 | 0.582753447692479 |
+
+| initDepth | {200, 200, 1} | {8, 8, 1} | 4876308.93 | 143.584 | 1 | 8 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 2 | 512 | 0 | 32 | 128 | 2147483647 | Warps, Blocks | 1.3079230920539 | 1.30680074079267 | 0.986840001880671 | 24 | 0.000858117169160445 | 0.773479600289426 |
+
+| _rasterize_tris | {33, 1, 1} | {128, 1, 1} | 5020291.202 | 405.856 | 0.1875 | 142 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 4 | 18432 | 0 | 16 | 3 | 2147483647 | Registers | 0.49333035558098 | 0.493205732071651 | 0.625015602147618 | 9803.9696969697 | 0.00025261674640446 | 0.0996624718136 |
+
+| render | {200, 200, 1} | {8, 8, 1} | 5176772.514 | 1884.896 | 0.5625 | 53 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 2 | 3584 | 0 | 32 | 18 | 2147483647 | Registers | 0.36351287326389 | 0.363442911956224 | 0.998955454293914 | 88.89795 | 0.000192458954859307 | 0.484056560260082 |
+
+| sendImageToPBO | {200, 200, 1} | {8, 8, 1} | 5326698.53 | 685.824 | 1 | 28 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 2 | 2048 | 0 | 32 | 32 | 2147483647 | Warps, Registers, Blocks | 0.927292534485503 | 0.927048354517842 | 0.994841228243049 | 82.4806625 | 0.000263325712846795 | 0.774238981355545 |
+
+| _vertexTransformAndAssembly | {19, 1, 1} | {128, 1, 1} | 5486604.674 | 11.648 | 1 | 26 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 4 | 4096 | 0 | 16 | 16 | 2147483647 | Warps, Registers | 0.173639176119736 | 0.16242555998402 | 0.811619665208868 | 230.039473684211 | 0.0645799892990904 | 0.111282691996236 |
+
+| _primitiveAssembly | {99, 1, 1} | {128, 1, 1} | 5628228.418 | 47.168 | 1 | 28 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 4 | 4096 | 0 | 16 | 16 | 2147483647 | Warps, Registers | 0.116148524912647 | 0.113784689957234 | 0.891605901305749 | 105.752525252525 | 0.0203518293253486 | 0.582163121415903 |
+
+| initDepth | {200, 200, 1} | {8, 8, 1} | 5774274.882 | 143.552 | 1 | 8 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 2 | 512 | 0 | 32 | 128 | 2147483647 | Warps, Blocks | 1.30558975236977 | 1.30446125750065 | 0.986932732539427 | 24 | 0.00086435640833633 | 0.780488055935126 |
+
+| _rasterize_tris | {33, 1, 1} | {128, 1, 1} | 5921743.586 | 654.272 | 0.1875 | 142 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 4 | 18432 | 0 | 16 | 3 | 2147483647 | Registers | 0.458657755661932 | 0.45855202639675 | 0.455602138356812 | 10744.0303030303 | 0.000230518864832056 | 0.0926911417449346 |
+
+| render | {200, 200, 1} | {8, 8, 1} | 6079092.866 | 1885.632 | 0.5625 | 53 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 2 | 3584 | 0 | 32 | 18 | 2147483647 | Registers | 0.362860213381505 | 0.362786134297251 | 0.998783037794609 | 88.7020125 | 0.000204153229044707 | 0.484539905338922 |
+
+| sendImageToPBO | {200, 200, 1} | {8, 8, 1} | 6228437.858 | 686.464 | 1 | 28 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 2 | 2048 | 0 | 32 | 32 | 2147483647 | Warps, Registers, Blocks | 0.929535358224039 | 0.929276510358985 | 0.994909086833038 | 82.481375 | 0.000278470165512485 | 0.738158342349256 |
+
+| _vertexTransformAndAssembly | {19, 1, 1} | {128, 1, 1} | 6382768.258 | 13.184 | 1 | 26 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 4 | 4096 | 0 | 16 | 16 | 2147483647 | Warps, Registers | 0.173487668359154 | 0.162283836592996 | 0.814047151277014 | 230.039473684211 | 0.0645799892990904 | 0.110556126463275 |
+
+| _primitiveAssembly | {99, 1, 1} | {128, 1, 1} | 6523928.866 | 45.376 | 1 | 28 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 4 | 4096 | 0 | 16 | 16 | 2147483647 | Warps, Registers | 0.116198666532577 | 0.113825822952709 | 0.89204005431093 | 105.752525252525 | 0.0204205749573109 | 0.58194043495837 |
+
+| initDepth | {200, 200, 1} | {8, 8, 1} | 6668118.242 | 143.456 | 1 | 8 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 2 | 512 | 0 | 32 | 128 | 2147483647 | Warps, Blocks | 1.30563705073658 | 1.30452684403702 | 0.986909583456267 | 24 | 0.000850318010609721 | 0.77922992982104 |
+
+| _rasterize_tris | {33, 1, 1} | {128, 1, 1} | 6817917.378 | 651.904 | 0.1875 | 142 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 4 | 18432 | 0 | 16 | 3 | 2147483647 | Registers | 0.459179273908334 | 0.459083784936825 | 0.454007480178995 | 10744.5151515152 | 0.000207955752655313 | 0.0924428655354529 |
+
+| render | {200, 200, 1} | {8, 8, 1} | 6976368.258 | 1885.696 | 0.5625 | 53 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 2 | 3584 | 0 | 32 | 18 | 2147483647 | Registers | 0.363022189397157 | 0.362947974971684 | 0.999068201897935 | 88.7020125 | 0.000204434956433234 | 0.483385836019774 |
+
+| sendImageToPBO | {200, 200, 1} | {8, 8, 1} | 7126343.042 | 691.776 | 1 | 28 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 2 | 2048 | 0 | 32 | 32 | 2147483647 | Warps, Registers, Blocks | 0.931502994388036 | 0.931235979793261 | 0.995279772339087 | 82.481375 | 0.000286649207124051 | 0.772856491247948 |
+
+| _vertexTransformAndAssembly | {19, 1, 1} | {128, 1, 1} | 7282935.682 | 11.52 | 1 | 26 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 4 | 4096 | 0 | 16 | 16 | 2147483647 | Warps, Registers | 0.175990357724649 | 0.164624902306048 | 0.801925545571245 | 230.039473684211 | 0.0645799892990904 | 0.111629441412581 |
+
+| _primitiveAssembly | {99, 1, 1} | {128, 1, 1} | 7423423.554 | 38.912 | 1 | 28 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 4 | 4096 | 0 | 16 | 16 | 2147483647 | Warps, Registers | 0.115558883800399 | 0.113214994403863 | 0.889113765834195 | 105.752525252525 | 0.0202830740437478 | 0.581825974338674 |
+
+| initDepth | {200, 200, 1} | {8, 8, 1} | 7568577.538 | 143.552 | 1 | 8 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 2 | 512 | 0 | 32 | 128 | 2147483647 | Warps, Blocks | 1.30524030013659 | 1.30411482187625 | 0.98702283423392 | 24 | 0.000862276670602506 | 0.779747015877279 |
+
+| _rasterize_tris | {33, 1, 1} | {128, 1, 1} | 7716841.378 | 654.912 | 0.1875 | 142 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 4 | 18432 | 0 | 16 | 3 | 2147483647 | Registers | 0.459790335622519 | 0.459687234750311 | 0.453431359577552 | 10741.2121212121 | 0.00022423453522104 | 0.0923711830600096 |
+
+| render | {200, 200, 1} | {8, 8, 1} | 7879058.914 | 1886.176 | 0.5625 | 53 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 2 | 3584 | 0 | 32 | 18 | 2147483647 | Registers | 0.362579180310165 | 0.362506639759231 | 0.999144957561782 | 88.7020125 | 0.000200068164068857 | 0.481828407766568 |
+
+| sendImageToPBO | {200, 200, 1} | {8, 8, 1} | 8029371.01 | 683.424 | 1 | 28 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 2 | 2048 | 0 | 32 | 32 | 2147483647 | Warps, Registers, Blocks | 0.926242422005649 | 0.925968778543995 | 0.995638147452664 | 82.481375 | 0.000295433954603164 | 0.7768287461036 |
+
+| _vertexTransformAndAssembly | {19, 1, 1} | {128, 1, 1} | 8186013.058 | 14.208 | 1 | 26 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 4 | 4096 | 0 | 16 | 16 | 2147483647 | Warps, Registers | 0.176397304490628 | 0.165005568454235 | 0.810727676180274 | 230.039473684211 | 0.0645799892990904 | 0.111806475108994 |
+
+| _primitiveAssembly | {99, 1, 1} | {128, 1, 1} | 8328090.402 | 38.976 | 1 | 28 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 4 | 4096 | 0 | 16 | 16 | 2147483647 | Warps, Registers | 0.115746494356042 | 0.11338818897851 | 0.885775613967767 | 105.752525252525 | 0.0203747456080844 | 0.583423653891078 |
+
+| initDepth | {200, 200, 1} | {8, 8, 1} | 8468803.906 | 143.712 | 1 | 8 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 2 | 512 | 0 | 32 | 128 | 2147483647 | Warps, Blocks | 1.3073168309153 | 1.30618820267143 | 0.987170171185268 | 24 | 0.00086331654055168 | 0.782551117262802 |
+
+| _rasterize_tris | {33, 1, 1} | {128, 1, 1} | 8618424.354 | 654.656 | 0.1875 | 142 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 4 | 18432 | 0 | 16 | 3 | 2147483647 | Registers | 0.463106124172719 | 0.463007812359657 | 0.450372603013774 | 10739.2803030303 | 0.00021228787081613 | 0.092216246879781 |
+
+| render | {200, 200, 1} | {8, 8, 1} | 8775178.05 | 1886.624 | 0.5625 | 53 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 2 | 3584 | 0 | 32 | 18 | 2147483647 | Registers | 0.362880292301227 | 0.362807947093003 | 0.998802040193112 | 88.7020125 | 0.000199363839147337 | 0.48478798899915 |
+
+| sendImageToPBO | {200, 200, 1} | {8, 8, 1} | 8923491.33 | 687.232 | 1 | 28 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 2 | 2048 | 0 | 32 | 32 | 2147483647 | Warps, Registers, Blocks | 0.927765989327423 | 0.927498500205501 | 0.995753185950326 | 82.481375 | 0.000288315291785529 | 0.752960923920567 |
+
+| _vertexTransformAndAssembly | {19, 1, 1} | {128, 1, 1} | 9082697.09 | 11.744 | 1 | 26 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 4 | 4096 | 0 | 16 | 16 | 2147483647 | Warps, Registers | 0.174053137892178 | 0.162812788109628 | 0.817020467168835 | 230.039473684211 | 0.0645799892990904 | 0.110981528687737 |
+
+| _primitiveAssembly | {99, 1, 1} | {128, 1, 1} | 9221597.25 | 38.688 | 1 | 28 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 4 | 4096 | 0 | 16 | 16 | 2147483647 | Warps, Registers | 0.115678952210857 | 0.113324673918926 | 0.896223898333859 | 105.752525252525 | 0.0203518293253486 | 0.582956583474044 |
+
+| initDepth | {200, 200, 1} | {8, 8, 1} | 9363803.778 | 143.232 | 1 | 8 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 2 | 512 | 0 | 32 | 128 | 2147483647 | Warps, Blocks | 1.30588224101164 | 1.3047786157658 | 0.986798551502146 | 24 | 0.000845118503932091 | 0.778370307878584 |
+
+| _rasterize_tris | {33, 1, 1} | {128, 1, 1} | 9511359.234 | 656.32 | 0.1875 | 142 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 4 | 18432 | 0 | 16 | 3 | 2147483647 | Registers | 0.460226212675217 | 0.460116866922975 | 0.455086256159309 | 10742.9166666667 | 0.000237591317553134 | 0.0922243475722061 |
+
+| render | {200, 200, 1} | {8, 8, 1} | 9669576.514 | 1884.928 | 0.5625 | 53 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 2 | 3584 | 0 | 32 | 18 | 2147483647 | Registers | 0.363539269690931 | 0.363463310860515 | 0.998935968273476 | 88.7020125 | 0.000208942573056637 | 0.482617185879635 |
+
+| sendImageToPBO | {200, 200, 1} | {8, 8, 1} | 9820362.722 | 683.584 | 1 | 28 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 2 | 2048 | 0 | 32 | 32 | 2147483647 | Warps, Registers, Blocks | 0.92372544524711 | 0.923468075357456 | 0.995888142765155 | 82.481375 | 0.000278621630462268 | 0.758441811288614 |
+
+| _vertexTransformAndAssembly | {19, 1, 1} | {128, 1, 1} | 9970328.45 | 24.608 | 1 | 26 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 4 | 4096 | 0 | 16 | 16 | 2147483647 | Warps, Registers | 0.175633134426538 | 0.164290748484706 | 0.813259457393963 | 230.039473684211 | 0.0645799892990904 | 0.111500828516587 |
+
+| _primitiveAssembly | {99, 1, 1} | {128, 1, 1} | 10113022.53 | 38.528 | 1 | 28 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 4 | 4096 | 0 | 16 | 16 | 2147483647 | Warps, Registers | 0.116594973964229 | 0.114230069474184 | 0.893192837129979 | 105.752525252525 | 0.0202830740437478 | 0.58302719413925 |
+
+| initDepth | {200, 200, 1} | {8, 8, 1} | 10261029.186 | 143.456 | 1 | 8 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 2 | 512 | 0 | 32 | 128 | 2147483647 | Warps, Blocks | 1.30387165807331 | 1.3027432923987 | 0.987135470388876 | 24 | 0.000865396273956464 | 0.774493731424646 |
+
+| _rasterize_tris | {33, 1, 1} | {128, 1, 1} | 10411357.922 | 653.856 | 0.1875 | 142 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 4 | 18432 | 0 | 16 | 3 | 2147483647 | Registers | 0.464229002976853 | 0.464137367226343 | 0.449384437316438 | 10743.9924242424 | 0.000197393419890348 | 0.0922061909048464 |
+
+| render | {200, 200, 1} | {8, 8, 1} | 10570921.442 | 1886.816 | 0.5625 | 53 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 2 | 3584 | 0 | 32 | 18 | 2147483647 | Registers | 0.363411372703308 | 0.36333769301365 | 0.999078394761746 | 88.7020125 | 0.000202744589720469 | 0.482583280075321 |
+
+| sendImageToPBO | {200, 200, 1} | {8, 8, 1} | 10723421.282 | 688.256 | 1 | 28 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 2 | 2048 | 0 | 32 | 32 | 2147483647 | Warps, Registers, Blocks | 0.928251171528411 | 0.927984386090423 | 0.995252711876269 | 82.481375 | 0.000287406519022236 | 0.762500189990782 |
+
+| _vertexTransformAndAssembly | {19, 1, 1} | {128, 1, 1} | 10887967.81 | 11.52 | 1 | 26 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 4 | 4096 | 0 | 16 | 16 | 2147483647 | Warps, Registers | 0.176131330455925 | 0.164756771019847 | 0.807503234152652 | 230.039473684211 | 0.0645799892990904 | 0.111228946861229 |
+
+| _primitiveAssembly | {99, 1, 1} | {128, 1, 1} | 11035042.082 | 44.448 | 1 | 28 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 4 | 4096 | 0 | 16 | 16 | 2147483647 | Warps, Registers | 0.115804645189955 | 0.113455770996958 | 0.89360625574977 | 105.752525252525 | 0.0202830740437478 | 0.582381305423004 |
+
+| initDepth | {200, 200, 1} | {8, 8, 1} | 11177136.546 | 143.488 | 1 | 8 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 2 | 512 | 0 | 32 | 128 | 2147483647 | Warps, Blocks | 1.30669812699956 | 1.30555848324023 | 0.986513409447657 | 24 | 0.000872155347727713 | 0.777465583614206 |
+
+| _rasterize_tris | {33, 1, 1} | {128, 1, 1} | 11319894.274 | 652.864 | 0.1875 | 142 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 4 | 18432 | 0 | 16 | 3 | 2147483647 | Registers | 0.460588506523447 | 0.460495639128995 | 0.45343997220165 | 10743.7121212121 | 0.000201627685314173 | 0.0922890749128351 |
+
+| render | {200, 200, 1} | {8, 8, 1} | 11479944.834 | 1887.52 | 0.5625 | 53 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 2 | 3584 | 0 | 32 | 18 | 2147483647 | Registers | 0.363262709888734 | 0.363188497462984 | 0.999024777094099 | 88.7020125 | 0.000204294092758817 | 0.485356409064251 |
+
+| sendImageToPBO | {200, 200, 1} | {8, 8, 1} | 11627693.218 | 688.992 | 1 | 28 | 0 | 0 | PREFER_SHARED | OFF | OFF | 0 | GeForce GTX 970M | 1 | 1 | cis565_rasterizer.exe | 2 | 2048 | 0 | 32 | 32 | 2147483647 | Warps, Registers, Blocks | 0.92939090215555 | 0.929112950168869 | 0.995431520462426 | 82.481375 | 0.000299068977365902 | 0.771455499632248 |
+
+
+
+
diff --git a/graphs/pipeline_timing.png b/graphs/pipeline_timing.png
new file mode 100644
index 0000000..7a40c2e
Binary files /dev/null and b/graphs/pipeline_timing.png differ
diff --git a/images/AA.PNG b/images/AA.PNG
new file mode 100644
index 0000000..b40e71b
Binary files /dev/null and b/images/AA.PNG differ
diff --git a/images/checkerboard.PNG b/images/checkerboard.PNG
new file mode 100644
index 0000000..d20df39
Binary files /dev/null and b/images/checkerboard.PNG differ
diff --git a/images/cow.PNG b/images/cow.PNG
new file mode 100644
index 0000000..22cfcca
Binary files /dev/null and b/images/cow.PNG differ
diff --git a/images/duck.PNG b/images/duck.PNG
new file mode 100644
index 0000000..1af70b8
Binary files /dev/null and b/images/duck.PNG differ
diff --git a/images/duck.gif b/images/duck.gif
new file mode 100644
index 0000000..c8330f5
Binary files /dev/null and b/images/duck.gif differ
diff --git a/images/no_AA.PNG b/images/no_AA.PNG
new file mode 100644
index 0000000..ff956ec
Binary files /dev/null and b/images/no_AA.PNG differ
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index a57f69f..ce4f1b6 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -6,5 +6,5 @@ set(SOURCE_FILES
cuda_add_library(src
${SOURCE_FILES}
- OPTIONS -arch=sm_20
+ OPTIONS -arch=sm_50
)
diff --git a/src/rasterize.cu b/src/rasterize.cu
index 1262a09..450f6e9 100644
--- a/src/rasterize.cu
+++ b/src/rasterize.cu
@@ -17,6 +17,7 @@
#include "rasterize.h"
#include
#include
+#include
namespace {
@@ -43,10 +44,10 @@ namespace {
glm::vec3 eyePos; // eye space position used for shading
glm::vec3 eyeNor; // eye space normal used for shading, cuz normal will go wrong after perspective transformation
- // glm::vec3 col;
+ glm::vec3 col;
glm::vec2 texcoord0;
TextureData* dev_diffuseTex = NULL;
- // int texWidth, texHeight;
+ int texWidth, texHeight;
// ...
};
@@ -62,10 +63,11 @@ namespace {
// The attributes listed below might be useful,
// but always feel free to modify on your own
- // glm::vec3 eyePos; // eye space position used for shading
- // glm::vec3 eyeNor;
- // VertexAttributeTexcoord texcoord0;
- // TextureData* dev_diffuseTex;
+ glm::vec3 eyePos; // eye space position used for shading
+ glm::vec3 eyeNor;
+ VertexAttributeTexcoord texcoord0;
+ TextureData* dev_diffuseTex;
+ int texWidth, texHeight;
// ...
};
@@ -100,9 +102,10 @@ namespace {
static std::map> mesh2PrimitivesMap;
-
static int width = 0;
static int height = 0;
+static int baseWidth = 0;
+static int baseHeight = 0;
static int totalNumPrimitives = 0;
static Primitive *dev_primitives = NULL;
@@ -110,21 +113,34 @@ static Fragment *dev_fragmentBuffer = NULL;
static glm::vec3 *dev_framebuffer = NULL;
static int * dev_depth = NULL; // you might need this buffer when doing depth test
+static int * dev_mutex = NULL; // mutex buffer for locking depth buffer write
/**
* Kernel that writes the image to the OpenGL PBO directly.
*/
__global__
-void sendImageToPBO(uchar4 *pbo, int w, int h, glm::vec3 *image) {
+void sendImageToPBO(uchar4 *pbo, int w, int h, int antialias, glm::vec3 *image) {
int x = (blockIdx.x * blockDim.x) + threadIdx.x;
int y = (blockIdx.y * blockDim.y) + threadIdx.y;
int index = x + (y * w);
if (x < w && y < h) {
glm::vec3 color;
- color.x = glm::clamp(image[index].x, 0.0f, 1.0f) * 255.0;
- color.y = glm::clamp(image[index].y, 0.0f, 1.0f) * 255.0;
- color.z = glm::clamp(image[index].z, 0.0f, 1.0f) * 255.0;
+ for (int i = 0; i < antialias; ++i)
+ {
+ int AA_x = x * antialias + i;
+ for (int j = 0; j < antialias; ++j)
+ {
+ int AA_y = y * antialias + j;
+ int AA_index = AA_x + AA_y * antialias * w;
+ color += image[AA_index];
+ }
+ }
+ color /= (antialias * antialias);
+
+ color.x = glm::clamp(color.x, 0.0f, 1.0f) * 255.0;
+ color.y = glm::clamp(color.y, 0.0f, 1.0f) * 255.0;
+ color.z = glm::clamp(color.z, 0.0f, 1.0f) * 255.0;
// Each thread writes one pixel location in the texture (textel)
pbo[index].w = 0;
pbo[index].x = color.x;
@@ -136,6 +152,36 @@ void sendImageToPBO(uchar4 *pbo, int w, int h, glm::vec3 *image) {
/**
* Writes fragment colors to the framebuffer
*/
+
+__device__
+glm::vec3 getColor(TextureData* texture, int w, float u, float v)
+{
+ int idx = u + v * w;
+ return glm::vec3(texture[idx * 3], texture[idx * 3 + 1], texture[idx * 3 + 2]) / 255.f;
+}
+
+__device__
+glm::vec3 bilinearInterpolate(Fragment& frag)
+{
+ // source: https://en.wikipedia.org/wiki/Bilinear_filtering
+ float u = frag.texcoord0.x * frag.texWidth - 0.5;
+ float v = frag.texcoord0.y * frag.texHeight - 0.5;
+ int u_min = glm::floor(u);
+ int v_min = glm::floor(v);
+ float u_alpha = u - u_min;
+ float v_alpha = v - v_min;
+ float u_opposite = 1.f - u_alpha;
+ float v_opposite = 1.f - v_alpha;
+ glm::vec3 col_00 = getColor(frag.dev_diffuseTex, frag.texWidth, u_min, v_min);
+ glm::vec3 col_10 = getColor(frag.dev_diffuseTex, frag.texWidth, u_min + 1, v_min);
+ glm::vec3 col_01 = getColor(frag.dev_diffuseTex, frag.texWidth, u_min, v_min + 1);
+ glm::vec3 col_11 = getColor(frag.dev_diffuseTex, frag.texWidth, u_min + 1, v_min + 1);
+ glm::vec3 interp_color = (
+ (col_00 * u_opposite + col_10 * u_alpha) * v_opposite +
+ (col_01 * u_opposite + col_11 * u_alpha) * v_alpha);
+ return interp_color;
+}
+
__global__
void render(int w, int h, Fragment *fragmentBuffer, glm::vec3 *framebuffer) {
int x = (blockIdx.x * blockDim.x) + threadIdx.x;
@@ -143,10 +189,17 @@ void render(int w, int h, Fragment *fragmentBuffer, glm::vec3 *framebuffer) {
int index = x + (y * w);
if (x < w && y < h) {
- framebuffer[index] = fragmentBuffer[index].color;
-
- // TODO: add your fragment shader code here
-
+ Fragment frag = fragmentBuffer[index];
+#if TEXTURE == 1
+ if (frag.dev_diffuseTex != NULL)
+ {
+ frag.color = bilinearInterpolate(frag);
+ }
+#endif // TEXTURE
+ framebuffer[index] = frag.color;
+#if RAST_TRIS == 1
+ framebuffer[index] *= glm::dot(frag.eyeNor, glm::normalize(glm::vec3(1.f) - frag.eyePos)); // Lambertian shading, light @(1,1,1)
+#endif
}
}
@@ -154,8 +207,10 @@ void render(int w, int h, Fragment *fragmentBuffer, glm::vec3 *framebuffer) {
* Called once at the beginning of the program to allocate memory.
*/
void rasterizeInit(int w, int h) {
- width = w;
- height = h;
+ width = w * ANTIALIAS;
+ height = h * ANTIALIAS;
+ baseWidth = w;
+ baseHeight = h;
cudaFree(dev_fragmentBuffer);
cudaMalloc(&dev_fragmentBuffer, width * height * sizeof(Fragment));
cudaMemset(dev_fragmentBuffer, 0, width * height * sizeof(Fragment));
@@ -165,6 +220,8 @@ void rasterizeInit(int w, int h) {
cudaFree(dev_depth);
cudaMalloc(&dev_depth, width * height * sizeof(int));
+ cudaFree(dev_mutex);
+ cudaMalloc(&dev_mutex, width * height * sizeof(int));
checkCUDAError("rasterizeInit");
}
@@ -621,8 +678,6 @@ void rasterizeSetBuffers(const tinygltf::Scene & scene) {
}
-
-
__global__
void _vertexTransformAndAssembly(
int numVertices,
@@ -634,19 +689,30 @@ void _vertexTransformAndAssembly(
int vid = (blockIdx.x * blockDim.x) + threadIdx.x;
if (vid < numVertices) {
- // TODO: Apply vertex transformation here
// Multiply the MVP matrix for each vertex position, this will transform everything into clipping space
// Then divide the pos by its w element to transform into NDC space
// Finally transform x and y to viewport space
+ glm::vec4 clip_pos = MVP * glm::vec4(primitive.dev_position[vid], 1); // to clipping space
+ glm::vec4 ndc_pos = clip_pos / clip_pos[3]; // perspective divide
+ glm::vec4 ss_pos = glm::vec4((ndc_pos.x + 1.f) * width * 0.5, (1.f - ndc_pos.y) * height * 0.5, ndc_pos.z, ndc_pos[3]); // to screen space
+
// TODO: Apply vertex assembly here
// Assemble all attribute arraies into the primitive array
-
+
+ primitive.dev_verticesOut[vid].pos = ss_pos;
+ primitive.dev_verticesOut[vid].eyePos = glm::vec3(MV * glm::vec4(primitive.dev_position[vid], 1));
+ primitive.dev_verticesOut[vid].eyeNor = glm::normalize(MV_normal * primitive.dev_normal[vid]);
+ primitive.dev_verticesOut[vid].col = glm::normalize(glm::vec3(ss_pos));
+#if TEXTURE == 1
+ primitive.dev_verticesOut[vid].texcoord0 = primitive.dev_texcoord0[vid];
+ primitive.dev_verticesOut[vid].dev_diffuseTex = primitive.dev_diffuseTex;
+ primitive.dev_verticesOut[vid].texWidth = primitive.diffuseTexWidth;
+ primitive.dev_verticesOut[vid].texHeight = primitive.diffuseTexHeight;
+#endif
}
}
-
-
static int curPrimitiveBeginId = 0;
__global__
@@ -657,28 +723,132 @@ void _primitiveAssembly(int numIndices, int curPrimitiveBeginId, Primitive* dev_
if (iid < numIndices) {
- // TODO: uncomment the following code for a start
// This is primitive assembly for triangles
- //int pid; // id for cur primitives vector
- //if (primitive.primitiveMode == TINYGLTF_MODE_TRIANGLES) {
- // pid = iid / (int)primitive.primitiveType;
- // dev_primitives[pid + curPrimitiveBeginId].v[iid % (int)primitive.primitiveType]
- // = primitive.dev_verticesOut[primitive.dev_indices[iid]];
- //}
+ int pid; // id for cur primitives vector
+ if (primitive.primitiveMode == TINYGLTF_MODE_TRIANGLES) {
+ pid = iid / (int)primitive.primitiveType;
+ dev_primitives[pid + curPrimitiveBeginId].v[iid % (int)primitive.primitiveType]
+ = primitive.dev_verticesOut[primitive.dev_indices[iid]];
+ }
+ // other primitive types (point, line)
+ else if (primitive.primitiveMode == TINYGLTF_MODE_LINE) {
+ pid = iid / (int)primitive.primitiveType;
+ dev_primitives[pid + curPrimitiveBeginId].v[iid % (int)primitive.primitiveType]
+ = primitive.dev_verticesOut[primitive.dev_indices[iid]];
+ }
+ else if (primitive.primitiveMode == TINYGLTF_MODE_POINTS) {
+ pid = iid / (int)primitive.primitiveType;
+ dev_primitives[pid + curPrimitiveBeginId].v[iid % (int)primitive.primitiveType]
+ = primitive.dev_verticesOut[primitive.dev_indices[iid]];
+ }
+ }
+
+}
+
+__global__
+void _rasterize_tris(int numPrimitives, Primitive* primitives, Fragment *fragmentBuffer, int *depth, int *mutex, int w, int h)
+{
+ int pid = (blockIdx.x * blockDim.x) + threadIdx.x;
+ if (pid > numPrimitives) return;
+
+ Primitive prim = primitives[pid];
+ VertexOut v0 = prim.v[0];
+ VertexOut v1 = prim.v[1];
+ VertexOut v2 = prim.v[2];
+
+ // this primitive's triangle
+ glm::vec3 tri[3] = {glm::vec3(v0.pos), glm::vec3(v1.pos), glm::vec3(v2.pos)};
+
+ // this primitive's bounding box
+ AABB boundingBox = getAABBForTriangle(tri);
+ // bounding box limits
+ // clamp to window
+ int xMin = glm::min(w - 1, glm::max(0, (int)boundingBox.min.x));
+ int yMin = glm::min(h - 1, glm::max(0, (int)boundingBox.min.y));
+ int xMax = glm::max(0, glm::min(w - 1, (int)boundingBox.max.x));
+ int yMax = glm::max(0, glm::min(h - 1, (int)boundingBox.max.y));
- // TODO: other primitive types (point, line)
+ // loop over all pixels in the bounding box
+ for (int i = xMin; i <= xMax; ++i)
+ {
+ for (int j = yMin; j <= yMax; ++j)
+ {
+ // this pixel in barycentric coordinates wrt this primitive
+ glm::vec3 barycentricCoordinate = calculateBarycentricCoordinate(tri, glm::vec2(i, j));
+
+ // is this pixel in this primitive?
+ if (isBarycentricCoordInBounds(barycentricCoordinate))
+ {
+ int fragIdx = j * w + i;
+ // read/write from/to depth & frame buffer using mutex buffer
+ bool isSet;
+ do {
+ isSet = (atomicCAS(&mutex[fragIdx], 0, 1) == 0);
+ if (isSet) {
+ int d = -getZAtCoordinate(barycentricCoordinate, tri) * INT_MAX;
+ // if this fragment is shallower than previous fragment, draw
+ if (d < depth[fragIdx])
+ {
+ depth[fragIdx] = d;
+ Fragment& frag = fragmentBuffer[fragIdx];
+ glm::mat3 verts = glm::mat3(v0.eyePos, v1.eyePos, v2.eyePos);
+ glm::mat3 norms = glm::mat3(v0.eyeNor, v1.eyeNor, v2.eyeNor);
+ // interpolated fragment position and normal
+ frag.eyePos = verts * barycentricCoordinate;
+ frag.eyeNor = norms * barycentricCoordinate;
+ // interpolated fragment color
+ glm::vec3 col_0 = v0.col;
+ glm::vec3 col_1 = v1.col;
+ glm::vec3 col_2 = v2.col;
+ glm::mat3 cols = glm::mat3(col_0, col_1, col_2);
+ frag.color = cols * barycentricCoordinate;
+#if TEXTURE == 1
+ // fragment texture
+ frag.dev_diffuseTex = v0.dev_diffuseTex;
+ frag.texWidth = v0.texWidth;
+ frag.texHeight = v0.texHeight;
+ // perspective-correct depth
+ float persp_z = 1.f / (
+ barycentricCoordinate[0] / v0.eyePos.z +
+ barycentricCoordinate[1] / v1.eyePos.z +
+ barycentricCoordinate[2] / v2.eyePos.z
+ );
+ frag.texcoord0 = persp_z * (
+ barycentricCoordinate[0] * v0.texcoord0 / v0.eyePos.z +
+ barycentricCoordinate[1] * v1.texcoord0 / v1.eyePos.z +
+ barycentricCoordinate[2] * v2.texcoord0 / v2.eyePos.z);
+#endif // TEXTURE
+ }
+ }
+ if (isSet) {
+ mutex[fragIdx] = 0;
+ }
+ } while (!isSet);
+ }
+ }
}
-
}
+__global__
+void _rasterize_lines(int numPrimitives, Primitive* primitives, Fragment *fragmentBuffer, int *depth, int *mutex, int w, int h)
+{
+
+}
+
+__global__
+void _rasterize_points(int numPrimitives, Primitive* primitives, Fragment *fragmentBuffer, int *depth, int *mutex, int w, int h)
+{
+
+}
/**
* Perform rasterization.
*/
void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const glm::mat3 MV_normal) {
+
int sideLength2d = 8;
dim3 blockSize2d(sideLength2d, sideLength2d);
dim3 blockCount2d((width - 1) / blockSize2d.x + 1,
@@ -702,10 +872,10 @@ void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const g
dim3 numBlocksForVertices((p->numVertices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x);
dim3 numBlocksForIndices((p->numIndices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x);
- _vertexTransformAndAssembly << < numBlocksForVertices, numThreadsPerBlock >> >(p->numVertices, *p, MVP, MV, MV_normal, width, height);
+ _vertexTransformAndAssembly <<< numBlocksForVertices, numThreadsPerBlock >>>(p->numVertices, *p, MVP, MV, MV_normal, width, height);
checkCUDAError("Vertex Processing");
cudaDeviceSynchronize();
- _primitiveAssembly << < numBlocksForIndices, numThreadsPerBlock >> >
+ _primitiveAssembly <<>>
(p->numIndices,
curPrimitiveBeginId,
dev_primitives,
@@ -720,17 +890,28 @@ void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const g
}
cudaMemset(dev_fragmentBuffer, 0, width * height * sizeof(Fragment));
- initDepth << > >(width, height, dev_depth);
+ initDepth <<>>(width, height, dev_depth);
// TODO: rasterize
-
-
+ dim3 numThreadsPerBlock(128);
+ dim3 numBlocksForPrimitives((totalNumPrimitives + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x);
+
+#if RAST_TRIS == 1
+ _rasterize_tris <<>> (totalNumPrimitives, dev_primitives, dev_fragmentBuffer, dev_depth, dev_mutex, width, height);
+#endif
+#if RAST_LINES == 1
+ _rasterize_lines << > > (totalNumPrimitives, dev_primitives, dev_fragmentBuffer, dev_depth, dev_mutex, width, height);
+#endif
+#if RAST_POINTS == 1
+ _rasterize_points << > > (totalNumPrimitives, dev_primitives, dev_fragmentBuffer, dev_depth, dev_mutex, width, height);
+#endif
// Copy depthbuffer colors into framebuffer
- render << > >(width, height, dev_fragmentBuffer, dev_framebuffer);
+ render <<>>(width, height, dev_fragmentBuffer, dev_framebuffer);
checkCUDAError("fragment shader");
+
// Copy framebuffer into OpenGL buffer for OpenGL previewing
- sendImageToPBO<<>>(pbo, width, height, dev_framebuffer);
+ sendImageToPBO <<>>(pbo, baseWidth, baseHeight, ANTIALIAS, dev_framebuffer);
checkCUDAError("copy render result to pbo");
}
@@ -772,5 +953,8 @@ void rasterizeFree() {
cudaFree(dev_depth);
dev_depth = NULL;
+ cudaFree(dev_mutex);
+ dev_mutex = NULL;
+
checkCUDAError("rasterize Free");
}
diff --git a/src/rasterize.h b/src/rasterize.h
index 560aae9..de3a1cd 100644
--- a/src/rasterize.h
+++ b/src/rasterize.h
@@ -11,6 +11,15 @@
#include
#include
#include
+#include
+#include
+#include
+
+#define RAST_TRIS 1
+#define RAST_LINES 0
+#define RAST_POINTS 0
+#define TEXTURE 1
+#define ANTIALIAS 2
namespace tinygltf{
class Scene;