diff --git a/README.md b/README.md index 41b91f0..57de14f 100644 --- a/README.md +++ b/README.md @@ -1,18 +1,85 @@ CUDA Rasterizer =============== -[CLICK ME FOR INSTRUCTION OF THIS PROJECT](./INSTRUCTION.md) +* Zach Corse + * LinkedIn: https://www.linkedin.com/in/wzcorse/ + * Personal Website: https://wzcorse.com + * Twitter: @ZachCorse +* Tested on: Windows 10, i7-6700HQ @ 2.60GHz 32GB, NVIDIA GeForce GTX 970M (personal computer) -**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 4** +## README -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +![gif](images/duck.gif) -### (TODO: Your README) +Introduction +------------ -*DO NOT* leave the README to the last minute! It is a crucial part of the -project, and we will not be able to grade you without a good README. +This is a CUDA rasterizer that includes the standard rasterization pipeline, including: + +* Vertex shading +* Primitive assembly +* Rasterization +* Fragments to depth buffer +* Fragment shading +* Fragment to framebuffer writing + +For rasterization, I implemented a naive parallelization per triangle, checking all fragments in a particular triangle's bounding box (in barycentric coordinates) before adding a particular fragment to the scene fragment buffer, assuming it clears the depth buffer. The depth buffer itself is regulated by a mutex lock buffer, which ensures that a particular index in the depth buffer is not being written to or read from while another thread is writing to that index. + +As will be shown in the performance section below, a tile-based rendering system would be more efficient in some cases than parallelization by triangle primitive. + +Features +------------ + +Present features included in my rasterizer. See below for sample renders and performance analysis! + +1. Basic Rasterizing + * Rasterizes per triangle primitive + * Depth buffer regulated by mutex lock buffer +2. Textures + * Includes bilinear interpolation sampling +3. Color interpolation +4. Lambertian Shading +5. Anti-Aliasing + +Textures +------------ + +![pic1](images/checkerboard.PNG) + +*Shading includes textures* + +Textures are drawn using perspective-correct interpolation. UV coordinates, which are not discrete, are interpolated using neighboring pixel values using bilinear interpolation. + +Color Interpolation +------------ + +![pic1](images/cow.PNG) + +*Shading includes vertex color interpolation when no texture is specified.* + +Anti-Aliasing +------------ + +![pic10](images/no_AA.PNG) + +*No AA. Jagged edges can be seen on edge of duck.* + +![pic11](images/AA.PNG) + +*With AA X 2. Jagged edges are smoothed.* + +![pic11](graphs/AA_graph.png) + +We see here that, although anti-aliasing does provide a smoother picture, it comes with an associated drop in FPS. + +Performance +------------ + +From the graph below, which depicts the percentage of computational time each step in the rasterization pipeline takes, we can divine a few limitations of my implementation. First of all, we see that large triangles take a disproportionate amount of time to rasterize (the checkerboard is only two triangles). Because my implementation loops over all primitives, and for each primitive, it tests all pixels in that primitive's bounding box, larger triangles will have more pixel misses, and for those pixels that do fall within the triangle, these must be drawn serially. Conversely, the duck, although it has over 4,000 triangles, has relatively smaller triangles, which draw quickly in parallel. + +Now consider the duck's timing proportion. Here, the camera, in its default position, is zoomed out, such that the duck takes a relatively small proportion of the screen. However, while this means that fewer pixels need to be checked against each primitive, it also means that there will be proportionally fewer fragments to shade. Anti-aliasing X2, which quadruples the number of fragments, also quadruples the number of screen-space pixels that need to be checked in triangle bounding boxes, so this shouldn't affect the relative timing between rasterization and shading. I must therefore conclude that texture reading, bilinear interpolation, and lambertian shading, together, are relatively costly. + +![pic11](graphs/pipeline_timing.png) ### Credits diff --git a/graphs/AA_graph.png b/graphs/AA_graph.png new file mode 100644 index 0000000..1131ddc Binary files /dev/null and b/graphs/AA_graph.png differ diff --git a/graphs/data.xml b/graphs/data.xml new file mode 100644 index 0000000..3970a04 --- /dev/null +++ b/graphs/data.xml @@ -0,0 +1,210 @@ + + + + + + + + + +Function NameGrid +DimensionsBlock +DimensionsStart Time +(μs)Duration +(μs)OccupancyRegisters +per ThreadStatic Shared +Memory per +Block (bytes)Dynamic Shared +Memory per +Block (bytes)Cache +Configuration +ExecutedGlobal +Caching +RequestedGlobal +Caching +ExecutedLocal Memory +per Thread +(bytes)Device +NameContext +IDStream +IDProcess +NameOccupancy [0]: +Allocated Warps +Per BlockOccupancy [0]: +Allocated Registers +Per BlockOccupancy [0]: +Allocated Shared Memory +Per BlockOccupancy [0]: +Max Block Limit +WarpsOccupancy [0]: +Max Block Limit +RegistersOccupancy [0]: +Max Block Limit +Shared MemoryOccupancy [0]: +Block Limit ReasonInstruction Stats [2]: +GPU Issued IPCInstruction Stats [2]: +GPU Executed IPCInstruction Stats [2]: +GPU SM ActivityInstruction Stats [2]: +GPU SM Average IPWInstruction Stats [2]: +GPU SerializationAchieved Occupancy [1]: +Achieved Occupancy + +_deviceBufferCopy{99, 1, 1}{128, 1, 1}269080.83412.811500PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe42048016322147483647Warps0.862984077059170.8456850796147040.71435481354345597.77272727272730.02004555808656040.580820838076396 + +_deviceBufferCopy{57, 1, 1}{128, 1, 1}413704.8985.4411500PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe42048016322147483647Warps0.4759131293188550.4669101678183610.73320787492762103.7236842105260.01891723708774110.339507459446616 + +_deviceBufferCopy{57, 1, 1}{128, 1, 1}555153.255.31211500PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe42048016322147483647Warps0.4705939686933470.4610031384627380.735048001146296103.7236842105260.02038026593761650.338303652538174 + +_deviceBufferCopy{38, 1, 1}{128, 1, 1}695772.6745.15211500PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe42048016322147483647Warps0.3210853363703570.3150112889368420.731817517180874103.7236842105260.01891723708774110.226726013383506 + +_nodeMatrixTransform{19, 1, 1}{128, 1, 1}836581.7625.56812000PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe43072016212147483647Warps0.1421438870596340.1341382056082810.68942136498516382.01315789473680.05632096896290690.114977829328735 + +_vertexTransformAndAssembly{19, 1, 1}{128, 1, 1}980123.1711.55212600PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe44096016162147483647Warps, Registers0.1740547588005220.1628143043397280.817697228144989230.0394736842110.06457998929909040.11114078796763 + +_primitiveAssembly{99, 1, 1}{128, 1, 1}1120466.01838.30412800PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe44096016162147483647Warps, Registers0.1155572500094640.113234586327920.895828892549172105.7525252525250.02009967943468190.583518889917815 + +initDepth{200, 200, 1}{8, 8, 1}1259768.29143.4241800PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe25120321282147483647Warps, Blocks1.306425723289041.305336583077530.986535520738316240.0008336793987516670.781278960750559 + +_rasterize_tris{33, 1, 1}{128, 1, 1}1401854.626326.7520.187514200PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe41843201632147483647Registers0.5030531614658130.5029310610784380.6741397441568228768.46969696970.0002427186562542380.101232220989451 + +render{200, 200, 1}{8, 8, 1}1557381.6341886.2080.56255300PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe23584032182147483647Registers0.3638150220722520.3637403848231040.99936828631547988.81850.0002051516419588860.484978208360749 + +sendImageToPBO{200, 200, 1}{8, 8, 1}1703430.626687.87212800PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe22048032322147483647Warps, Registers, Blocks0.9274123166623040.9271672598474830.99562074532527982.47983750.000264237179534770.75495835098774 + +_vertexTransformAndAssembly{19, 1, 1}{128, 1, 1}1914064.41811.58412600PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe44096016162147483647Warps, Registers0.1752707835138560.1639517981900880.817941244151262230.0394736842110.06457998929909040.111558923714915 + +_primitiveAssembly{99, 1, 1}{128, 1, 1}2057898.8539.10412800PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe44096016162147483647Warps, Registers0.1146169603500610.1122842955352260.893177191848073105.7525252525250.02035182932534860.584685815328522 + +initDepth{200, 200, 1}{8, 8, 1}2199326.466143.361800PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe25120321282147483647Warps, Blocks1.305442633463951.30434073727860.986825415979513240.0008440785961026670.783281883928917 + +_rasterize_tris{33, 1, 1}{128, 1, 1}2348178.338319.7440.187514200PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe41843201632147483647Registers0.5044851547694250.5043631690156190.676956607652138770.378787878790.0002418024646579790.101051494347002 + +render{200, 200, 1}{8, 8, 1}2508212.1621885.0560.56255300PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe23584032182147483647Registers0.3637504217446630.3636769747067850.9988822965528788.81850.0002019160212272130.482364211446398 + +sendImageToPBO{200, 200, 1}{8, 8, 1}2654513.986690.20812800PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe22048032322147483647Warps, Registers, Blocks0.9263787768953690.9260809550509360.9961600999925982.47983750.0003214903577890740.778320699223218 + +_vertexTransformAndAssembly{19, 1, 1}{128, 1, 1}2801809.5711.5212600PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe44096016162147483647Warps, Registers0.1739737503490650.1627385274131990.815284207330955230.0394736842110.06457998929909040.111523404313634 + +_primitiveAssembly{99, 1, 1}{128, 1, 1}2944743.93851.5212800PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe44096016162147483647Warps, Registers0.1162157827057390.11385057893120.894600773402729105.7525252525250.02035182932534860.583825450146154 + +initDepth{200, 200, 1}{8, 8, 1}3085938.178143.521800PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe25120321282147483647Warps, Blocks1.306396198581951.305207915542170.987029979333852240.000909588562079940.778739854232955 + +_rasterize_tris{33, 1, 1}{128, 1, 1}3230410.018322.1760.187514200PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe41843201632147483647Registers0.5062545185129460.5061320566980710.6773414425330098766.924242424240.0002418977221873770.101677145381831 + +render{200, 200, 1}{8, 8, 1}3391765.1221886.6560.56255300PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe23584032182147483647Registers0.363467323663920.363392144161350.99897625533636788.81850.0002068397835921180.48615563432843 + +sendImageToPBO{200, 200, 1}{8, 8, 1}3540035.458686.43212800PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe22048032322147483647Warps, Registers, Blocks0.9303322418821260.9300668261322460.99617091184812882.47983750.0002852913592928770.756484483628558 + +_vertexTransformAndAssembly{19, 1, 1}{128, 1, 1}3700068.57822.68812600PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe44096016162147483647Warps, Registers0.1755522993688010.1642151337541330.795516700291415230.0394736842110.06457998929909040.111182808279518 + +_primitiveAssembly{99, 1, 1}{128, 1, 1}3837571.71447.96812800PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe44096016162147483647Warps, Registers0.115718384582070.1133553486357730.893273369118429105.7525252525250.02042057495731090.585364484487167 + +initDepth{200, 200, 1}{8, 8, 1}3979133.474143.5521800PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe25120321282147483647Warps, Blocks1.304710472839711.303580024537180.986924911885713240.0008664361374120880.777236894760645 + +_rasterize_tris{33, 1, 1}{128, 1, 1}4120905.41325.6640.187514200PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe41843201632147483647Registers0.5049663725876290.5048333544853920.6775490966685788769.265151515150.0002634197234870210.101391960032109 + +render{200, 200, 1}{8, 8, 1}4278789.091882.9440.56255300PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe23584032182147483647Registers0.3632107123914920.363135483774690.99873477744016588.81850.0002071211399767440.486818149632364 + +sendImageToPBO{200, 200, 1}{8, 8, 1}4427674.018691.90412800PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe22048032322147483647Warps, Registers, Blocks0.9312811304597750.9310141744873340.99481084404318882.47983750.0002866545489970950.758627577765074 + +_vertexTransformAndAssembly{19, 1, 1}{128, 1, 1}4586458.8511.42412600PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe44096016162147483647Warps, Registers0.1756512913048380.1643077327920.81038842345773230.0394736842110.06457998929909040.111276727251733 + +_primitiveAssembly{99, 1, 1}{128, 1, 1}4730588.57839.00812800PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe44096016162147483647Warps, Registers0.11616912246190.1137862357726450.896196459444323105.7525252525250.02051222079288970.582753447692479 + +initDepth{200, 200, 1}{8, 8, 1}4876308.93143.5841800PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe25120321282147483647Warps, Blocks1.30792309205391.306800740792670.986840001880671240.0008581171691604450.773479600289426 + +_rasterize_tris{33, 1, 1}{128, 1, 1}5020291.202405.8560.187514200PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe41843201632147483647Registers0.493330355580980.4932057320716510.6250156021476189803.96969696970.000252616746404460.0996624718136 + +render{200, 200, 1}{8, 8, 1}5176772.5141884.8960.56255300PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe23584032182147483647Registers0.363512873263890.3634429119562240.99895545429391488.897950.0001924589548593070.484056560260082 + +sendImageToPBO{200, 200, 1}{8, 8, 1}5326698.53685.82412800PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe22048032322147483647Warps, Registers, Blocks0.9272925344855030.9270483545178420.99484122824304982.48066250.0002633257128467950.774238981355545 + +_vertexTransformAndAssembly{19, 1, 1}{128, 1, 1}5486604.67411.64812600PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe44096016162147483647Warps, Registers0.1736391761197360.162425559984020.811619665208868230.0394736842110.06457998929909040.111282691996236 + +_primitiveAssembly{99, 1, 1}{128, 1, 1}5628228.41847.16812800PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe44096016162147483647Warps, Registers0.1161485249126470.1137846899572340.891605901305749105.7525252525250.02035182932534860.582163121415903 + +initDepth{200, 200, 1}{8, 8, 1}5774274.882143.5521800PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe25120321282147483647Warps, Blocks1.305589752369771.304461257500650.986932732539427240.000864356408336330.780488055935126 + +_rasterize_tris{33, 1, 1}{128, 1, 1}5921743.586654.2720.187514200PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe41843201632147483647Registers0.4586577556619320.458552026396750.45560213835681210744.03030303030.0002305188648320560.0926911417449346 + +render{200, 200, 1}{8, 8, 1}6079092.8661885.6320.56255300PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe23584032182147483647Registers0.3628602133815050.3627861342972510.99878303779460988.70201250.0002041532290447070.484539905338922 + +sendImageToPBO{200, 200, 1}{8, 8, 1}6228437.858686.46412800PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe22048032322147483647Warps, Registers, Blocks0.9295353582240390.9292765103589850.99490908683303882.4813750.0002784701655124850.738158342349256 + +_vertexTransformAndAssembly{19, 1, 1}{128, 1, 1}6382768.25813.18412600PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe44096016162147483647Warps, Registers0.1734876683591540.1622838365929960.814047151277014230.0394736842110.06457998929909040.110556126463275 + +_primitiveAssembly{99, 1, 1}{128, 1, 1}6523928.86645.37612800PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe44096016162147483647Warps, Registers0.1161986665325770.1138258229527090.89204005431093105.7525252525250.02042057495731090.58194043495837 + +initDepth{200, 200, 1}{8, 8, 1}6668118.242143.4561800PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe25120321282147483647Warps, Blocks1.305637050736581.304526844037020.986909583456267240.0008503180106097210.77922992982104 + +_rasterize_tris{33, 1, 1}{128, 1, 1}6817917.378651.9040.187514200PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe41843201632147483647Registers0.4591792739083340.4590837849368250.45400748017899510744.51515151520.0002079557526553130.0924428655354529 + +render{200, 200, 1}{8, 8, 1}6976368.2581885.6960.56255300PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe23584032182147483647Registers0.3630221893971570.3629479749716840.99906820189793588.70201250.0002044349564332340.483385836019774 + +sendImageToPBO{200, 200, 1}{8, 8, 1}7126343.042691.77612800PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe22048032322147483647Warps, Registers, Blocks0.9315029943880360.9312359797932610.99527977233908782.4813750.0002866492071240510.772856491247948 + +_vertexTransformAndAssembly{19, 1, 1}{128, 1, 1}7282935.68211.5212600PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe44096016162147483647Warps, Registers0.1759903577246490.1646249023060480.801925545571245230.0394736842110.06457998929909040.111629441412581 + +_primitiveAssembly{99, 1, 1}{128, 1, 1}7423423.55438.91212800PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe44096016162147483647Warps, Registers0.1155588838003990.1132149944038630.889113765834195105.7525252525250.02028307404374780.581825974338674 + +initDepth{200, 200, 1}{8, 8, 1}7568577.538143.5521800PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe25120321282147483647Warps, Blocks1.305240300136591.304114821876250.98702283423392240.0008622766706025060.779747015877279 + +_rasterize_tris{33, 1, 1}{128, 1, 1}7716841.378654.9120.187514200PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe41843201632147483647Registers0.4597903356225190.4596872347503110.45343135957755210741.21212121210.000224234535221040.0923711830600096 + +render{200, 200, 1}{8, 8, 1}7879058.9141886.1760.56255300PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe23584032182147483647Registers0.3625791803101650.3625066397592310.99914495756178288.70201250.0002000681640688570.481828407766568 + +sendImageToPBO{200, 200, 1}{8, 8, 1}8029371.01683.42412800PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe22048032322147483647Warps, Registers, Blocks0.9262424220056490.9259687785439950.99563814745266482.4813750.0002954339546031640.7768287461036 + +_vertexTransformAndAssembly{19, 1, 1}{128, 1, 1}8186013.05814.20812600PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe44096016162147483647Warps, Registers0.1763973044906280.1650055684542350.810727676180274230.0394736842110.06457998929909040.111806475108994 + +_primitiveAssembly{99, 1, 1}{128, 1, 1}8328090.40238.97612800PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe44096016162147483647Warps, Registers0.1157464943560420.113388188978510.885775613967767105.7525252525250.02037474560808440.583423653891078 + +initDepth{200, 200, 1}{8, 8, 1}8468803.906143.7121800PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe25120321282147483647Warps, Blocks1.30731683091531.306188202671430.987170171185268240.000863316540551680.782551117262802 + +_rasterize_tris{33, 1, 1}{128, 1, 1}8618424.354654.6560.187514200PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe41843201632147483647Registers0.4631061241727190.4630078123596570.45037260301377410739.28030303030.000212287870816130.092216246879781 + +render{200, 200, 1}{8, 8, 1}8775178.051886.6240.56255300PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe23584032182147483647Registers0.3628802923012270.3628079470930030.99880204019311288.70201250.0001993638391473370.48478798899915 + +sendImageToPBO{200, 200, 1}{8, 8, 1}8923491.33687.23212800PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe22048032322147483647Warps, Registers, Blocks0.9277659893274230.9274985002055010.99575318595032682.4813750.0002883152917855290.752960923920567 + +_vertexTransformAndAssembly{19, 1, 1}{128, 1, 1}9082697.0911.74412600PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe44096016162147483647Warps, Registers0.1740531378921780.1628127881096280.817020467168835230.0394736842110.06457998929909040.110981528687737 + +_primitiveAssembly{99, 1, 1}{128, 1, 1}9221597.2538.68812800PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe44096016162147483647Warps, Registers0.1156789522108570.1133246739189260.896223898333859105.7525252525250.02035182932534860.582956583474044 + +initDepth{200, 200, 1}{8, 8, 1}9363803.778143.2321800PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe25120321282147483647Warps, Blocks1.305882241011641.30477861576580.986798551502146240.0008451185039320910.778370307878584 + +_rasterize_tris{33, 1, 1}{128, 1, 1}9511359.234656.320.187514200PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe41843201632147483647Registers0.4602262126752170.4601168669229750.45508625615930910742.91666666670.0002375913175531340.0922243475722061 + +render{200, 200, 1}{8, 8, 1}9669576.5141884.9280.56255300PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe23584032182147483647Registers0.3635392696909310.3634633108605150.99893596827347688.70201250.0002089425730566370.482617185879635 + +sendImageToPBO{200, 200, 1}{8, 8, 1}9820362.722683.58412800PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe22048032322147483647Warps, Registers, Blocks0.923725445247110.9234680753574560.99588814276515582.4813750.0002786216304622680.758441811288614 + +_vertexTransformAndAssembly{19, 1, 1}{128, 1, 1}9970328.4524.60812600PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe44096016162147483647Warps, Registers0.1756331344265380.1642907484847060.813259457393963230.0394736842110.06457998929909040.111500828516587 + +_primitiveAssembly{99, 1, 1}{128, 1, 1}10113022.5338.52812800PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe44096016162147483647Warps, Registers0.1165949739642290.1142300694741840.893192837129979105.7525252525250.02028307404374780.58302719413925 + +initDepth{200, 200, 1}{8, 8, 1}10261029.186143.4561800PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe25120321282147483647Warps, Blocks1.303871658073311.30274329239870.987135470388876240.0008653962739564640.774493731424646 + +_rasterize_tris{33, 1, 1}{128, 1, 1}10411357.922653.8560.187514200PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe41843201632147483647Registers0.4642290029768530.4641373672263430.44938443731643810743.99242424240.0001973934198903480.0922061909048464 + +render{200, 200, 1}{8, 8, 1}10570921.4421886.8160.56255300PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe23584032182147483647Registers0.3634113727033080.363337693013650.99907839476174688.70201250.0002027445897204690.482583280075321 + +sendImageToPBO{200, 200, 1}{8, 8, 1}10723421.282688.25612800PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe22048032322147483647Warps, Registers, Blocks0.9282511715284110.9279843860904230.99525271187626982.4813750.0002874065190222360.762500189990782 + +_vertexTransformAndAssembly{19, 1, 1}{128, 1, 1}10887967.8111.5212600PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe44096016162147483647Warps, Registers0.1761313304559250.1647567710198470.807503234152652230.0394736842110.06457998929909040.111228946861229 + +_primitiveAssembly{99, 1, 1}{128, 1, 1}11035042.08244.44812800PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe44096016162147483647Warps, Registers0.1158046451899550.1134557709969580.89360625574977105.7525252525250.02028307404374780.582381305423004 + +initDepth{200, 200, 1}{8, 8, 1}11177136.546143.4881800PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe25120321282147483647Warps, Blocks1.306698126999561.305558483240230.986513409447657240.0008721553477277130.777465583614206 + +_rasterize_tris{33, 1, 1}{128, 1, 1}11319894.274652.8640.187514200PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe41843201632147483647Registers0.4605885065234470.4604956391289950.4534399722016510743.71212121210.0002016276853141730.0922890749128351 + +render{200, 200, 1}{8, 8, 1}11479944.8341887.520.56255300PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe23584032182147483647Registers0.3632627098887340.3631884974629840.99902477709409988.70201250.0002042940927588170.485356409064251 + +sendImageToPBO{200, 200, 1}{8, 8, 1}11627693.218688.99212800PREFER_SHAREDOFFOFF0GeForce GTX 970M11cis565_rasterizer.exe22048032322147483647Warps, Registers, Blocks0.929390902155550.9291129501688690.99543152046242682.4813750.0002990689773659020.771455499632248 +
+ +
+
diff --git a/graphs/pipeline_timing.png b/graphs/pipeline_timing.png new file mode 100644 index 0000000..7a40c2e Binary files /dev/null and b/graphs/pipeline_timing.png differ diff --git a/images/AA.PNG b/images/AA.PNG new file mode 100644 index 0000000..b40e71b Binary files /dev/null and b/images/AA.PNG differ diff --git a/images/checkerboard.PNG b/images/checkerboard.PNG new file mode 100644 index 0000000..d20df39 Binary files /dev/null and b/images/checkerboard.PNG differ diff --git a/images/cow.PNG b/images/cow.PNG new file mode 100644 index 0000000..22cfcca Binary files /dev/null and b/images/cow.PNG differ diff --git a/images/duck.PNG b/images/duck.PNG new file mode 100644 index 0000000..1af70b8 Binary files /dev/null and b/images/duck.PNG differ diff --git a/images/duck.gif b/images/duck.gif new file mode 100644 index 0000000..c8330f5 Binary files /dev/null and b/images/duck.gif differ diff --git a/images/no_AA.PNG b/images/no_AA.PNG new file mode 100644 index 0000000..ff956ec Binary files /dev/null and b/images/no_AA.PNG differ diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a57f69f..ce4f1b6 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -6,5 +6,5 @@ set(SOURCE_FILES cuda_add_library(src ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_50 ) diff --git a/src/rasterize.cu b/src/rasterize.cu index 1262a09..450f6e9 100644 --- a/src/rasterize.cu +++ b/src/rasterize.cu @@ -17,6 +17,7 @@ #include "rasterize.h" #include #include +#include namespace { @@ -43,10 +44,10 @@ namespace { glm::vec3 eyePos; // eye space position used for shading glm::vec3 eyeNor; // eye space normal used for shading, cuz normal will go wrong after perspective transformation - // glm::vec3 col; + glm::vec3 col; glm::vec2 texcoord0; TextureData* dev_diffuseTex = NULL; - // int texWidth, texHeight; + int texWidth, texHeight; // ... }; @@ -62,10 +63,11 @@ namespace { // The attributes listed below might be useful, // but always feel free to modify on your own - // glm::vec3 eyePos; // eye space position used for shading - // glm::vec3 eyeNor; - // VertexAttributeTexcoord texcoord0; - // TextureData* dev_diffuseTex; + glm::vec3 eyePos; // eye space position used for shading + glm::vec3 eyeNor; + VertexAttributeTexcoord texcoord0; + TextureData* dev_diffuseTex; + int texWidth, texHeight; // ... }; @@ -100,9 +102,10 @@ namespace { static std::map> mesh2PrimitivesMap; - static int width = 0; static int height = 0; +static int baseWidth = 0; +static int baseHeight = 0; static int totalNumPrimitives = 0; static Primitive *dev_primitives = NULL; @@ -110,21 +113,34 @@ static Fragment *dev_fragmentBuffer = NULL; static glm::vec3 *dev_framebuffer = NULL; static int * dev_depth = NULL; // you might need this buffer when doing depth test +static int * dev_mutex = NULL; // mutex buffer for locking depth buffer write /** * Kernel that writes the image to the OpenGL PBO directly. */ __global__ -void sendImageToPBO(uchar4 *pbo, int w, int h, glm::vec3 *image) { +void sendImageToPBO(uchar4 *pbo, int w, int h, int antialias, glm::vec3 *image) { int x = (blockIdx.x * blockDim.x) + threadIdx.x; int y = (blockIdx.y * blockDim.y) + threadIdx.y; int index = x + (y * w); if (x < w && y < h) { glm::vec3 color; - color.x = glm::clamp(image[index].x, 0.0f, 1.0f) * 255.0; - color.y = glm::clamp(image[index].y, 0.0f, 1.0f) * 255.0; - color.z = glm::clamp(image[index].z, 0.0f, 1.0f) * 255.0; + for (int i = 0; i < antialias; ++i) + { + int AA_x = x * antialias + i; + for (int j = 0; j < antialias; ++j) + { + int AA_y = y * antialias + j; + int AA_index = AA_x + AA_y * antialias * w; + color += image[AA_index]; + } + } + color /= (antialias * antialias); + + color.x = glm::clamp(color.x, 0.0f, 1.0f) * 255.0; + color.y = glm::clamp(color.y, 0.0f, 1.0f) * 255.0; + color.z = glm::clamp(color.z, 0.0f, 1.0f) * 255.0; // Each thread writes one pixel location in the texture (textel) pbo[index].w = 0; pbo[index].x = color.x; @@ -136,6 +152,36 @@ void sendImageToPBO(uchar4 *pbo, int w, int h, glm::vec3 *image) { /** * Writes fragment colors to the framebuffer */ + +__device__ +glm::vec3 getColor(TextureData* texture, int w, float u, float v) +{ + int idx = u + v * w; + return glm::vec3(texture[idx * 3], texture[idx * 3 + 1], texture[idx * 3 + 2]) / 255.f; +} + +__device__ +glm::vec3 bilinearInterpolate(Fragment& frag) +{ + // source: https://en.wikipedia.org/wiki/Bilinear_filtering + float u = frag.texcoord0.x * frag.texWidth - 0.5; + float v = frag.texcoord0.y * frag.texHeight - 0.5; + int u_min = glm::floor(u); + int v_min = glm::floor(v); + float u_alpha = u - u_min; + float v_alpha = v - v_min; + float u_opposite = 1.f - u_alpha; + float v_opposite = 1.f - v_alpha; + glm::vec3 col_00 = getColor(frag.dev_diffuseTex, frag.texWidth, u_min, v_min); + glm::vec3 col_10 = getColor(frag.dev_diffuseTex, frag.texWidth, u_min + 1, v_min); + glm::vec3 col_01 = getColor(frag.dev_diffuseTex, frag.texWidth, u_min, v_min + 1); + glm::vec3 col_11 = getColor(frag.dev_diffuseTex, frag.texWidth, u_min + 1, v_min + 1); + glm::vec3 interp_color = ( + (col_00 * u_opposite + col_10 * u_alpha) * v_opposite + + (col_01 * u_opposite + col_11 * u_alpha) * v_alpha); + return interp_color; +} + __global__ void render(int w, int h, Fragment *fragmentBuffer, glm::vec3 *framebuffer) { int x = (blockIdx.x * blockDim.x) + threadIdx.x; @@ -143,10 +189,17 @@ void render(int w, int h, Fragment *fragmentBuffer, glm::vec3 *framebuffer) { int index = x + (y * w); if (x < w && y < h) { - framebuffer[index] = fragmentBuffer[index].color; - - // TODO: add your fragment shader code here - + Fragment frag = fragmentBuffer[index]; +#if TEXTURE == 1 + if (frag.dev_diffuseTex != NULL) + { + frag.color = bilinearInterpolate(frag); + } +#endif // TEXTURE + framebuffer[index] = frag.color; +#if RAST_TRIS == 1 + framebuffer[index] *= glm::dot(frag.eyeNor, glm::normalize(glm::vec3(1.f) - frag.eyePos)); // Lambertian shading, light @(1,1,1) +#endif } } @@ -154,8 +207,10 @@ void render(int w, int h, Fragment *fragmentBuffer, glm::vec3 *framebuffer) { * Called once at the beginning of the program to allocate memory. */ void rasterizeInit(int w, int h) { - width = w; - height = h; + width = w * ANTIALIAS; + height = h * ANTIALIAS; + baseWidth = w; + baseHeight = h; cudaFree(dev_fragmentBuffer); cudaMalloc(&dev_fragmentBuffer, width * height * sizeof(Fragment)); cudaMemset(dev_fragmentBuffer, 0, width * height * sizeof(Fragment)); @@ -165,6 +220,8 @@ void rasterizeInit(int w, int h) { cudaFree(dev_depth); cudaMalloc(&dev_depth, width * height * sizeof(int)); + cudaFree(dev_mutex); + cudaMalloc(&dev_mutex, width * height * sizeof(int)); checkCUDAError("rasterizeInit"); } @@ -621,8 +678,6 @@ void rasterizeSetBuffers(const tinygltf::Scene & scene) { } - - __global__ void _vertexTransformAndAssembly( int numVertices, @@ -634,19 +689,30 @@ void _vertexTransformAndAssembly( int vid = (blockIdx.x * blockDim.x) + threadIdx.x; if (vid < numVertices) { - // TODO: Apply vertex transformation here // Multiply the MVP matrix for each vertex position, this will transform everything into clipping space // Then divide the pos by its w element to transform into NDC space // Finally transform x and y to viewport space + glm::vec4 clip_pos = MVP * glm::vec4(primitive.dev_position[vid], 1); // to clipping space + glm::vec4 ndc_pos = clip_pos / clip_pos[3]; // perspective divide + glm::vec4 ss_pos = glm::vec4((ndc_pos.x + 1.f) * width * 0.5, (1.f - ndc_pos.y) * height * 0.5, ndc_pos.z, ndc_pos[3]); // to screen space + // TODO: Apply vertex assembly here // Assemble all attribute arraies into the primitive array - + + primitive.dev_verticesOut[vid].pos = ss_pos; + primitive.dev_verticesOut[vid].eyePos = glm::vec3(MV * glm::vec4(primitive.dev_position[vid], 1)); + primitive.dev_verticesOut[vid].eyeNor = glm::normalize(MV_normal * primitive.dev_normal[vid]); + primitive.dev_verticesOut[vid].col = glm::normalize(glm::vec3(ss_pos)); +#if TEXTURE == 1 + primitive.dev_verticesOut[vid].texcoord0 = primitive.dev_texcoord0[vid]; + primitive.dev_verticesOut[vid].dev_diffuseTex = primitive.dev_diffuseTex; + primitive.dev_verticesOut[vid].texWidth = primitive.diffuseTexWidth; + primitive.dev_verticesOut[vid].texHeight = primitive.diffuseTexHeight; +#endif } } - - static int curPrimitiveBeginId = 0; __global__ @@ -657,28 +723,132 @@ void _primitiveAssembly(int numIndices, int curPrimitiveBeginId, Primitive* dev_ if (iid < numIndices) { - // TODO: uncomment the following code for a start // This is primitive assembly for triangles - //int pid; // id for cur primitives vector - //if (primitive.primitiveMode == TINYGLTF_MODE_TRIANGLES) { - // pid = iid / (int)primitive.primitiveType; - // dev_primitives[pid + curPrimitiveBeginId].v[iid % (int)primitive.primitiveType] - // = primitive.dev_verticesOut[primitive.dev_indices[iid]]; - //} + int pid; // id for cur primitives vector + if (primitive.primitiveMode == TINYGLTF_MODE_TRIANGLES) { + pid = iid / (int)primitive.primitiveType; + dev_primitives[pid + curPrimitiveBeginId].v[iid % (int)primitive.primitiveType] + = primitive.dev_verticesOut[primitive.dev_indices[iid]]; + } + // other primitive types (point, line) + else if (primitive.primitiveMode == TINYGLTF_MODE_LINE) { + pid = iid / (int)primitive.primitiveType; + dev_primitives[pid + curPrimitiveBeginId].v[iid % (int)primitive.primitiveType] + = primitive.dev_verticesOut[primitive.dev_indices[iid]]; + } + else if (primitive.primitiveMode == TINYGLTF_MODE_POINTS) { + pid = iid / (int)primitive.primitiveType; + dev_primitives[pid + curPrimitiveBeginId].v[iid % (int)primitive.primitiveType] + = primitive.dev_verticesOut[primitive.dev_indices[iid]]; + } + } + +} + +__global__ +void _rasterize_tris(int numPrimitives, Primitive* primitives, Fragment *fragmentBuffer, int *depth, int *mutex, int w, int h) +{ + int pid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (pid > numPrimitives) return; + + Primitive prim = primitives[pid]; + VertexOut v0 = prim.v[0]; + VertexOut v1 = prim.v[1]; + VertexOut v2 = prim.v[2]; + + // this primitive's triangle + glm::vec3 tri[3] = {glm::vec3(v0.pos), glm::vec3(v1.pos), glm::vec3(v2.pos)}; + + // this primitive's bounding box + AABB boundingBox = getAABBForTriangle(tri); + // bounding box limits + // clamp to window + int xMin = glm::min(w - 1, glm::max(0, (int)boundingBox.min.x)); + int yMin = glm::min(h - 1, glm::max(0, (int)boundingBox.min.y)); + int xMax = glm::max(0, glm::min(w - 1, (int)boundingBox.max.x)); + int yMax = glm::max(0, glm::min(h - 1, (int)boundingBox.max.y)); - // TODO: other primitive types (point, line) + // loop over all pixels in the bounding box + for (int i = xMin; i <= xMax; ++i) + { + for (int j = yMin; j <= yMax; ++j) + { + // this pixel in barycentric coordinates wrt this primitive + glm::vec3 barycentricCoordinate = calculateBarycentricCoordinate(tri, glm::vec2(i, j)); + + // is this pixel in this primitive? + if (isBarycentricCoordInBounds(barycentricCoordinate)) + { + int fragIdx = j * w + i; + // read/write from/to depth & frame buffer using mutex buffer + bool isSet; + do { + isSet = (atomicCAS(&mutex[fragIdx], 0, 1) == 0); + if (isSet) { + int d = -getZAtCoordinate(barycentricCoordinate, tri) * INT_MAX; + // if this fragment is shallower than previous fragment, draw + if (d < depth[fragIdx]) + { + depth[fragIdx] = d; + Fragment& frag = fragmentBuffer[fragIdx]; + glm::mat3 verts = glm::mat3(v0.eyePos, v1.eyePos, v2.eyePos); + glm::mat3 norms = glm::mat3(v0.eyeNor, v1.eyeNor, v2.eyeNor); + // interpolated fragment position and normal + frag.eyePos = verts * barycentricCoordinate; + frag.eyeNor = norms * barycentricCoordinate; + // interpolated fragment color + glm::vec3 col_0 = v0.col; + glm::vec3 col_1 = v1.col; + glm::vec3 col_2 = v2.col; + glm::mat3 cols = glm::mat3(col_0, col_1, col_2); + frag.color = cols * barycentricCoordinate; +#if TEXTURE == 1 + // fragment texture + frag.dev_diffuseTex = v0.dev_diffuseTex; + frag.texWidth = v0.texWidth; + frag.texHeight = v0.texHeight; + // perspective-correct depth + float persp_z = 1.f / ( + barycentricCoordinate[0] / v0.eyePos.z + + barycentricCoordinate[1] / v1.eyePos.z + + barycentricCoordinate[2] / v2.eyePos.z + ); + frag.texcoord0 = persp_z * ( + barycentricCoordinate[0] * v0.texcoord0 / v0.eyePos.z + + barycentricCoordinate[1] * v1.texcoord0 / v1.eyePos.z + + barycentricCoordinate[2] * v2.texcoord0 / v2.eyePos.z); +#endif // TEXTURE + } + } + if (isSet) { + mutex[fragIdx] = 0; + } + } while (!isSet); + } + } } - } +__global__ +void _rasterize_lines(int numPrimitives, Primitive* primitives, Fragment *fragmentBuffer, int *depth, int *mutex, int w, int h) +{ + +} + +__global__ +void _rasterize_points(int numPrimitives, Primitive* primitives, Fragment *fragmentBuffer, int *depth, int *mutex, int w, int h) +{ + +} /** * Perform rasterization. */ void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const glm::mat3 MV_normal) { + int sideLength2d = 8; dim3 blockSize2d(sideLength2d, sideLength2d); dim3 blockCount2d((width - 1) / blockSize2d.x + 1, @@ -702,10 +872,10 @@ void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const g dim3 numBlocksForVertices((p->numVertices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x); dim3 numBlocksForIndices((p->numIndices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x); - _vertexTransformAndAssembly << < numBlocksForVertices, numThreadsPerBlock >> >(p->numVertices, *p, MVP, MV, MV_normal, width, height); + _vertexTransformAndAssembly <<< numBlocksForVertices, numThreadsPerBlock >>>(p->numVertices, *p, MVP, MV, MV_normal, width, height); checkCUDAError("Vertex Processing"); cudaDeviceSynchronize(); - _primitiveAssembly << < numBlocksForIndices, numThreadsPerBlock >> > + _primitiveAssembly <<>> (p->numIndices, curPrimitiveBeginId, dev_primitives, @@ -720,17 +890,28 @@ void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const g } cudaMemset(dev_fragmentBuffer, 0, width * height * sizeof(Fragment)); - initDepth << > >(width, height, dev_depth); + initDepth <<>>(width, height, dev_depth); // TODO: rasterize - - + dim3 numThreadsPerBlock(128); + dim3 numBlocksForPrimitives((totalNumPrimitives + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x); + +#if RAST_TRIS == 1 + _rasterize_tris <<>> (totalNumPrimitives, dev_primitives, dev_fragmentBuffer, dev_depth, dev_mutex, width, height); +#endif +#if RAST_LINES == 1 + _rasterize_lines << > > (totalNumPrimitives, dev_primitives, dev_fragmentBuffer, dev_depth, dev_mutex, width, height); +#endif +#if RAST_POINTS == 1 + _rasterize_points << > > (totalNumPrimitives, dev_primitives, dev_fragmentBuffer, dev_depth, dev_mutex, width, height); +#endif // Copy depthbuffer colors into framebuffer - render << > >(width, height, dev_fragmentBuffer, dev_framebuffer); + render <<>>(width, height, dev_fragmentBuffer, dev_framebuffer); checkCUDAError("fragment shader"); + // Copy framebuffer into OpenGL buffer for OpenGL previewing - sendImageToPBO<<>>(pbo, width, height, dev_framebuffer); + sendImageToPBO <<>>(pbo, baseWidth, baseHeight, ANTIALIAS, dev_framebuffer); checkCUDAError("copy render result to pbo"); } @@ -772,5 +953,8 @@ void rasterizeFree() { cudaFree(dev_depth); dev_depth = NULL; + cudaFree(dev_mutex); + dev_mutex = NULL; + checkCUDAError("rasterize Free"); } diff --git a/src/rasterize.h b/src/rasterize.h index 560aae9..de3a1cd 100644 --- a/src/rasterize.h +++ b/src/rasterize.h @@ -11,6 +11,15 @@ #include #include #include +#include +#include +#include + +#define RAST_TRIS 1 +#define RAST_LINES 0 +#define RAST_POINTS 0 +#define TEXTURE 1 +#define ANTIALIAS 2 namespace tinygltf{ class Scene;