diff --git a/README.md b/README.md index 41b91f0..337e251 100644 --- a/README.md +++ b/README.md @@ -1,18 +1,58 @@ CUDA Rasterizer =============== -[CLICK ME FOR INSTRUCTION OF THIS PROJECT](./INSTRUCTION.md) - **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 4** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Ishan Ranade +* Tested on personal computer: Gigabyte Aero 14, Windows 10, i7-7700HQ, GTX 1060 + +## Demo + +![](renders/demo.gif) + +## Introduction + +I implemented a GPU based rasterization pipeline with a number of added on features. For a brief overview the rasterization pipeline consists of a number of steps including vertex transformation of a set of user defined vertices, primitive assembly to arrange these vertices into shapes, a fragment depth test in which the closest fragment is saved, fragment transformation to determine the final color of a pixel, and finally rendering to the screen. Specific extra features that I chose to implement were super sampled antialiasing, UV texture mapping with bilinear texture filtering and perspective correct texture coordinates, correct color interpolation between points on a primitive, and backface culling. + +## Features + +![](renders/truck.JPG) + +![](renders/texture1.JPG) + +- UV texture mapping with bilinear texture filtering and perspective correct texture coordinates + +
+ +![](renders/cow.JPG) + +- Correct color interpolation between points on a primitive + +
+ +![](renders/aa2.JPG) + +- Super sampled anti aliasing + +
+ +![](renders/noaa.JPG) + +- No anti aliasing + +
+ +## Performance Analysis + +The rasterization step of the pipeline seemed to take the longest time, and this is because of the heavy computation in calculating which pixels overlap which triangles. I implemented backface culling to try to reduce the computation time, and the graphs can be seen below comparing runtimes with and without this optimization. + +Antialiasing also increased the runtime as the size of the PBO quadrupled in size so the computations became 4 times as expensive. This step was necessary though because without it you can easily see jagged edges along diagonal lines as can be seen in the above images. + +![](renders/graph1.JPG) + +![](renders/graph2.JPG) -### (TODO: Your README) -*DO NOT* leave the README to the last minute! It is a crucial part of the -project, and we will not be able to grade you without a good README. ### Credits diff --git a/graphs.pptx b/graphs.pptx new file mode 100644 index 0000000..4e108b6 Binary files /dev/null and b/graphs.pptx differ diff --git a/renders/aa.JPG b/renders/aa.JPG new file mode 100644 index 0000000..a60e8d4 Binary files /dev/null and b/renders/aa.JPG differ diff --git a/renders/aa2.JPG b/renders/aa2.JPG new file mode 100644 index 0000000..ed8aca1 Binary files /dev/null and b/renders/aa2.JPG differ diff --git a/renders/cow.JPG b/renders/cow.JPG new file mode 100644 index 0000000..7d0bab5 Binary files /dev/null and b/renders/cow.JPG differ diff --git a/renders/demo.gif b/renders/demo.gif new file mode 100644 index 0000000..b5c3010 Binary files /dev/null and b/renders/demo.gif differ diff --git a/renders/graph1.JPG b/renders/graph1.JPG new file mode 100644 index 0000000..bf067ec Binary files /dev/null and b/renders/graph1.JPG differ diff --git a/renders/graph2.JPG b/renders/graph2.JPG new file mode 100644 index 0000000..2e81e22 Binary files /dev/null and b/renders/graph2.JPG differ diff --git a/renders/noaa.JPG b/renders/noaa.JPG new file mode 100644 index 0000000..8e9ab25 Binary files /dev/null and b/renders/noaa.JPG differ diff --git a/renders/texture1.JPG b/renders/texture1.JPG new file mode 100644 index 0000000..fabd6ba Binary files /dev/null and b/renders/texture1.JPG differ diff --git a/renders/truck.JPG b/renders/truck.JPG new file mode 100644 index 0000000..6039beb Binary files /dev/null and b/renders/truck.JPG differ diff --git a/shaders/README.md b/shaders/README.md new file mode 100644 index 0000000..e69de29 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a57f69f..d9247c3 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -6,5 +6,5 @@ set(SOURCE_FILES cuda_add_library(src ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_60 ) diff --git a/src/rasterize.cu b/src/rasterize.cu index 1262a09..cf71b33 100644 --- a/src/rasterize.cu +++ b/src/rasterize.cu @@ -17,6 +17,9 @@ #include "rasterize.h" #include #include +#include +#include +#include namespace { @@ -28,7 +31,7 @@ namespace { typedef unsigned char BufferByte; - enum PrimitiveType{ + enum PrimitiveType { Point = 1, Line = 2, Triangle = 3 @@ -41,12 +44,12 @@ namespace { // The attributes listed below might be useful, // but always feel free to modify on your own - glm::vec3 eyePos; // eye space position used for shading - glm::vec3 eyeNor; // eye space normal used for shading, cuz normal will go wrong after perspective transformation - // glm::vec3 col; - glm::vec2 texcoord0; - TextureData* dev_diffuseTex = NULL; - // int texWidth, texHeight; + glm::vec3 eyePos; // eye space position used for shading + glm::vec3 eyeNor; // eye space normal used for shading, cuz normal will go wrong after perspective transformation + glm::vec3 color; + glm::vec2 texcoord0; + TextureData* dev_diffuseTex = NULL; + int texWidth, texHeight; // ... }; @@ -62,10 +65,8 @@ namespace { // The attributes listed below might be useful, // but always feel free to modify on your own - // glm::vec3 eyePos; // eye space position used for shading - // glm::vec3 eyeNor; - // VertexAttributeTexcoord texcoord0; - // TextureData* dev_diffuseTex; + glm::vec3 eyePos; // eye space position used for shading + glm::vec3 eyeNor; // ... }; @@ -100,77 +101,117 @@ namespace { static std::map> mesh2PrimitivesMap; - -static int width = 0; -static int height = 0; - static int totalNumPrimitives = 0; static Primitive *dev_primitives = NULL; +static Primitive *dev_primitives_copy = NULL; static Fragment *dev_fragmentBuffer = NULL; static glm::vec3 *dev_framebuffer = NULL; -static int * dev_depth = NULL; // you might need this buffer when doing depth test +static float * dev_depth = NULL; // you might need this buffer when doing depth test + +static int *mutex; + +static int width = 0; +static int height = 0; +static int originalWidth = 0; +static int originalHeight = 0; +static int antialiasing = 2; /** * Kernel that writes the image to the OpenGL PBO directly. */ -__global__ -void sendImageToPBO(uchar4 *pbo, int w, int h, glm::vec3 *image) { - int x = (blockIdx.x * blockDim.x) + threadIdx.x; - int y = (blockIdx.y * blockDim.y) + threadIdx.y; - int index = x + (y * w); - - if (x < w && y < h) { - glm::vec3 color; - color.x = glm::clamp(image[index].x, 0.0f, 1.0f) * 255.0; - color.y = glm::clamp(image[index].y, 0.0f, 1.0f) * 255.0; - color.z = glm::clamp(image[index].z, 0.0f, 1.0f) * 255.0; - // Each thread writes one pixel location in the texture (textel) - pbo[index].w = 0; - pbo[index].x = color.x; - pbo[index].y = color.y; - pbo[index].z = color.z; - } +__global__ +void sendImageToPBO(uchar4 *pbo, int w, int h, glm::vec3 *image, int antialiasing, int width) { + int x = (blockIdx.x * blockDim.x) + threadIdx.x; + int y = (blockIdx.y * blockDim.y) + threadIdx.y; + int index = x + (y * w); + + if (x < w && y < h) { + + glm::vec3 color; + for (int i = 0; i < antialiasing; ++i) { + for (int j = 0; j < antialiasing; ++j) { + int tempIndex = ((antialiasing * x) + i) + (((antialiasing * y) + j) * width); + color.x += glm::clamp(image[tempIndex].x, 0.0f, 1.0f) * 255.0; + color.y += glm::clamp(image[tempIndex].y, 0.0f, 1.0f) * 255.0; + color.z += glm::clamp(image[tempIndex].z, 0.0f, 1.0f) * 255.0; + } + } + + color /= antialiasing * antialiasing; + + // Each thread writes one pixel location in the texture (textel) + pbo[index].w = 0; + pbo[index].x = color.x; + pbo[index].y = color.y; + pbo[index].z = color.z; + } } -/** +/** * Writes fragment colors to the framebuffer */ __global__ void render(int w, int h, Fragment *fragmentBuffer, glm::vec3 *framebuffer) { - int x = (blockIdx.x * blockDim.x) + threadIdx.x; - int y = (blockIdx.y * blockDim.y) + threadIdx.y; - int index = x + (y * w); + int x = (blockIdx.x * blockDim.x) + threadIdx.x; + int y = (blockIdx.y * blockDim.y) + threadIdx.y; + int index = x + (y * w); + + const int numLights = 2; + glm::vec3 lights[numLights] = { glm::normalize(glm::vec3(-1,-1,-1)), glm::normalize(glm::vec3(1,1,1)) }; + + if (x < w && y < h) { + framebuffer[index] = glm::vec3(0, 0, 0); - if (x < w && y < h) { - framebuffer[index] = fragmentBuffer[index].color; + for (int i = 0; i < numLights; ++i) { + float lambert = glm::clamp(glm::dot(fragmentBuffer[index].eyeNor, lights[i]), 0.f, 1.f); + + framebuffer[index] += fragmentBuffer[index].color * lambert; + } // TODO: add your fragment shader code here - } + } } +struct cullpredicate +{ + __device__ + bool operator()(const Primitive &primitive) + { + glm::vec3 normal = glm::cross(primitive.v[1].eyePos - primitive.v[0].eyePos, primitive.v[2].eyePos - primitive.v[0].eyePos); + + return glm::dot(normal, glm::vec3(0,0,1)) > -0.1; + } +}; + /** * Called once at the beginning of the program to allocate memory. */ void rasterizeInit(int w, int h) { - width = w; - height = h; + originalWidth = w; + originalHeight = h; + width = antialiasing * originalWidth; + height = antialiasing * originalHeight; + cudaFree(dev_fragmentBuffer); cudaMalloc(&dev_fragmentBuffer, width * height * sizeof(Fragment)); cudaMemset(dev_fragmentBuffer, 0, width * height * sizeof(Fragment)); - cudaFree(dev_framebuffer); - cudaMalloc(&dev_framebuffer, width * height * sizeof(glm::vec3)); - cudaMemset(dev_framebuffer, 0, width * height * sizeof(glm::vec3)); - + cudaFree(dev_framebuffer); + cudaMalloc(&dev_framebuffer, width * height * sizeof(glm::vec3)); + cudaMemset(dev_framebuffer, 0, width * height * sizeof(glm::vec3)); + cudaFree(dev_depth); - cudaMalloc(&dev_depth, width * height * sizeof(int)); + cudaMalloc(&dev_depth, width * height * sizeof(float)); + + cudaMalloc((void **)&mutex, width * height * sizeof(int)); + cudaMemset(mutex, 0, width * height * sizeof(int)); checkCUDAError("rasterizeInit"); } __global__ -void initDepth(int w, int h, int * depth) +void initDepth(int w, int h, float * depth) { int x = (blockIdx.x * blockDim.x) + threadIdx.x; int y = (blockIdx.y * blockDim.y) + threadIdx.y; @@ -178,7 +219,7 @@ void initDepth(int w, int h, int * depth) if (x < w && y < h) { int index = x + (y * w); - depth[index] = INT_MAX; + depth[index] = 1.0; } } @@ -187,9 +228,9 @@ void initDepth(int w, int h, int * depth) * kern function with support for stride to sometimes replace cudaMemcpy * One thread is responsible for copying one component */ -__global__ +__global__ void _deviceBufferCopy(int N, BufferByte* dev_dst, const BufferByte* dev_src, int n, int byteStride, int byteOffset, int componentTypeByteSize) { - + // Attribute (vec3 position) // component (3 * float) // byte (4 * byte) @@ -202,20 +243,20 @@ void _deviceBufferCopy(int N, BufferByte* dev_dst, const BufferByte* dev_src, in int offset = i - count * n; // which component of the attribute for (int j = 0; j < componentTypeByteSize; j++) { - - dev_dst[count * componentTypeByteSize * n - + offset * componentTypeByteSize + + dev_dst[count * componentTypeByteSize * n + + offset * componentTypeByteSize + j] - = + = - dev_src[byteOffset - + count * (byteStride == 0 ? componentTypeByteSize * n : byteStride) - + offset * componentTypeByteSize + dev_src[byteOffset + + count * (byteStride == 0 ? componentTypeByteSize * n : byteStride) + + offset * componentTypeByteSize + j]; } } - + } @@ -235,7 +276,7 @@ void _nodeMatrixTransform( } glm::mat4 getMatrixFromNodeMatrixVector(const tinygltf::Node & n) { - + glm::mat4 curMatrix(1.0); const std::vector &m = n.matrix; @@ -247,7 +288,8 @@ glm::mat4 getMatrixFromNodeMatrixVector(const tinygltf::Node & n) { curMatrix[i][j] = (float)m.at(4 * i + j); } } - } else { + } + else { // no matrix, use rotation, scale, translation if (n.translation.size() > 0) { @@ -275,12 +317,12 @@ glm::mat4 getMatrixFromNodeMatrixVector(const tinygltf::Node & n) { return curMatrix; } -void traverseNode ( +void traverseNode( std::map & n2m, const tinygltf::Scene & scene, const std::string & nodeString, const glm::mat4 & parentMatrix - ) +) { const tinygltf::Node & n = scene.nodes.at(nodeString); glm::mat4 M = parentMatrix * getMatrixFromNodeMatrixVector(n); @@ -537,7 +579,7 @@ void rasterizeSetBuffers(const tinygltf::Scene & scene) { size_t s = image.image.size() * sizeof(TextureData); cudaMalloc(&dev_diffuseTex, s); cudaMemcpy(dev_diffuseTex, &image.image.at(0), s, cudaMemcpyHostToDevice); - + diffuseTexWidth = image.width; diffuseTexHeight = image.height; @@ -554,7 +596,7 @@ void rasterizeSetBuffers(const tinygltf::Scene & scene) { // ---------Node hierarchy transform-------- cudaDeviceSynchronize(); - + dim3 numBlocksNodeTransform((numVertices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x); _nodeMatrixTransform << > > ( numVertices, @@ -584,7 +626,7 @@ void rasterizeSetBuffers(const tinygltf::Scene & scene) { diffuseTexHeight, dev_vertexOut //VertexOut - }); + }); totalNumPrimitives += numPrimitives; @@ -595,21 +637,23 @@ void rasterizeSetBuffers(const tinygltf::Scene & scene) { } // for each node } - + + printf("Num primitives: %u\n", totalNumPrimitives); // 3. Malloc for dev_primitives { cudaMalloc(&dev_primitives, totalNumPrimitives * sizeof(Primitive)); + cudaMalloc(&dev_primitives_copy, totalNumPrimitives * sizeof(Primitive)); } - + // Finally, cudaFree raw dev_bufferViews { std::map::const_iterator it(bufferViewDevPointers.begin()); std::map::const_iterator itEnd(bufferViewDevPointers.end()); - - //bufferViewDevPointers + + //bufferViewDevPointers for (; it != itEnd; it++) { cudaFree(it->second); @@ -623,11 +667,11 @@ void rasterizeSetBuffers(const tinygltf::Scene & scene) { -__global__ +__global__ void _vertexTransformAndAssembly( - int numVertices, - PrimitiveDevBufPointers primitive, - glm::mat4 MVP, glm::mat4 MV, glm::mat3 MV_normal, + int numVertices, + PrimitiveDevBufPointers primitive, + glm::mat4 MVP, glm::mat4 MV, glm::mat3 MV_normal, int width, int height) { // vertex id @@ -638,18 +682,45 @@ void _vertexTransformAndAssembly( // Multiply the MVP matrix for each vertex position, this will transform everything into clipping space // Then divide the pos by its w element to transform into NDC space // Finally transform x and y to viewport space + glm::vec3 &position = primitive.dev_position[vid]; + glm::vec3 &normal = primitive.dev_normal[vid]; + + VertexOut &vertex = primitive.dev_verticesOut[vid]; + + glm::vec4 projected = MVP * glm::vec4(position, 1); + projected /= projected.w; + + vertex.pos = glm::vec4((projected.x + 1.f) * width * 0.5f, (1.f - projected.y) * height * 0.5f, (projected.z + 1.f) * 0.5f, 1.0); + vertex.eyePos = glm::vec3(MV * glm::vec4(position, 1)); + vertex.eyeNor = glm::normalize(MV_normal * normal); + + // Give the vertex a random color or texture color + if (primitive.dev_diffuseTex == NULL) { + vertex.dev_diffuseTex = NULL; + thrust::default_random_engine rng = thrust::default_random_engine(utilhash(vid + 11)); + thrust::uniform_real_distribution u01(0, 1); + vertex.color = glm::vec3(1.0, u01(rng), u01(rng)); + } + else { + vertex.dev_diffuseTex = primitive.dev_diffuseTex; + vertex.texcoord0 = primitive.dev_texcoord0[vid]; + vertex.texWidth = primitive.diffuseTexWidth; + vertex.texHeight = primitive.diffuseTexHeight; + } // TODO: Apply vertex assembly here // Assemble all attribute arraies into the primitive array - + } } + + static int curPrimitiveBeginId = 0; -__global__ +__global__ void _primitiveAssembly(int numIndices, int curPrimitiveBeginId, Primitive* dev_primitives, PrimitiveDevBufPointers primitive) { // index id @@ -660,30 +731,144 @@ void _primitiveAssembly(int numIndices, int curPrimitiveBeginId, Primitive* dev_ // TODO: uncomment the following code for a start // This is primitive assembly for triangles - //int pid; // id for cur primitives vector - //if (primitive.primitiveMode == TINYGLTF_MODE_TRIANGLES) { - // pid = iid / (int)primitive.primitiveType; - // dev_primitives[pid + curPrimitiveBeginId].v[iid % (int)primitive.primitiveType] - // = primitive.dev_verticesOut[primitive.dev_indices[iid]]; - //} + int pid; // id for cur primitives vector + if (primitive.primitiveMode == TINYGLTF_MODE_TRIANGLES) { + pid = iid / (int)primitive.primitiveType; + dev_primitives[pid + curPrimitiveBeginId].v[iid % (int)primitive.primitiveType] + = primitive.dev_verticesOut[primitive.dev_indices[iid]]; + } // TODO: other primitive types (point, line) } - + +} + +__host__ __device__ static +float perspectiveCorrectZ(const glm::vec3 vertices[3], const glm::vec3 &barycentric) { + float sum = (barycentric[0] / vertices[0][2]) + (barycentric[1] / vertices[1][2]) + (barycentric[2] / vertices[2][2]); + return 1.0 / sum; } +__host__ __device__ static +glm::vec3 perspectiveCorrectInterpolation(const glm::vec3 vertices[3], const float &z, const glm::vec3 values[3], const glm::vec3 &barycentric) { + glm::vec3 sum = (barycentric[0] * values[0] / vertices[0][2]) + + (barycentric[1] * values[1] / vertices[1][2]) + + (barycentric[2] * values[2] / vertices[2][2]); + return sum * z; +} + +/** +* Rasterization +*/ +/** + * Rasterization kernel. + */ +__global__ +void rasterizeKernel(int numPrimitives, int width, int height, Fragment *fragmentBuffer, Primitive* dev_primitives, float* depth, int* mutex) { + int x = (blockIdx.x * blockDim.x) + threadIdx.x; + int y = (blockIdx.y * blockDim.y) + threadIdx.y; + int idx = x + (y * width); + + if (idx < numPrimitives) { + Primitive &primitive = dev_primitives[idx]; + glm::vec3 tri[3] = { glm::vec3(primitive.v[0].pos), glm::vec3(primitive.v[1].pos), glm::vec3(primitive.v[2].pos) }; + + AABB aabb = getAABBForTriangle(tri); + aabb.min.x = glm::min((float)width - 1, glm::max(0.f, aabb.min.x)); + aabb.min.y = glm::min((float)height - 1, glm::max(0.f, aabb.min.y)); + aabb.max.x = glm::max(0.f, glm::min((float)width - 1, aabb.max.x)); + aabb.max.y = glm::max(0.f, glm::min((float)height - 1, aabb.max.y)); + + for (int col = aabb.min.x; col <= aabb.max.x; ++col) { + for (int row = aabb.min.y; row <= aabb.max.y; ++row) { + glm::vec2 point = glm::vec2(col, row); + int fragmentIndex = glm::min(width*height - 1, glm::max(0, col + (row * width))); + + glm::vec3 bary = calculateBarycentricCoordinate(tri, point); + + if (isBarycentricCoordInBounds(bary)) { + bool isSet; + do { + isSet = (atomicCAS(&mutex[fragmentIndex], 0, 1) == 0); + if (isSet) { + Fragment &fragment = fragmentBuffer[fragmentIndex]; + + // Only set this fragments attributes if closest depth + float fragmentDepth = getZAtCoordinate(bary, tri); + + if (fragmentDepth < depth[fragmentIndex]) { + depth[fragmentIndex] = fragmentDepth; + + // Perspective correct z + glm::vec3 eyeTri[3] = { glm::vec3(primitive.v[0].eyePos), glm::vec3(primitive.v[1].eyePos), glm::vec3(primitive.v[2].eyePos) }; + float perspZ = perspectiveCorrectZ (eyeTri, bary); + + // Calculate normal + glm::vec3 normals[3] = { primitive.v[0].eyeNor, primitive.v[1].eyeNor, primitive.v[2].eyeNor }; + fragment.eyeNor = glm::normalize( perspectiveCorrectInterpolation (eyeTri, perspZ, normals, bary)); + + if (primitive.v[0].dev_diffuseTex) { + glm::vec3 uv[3] = { glm::vec3(primitive.v[0].texcoord0, 0), glm::vec3(primitive.v[1].texcoord0, 0), glm::vec3(primitive.v[2].texcoord0, 0) }; + + glm::vec2 final_uv = bary[0] * primitive.v[0].texcoord0 + bary[1] * primitive.v[1].texcoord0 + bary[2] * primitive.v[2].texcoord0; + float u = final_uv.x * primitive.v[0].texWidth; + float v = final_uv.y * primitive.v[0].texHeight; + + int uInt = glm::floor(u); + int vInt = glm::floor(v); + + TextureData* texture = primitive.v[0].dev_diffuseTex; + + float u_fract = u - glm::floor(u); + float v_fract = v - glm::floor(v); + + int col_00_offset = (uInt + (vInt * primitive.v[0].texWidth)) * 3; + glm::vec3 col_00 = glm::vec3(texture[col_00_offset], texture[col_00_offset + 1], texture[col_00_offset + 2]); + + int col_10_offset = (uInt + 1 + (vInt * primitive.v[0].texWidth)) * 3; + glm::vec3 col_10 = glm::vec3(texture[col_10_offset], texture[col_10_offset + 1], texture[col_10_offset + 2]); + + int col_01_offset = (uInt + ((vInt + 1) * primitive.v[0].texWidth)) * 3; + glm::vec3 col_01 = glm::vec3(texture[col_01_offset], texture[col_01_offset + 1], texture[col_01_offset + 2]); + + int col_11_offset = (uInt + 1 + ((vInt + 1) * primitive.v[0].texWidth)) * 3; + glm::vec3 col_11 = glm::vec3(texture[col_11_offset], texture[col_11_offset + 1], texture[col_11_offset + 2]); + + glm::vec3 col_interp1 = glm::mix(col_00, col_10, u_fract); + glm::vec3 col_interp2 = glm::mix(col_01, col_11, u_fract); + + fragment.color = glm::mix(col_interp1, col_interp2, v_fract) / 255.f; + } + else { + glm::vec3 colors[3] = { primitive.v[0].color, primitive.v[1].color, primitive.v[2].color }; + fragment.color = perspectiveCorrectInterpolation (eyeTri, perspZ, colors, bary); + } + } + } + if (isSet) { + mutex[fragmentIndex] = 0; + } + } while (!isSet); + } + } + } + } +} /** * Perform rasterization. */ void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const glm::mat3 MV_normal) { - int sideLength2d = 8; - dim3 blockSize2d(sideLength2d, sideLength2d); - dim3 blockCount2d((width - 1) / blockSize2d.x + 1, + int sideLength2d = 8; + dim3 blockSize2d(sideLength2d, sideLength2d); + dim3 blockCount2d((width - 1) / blockSize2d.x + 1, (height - 1) / blockSize2d.y + 1); + cudaMemcpy(dev_primitives_copy, dev_primitives, totalNumPrimitives * sizeof(Primitive), cudaMemcpyDeviceToDevice); + + // Execute your rasterization pipeline here // (See README for rasterization pipeline outline.) @@ -702,14 +887,14 @@ void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const g dim3 numBlocksForVertices((p->numVertices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x); dim3 numBlocksForIndices((p->numIndices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x); - _vertexTransformAndAssembly << < numBlocksForVertices, numThreadsPerBlock >> >(p->numVertices, *p, MVP, MV, MV_normal, width, height); + _vertexTransformAndAssembly << < numBlocksForVertices, numThreadsPerBlock >> > (p->numVertices, *p, MVP, MV, MV_normal, width, height); checkCUDAError("Vertex Processing"); cudaDeviceSynchronize(); _primitiveAssembly << < numBlocksForIndices, numThreadsPerBlock >> > - (p->numIndices, - curPrimitiveBeginId, - dev_primitives, - *p); + (p->numIndices, + curPrimitiveBeginId, + dev_primitives_copy, + *p); checkCUDAError("Primitive Assembly"); curPrimitiveBeginId += p->numPrimitives; @@ -718,20 +903,27 @@ void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const g checkCUDAError("Vertex Processing and Primitive Assembly"); } - + cudaMemset(dev_fragmentBuffer, 0, width * height * sizeof(Fragment)); - initDepth << > >(width, height, dev_depth); - - // TODO: rasterize + initDepth << > > (width, height, dev_depth); + // Cull all the primitives facing away + Primitive *end = thrust::partition(thrust::device, dev_primitives_copy, dev_primitives_copy + totalNumPrimitives, cullpredicate()); + cudaDeviceSynchronize(); - // Copy depthbuffer colors into framebuffer - render << > >(width, height, dev_fragmentBuffer, dev_framebuffer); + int primitivesLeft = end - dev_primitives_copy; + + // TODO: rasterize + rasterizeKernel << > > (primitivesLeft, width, height, dev_fragmentBuffer, dev_primitives_copy, dev_depth, mutex); + + + // Copy depthbuffer colors into framebuffer + render << > > (width, height, dev_fragmentBuffer, dev_framebuffer); checkCUDAError("fragment shader"); - // Copy framebuffer into OpenGL buffer for OpenGL previewing - sendImageToPBO<<>>(pbo, width, height, dev_framebuffer); - checkCUDAError("copy render result to pbo"); + // Copy framebuffer into OpenGL buffer for OpenGL previewing + sendImageToPBO << > > (pbo, originalWidth, originalHeight, dev_framebuffer, antialiasing, width); + checkCUDAError("copy render result to pbo"); } /** @@ -739,7 +931,7 @@ void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const g */ void rasterizeFree() { - // deconstruct primitives attribute/indices device buffer + // deconstruct primitives attribute/indices device buffer auto it(mesh2PrimitivesMap.begin()); auto itEnd(mesh2PrimitivesMap.end()); @@ -753,24 +945,27 @@ void rasterizeFree() { cudaFree(p->dev_verticesOut); - + //TODO: release other attributes and materials } } //////////// - cudaFree(dev_primitives); - dev_primitives = NULL; + cudaFree(dev_primitives); + dev_primitives = NULL; cudaFree(dev_fragmentBuffer); dev_fragmentBuffer = NULL; - cudaFree(dev_framebuffer); - dev_framebuffer = NULL; + cudaFree(dev_framebuffer); + dev_framebuffer = NULL; cudaFree(dev_depth); dev_depth = NULL; - checkCUDAError("rasterize Free"); -} + cudaFree(mutex); + mutex = NULL; + + checkCUDAError("rasterize Free"); +} \ No newline at end of file diff --git a/src/rasterizeTools.h b/src/rasterizeTools.h index 46c701e..555a1ab 100644 --- a/src/rasterizeTools.h +++ b/src/rasterizeTools.h @@ -13,8 +13,8 @@ #include struct AABB { - glm::vec3 min; - glm::vec3 max; + glm::vec3 min; + glm::vec3 max; }; /** @@ -22,7 +22,7 @@ struct AABB { */ __host__ __device__ static glm::vec3 multiplyMV(glm::mat4 m, glm::vec4 v) { - return glm::vec3(m * v); + return glm::vec3(m * v); } // CHECKITOUT @@ -31,16 +31,16 @@ glm::vec3 multiplyMV(glm::mat4 m, glm::vec4 v) { */ __host__ __device__ static AABB getAABBForTriangle(const glm::vec3 tri[3]) { - AABB aabb; - aabb.min = glm::vec3( - min(min(tri[0].x, tri[1].x), tri[2].x), - min(min(tri[0].y, tri[1].y), tri[2].y), - min(min(tri[0].z, tri[1].z), tri[2].z)); - aabb.max = glm::vec3( - max(max(tri[0].x, tri[1].x), tri[2].x), - max(max(tri[0].y, tri[1].y), tri[2].y), - max(max(tri[0].z, tri[1].z), tri[2].z)); - return aabb; + AABB aabb; + aabb.min = glm::vec3( + min(min(tri[0].x, tri[1].x), tri[2].x), + min(min(tri[0].y, tri[1].y), tri[2].y), + min(min(tri[0].z, tri[1].z), tri[2].z)); + aabb.max = glm::vec3( + max(max(tri[0].x, tri[1].x), tri[2].x), + max(max(tri[0].y, tri[1].y), tri[2].y), + max(max(tri[0].z, tri[1].z), tri[2].z)); + return aabb; } // CHECKITOUT @@ -49,7 +49,7 @@ AABB getAABBForTriangle(const glm::vec3 tri[3]) { */ __host__ __device__ static float calculateSignedArea(const glm::vec3 tri[3]) { - return 0.5 * ((tri[2].x - tri[0].x) * (tri[1].y - tri[0].y) - (tri[1].x - tri[0].x) * (tri[2].y - tri[0].y)); + return 0.5 * ((tri[2].x - tri[0].x) * (tri[1].y - tri[0].y) - (tri[1].x - tri[0].x) * (tri[2].y - tri[0].y)); } // CHECKITOUT @@ -58,11 +58,11 @@ float calculateSignedArea(const glm::vec3 tri[3]) { */ __host__ __device__ static float calculateBarycentricCoordinateValue(glm::vec2 a, glm::vec2 b, glm::vec2 c, const glm::vec3 tri[3]) { - glm::vec3 baryTri[3]; - baryTri[0] = glm::vec3(a, 0); - baryTri[1] = glm::vec3(b, 0); - baryTri[2] = glm::vec3(c, 0); - return calculateSignedArea(baryTri) / calculateSignedArea(tri); + glm::vec3 baryTri[3]; + baryTri[0] = glm::vec3(a, 0); + baryTri[1] = glm::vec3(b, 0); + baryTri[2] = glm::vec3(c, 0); + return calculateSignedArea(baryTri) / calculateSignedArea(tri); } // CHECKITOUT @@ -71,10 +71,10 @@ float calculateBarycentricCoordinateValue(glm::vec2 a, glm::vec2 b, glm::vec2 c, */ __host__ __device__ static glm::vec3 calculateBarycentricCoordinate(const glm::vec3 tri[3], glm::vec2 point) { - float beta = calculateBarycentricCoordinateValue(glm::vec2(tri[0].x, tri[0].y), point, glm::vec2(tri[2].x, tri[2].y), tri); - float gamma = calculateBarycentricCoordinateValue(glm::vec2(tri[0].x, tri[0].y), glm::vec2(tri[1].x, tri[1].y), point, tri); - float alpha = 1.0 - beta - gamma; - return glm::vec3(alpha, beta, gamma); + float beta = calculateBarycentricCoordinateValue(glm::vec2(tri[0].x, tri[0].y), point, glm::vec2(tri[2].x, tri[2].y), tri); + float gamma = calculateBarycentricCoordinateValue(glm::vec2(tri[0].x, tri[0].y), glm::vec2(tri[1].x, tri[1].y), point, tri); + float alpha = 1.0 - beta - gamma; + return glm::vec3(alpha, beta, gamma); } // CHECKITOUT @@ -83,9 +83,9 @@ glm::vec3 calculateBarycentricCoordinate(const glm::vec3 tri[3], glm::vec2 point */ __host__ __device__ static bool isBarycentricCoordInBounds(const glm::vec3 barycentricCoord) { - return barycentricCoord.x >= 0.0 && barycentricCoord.x <= 1.0 && - barycentricCoord.y >= 0.0 && barycentricCoord.y <= 1.0 && - barycentricCoord.z >= 0.0 && barycentricCoord.z <= 1.0; + return barycentricCoord.x >= 0.0 && barycentricCoord.x <= 1.0 && + barycentricCoord.y >= 0.0 && barycentricCoord.y <= 1.0 && + barycentricCoord.z >= 0.0 && barycentricCoord.z <= 1.0; } // CHECKITOUT @@ -95,7 +95,25 @@ bool isBarycentricCoordInBounds(const glm::vec3 barycentricCoord) { */ __host__ __device__ static float getZAtCoordinate(const glm::vec3 barycentricCoord, const glm::vec3 tri[3]) { - return -(barycentricCoord.x * tri[0].z - + barycentricCoord.y * tri[1].z - + barycentricCoord.z * tri[2].z); + return (barycentricCoord.x * tri[0].z + + barycentricCoord.y * tri[1].z + + barycentricCoord.z * tri[2].z); } + +__host__ __device__ static +int convert2Dto1D(const int x, const int y, const int width) { + return x + (y * width); +} + +/** +* Handy-dandy hash function that provides seeds for random number generation. +*/ +__host__ __device__ inline unsigned int utilhash(unsigned int a) { + a = (a + 0x7ed55d16) + (a << 12); + a = (a ^ 0xc761c23c) ^ (a >> 19); + a = (a + 0x165667b1) + (a << 5); + a = (a + 0xd3a2646c) ^ (a << 9); + a = (a + 0xfd7046c5) + (a << 3); + a = (a ^ 0xb55a4f09) ^ (a >> 16); + return a; +} \ No newline at end of file