diff --git a/README.md b/README.md
index 41b91f0..337e251 100644
--- a/README.md
+++ b/README.md
@@ -1,18 +1,58 @@
CUDA Rasterizer
===============
-[CLICK ME FOR INSTRUCTION OF THIS PROJECT](./INSTRUCTION.md)
-
**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 4**
-* (TODO) YOUR NAME HERE
- * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Ishan Ranade
+* Tested on personal computer: Gigabyte Aero 14, Windows 10, i7-7700HQ, GTX 1060
+
+## Demo
+
+
+
+## Introduction
+
+I implemented a GPU based rasterization pipeline with a number of added on features. For a brief overview the rasterization pipeline consists of a number of steps including vertex transformation of a set of user defined vertices, primitive assembly to arrange these vertices into shapes, a fragment depth test in which the closest fragment is saved, fragment transformation to determine the final color of a pixel, and finally rendering to the screen. Specific extra features that I chose to implement were super sampled antialiasing, UV texture mapping with bilinear texture filtering and perspective correct texture coordinates, correct color interpolation between points on a primitive, and backface culling.
+
+## Features
+
+
+
+
+
+- UV texture mapping with bilinear texture filtering and perspective correct texture coordinates
+
+
+
+
+
+- Correct color interpolation between points on a primitive
+
+
+
+
+
+- Super sampled anti aliasing
+
+
+
+
+
+- No anti aliasing
+
+
+
+## Performance Analysis
+
+The rasterization step of the pipeline seemed to take the longest time, and this is because of the heavy computation in calculating which pixels overlap which triangles. I implemented backface culling to try to reduce the computation time, and the graphs can be seen below comparing runtimes with and without this optimization.
+
+Antialiasing also increased the runtime as the size of the PBO quadrupled in size so the computations became 4 times as expensive. This step was necessary though because without it you can easily see jagged edges along diagonal lines as can be seen in the above images.
+
+
+
+
-### (TODO: Your README)
-*DO NOT* leave the README to the last minute! It is a crucial part of the
-project, and we will not be able to grade you without a good README.
### Credits
diff --git a/graphs.pptx b/graphs.pptx
new file mode 100644
index 0000000..4e108b6
Binary files /dev/null and b/graphs.pptx differ
diff --git a/renders/aa.JPG b/renders/aa.JPG
new file mode 100644
index 0000000..a60e8d4
Binary files /dev/null and b/renders/aa.JPG differ
diff --git a/renders/aa2.JPG b/renders/aa2.JPG
new file mode 100644
index 0000000..ed8aca1
Binary files /dev/null and b/renders/aa2.JPG differ
diff --git a/renders/cow.JPG b/renders/cow.JPG
new file mode 100644
index 0000000..7d0bab5
Binary files /dev/null and b/renders/cow.JPG differ
diff --git a/renders/demo.gif b/renders/demo.gif
new file mode 100644
index 0000000..b5c3010
Binary files /dev/null and b/renders/demo.gif differ
diff --git a/renders/graph1.JPG b/renders/graph1.JPG
new file mode 100644
index 0000000..bf067ec
Binary files /dev/null and b/renders/graph1.JPG differ
diff --git a/renders/graph2.JPG b/renders/graph2.JPG
new file mode 100644
index 0000000..2e81e22
Binary files /dev/null and b/renders/graph2.JPG differ
diff --git a/renders/noaa.JPG b/renders/noaa.JPG
new file mode 100644
index 0000000..8e9ab25
Binary files /dev/null and b/renders/noaa.JPG differ
diff --git a/renders/texture1.JPG b/renders/texture1.JPG
new file mode 100644
index 0000000..fabd6ba
Binary files /dev/null and b/renders/texture1.JPG differ
diff --git a/renders/truck.JPG b/renders/truck.JPG
new file mode 100644
index 0000000..6039beb
Binary files /dev/null and b/renders/truck.JPG differ
diff --git a/shaders/README.md b/shaders/README.md
new file mode 100644
index 0000000..e69de29
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index a57f69f..d9247c3 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -6,5 +6,5 @@ set(SOURCE_FILES
cuda_add_library(src
${SOURCE_FILES}
- OPTIONS -arch=sm_20
+ OPTIONS -arch=sm_60
)
diff --git a/src/rasterize.cu b/src/rasterize.cu
index 1262a09..cf71b33 100644
--- a/src/rasterize.cu
+++ b/src/rasterize.cu
@@ -17,6 +17,9 @@
#include "rasterize.h"
#include
#include
+#include
+#include
+#include
namespace {
@@ -28,7 +31,7 @@ namespace {
typedef unsigned char BufferByte;
- enum PrimitiveType{
+ enum PrimitiveType {
Point = 1,
Line = 2,
Triangle = 3
@@ -41,12 +44,12 @@ namespace {
// The attributes listed below might be useful,
// but always feel free to modify on your own
- glm::vec3 eyePos; // eye space position used for shading
- glm::vec3 eyeNor; // eye space normal used for shading, cuz normal will go wrong after perspective transformation
- // glm::vec3 col;
- glm::vec2 texcoord0;
- TextureData* dev_diffuseTex = NULL;
- // int texWidth, texHeight;
+ glm::vec3 eyePos; // eye space position used for shading
+ glm::vec3 eyeNor; // eye space normal used for shading, cuz normal will go wrong after perspective transformation
+ glm::vec3 color;
+ glm::vec2 texcoord0;
+ TextureData* dev_diffuseTex = NULL;
+ int texWidth, texHeight;
// ...
};
@@ -62,10 +65,8 @@ namespace {
// The attributes listed below might be useful,
// but always feel free to modify on your own
- // glm::vec3 eyePos; // eye space position used for shading
- // glm::vec3 eyeNor;
- // VertexAttributeTexcoord texcoord0;
- // TextureData* dev_diffuseTex;
+ glm::vec3 eyePos; // eye space position used for shading
+ glm::vec3 eyeNor;
// ...
};
@@ -100,77 +101,117 @@ namespace {
static std::map> mesh2PrimitivesMap;
-
-static int width = 0;
-static int height = 0;
-
static int totalNumPrimitives = 0;
static Primitive *dev_primitives = NULL;
+static Primitive *dev_primitives_copy = NULL;
static Fragment *dev_fragmentBuffer = NULL;
static glm::vec3 *dev_framebuffer = NULL;
-static int * dev_depth = NULL; // you might need this buffer when doing depth test
+static float * dev_depth = NULL; // you might need this buffer when doing depth test
+
+static int *mutex;
+
+static int width = 0;
+static int height = 0;
+static int originalWidth = 0;
+static int originalHeight = 0;
+static int antialiasing = 2;
/**
* Kernel that writes the image to the OpenGL PBO directly.
*/
-__global__
-void sendImageToPBO(uchar4 *pbo, int w, int h, glm::vec3 *image) {
- int x = (blockIdx.x * blockDim.x) + threadIdx.x;
- int y = (blockIdx.y * blockDim.y) + threadIdx.y;
- int index = x + (y * w);
-
- if (x < w && y < h) {
- glm::vec3 color;
- color.x = glm::clamp(image[index].x, 0.0f, 1.0f) * 255.0;
- color.y = glm::clamp(image[index].y, 0.0f, 1.0f) * 255.0;
- color.z = glm::clamp(image[index].z, 0.0f, 1.0f) * 255.0;
- // Each thread writes one pixel location in the texture (textel)
- pbo[index].w = 0;
- pbo[index].x = color.x;
- pbo[index].y = color.y;
- pbo[index].z = color.z;
- }
+__global__
+void sendImageToPBO(uchar4 *pbo, int w, int h, glm::vec3 *image, int antialiasing, int width) {
+ int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+ int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+ int index = x + (y * w);
+
+ if (x < w && y < h) {
+
+ glm::vec3 color;
+ for (int i = 0; i < antialiasing; ++i) {
+ for (int j = 0; j < antialiasing; ++j) {
+ int tempIndex = ((antialiasing * x) + i) + (((antialiasing * y) + j) * width);
+ color.x += glm::clamp(image[tempIndex].x, 0.0f, 1.0f) * 255.0;
+ color.y += glm::clamp(image[tempIndex].y, 0.0f, 1.0f) * 255.0;
+ color.z += glm::clamp(image[tempIndex].z, 0.0f, 1.0f) * 255.0;
+ }
+ }
+
+ color /= antialiasing * antialiasing;
+
+ // Each thread writes one pixel location in the texture (textel)
+ pbo[index].w = 0;
+ pbo[index].x = color.x;
+ pbo[index].y = color.y;
+ pbo[index].z = color.z;
+ }
}
-/**
+/**
* Writes fragment colors to the framebuffer
*/
__global__
void render(int w, int h, Fragment *fragmentBuffer, glm::vec3 *framebuffer) {
- int x = (blockIdx.x * blockDim.x) + threadIdx.x;
- int y = (blockIdx.y * blockDim.y) + threadIdx.y;
- int index = x + (y * w);
+ int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+ int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+ int index = x + (y * w);
+
+ const int numLights = 2;
+ glm::vec3 lights[numLights] = { glm::normalize(glm::vec3(-1,-1,-1)), glm::normalize(glm::vec3(1,1,1)) };
+
+ if (x < w && y < h) {
+ framebuffer[index] = glm::vec3(0, 0, 0);
- if (x < w && y < h) {
- framebuffer[index] = fragmentBuffer[index].color;
+ for (int i = 0; i < numLights; ++i) {
+ float lambert = glm::clamp(glm::dot(fragmentBuffer[index].eyeNor, lights[i]), 0.f, 1.f);
+
+ framebuffer[index] += fragmentBuffer[index].color * lambert;
+ }
// TODO: add your fragment shader code here
- }
+ }
}
+struct cullpredicate
+{
+ __device__
+ bool operator()(const Primitive &primitive)
+ {
+ glm::vec3 normal = glm::cross(primitive.v[1].eyePos - primitive.v[0].eyePos, primitive.v[2].eyePos - primitive.v[0].eyePos);
+
+ return glm::dot(normal, glm::vec3(0,0,1)) > -0.1;
+ }
+};
+
/**
* Called once at the beginning of the program to allocate memory.
*/
void rasterizeInit(int w, int h) {
- width = w;
- height = h;
+ originalWidth = w;
+ originalHeight = h;
+ width = antialiasing * originalWidth;
+ height = antialiasing * originalHeight;
+
cudaFree(dev_fragmentBuffer);
cudaMalloc(&dev_fragmentBuffer, width * height * sizeof(Fragment));
cudaMemset(dev_fragmentBuffer, 0, width * height * sizeof(Fragment));
- cudaFree(dev_framebuffer);
- cudaMalloc(&dev_framebuffer, width * height * sizeof(glm::vec3));
- cudaMemset(dev_framebuffer, 0, width * height * sizeof(glm::vec3));
-
+ cudaFree(dev_framebuffer);
+ cudaMalloc(&dev_framebuffer, width * height * sizeof(glm::vec3));
+ cudaMemset(dev_framebuffer, 0, width * height * sizeof(glm::vec3));
+
cudaFree(dev_depth);
- cudaMalloc(&dev_depth, width * height * sizeof(int));
+ cudaMalloc(&dev_depth, width * height * sizeof(float));
+
+ cudaMalloc((void **)&mutex, width * height * sizeof(int));
+ cudaMemset(mutex, 0, width * height * sizeof(int));
checkCUDAError("rasterizeInit");
}
__global__
-void initDepth(int w, int h, int * depth)
+void initDepth(int w, int h, float * depth)
{
int x = (blockIdx.x * blockDim.x) + threadIdx.x;
int y = (blockIdx.y * blockDim.y) + threadIdx.y;
@@ -178,7 +219,7 @@ void initDepth(int w, int h, int * depth)
if (x < w && y < h)
{
int index = x + (y * w);
- depth[index] = INT_MAX;
+ depth[index] = 1.0;
}
}
@@ -187,9 +228,9 @@ void initDepth(int w, int h, int * depth)
* kern function with support for stride to sometimes replace cudaMemcpy
* One thread is responsible for copying one component
*/
-__global__
+__global__
void _deviceBufferCopy(int N, BufferByte* dev_dst, const BufferByte* dev_src, int n, int byteStride, int byteOffset, int componentTypeByteSize) {
-
+
// Attribute (vec3 position)
// component (3 * float)
// byte (4 * byte)
@@ -202,20 +243,20 @@ void _deviceBufferCopy(int N, BufferByte* dev_dst, const BufferByte* dev_src, in
int offset = i - count * n; // which component of the attribute
for (int j = 0; j < componentTypeByteSize; j++) {
-
- dev_dst[count * componentTypeByteSize * n
- + offset * componentTypeByteSize
+
+ dev_dst[count * componentTypeByteSize * n
+ + offset * componentTypeByteSize
+ j]
- =
+ =
- dev_src[byteOffset
- + count * (byteStride == 0 ? componentTypeByteSize * n : byteStride)
- + offset * componentTypeByteSize
+ dev_src[byteOffset
+ + count * (byteStride == 0 ? componentTypeByteSize * n : byteStride)
+ + offset * componentTypeByteSize
+ j];
}
}
-
+
}
@@ -235,7 +276,7 @@ void _nodeMatrixTransform(
}
glm::mat4 getMatrixFromNodeMatrixVector(const tinygltf::Node & n) {
-
+
glm::mat4 curMatrix(1.0);
const std::vector &m = n.matrix;
@@ -247,7 +288,8 @@ glm::mat4 getMatrixFromNodeMatrixVector(const tinygltf::Node & n) {
curMatrix[i][j] = (float)m.at(4 * i + j);
}
}
- } else {
+ }
+ else {
// no matrix, use rotation, scale, translation
if (n.translation.size() > 0) {
@@ -275,12 +317,12 @@ glm::mat4 getMatrixFromNodeMatrixVector(const tinygltf::Node & n) {
return curMatrix;
}
-void traverseNode (
+void traverseNode(
std::map & n2m,
const tinygltf::Scene & scene,
const std::string & nodeString,
const glm::mat4 & parentMatrix
- )
+)
{
const tinygltf::Node & n = scene.nodes.at(nodeString);
glm::mat4 M = parentMatrix * getMatrixFromNodeMatrixVector(n);
@@ -537,7 +579,7 @@ void rasterizeSetBuffers(const tinygltf::Scene & scene) {
size_t s = image.image.size() * sizeof(TextureData);
cudaMalloc(&dev_diffuseTex, s);
cudaMemcpy(dev_diffuseTex, &image.image.at(0), s, cudaMemcpyHostToDevice);
-
+
diffuseTexWidth = image.width;
diffuseTexHeight = image.height;
@@ -554,7 +596,7 @@ void rasterizeSetBuffers(const tinygltf::Scene & scene) {
// ---------Node hierarchy transform--------
cudaDeviceSynchronize();
-
+
dim3 numBlocksNodeTransform((numVertices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x);
_nodeMatrixTransform << > > (
numVertices,
@@ -584,7 +626,7 @@ void rasterizeSetBuffers(const tinygltf::Scene & scene) {
diffuseTexHeight,
dev_vertexOut //VertexOut
- });
+ });
totalNumPrimitives += numPrimitives;
@@ -595,21 +637,23 @@ void rasterizeSetBuffers(const tinygltf::Scene & scene) {
} // for each node
}
-
+
+ printf("Num primitives: %u\n", totalNumPrimitives);
// 3. Malloc for dev_primitives
{
cudaMalloc(&dev_primitives, totalNumPrimitives * sizeof(Primitive));
+ cudaMalloc(&dev_primitives_copy, totalNumPrimitives * sizeof(Primitive));
}
-
+
// Finally, cudaFree raw dev_bufferViews
{
std::map::const_iterator it(bufferViewDevPointers.begin());
std::map::const_iterator itEnd(bufferViewDevPointers.end());
-
- //bufferViewDevPointers
+
+ //bufferViewDevPointers
for (; it != itEnd; it++) {
cudaFree(it->second);
@@ -623,11 +667,11 @@ void rasterizeSetBuffers(const tinygltf::Scene & scene) {
-__global__
+__global__
void _vertexTransformAndAssembly(
- int numVertices,
- PrimitiveDevBufPointers primitive,
- glm::mat4 MVP, glm::mat4 MV, glm::mat3 MV_normal,
+ int numVertices,
+ PrimitiveDevBufPointers primitive,
+ glm::mat4 MVP, glm::mat4 MV, glm::mat3 MV_normal,
int width, int height) {
// vertex id
@@ -638,18 +682,45 @@ void _vertexTransformAndAssembly(
// Multiply the MVP matrix for each vertex position, this will transform everything into clipping space
// Then divide the pos by its w element to transform into NDC space
// Finally transform x and y to viewport space
+ glm::vec3 &position = primitive.dev_position[vid];
+ glm::vec3 &normal = primitive.dev_normal[vid];
+
+ VertexOut &vertex = primitive.dev_verticesOut[vid];
+
+ glm::vec4 projected = MVP * glm::vec4(position, 1);
+ projected /= projected.w;
+
+ vertex.pos = glm::vec4((projected.x + 1.f) * width * 0.5f, (1.f - projected.y) * height * 0.5f, (projected.z + 1.f) * 0.5f, 1.0);
+ vertex.eyePos = glm::vec3(MV * glm::vec4(position, 1));
+ vertex.eyeNor = glm::normalize(MV_normal * normal);
+
+ // Give the vertex a random color or texture color
+ if (primitive.dev_diffuseTex == NULL) {
+ vertex.dev_diffuseTex = NULL;
+ thrust::default_random_engine rng = thrust::default_random_engine(utilhash(vid + 11));
+ thrust::uniform_real_distribution u01(0, 1);
+ vertex.color = glm::vec3(1.0, u01(rng), u01(rng));
+ }
+ else {
+ vertex.dev_diffuseTex = primitive.dev_diffuseTex;
+ vertex.texcoord0 = primitive.dev_texcoord0[vid];
+ vertex.texWidth = primitive.diffuseTexWidth;
+ vertex.texHeight = primitive.diffuseTexHeight;
+ }
// TODO: Apply vertex assembly here
// Assemble all attribute arraies into the primitive array
-
+
}
}
+
+
static int curPrimitiveBeginId = 0;
-__global__
+__global__
void _primitiveAssembly(int numIndices, int curPrimitiveBeginId, Primitive* dev_primitives, PrimitiveDevBufPointers primitive) {
// index id
@@ -660,30 +731,144 @@ void _primitiveAssembly(int numIndices, int curPrimitiveBeginId, Primitive* dev_
// TODO: uncomment the following code for a start
// This is primitive assembly for triangles
- //int pid; // id for cur primitives vector
- //if (primitive.primitiveMode == TINYGLTF_MODE_TRIANGLES) {
- // pid = iid / (int)primitive.primitiveType;
- // dev_primitives[pid + curPrimitiveBeginId].v[iid % (int)primitive.primitiveType]
- // = primitive.dev_verticesOut[primitive.dev_indices[iid]];
- //}
+ int pid; // id for cur primitives vector
+ if (primitive.primitiveMode == TINYGLTF_MODE_TRIANGLES) {
+ pid = iid / (int)primitive.primitiveType;
+ dev_primitives[pid + curPrimitiveBeginId].v[iid % (int)primitive.primitiveType]
+ = primitive.dev_verticesOut[primitive.dev_indices[iid]];
+ }
// TODO: other primitive types (point, line)
}
-
+
+}
+
+__host__ __device__ static
+float perspectiveCorrectZ(const glm::vec3 vertices[3], const glm::vec3 &barycentric) {
+ float sum = (barycentric[0] / vertices[0][2]) + (barycentric[1] / vertices[1][2]) + (barycentric[2] / vertices[2][2]);
+ return 1.0 / sum;
}
+__host__ __device__ static
+glm::vec3 perspectiveCorrectInterpolation(const glm::vec3 vertices[3], const float &z, const glm::vec3 values[3], const glm::vec3 &barycentric) {
+ glm::vec3 sum = (barycentric[0] * values[0] / vertices[0][2])
+ + (barycentric[1] * values[1] / vertices[1][2])
+ + (barycentric[2] * values[2] / vertices[2][2]);
+ return sum * z;
+}
+
+/**
+* Rasterization
+*/
+/**
+ * Rasterization kernel.
+ */
+__global__
+void rasterizeKernel(int numPrimitives, int width, int height, Fragment *fragmentBuffer, Primitive* dev_primitives, float* depth, int* mutex) {
+ int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+ int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+ int idx = x + (y * width);
+
+ if (idx < numPrimitives) {
+ Primitive &primitive = dev_primitives[idx];
+ glm::vec3 tri[3] = { glm::vec3(primitive.v[0].pos), glm::vec3(primitive.v[1].pos), glm::vec3(primitive.v[2].pos) };
+
+ AABB aabb = getAABBForTriangle(tri);
+ aabb.min.x = glm::min((float)width - 1, glm::max(0.f, aabb.min.x));
+ aabb.min.y = glm::min((float)height - 1, glm::max(0.f, aabb.min.y));
+ aabb.max.x = glm::max(0.f, glm::min((float)width - 1, aabb.max.x));
+ aabb.max.y = glm::max(0.f, glm::min((float)height - 1, aabb.max.y));
+
+ for (int col = aabb.min.x; col <= aabb.max.x; ++col) {
+ for (int row = aabb.min.y; row <= aabb.max.y; ++row) {
+ glm::vec2 point = glm::vec2(col, row);
+ int fragmentIndex = glm::min(width*height - 1, glm::max(0, col + (row * width)));
+
+ glm::vec3 bary = calculateBarycentricCoordinate(tri, point);
+
+ if (isBarycentricCoordInBounds(bary)) {
+ bool isSet;
+ do {
+ isSet = (atomicCAS(&mutex[fragmentIndex], 0, 1) == 0);
+ if (isSet) {
+ Fragment &fragment = fragmentBuffer[fragmentIndex];
+
+ // Only set this fragments attributes if closest depth
+ float fragmentDepth = getZAtCoordinate(bary, tri);
+
+ if (fragmentDepth < depth[fragmentIndex]) {
+ depth[fragmentIndex] = fragmentDepth;
+
+ // Perspective correct z
+ glm::vec3 eyeTri[3] = { glm::vec3(primitive.v[0].eyePos), glm::vec3(primitive.v[1].eyePos), glm::vec3(primitive.v[2].eyePos) };
+ float perspZ = perspectiveCorrectZ (eyeTri, bary);
+
+ // Calculate normal
+ glm::vec3 normals[3] = { primitive.v[0].eyeNor, primitive.v[1].eyeNor, primitive.v[2].eyeNor };
+ fragment.eyeNor = glm::normalize( perspectiveCorrectInterpolation (eyeTri, perspZ, normals, bary));
+
+ if (primitive.v[0].dev_diffuseTex) {
+ glm::vec3 uv[3] = { glm::vec3(primitive.v[0].texcoord0, 0), glm::vec3(primitive.v[1].texcoord0, 0), glm::vec3(primitive.v[2].texcoord0, 0) };
+
+ glm::vec2 final_uv = bary[0] * primitive.v[0].texcoord0 + bary[1] * primitive.v[1].texcoord0 + bary[2] * primitive.v[2].texcoord0;
+ float u = final_uv.x * primitive.v[0].texWidth;
+ float v = final_uv.y * primitive.v[0].texHeight;
+
+ int uInt = glm::floor(u);
+ int vInt = glm::floor(v);
+
+ TextureData* texture = primitive.v[0].dev_diffuseTex;
+
+ float u_fract = u - glm::floor(u);
+ float v_fract = v - glm::floor(v);
+
+ int col_00_offset = (uInt + (vInt * primitive.v[0].texWidth)) * 3;
+ glm::vec3 col_00 = glm::vec3(texture[col_00_offset], texture[col_00_offset + 1], texture[col_00_offset + 2]);
+
+ int col_10_offset = (uInt + 1 + (vInt * primitive.v[0].texWidth)) * 3;
+ glm::vec3 col_10 = glm::vec3(texture[col_10_offset], texture[col_10_offset + 1], texture[col_10_offset + 2]);
+
+ int col_01_offset = (uInt + ((vInt + 1) * primitive.v[0].texWidth)) * 3;
+ glm::vec3 col_01 = glm::vec3(texture[col_01_offset], texture[col_01_offset + 1], texture[col_01_offset + 2]);
+
+ int col_11_offset = (uInt + 1 + ((vInt + 1) * primitive.v[0].texWidth)) * 3;
+ glm::vec3 col_11 = glm::vec3(texture[col_11_offset], texture[col_11_offset + 1], texture[col_11_offset + 2]);
+
+ glm::vec3 col_interp1 = glm::mix(col_00, col_10, u_fract);
+ glm::vec3 col_interp2 = glm::mix(col_01, col_11, u_fract);
+
+ fragment.color = glm::mix(col_interp1, col_interp2, v_fract) / 255.f;
+ }
+ else {
+ glm::vec3 colors[3] = { primitive.v[0].color, primitive.v[1].color, primitive.v[2].color };
+ fragment.color = perspectiveCorrectInterpolation (eyeTri, perspZ, colors, bary);
+ }
+ }
+ }
+ if (isSet) {
+ mutex[fragmentIndex] = 0;
+ }
+ } while (!isSet);
+ }
+ }
+ }
+ }
+}
/**
* Perform rasterization.
*/
void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const glm::mat3 MV_normal) {
- int sideLength2d = 8;
- dim3 blockSize2d(sideLength2d, sideLength2d);
- dim3 blockCount2d((width - 1) / blockSize2d.x + 1,
+ int sideLength2d = 8;
+ dim3 blockSize2d(sideLength2d, sideLength2d);
+ dim3 blockCount2d((width - 1) / blockSize2d.x + 1,
(height - 1) / blockSize2d.y + 1);
+ cudaMemcpy(dev_primitives_copy, dev_primitives, totalNumPrimitives * sizeof(Primitive), cudaMemcpyDeviceToDevice);
+
+
// Execute your rasterization pipeline here
// (See README for rasterization pipeline outline.)
@@ -702,14 +887,14 @@ void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const g
dim3 numBlocksForVertices((p->numVertices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x);
dim3 numBlocksForIndices((p->numIndices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x);
- _vertexTransformAndAssembly << < numBlocksForVertices, numThreadsPerBlock >> >(p->numVertices, *p, MVP, MV, MV_normal, width, height);
+ _vertexTransformAndAssembly << < numBlocksForVertices, numThreadsPerBlock >> > (p->numVertices, *p, MVP, MV, MV_normal, width, height);
checkCUDAError("Vertex Processing");
cudaDeviceSynchronize();
_primitiveAssembly << < numBlocksForIndices, numThreadsPerBlock >> >
- (p->numIndices,
- curPrimitiveBeginId,
- dev_primitives,
- *p);
+ (p->numIndices,
+ curPrimitiveBeginId,
+ dev_primitives_copy,
+ *p);
checkCUDAError("Primitive Assembly");
curPrimitiveBeginId += p->numPrimitives;
@@ -718,20 +903,27 @@ void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const g
checkCUDAError("Vertex Processing and Primitive Assembly");
}
-
+
cudaMemset(dev_fragmentBuffer, 0, width * height * sizeof(Fragment));
- initDepth << > >(width, height, dev_depth);
-
- // TODO: rasterize
+ initDepth << > > (width, height, dev_depth);
+ // Cull all the primitives facing away
+ Primitive *end = thrust::partition(thrust::device, dev_primitives_copy, dev_primitives_copy + totalNumPrimitives, cullpredicate());
+ cudaDeviceSynchronize();
- // Copy depthbuffer colors into framebuffer
- render << > >(width, height, dev_fragmentBuffer, dev_framebuffer);
+ int primitivesLeft = end - dev_primitives_copy;
+
+ // TODO: rasterize
+ rasterizeKernel << > > (primitivesLeft, width, height, dev_fragmentBuffer, dev_primitives_copy, dev_depth, mutex);
+
+
+ // Copy depthbuffer colors into framebuffer
+ render << > > (width, height, dev_fragmentBuffer, dev_framebuffer);
checkCUDAError("fragment shader");
- // Copy framebuffer into OpenGL buffer for OpenGL previewing
- sendImageToPBO<<>>(pbo, width, height, dev_framebuffer);
- checkCUDAError("copy render result to pbo");
+ // Copy framebuffer into OpenGL buffer for OpenGL previewing
+ sendImageToPBO << > > (pbo, originalWidth, originalHeight, dev_framebuffer, antialiasing, width);
+ checkCUDAError("copy render result to pbo");
}
/**
@@ -739,7 +931,7 @@ void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const g
*/
void rasterizeFree() {
- // deconstruct primitives attribute/indices device buffer
+ // deconstruct primitives attribute/indices device buffer
auto it(mesh2PrimitivesMap.begin());
auto itEnd(mesh2PrimitivesMap.end());
@@ -753,24 +945,27 @@ void rasterizeFree() {
cudaFree(p->dev_verticesOut);
-
+
//TODO: release other attributes and materials
}
}
////////////
- cudaFree(dev_primitives);
- dev_primitives = NULL;
+ cudaFree(dev_primitives);
+ dev_primitives = NULL;
cudaFree(dev_fragmentBuffer);
dev_fragmentBuffer = NULL;
- cudaFree(dev_framebuffer);
- dev_framebuffer = NULL;
+ cudaFree(dev_framebuffer);
+ dev_framebuffer = NULL;
cudaFree(dev_depth);
dev_depth = NULL;
- checkCUDAError("rasterize Free");
-}
+ cudaFree(mutex);
+ mutex = NULL;
+
+ checkCUDAError("rasterize Free");
+}
\ No newline at end of file
diff --git a/src/rasterizeTools.h b/src/rasterizeTools.h
index 46c701e..555a1ab 100644
--- a/src/rasterizeTools.h
+++ b/src/rasterizeTools.h
@@ -13,8 +13,8 @@
#include
struct AABB {
- glm::vec3 min;
- glm::vec3 max;
+ glm::vec3 min;
+ glm::vec3 max;
};
/**
@@ -22,7 +22,7 @@ struct AABB {
*/
__host__ __device__ static
glm::vec3 multiplyMV(glm::mat4 m, glm::vec4 v) {
- return glm::vec3(m * v);
+ return glm::vec3(m * v);
}
// CHECKITOUT
@@ -31,16 +31,16 @@ glm::vec3 multiplyMV(glm::mat4 m, glm::vec4 v) {
*/
__host__ __device__ static
AABB getAABBForTriangle(const glm::vec3 tri[3]) {
- AABB aabb;
- aabb.min = glm::vec3(
- min(min(tri[0].x, tri[1].x), tri[2].x),
- min(min(tri[0].y, tri[1].y), tri[2].y),
- min(min(tri[0].z, tri[1].z), tri[2].z));
- aabb.max = glm::vec3(
- max(max(tri[0].x, tri[1].x), tri[2].x),
- max(max(tri[0].y, tri[1].y), tri[2].y),
- max(max(tri[0].z, tri[1].z), tri[2].z));
- return aabb;
+ AABB aabb;
+ aabb.min = glm::vec3(
+ min(min(tri[0].x, tri[1].x), tri[2].x),
+ min(min(tri[0].y, tri[1].y), tri[2].y),
+ min(min(tri[0].z, tri[1].z), tri[2].z));
+ aabb.max = glm::vec3(
+ max(max(tri[0].x, tri[1].x), tri[2].x),
+ max(max(tri[0].y, tri[1].y), tri[2].y),
+ max(max(tri[0].z, tri[1].z), tri[2].z));
+ return aabb;
}
// CHECKITOUT
@@ -49,7 +49,7 @@ AABB getAABBForTriangle(const glm::vec3 tri[3]) {
*/
__host__ __device__ static
float calculateSignedArea(const glm::vec3 tri[3]) {
- return 0.5 * ((tri[2].x - tri[0].x) * (tri[1].y - tri[0].y) - (tri[1].x - tri[0].x) * (tri[2].y - tri[0].y));
+ return 0.5 * ((tri[2].x - tri[0].x) * (tri[1].y - tri[0].y) - (tri[1].x - tri[0].x) * (tri[2].y - tri[0].y));
}
// CHECKITOUT
@@ -58,11 +58,11 @@ float calculateSignedArea(const glm::vec3 tri[3]) {
*/
__host__ __device__ static
float calculateBarycentricCoordinateValue(glm::vec2 a, glm::vec2 b, glm::vec2 c, const glm::vec3 tri[3]) {
- glm::vec3 baryTri[3];
- baryTri[0] = glm::vec3(a, 0);
- baryTri[1] = glm::vec3(b, 0);
- baryTri[2] = glm::vec3(c, 0);
- return calculateSignedArea(baryTri) / calculateSignedArea(tri);
+ glm::vec3 baryTri[3];
+ baryTri[0] = glm::vec3(a, 0);
+ baryTri[1] = glm::vec3(b, 0);
+ baryTri[2] = glm::vec3(c, 0);
+ return calculateSignedArea(baryTri) / calculateSignedArea(tri);
}
// CHECKITOUT
@@ -71,10 +71,10 @@ float calculateBarycentricCoordinateValue(glm::vec2 a, glm::vec2 b, glm::vec2 c,
*/
__host__ __device__ static
glm::vec3 calculateBarycentricCoordinate(const glm::vec3 tri[3], glm::vec2 point) {
- float beta = calculateBarycentricCoordinateValue(glm::vec2(tri[0].x, tri[0].y), point, glm::vec2(tri[2].x, tri[2].y), tri);
- float gamma = calculateBarycentricCoordinateValue(glm::vec2(tri[0].x, tri[0].y), glm::vec2(tri[1].x, tri[1].y), point, tri);
- float alpha = 1.0 - beta - gamma;
- return glm::vec3(alpha, beta, gamma);
+ float beta = calculateBarycentricCoordinateValue(glm::vec2(tri[0].x, tri[0].y), point, glm::vec2(tri[2].x, tri[2].y), tri);
+ float gamma = calculateBarycentricCoordinateValue(glm::vec2(tri[0].x, tri[0].y), glm::vec2(tri[1].x, tri[1].y), point, tri);
+ float alpha = 1.0 - beta - gamma;
+ return glm::vec3(alpha, beta, gamma);
}
// CHECKITOUT
@@ -83,9 +83,9 @@ glm::vec3 calculateBarycentricCoordinate(const glm::vec3 tri[3], glm::vec2 point
*/
__host__ __device__ static
bool isBarycentricCoordInBounds(const glm::vec3 barycentricCoord) {
- return barycentricCoord.x >= 0.0 && barycentricCoord.x <= 1.0 &&
- barycentricCoord.y >= 0.0 && barycentricCoord.y <= 1.0 &&
- barycentricCoord.z >= 0.0 && barycentricCoord.z <= 1.0;
+ return barycentricCoord.x >= 0.0 && barycentricCoord.x <= 1.0 &&
+ barycentricCoord.y >= 0.0 && barycentricCoord.y <= 1.0 &&
+ barycentricCoord.z >= 0.0 && barycentricCoord.z <= 1.0;
}
// CHECKITOUT
@@ -95,7 +95,25 @@ bool isBarycentricCoordInBounds(const glm::vec3 barycentricCoord) {
*/
__host__ __device__ static
float getZAtCoordinate(const glm::vec3 barycentricCoord, const glm::vec3 tri[3]) {
- return -(barycentricCoord.x * tri[0].z
- + barycentricCoord.y * tri[1].z
- + barycentricCoord.z * tri[2].z);
+ return (barycentricCoord.x * tri[0].z
+ + barycentricCoord.y * tri[1].z
+ + barycentricCoord.z * tri[2].z);
}
+
+__host__ __device__ static
+int convert2Dto1D(const int x, const int y, const int width) {
+ return x + (y * width);
+}
+
+/**
+* Handy-dandy hash function that provides seeds for random number generation.
+*/
+__host__ __device__ inline unsigned int utilhash(unsigned int a) {
+ a = (a + 0x7ed55d16) + (a << 12);
+ a = (a ^ 0xc761c23c) ^ (a >> 19);
+ a = (a + 0x165667b1) + (a << 5);
+ a = (a + 0xd3a2646c) ^ (a << 9);
+ a = (a + 0xfd7046c5) + (a << 3);
+ a = (a ^ 0xb55a4f09) ^ (a >> 16);
+ return a;
+}
\ No newline at end of file