diff --git a/.gitignore b/.gitignore index 89942d9..6a06393 100644 --- a/.gitignore +++ b/.gitignore @@ -513,7 +513,7 @@ local.properties .externalToolBuilders/ # Locally stored "Eclipse launch configurations" -#*.launch +*.launch # CDT-specific #.cproject diff --git a/CMakeLists.txt b/CMakeLists.txt index d3d976c..df23a46 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,6 +3,7 @@ cmake_minimum_required(VERSION 3.0) project(cis565_path_tracer) set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH}) +SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") # Set up include and lib paths set(EXTERNAL "external") @@ -84,6 +85,7 @@ target_link_libraries(${CMAKE_PROJECT_NAME} src #stream_compaction # TODO: uncomment if using your stream compaction ${CORELIBS} + X11 ) add_custom_command( diff --git a/Project3-CUDA-Path-Tracer.launch b/Project3-CUDA-Path-Tracer.launch deleted file mode 100644 index 0222434..0000000 --- a/Project3-CUDA-Path-Tracer.launch +++ /dev/null @@ -1,22 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - diff --git a/README.md b/README.md index 110697c..e61f4fc 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,94 @@ -CUDA Path Tracer -================ +CUDA Stream Compaction +====================== -**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 3** +**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Edward Atter + * [LinkedIn](https://www.linkedin.com/in/atter/) + * Tested on: Linux Mint 18.3 Sylvia (4.13.0-41-generic), Ryzen 7 2700x @ 3.7 ghz (base clock) 16GB, GTX 1070 TI 8GB GDDR5 (Personal) + * CUDA 9 -### (TODO: Your README) +![](img/intro.png) -*DO NOT* leave the README to the last minute! It is a crucial part of the -project, and we will not be able to grade you without a good README. +## Overview +This project implements a path tracer with the following features: + - A shading kernel capable of diffuse, specular, and refractive surfaces with Fresnel effects + - Antialiasing + - Stream compaction + - Material sorting + - First bounce caching + - Built in frame timer + +All features may be toggled by changing the defined constants in `src/pathtracer.cu`. + +## Performance + + +#### Methodology + +Frame timings were calculated by taking the average over 5 frames. In general, after the initial frame, very little variance was observed. The timing for each feature was performed with all other toggleable features disabled. + +#### Relative Performance Impact + +![](img/relative-performance.png) + +The graph above shows the relative performance impact of each feature, represented in seconds. A negative time indicates the feature had a positive impact on performance, while a positive number represents a decrease in performance. "None" is with all features disabled and is the baseline in this test, set to 0. + +Analysis of `nvidia-smi` suggests the application is GPU bound, as expected, pegging the GPU usage at 100%. + + +-----------------------------------------------------------------------------+ + | NVIDIA-SMI 396.26 Driver Version: 396.26 | + |-------------------------------+----------------------+----------------------+ + | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | + | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | + |===============================+======================+======================| + | 0 GeForce GTX 107... Off | 00000000:0A:00.0 On | N/A | + | 47% 65C P2 116W / 180W | 1055MiB / 8116MiB | 100% Default | + +-------------------------------+----------------------+----------------------+ + + +-----------------------------------------------------------------------------+ + | Processes: GPU Memory | + | GPU PID Type Process name Usage | + |=============================================================================| + | 0 1570 G /usr/lib/xorg/Xorg 400MiB | + | 0 3563 G cinnamon 140MiB | + | 0 4200 G ...-token=A7907B8E58127E7F09984994A2B09AB9 48MiB | + | 0 6463 C gimp-2.8 107MiB | + | 0 10622 G ...-token=A3470AA5428C5FF1C71554B5AC8CA77C 41MiB | + | 0 12950 C+G ...DA-Path-Tracer/build/cis565_path_tracer 209MiB | + | 0 23722 C /usr/lib/libreoffice/program/soffice.bin 105MiB | + +-----------------------------------------------------------------------------+ + + +## Features + +#### Anti-Aliasing + +![](img/circle-no-aa.png) ![](img/circle-with-aa.png) + +Anti-aliasing can even out rough edges. This is done by "jittering" the rays' x and y positions slightly at each iteration. The effect is most noticible in high-contrast situations, as in the images above. The image on the left has anti-aliasing disabled, while the image on the right has AA enabled. The image on the right appears less pixelated. + +Anti-aliasing had a negligible impact on performance, .003 seconds. This is well within the margin of error. + +#### Sorting +In theory, sorting by material type should yield a large improvement. The GPUs scheduler can skip warps if each thread in the warp is returned. Sorting by material type should increase the liklihood of this happening. In addition, it should clean up memory accesses and branch prediction since most warps will consist of a single material type. + +Unfortunately, as shown in the graph above, this theoretical gain was not realized. The overhead of sorting is much greater than the gains it provides. + +#### Stream Compaction +Interestingly, stream compaction resulted in a slight decrease in performance. In theory, we should be able to eliminate many of the dead, black rays; but at least with the current implementation, the compaction overhead was to great to yield any appreciable results. + +I hypothesized that mostly black images would benefit the most from stream compaction, so I tried again on the `sphere.txt` scene. The results were the same, rendering was faster without stream compaction. + +#### First Bounce Caching +Caching the first bounce yielded the largest increase. Unfortunately, it has the effect of dampening the anti-aliasing affect (if both are enabled simultaneously), since the random jitter of the array is cached and reused for each iteration, only being updated during a new frame. + +## Gallery + +![](img/l1.png) +![](img/m1.png) ![](img/m2.png) + +![](img/g1.png) ![](img/g2.png) +![](img/g3.png) ![](img/g4.png) +![](img/g5.png) ![](img/g6.png) diff --git a/external/lib/linux/libglfw3.a b/external/lib/linux/libglfw3.a index da7ab6c..17b4438 100644 Binary files a/external/lib/linux/libglfw3.a and b/external/lib/linux/libglfw3.a differ diff --git a/img/circle-no-aa.png b/img/circle-no-aa.png new file mode 100644 index 0000000..ede49be Binary files /dev/null and b/img/circle-no-aa.png differ diff --git a/img/circle-with-aa.png b/img/circle-with-aa.png new file mode 100644 index 0000000..af2dd04 Binary files /dev/null and b/img/circle-with-aa.png differ diff --git a/img/g1.png b/img/g1.png new file mode 100644 index 0000000..f4dd91d Binary files /dev/null and b/img/g1.png differ diff --git a/img/g2.png b/img/g2.png new file mode 100644 index 0000000..5ca4e4e Binary files /dev/null and b/img/g2.png differ diff --git a/img/g3.png b/img/g3.png new file mode 100644 index 0000000..bef54d9 Binary files /dev/null and b/img/g3.png differ diff --git a/img/g4.png b/img/g4.png new file mode 100644 index 0000000..6722da5 Binary files /dev/null and b/img/g4.png differ diff --git a/img/g5.png b/img/g5.png new file mode 100644 index 0000000..1e5cef3 Binary files /dev/null and b/img/g5.png differ diff --git a/img/g6.png b/img/g6.png new file mode 100644 index 0000000..82db79f Binary files /dev/null and b/img/g6.png differ diff --git a/img/intro.png b/img/intro.png new file mode 100644 index 0000000..b411d50 Binary files /dev/null and b/img/intro.png differ diff --git a/img/l1.png b/img/l1.png new file mode 100644 index 0000000..304f023 Binary files /dev/null and b/img/l1.png differ diff --git a/img/m1.png b/img/m1.png new file mode 100644 index 0000000..e4c9120 Binary files /dev/null and b/img/m1.png differ diff --git a/img/m2.png b/img/m2.png new file mode 100644 index 0000000..230812d Binary files /dev/null and b/img/m2.png differ diff --git a/img/relative-performance.png b/img/relative-performance.png new file mode 100644 index 0000000..479a103 Binary files /dev/null and b/img/relative-performance.png differ diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a1cb3fb..84d8a59 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -19,5 +19,5 @@ set(SOURCE_FILES cuda_add_library(src ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_60 ) diff --git a/src/interactions.h b/src/interactions.h index 5ce3628..808a7ab 100644 --- a/src/interactions.h +++ b/src/interactions.h @@ -2,6 +2,8 @@ #include "intersections.h" +#define THRESH_INTERNAL_REFLECTION .01f + // CHECKITOUT /** * Computes a cosine-weighted random direction in a hemisphere. @@ -41,6 +43,41 @@ glm::vec3 calculateRandomDirectionInHemisphere( + sin(around) * over * perpendicularDirection2; } +__host__ __device__ +void reflect(PathSegment & pathSegment, + glm::vec3 intersect, + glm::vec3 normal, + const Material &material, + glm::vec3 direction) { + pathSegment.color *= material.color; + pathSegment.ray.direction = glm::reflect(direction, normal); + pathSegment.ray.origin = intersect + .0001f * pathSegment.ray.direction; +} + +__host__ __device__ +void refract(PathSegment & pathSegment, + glm::vec3 intersect, + glm::vec3 normal, + const Material &material, + glm::vec3 direction) { + float refractive_index = material.indexOfRefraction; + // Not sure why this fixes the bug... + if (glm::dot(direction, normal) < 0) { + refractive_index = 1.0f / refractive_index; + } else { + normal = -normal; + } + + if (glm::length(pathSegment.ray.direction) > THRESH_INTERNAL_REFLECTION) { + pathSegment.ray.direction = glm::refract(direction, normal, refractive_index); + } else { + pathSegment.ray.direction = glm::reflect(direction, normal); + } + + pathSegment.color *= material.color; + pathSegment.ray.origin = intersect + .001f * pathSegment.ray.direction; +} + /** * Scatter a ray with some probabilities according to the material properties. * For example, a diffuse surface scatters in a cosine-weighted hemisphere. @@ -71,9 +108,71 @@ void scatterRay( PathSegment & pathSegment, glm::vec3 intersect, glm::vec3 normal, - const Material &m, + const Material &material, thrust::default_random_engine &rng) { - // TODO: implement this. // A basic implementation of pure-diffuse shading will just call the // calculateRandomDirectionInHemisphere defined above. + glm::vec3 direction = glm::normalize(pathSegment.ray.direction); + + if (material.hasReflective && material.hasRefractive) { + thrust::uniform_real_distribution u01(0,1); + + float cosAngle = glm::dot(direction, normal); + if (cosAngle > 1) { + cosAngle = 1; + } else if (cosAngle < -1) { + cosAngle = -1; + } + + float refractive_index_before = 1; + float refractive_index_after; + + if (cosAngle > 0) { + std::swap(refractive_index_before, refractive_index_after); + } else { + cosAngle *= -1; + pathSegment.ray.origin = intersect + .001f * direction; + } + + float refractive_index = refractive_index_before / refractive_index_after; + float sinAngleBefore = 1 - std::pow(cosAngle, 2); + if (sinAngleBefore > 0) { + sinAngleBefore = std::sqrt(sinAngleBefore); + } else { + sinAngleBefore = 0; + } + float sinAngleAfter = refractive_index * sinAngleBefore; + + if (sinAngleAfter > 1) { + pathSegment.color *= material.specular.color; + pathSegment.ray.direction = glm::reflect(pathSegment.ray.direction, normal); + } else { + //Shlick + //R = R_0 + (1 - R_0) * (1 - cos(theta))^5 + //R_0 = [(n_1 - n_2) / (n_1 + n_2)]^2 + + float r0 = (refractive_index_before - refractive_index_after) / (refractive_index_before + refractive_index_after); + r0 *= r0; + float cosAngleRaised = std::pow(1 - cosAngle, 5); + float R = r0 + (1 - r0) * cosAngleRaised; + + //Randomize + if (R < u01(rng)) { + refract(pathSegment, intersect, normal, material, direction); + } else { + reflect(pathSegment, intersect, normal, material, direction); + } + } + + } else if (material.hasReflective) { + reflect(pathSegment, intersect, normal, material, direction); + } else if (material.hasRefractive) { + refract(pathSegment, intersect, normal, material, direction); + } else { + //Diffuse + pathSegment.color *= material.color; + pathSegment.ray.direction = calculateRandomDirectionInHemisphere(normal, rng); + pathSegment.ray.origin = intersect + .001f * pathSegment.ray.direction; + } + } diff --git a/src/pathtrace.cu b/src/pathtrace.cu index c1ec122..1c7b2be 100644 --- a/src/pathtrace.cu +++ b/src/pathtrace.cu @@ -4,6 +4,7 @@ #include #include #include +#include #include "sceneStructs.h" #include "scene.h" @@ -16,6 +17,14 @@ #define ERRORCHECK 1 +//CONFIGURABLE OPTIONS +#define VERBOSE 0 +#define OPTION_ENABLE_CACHE 0 +#define OPTION_ENABLE_SORT 0 +#define OPTION_ENABLE_COMPACTION 0 +#define OPTION_ENABLE_ANTI_ALIAS 1 +#define OPTION_ENABLE_TIMER 1 + #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) void checkCUDAErrorFn(const char *msg, const char *file, int line) { @@ -75,6 +84,7 @@ static PathSegment * dev_paths = NULL; static ShadeableIntersection * dev_intersections = NULL; // TODO: static variables for device memory, any extra info you need, etc // ... +static ShadeableIntersection * dev_intersection_cache; void pathtraceInit(Scene *scene) { hst_scene = scene; @@ -96,6 +106,7 @@ void pathtraceInit(Scene *scene) { cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection)); // TODO: initialize any extra device memeory you need + cudaMalloc(&dev_intersection_cache, pixelcount * sizeof(ShadeableIntersection)); checkCUDAError("pathtraceInit"); } @@ -108,6 +119,7 @@ void pathtraceFree() { cudaFree(dev_intersections); // TODO: clean up any extra device memory you created + cudaFree(dev_intersection_cache); checkCUDAError("pathtraceFree"); } @@ -131,10 +143,22 @@ __global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, Path segment.ray.origin = cam.position; segment.color = glm::vec3(1.0f, 1.0f, 1.0f); - // TODO: implement antialiasing by jittering the ray +#if OPTION_ENABLE_ANTI_ALIAS + // Implement antialiasing by jittering the ray + thrust::default_random_engine rng = + makeSeededRandomEngine(iter, index, pathSegments[index].remainingBounces); + thrust::uniform_real_distribution u01(0, 1); + + float jit_x = u01(rng); + float jit_y = u01(rng); +#else + float jit_x = 0; + float jit_y = 0; +#endif + segment.ray.direction = glm::normalize(cam.view - - cam.right * cam.pixelLength.x * ((float)x - (float)cam.resolution.x * 0.5f) - - cam.up * cam.pixelLength.y * ((float)y - (float)cam.resolution.y * 0.5f) + - cam.right * cam.pixelLength.x * (((float)x + jit_x) - (float)cam.resolution.x * 0.5f) + - cam.up * cam.pixelLength.y * (((float)y + jit_y) - (float)cam.resolution.y * 0.5f) ); segment.pixelIndex = index; @@ -265,6 +289,45 @@ __global__ void shadeFakeMaterial ( } } +__global__ void shadeMaterial (int iter + , int num_paths + , ShadeableIntersection * shadeableIntersections + , PathSegment * pathSegments + , Material * materials + ) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int remainingBounces = pathSegments[idx].remainingBounces; + if (idx > num_paths || remainingBounces <= 0){ + return; + } + + ShadeableIntersection si = shadeableIntersections[idx]; + //Per notes, intersection of -1 is non-existent + if (si.t > 0) { + thrust::default_random_engine rng = makeSeededRandomEngine(iter, idx, remainingBounces); + thrust::uniform_real_distribution u01(0, 1); + + Material material = materials[si.materialId]; + + //Check if we hit a light source + if (material.emittance) { + pathSegments[idx].color *= (material.color * material.emittance); + remainingBounces = 0; + } else { + glm::vec3 ri = getPointOnRay(pathSegments[idx].ray, si.t); + scatterRay(pathSegments[idx], ri, si.surfaceNormal, material, rng); + remainingBounces --; + } + } else { + //No intersection, color black + pathSegments[idx].color = glm::vec3(0.0f); + remainingBounces = 0; + } + + //Update copy of remainingBounces + pathSegments[idx].remainingBounces = remainingBounces; +} + // Add the current iteration's output to the overall image __global__ void finalGather(int nPaths, glm::vec3 * image, PathSegment * iterationPaths) { @@ -277,6 +340,23 @@ __global__ void finalGather(int nPaths, glm::vec3 * image, PathSegment * iterati } } +struct material_comparator { + __host__ __device__ + bool operator() (const ShadeableIntersection &a, const ShadeableIntersection &b) { + return (a.materialId < b.materialId); //Why does thrust comparator use boolean instead of standard NZP? + } +}; + +struct stream_comparator { + __host__ __device__ + bool operator() (const PathSegment &a) {//, const PathSegment &b) { + // Because this is only used for compacting, don't need two objects + // I'm not sure about this being equal to zero, but doesn't work without + // is this actually doing anything? A OB1 bug with remainingBounces somewhere else? + return a.remainingBounces >= 0; + } +}; + /** * Wrapper for the __global__ call that sets up the kernel calls and does a ton * of memory management @@ -295,6 +375,10 @@ void pathtrace(uchar4 *pbo, int frame, int iter) { // 1D block for path tracing const int blockSize1d = 128; + clock_t timer; + double iteration_time = 0; + double total_time = 0; + /////////////////////////////////////////////////////////////////////////// // Recap: @@ -338,12 +422,53 @@ void pathtrace(uchar4 *pbo, int frame, int iter) { bool iterationComplete = false; while (!iterationComplete) { +#if OPTION_ENABLE_TIMER + //Start the clock + timer = clock(); +#endif // clean shading chunks cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection)); // tracing dim3 numblocksPathSegmentTracing = (num_paths + blockSize1d - 1) / blockSize1d; + + //CACHE +#if OPTION_ENABLE_CACHE + if (depth == 0 && iter == 1) { + //Calculate, first run + computeIntersections <<>> ( + depth + , num_paths + , dev_paths + , dev_geoms + , hst_scene->geoms.size() + , dev_intersections + ); + checkCUDAError("trace one bounce"); + //Save cache + cudaMemcpy(dev_intersection_cache, dev_intersections, + pixelcount * sizeof(ShadeableIntersection), cudaMemcpyDeviceToDevice); + //printf("CACHED!\n"); + } else if (depth == 0 && iter != 0) { + //Re-use cache + cudaMemcpy(dev_intersections, dev_intersection_cache, + pixelcount * sizeof(ShadeableIntersection), cudaMemcpyDeviceToDevice); + //printf("USING CACHE\n"); + } else { + //Abandon the cache, depth != 1 + computeIntersections <<>> ( + depth + , num_paths + , dev_paths + , dev_geoms + , hst_scene->geoms.size() + , dev_intersections + ); + checkCUDAError("trace one bounce"); + } +#else + //NO-CACHE, Recalculate each time computeIntersections <<>> ( depth , num_paths @@ -353,10 +478,11 @@ void pathtrace(uchar4 *pbo, int frame, int iter) { , dev_intersections ); checkCUDAError("trace one bounce"); +#endif + //Common to caching and non-caching solutions cudaDeviceSynchronize(); depth++; - // TODO: // --- Shading Stage --- // Shade path segments based on intersections and generate new rays by @@ -366,14 +492,36 @@ void pathtrace(uchar4 *pbo, int frame, int iter) { // TODO: compare between directly shading the path segments and shading // path segments that have been reshuffled to be contiguous in memory. - shadeFakeMaterial<<>> ( +#if OPTION_ENABLE_SORT + //Sort + thrust::sort_by_key(thrust::device, dev_intersections, dev_intersections + num_paths, dev_paths, material_comparator()); +#endif + + shadeMaterial<<>> ( iter, num_paths, dev_intersections, dev_paths, dev_materials ); - iterationComplete = true; // TODO: should be based off stream compaction results. + +#if OPTION_ENABLE_COMPACTION + //Compact + PathSegment * segment = thrust::partition(thrust::device, dev_paths, dev_paths + num_paths, stream_comparator()); + //We've likely removed some paths, recalculate + num_paths = segment - dev_paths; +#endif + + iterationComplete = traceDepth < depth || num_paths <= 0; + +#if OPTION_ENABLE_TIMER + timer = clock() - timer; + iteration_time = ((double)timer)/CLOCKS_PER_SEC; + total_time += iteration_time; +#if VERBOSE + printf("Iteration completed: %f \n", iteration_time); +#endif +#endif } // Assemble this iteration and apply it to the image @@ -390,4 +538,7 @@ void pathtrace(uchar4 *pbo, int frame, int iter) { pixelcount * sizeof(glm::vec3), cudaMemcpyDeviceToHost); checkCUDAError("pathtrace"); +#if OPTION_ENABLE_TIMER + printf("Frame completed: %f \n", total_time); +#endif }