diff --git a/CMakeLists.txt b/CMakeLists.txt
index 84a7191a..6ad0f6b9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -184,3 +184,8 @@ endif()
 if(EXISTS ${CMAKE_SOURCE_DIR}/samples/CMakeLists.txt)
     add_subdirectory(samples)
 endif()
+
+# add apps if available
+if(EXISTS ${CMAKE_SOURCE_DIR}/apps/CMakeLists.txt)
+    add_subdirectory(apps)
+endif()
diff --git a/LICENSE b/LICENSE
index 3f7f789a..3cc17a66 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,3 +1,4 @@
+Copyright (c) 2020, University of Erlangen-Nuremberg
 Copyright (c) 2014, Saarland University
 Copyright (c) 2012, University of Erlangen-Nuremberg
 Copyright (c) 2012, Siemens AG
diff --git a/README.md b/README.md
index 9dc7d367..d4e4db88 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,8 @@
+This is a fork of the [Hipacc](http://hipacc-lang.org) project, with primarily experiential features:
+* Asynchronous runtime APIs
+* Multi-stream implementation for multiresolution filters in CUDA
+
+
 # Hipacc
 A domain-specific language and compiler for image processing
 
diff --git a/_clang-format b/_clang-format
new file mode 100644
index 00000000..a81aaae7
--- /dev/null
+++ b/_clang-format
@@ -0,0 +1,121 @@
+---
+Language:        Cpp
+# BasedOnStyle:  LLVM
+AccessModifierOffset: -2
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlines: Right
+AlignOperands:   true
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: MultiLine
+BinPackArguments: true
+BinPackParameters: true
+BraceWrapping:   
+  AfterClass:      false
+  AfterControlStatement: false
+  AfterEnum:       false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  AfterExternBlock: false
+  BeforeCatch:     false
+  BeforeElse:      false
+  IndentBraces:    false
+  SplitEmptyFunction: true
+  SplitEmptyRecord: true
+  SplitEmptyNamespace: true
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Attach
+BreakBeforeInheritanceComma: false
+BreakInheritanceList: BeforeColon
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakConstructorInitializers: BeforeColon
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: true
+ColumnLimit:     82
+CommentPragmas:  '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: false
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+DisableFormat:   false
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: true
+ForEachMacros:   
+  - foreach
+  - Q_FOREACH
+  - BOOST_FOREACH
+IncludeBlocks:   Preserve
+IncludeCategories: 
+  - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
+    Priority:        2
+  - Regex:           '^(<|"(gtest|gmock|isl|json)/)'
+    Priority:        3
+  - Regex:           '.*'
+    Priority:        1
+IncludeIsMainRegex: '(Test)?$'
+IndentCaseLabels: false
+IndentPPDirectives: None
+IndentWidth:     2
+IndentWrappedFunctionNames: false
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: true
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBinPackProtocolList: Auto
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: true
+PenaltyBreakAssignment: 2
+PenaltyBreakBeforeFirstCallParameter: 19
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 60
+PointerAlignment: Right
+ReflowComments:  true
+SortIncludes:    true
+SortUsingDeclarations: true
+SpaceAfterCStyleCast: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatements
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles:  false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard:        Cpp11
+StatementMacros: 
+  - Q_UNUSED
+  - QT_REQUIRE_VERSION
+TabWidth:        8
+UseTab:          Never
+...
+
diff --git a/apps/6_Multiresolution_Filters/Laplacian_Pyramid_Encoding/src/main.cpp b/apps/6_Multiresolution_Filters/Laplacian_Pyramid_Encoding/src/main.cpp
new file mode 100644
index 00000000..b69dcaad
--- /dev/null
+++ b/apps/6_Multiresolution_Filters/Laplacian_Pyramid_Encoding/src/main.cpp
@@ -0,0 +1,212 @@
+//
+// Copyright (c) 2020, University of Erlangen-Nuremberg
+// Copyright (c) 2012, University of Erlangen-Nuremberg
+// Copyright (c) 2012, Siemens AG
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+
+#include "hipacc.hpp"
+#include <iostream>
+#include <hipacc_helper.hpp>
+
+#define SIZE_X 3
+#define SIZE_Y 3
+#define WIDTH 1024
+#define HEIGHT 1024
+
+using namespace hipacc;
+using namespace hipacc::math;
+
+
+class Gaussian : public Kernel<char> {
+  private:
+    Accessor<char> &input;
+    Mask<float> &mask;
+
+  public:
+    Gaussian(IterationSpace<char> &iter, Accessor<char> &input,
+             Mask<float> &mask)
+          : Kernel(iter), input(input), mask(mask) {
+        add_accessor(&input);
+    }
+
+    void kernel() {
+        output() = convolve(mask, Reduce::SUM, [&] () {
+            return input(mask) * mask();
+        });
+    }
+
+    void _operatePyramidReduce() {}
+};
+
+class DifferenceOfGaussian : public Kernel<char> {
+  private:
+    Accessor<char> &input1;
+    Accessor<char> &input2;
+
+  public:
+    DifferenceOfGaussian(IterationSpace<char> &iter, Accessor<char> &input1,
+                         Accessor<char> &input2)
+          : Kernel(iter), input1(input1), input2(input2) {
+        add_accessor(&input1);
+        add_accessor(&input2);
+    }
+
+    void kernel() {
+        output() = input1() - input2();
+    }
+
+    void _operatePyramidFilter() {}
+};
+
+class Blend : public Kernel<char> {
+  private:
+    Accessor<char> &input1;
+    Accessor<char> &input2;
+
+  public:
+    Blend(IterationSpace<char> &iter, Accessor<char> &input1,
+          Accessor<char> &input2)
+        : Kernel(iter), input1(input1), input2(input2) {
+      add_accessor(&input1);
+      add_accessor(&input2);
+    }
+
+    void kernel() {
+        output() = (short)input1() + (short)input2() / 2;
+    }
+
+    void _operatePyramidExpand() {}
+};
+
+
+/*************************************************************************
+ * Main function                                                         *
+ *************************************************************************/
+int main(int argc, const char **argv) {
+    const int width = WIDTH;
+    const int height = HEIGHT;
+    const int size_x = SIZE_X;
+    const int size_y = SIZE_Y;
+    float timing = 0;
+
+    // only filter kernel sizes 3x3, 5x5, and 7x7 implemented
+    if (size_x != size_y || !(size_x == 3 || size_x == 5 || size_x == 7)) {
+        std::cout << "Wrong filter kernel size. "
+                  << "Currently supported values: 3x3, 5x5, and 7x7!"
+                  << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    // convolution filter mask
+    const float coef[SIZE_Y][SIZE_X] = {
+#if SIZE_X == 3
+        { 0.057118f, 0.124758f, 0.057118f },
+        { 0.124758f, 0.272496f, 0.124758f },
+        { 0.057118f, 0.124758f, 0.057118f }
+#endif
+#if SIZE_X == 5
+        { 0.005008f, 0.017300f, 0.026151f, 0.017300f, 0.005008f },
+        { 0.017300f, 0.059761f, 0.090339f, 0.059761f, 0.017300f },
+        { 0.026151f, 0.090339f, 0.136565f, 0.090339f, 0.026151f },
+        { 0.017300f, 0.059761f, 0.090339f, 0.059761f, 0.017300f },
+        { 0.005008f, 0.017300f, 0.026151f, 0.017300f, 0.005008f }
+#endif
+#if SIZE_X == 7
+        { 0.000841f, 0.003010f, 0.006471f, 0.008351f, 0.006471f, 0.003010f, 0.000841f },
+        { 0.003010f, 0.010778f, 0.023169f, 0.029902f, 0.023169f, 0.010778f, 0.003010f },
+        { 0.006471f, 0.023169f, 0.049806f, 0.064280f, 0.049806f, 0.023169f, 0.006471f },
+        { 0.008351f, 0.029902f, 0.064280f, 0.082959f, 0.064280f, 0.029902f, 0.008351f },
+        { 0.006471f, 0.023169f, 0.049806f, 0.064280f, 0.049806f, 0.023169f, 0.006471f },
+        { 0.003010f, 0.010778f, 0.023169f, 0.029902f, 0.023169f, 0.010778f, 0.003010f },
+        { 0.000841f, 0.003010f, 0.006471f, 0.008351f, 0.006471f, 0.003010f, 0.000841f }
+#endif
+    };
+
+    // host memory for random generated image of width x height pixels
+    char *input = new char[width * height];
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        input[y * width + x] = (char)((y * width + x) % 19);
+      }
+    }
+
+    std::cout << "Calculating Hipacc Gaussian Laplacian pyramid ..." << std::endl;
+
+    //************************************************************************//
+
+    // input and output image of width x height pixels
+    Image<char> gaus(width, height);
+    Image<char> lap(width, height);
+    Mask<float> mask(coef);
+
+    const int depth = 10;
+    Pyramid<char> pgaus(gaus, depth);
+    Pyramid<char> plap(lap, depth);
+
+    gaus = input;
+    traverse(pgaus, plap, [&] () {
+        if (!pgaus.is_top_level()) {
+            // construct Gaussian pyramid
+            BoundaryCondition<char> bound(pgaus(-1), mask, Boundary::CLAMP);
+            Accessor<char> acc1(bound);
+            IterationSpace<char> iter1(pgaus(0));
+            Gaussian blur(iter1, acc1, mask);
+            blur.execute();
+
+            // construct Laplacian pyramid
+            Accessor<char> acc3(pgaus(-1));
+            Accessor<char> acc4(pgaus(0), Interpolate::LF);
+            IterationSpace<char> iter3(plap(-1));
+            DifferenceOfGaussian DoG(iter3, acc3, acc4);
+            DoG.execute();
+        }
+
+        traverse();
+
+        // collapse pyramids
+        if (!pgaus.is_bottom_level()) {
+            // blend final output image from Laplacian pyramid
+            Accessor<char> acc3(plap(1), Interpolate::LF);
+            Accessor<char> acc4(plap(0));
+            IterationSpace<char> iter2(plap(0));
+            Blend blend(iter2, acc3, acc4);
+            blend.execute();
+        }
+    });
+
+    // get pointer to result data
+    char *output = lap.data();
+
+    //************************************************************************//
+
+    // convert to uchar for visualization
+    for (int p = 0; p < width*height; ++p) {
+        output[p] = (char)(output[p] + 127);
+    }
+
+    // free memory
+    delete[] input;
+
+    return EXIT_SUCCESS;
+}
diff --git a/apps/CMakeLists.txt b/apps/CMakeLists.txt
new file mode 100644
index 00000000..6b2762a3
--- /dev/null
+++ b/apps/CMakeLists.txt
@@ -0,0 +1,19 @@
+file(GLOB SAMPLE_DIRS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/[0-9]*)
+foreach(SAMPLE_DIR IN LISTS SAMPLE_DIRS)
+    # install samples
+    install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/${SAMPLE_DIR} DESTINATION samples COMPONENT samples)
+
+    # deploy build tool for samples
+    file(GLOB SAMPLES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}/${SAMPLE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/${SAMPLE_DIR}/*)
+    foreach(SAMPLE IN LISTS SAMPLES)
+        if(IS_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/${SAMPLE_DIR}/${SAMPLE})
+            if(UNIX)
+                install(FILES ${CMAKE_BINARY_DIR}/samples/buildtools/unix/Makefile DESTINATION samples/${SAMPLE_DIR}/${SAMPLE} COMPONENT samples)
+            elseif(WIN32)
+                file(GLOB NMAKE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/buildtools/nmake/*)
+                file(GLOB VS2015_FILES ${CMAKE_CURRENT_SOURCE_DIR}/buildtools/vs2015/*)
+                install(FILES ${NMAKE_FILES} ${VS2015_FILES} DESTINATION samples/${SAMPLE_DIR}/${SAMPLE} COMPONENT samples)
+            endif()
+        endif()
+    endforeach()
+endforeach()
diff --git a/apps/readme.txt b/apps/readme.txt
new file mode 100644
index 00000000..5cf2dccd
--- /dev/null
+++ b/apps/readme.txt
@@ -0,0 +1,7 @@
+Application folder for new samples
+
+How to enable stream-based code generation for apps:
+1 follow the standard getting started guide at (https://hipacc-lang.org/install.html)
+2 the apps should be copied automatically to the same directory as samples  
+3 enable the Hipacc flag `-use-stream n` in cuda.conf
+4 `-use-stream 1` for single-stream and `-use-stream 2` for multi-stream
diff --git a/compiler/hipacc.cpp b/compiler/hipacc.cpp
index d278f1f4..aea76368 100644
--- a/compiler/hipacc.cpp
+++ b/compiler/hipacc.cpp
@@ -101,7 +101,8 @@ void printUsage() {
     << "  -rs-package <string>    Specify Renderscript package name. (default: \"org.hipacc.rs\")\n"
     << "  -o <file>               Write output to <file>\n"
     << "  --help                  Display available options\n"
-    << "  --version               Display version information\n";
+    << "  --version               Display version information\n"
+    << "  -use-stream <n>         Async kernel launch with streams\n";
 }
 
 
@@ -247,6 +248,20 @@ int main(int argc, char *argv[]) {
       compilerOptions.setTimeKernels(USER_ON);
       continue;
     }
+    if (StringRef(argv[i]) == "-use-stream") {
+      assert(i<(argc-1) && "Mandatory integer parameter for -use-stream switch missing.");
+      std::istringstream buffer(argv[i+1]);
+      int val;
+      buffer >> val;
+      if (buffer.fail()) {
+        llvm::errs() << "ERROR: Expected integer parameter for -use-stream switch.\n\n";
+        printUsage();
+        return EXIT_FAILURE;
+      }
+      compilerOptions.setStreamAsyncKernelLaunch(val);
+      ++i;
+      continue;
+    }
     if (StringRef(argv[i]) == "-use-textures") {
       assert(i<(argc-1) && "Mandatory texture memory specification for -use-textures switch missing.");
       if (StringRef(argv[i+1]) == "off") {
@@ -421,6 +436,13 @@ int main(int argc, char *argv[]) {
     // kernels are timed internally by the runtime in case of exploration
     compilerOptions.setTimeKernels(OFF);
   }
+  // Enable CUDA streams
+  if (!compilerOptions.emitCUDA() && compilerOptions.asyncKernelLaunch()) {
+      llvm::errs() << "ERROR: async (stream) support is only available for CUDA!\n\n";
+      printUsage();
+      return EXIT_FAILURE;
+  }
+
 
   // print summary of compiler options
   compilerOptions.printSummary(targetDevice.getTargetDeviceName());
diff --git a/include/hipacc/Config/CompilerOptions.h b/include/hipacc/Config/CompilerOptions.h
index aa9d8e70..27bd6c47 100644
--- a/include/hipacc/Config/CompilerOptions.h
+++ b/include/hipacc/Config/CompilerOptions.h
@@ -84,11 +84,13 @@ class CompilerOptions {
     CompilerOption local_memory;
     CompilerOption multiple_pixels;
     CompilerOption vectorize_kernels;
+    CompilerOption async_kernels;
     // user defined values for target code features
     int kernel_config_x, kernel_config_y;
     int reduce_config_num_warps, reduce_config_num_hists;
     int align_bytes;
     int pixels_per_thread;
+    int num_streams;
     Texture texture_type;
     std::string rs_package_name, rs_directory;
 
@@ -126,12 +128,14 @@ class CompilerOptions {
       local_memory(AUTO),
       multiple_pixels(AUTO),
       vectorize_kernels(OFF),
+      async_kernels(OFF),
       kernel_config_x(128),
       kernel_config_y(1),
       reduce_config_num_warps(16),
       reduce_config_num_hists(16),
       align_bytes(0),
       pixels_per_thread(1),
+      num_streams(1),
       texture_type(Texture::None),
       rs_package_name("org.hipacc.rs"),
       rs_directory("/data/local/tmp")
@@ -190,6 +194,9 @@ class CompilerOptions {
     bool vectorizeKernels(CompilerOption option=option_ou) {
       return vectorize_kernels & option;
     }
+    bool asyncKernelLaunch(CompilerOption option=option_ou) {
+      return async_kernels & option;
+    }
     bool multiplePixelsPerThread(CompilerOption option=option_ou) {
       return multiple_pixels & option;
     }
@@ -207,6 +214,13 @@ class CompilerOptions {
     void setTimeKernels(CompilerOption o) { time_kernels = o; }
     void setLocalMemory(CompilerOption o) { local_memory = o; }
     void setVectorizeKernels(CompilerOption o) { vectorize_kernels = o; }
+    void setStreamAsyncKernelLaunch(int stream) {
+      assert((stream > 0) && "Number of streams must be larger than 0");
+      num_streams = stream;
+      async_kernels = USER_ON;
+    }
+
+    int getNumStreams() { return num_streams; }
 
     void setTextureMemory(Texture type) {
       texture_type = type;
diff --git a/include/hipacc/DSL/ClassRepresentation.h b/include/hipacc/DSL/ClassRepresentation.h
index c4952147..40321678 100644
--- a/include/hipacc/DSL/ClassRepresentation.h
+++ b/include/hipacc/DSL/ClassRepresentation.h
@@ -44,6 +44,7 @@
 #include <map>
 #include <set>
 #include <string>
+#include <numeric>
 
 namespace clang {
 namespace hipacc {
@@ -85,6 +86,12 @@ enum class Interpolate : uint8_t {
   L3
 };
 
+// basic pyramid operations
+enum class PyramidOperation : uint8_t {
+  REDUCE = 0,
+  FILTER,
+  EXPAND
+};
 
 // common base class for images, masks and pyramids
 class HipaccSize {
@@ -143,24 +150,149 @@ class HipaccMemory : public HipaccSize {
 class HipaccImage : public HipaccMemory {
   private:
     ASTContext &Ctx;
+    std::string streamStr;
 
   public:
     HipaccImage(ASTContext &Ctx, VarDecl *VD, QualType QT) :
       HipaccMemory(VD, VD->getNameAsString(), QT),
-      Ctx(Ctx)
+      Ctx(Ctx), streamStr("stream")
     {}
 
     unsigned getPixelSize() { return Ctx.getTypeSize(type)/8; }
     std::string getTextureType();
     std::string getImageReadFunction();
+    void setStream(std::string s) { streamStr = s; }
+    std::string getStream() { return streamStr; }
 };
 
 
 class HipaccPyramid : public HipaccImage {
+  private:
+    unsigned depth;
   public:
     HipaccPyramid(ASTContext &Ctx, VarDecl *VD, QualType QT) :
-      HipaccImage(Ctx, VD, QT)
+      HipaccImage(Ctx, VD, QT), depth(0)
     {}
+    void setDepth(unsigned d) { depth = d; }
+    unsigned getDepth() { return depth; }
+};
+
+
+// TODO, generalize this to an abstract class interface
+class HipaccPyramidPipeline : public HipaccDevice {
+  private:
+    CompilerOptions &options;
+
+    bool singleStream;
+    bool multiStream;
+    bool pipelineKernelLaunch;
+    unsigned depth;
+
+    SmallVector<HipaccImage *, 16> baseImgs;
+    std::string global_level_str;
+    llvm::DenseMap<ValueDecl *, PyramidOperation> KernelDeclMap;
+    llvm::DenseMap<HipaccKernel *, PyramidOperation> KernelMap;
+
+    // resource usage in the pipeline
+    struct PyrResource {
+      unsigned reg = 0;
+      unsigned smem = 0;
+      unsigned thr = 0;
+      unsigned blk = 0;
+    };
+    PyrResource UsedResourcePyr;
+
+  public:
+    HipaccPyramidPipeline(CompilerOptions &options) :
+      HipaccDevice(options),
+      options(options),
+      pipelineKernelLaunch(false),
+      depth(0)
+    {
+      if (options.getNumStreams() == 1) {
+        singleStream = true;
+        multiStream = false;
+      } else if (options.getNumStreams() > 1) {
+        singleStream = false;
+        multiStream = true;
+      }
+      resetResourceUsage();
+    }
+
+    void resetResourceUsage() {
+       UsedResourcePyr.reg = 0;
+       UsedResourcePyr.smem = 0;
+       UsedResourcePyr.thr = 0;
+       UsedResourcePyr.blk = 0;
+    }
+
+    bool hasResourceAvailable(PyrResource &r) {
+      bool reg_available = (max_total_registers - UsedResourcePyr.reg) > r.reg;
+      bool smem_available = (max_total_shared_memory - UsedResourcePyr.smem) > r.smem;
+      bool thr_available = (max_threads_per_multiprocessor - UsedResourcePyr.thr) > r.thr;
+      bool blk_available = (max_blocks_per_multiprocessor - UsedResourcePyr.blk) > r.blk;
+      return (reg_available && smem_available && thr_available && blk_available);
+    }
+
+    bool useResource(PyrResource &r) {
+      if (hasResourceAvailable(r)) {
+        UsedResourcePyr.reg += r.reg;
+        UsedResourcePyr.smem += r.smem;
+        UsedResourcePyr.thr += r.thr;
+        UsedResourcePyr.blk += r.blk;
+        return true;
+      } else {
+        return false;
+      }
+    }
+
+    std::string getPyramidOperationStr(PyramidOperation opr) {
+			switch(opr)
+			{
+					case PyramidOperation::REDUCE : return "REDUCE";
+					case PyramidOperation::FILTER : return "FILTER";
+					case PyramidOperation::EXPAND : return "EXPAND";
+					default: return "PyramidOperation{" + std::to_string(int(opr)) + '}';
+			}
+    }
+
+    unsigned getNumWaves(HipaccKernel *K, unsigned w, unsigned h, bool print=0);
+
+    void updateDepth(unsigned d) {
+      depth = (d > depth) ? d : depth;
+    }
+    unsigned getDepth() { return depth; }
+    void recordBaseImg(HipaccImage *img) {
+      baseImgs.push_back(img);
+    }
+    void setGlobalLevelStr(std::string level) {
+      if (global_level_str.empty()) {
+        global_level_str = level;
+      }
+    }
+    std::string getGlobalLevelStr() {
+      return global_level_str;
+    }
+    bool isSingleStream() { return singleStream; }
+    bool isMultiStream() { return multiStream; }
+    void setSingleStream(bool s) { singleStream = s; }
+    void setMultiStream(bool s) { multiStream = s; }
+    void recordKernel(ValueDecl *VD, HipaccKernel *K, PyramidOperation pyrOp) {
+      KernelDeclMap[VD] = pyrOp;
+      KernelMap[K] = pyrOp;
+    }
+    bool isRecordedKernel(ValueDecl *VD) { return KernelDeclMap.count(VD); }
+    PyramidOperation getPyramidOperation(ValueDecl *VD) {
+      assert(isRecordedKernel(VD) && "Kernel is not recorded in the pyramid pipeline.");
+      return KernelDeclMap[VD];
+    }
+    void enablePipelineKernelLaunch() {
+      pipelineKernelLaunch = true;
+    }
+    bool isPipelineKernelLaunch() {
+      return pipelineKernelLaunch;
+    }
+    void printStreamPipelineInfo();
 };
 
 
@@ -385,6 +517,7 @@ class HipaccKernelClass {
     std::string name;
     CXXMethodDecl *kernelFunction, *reduceFunction, *binningFunction;
     QualType pixelType, binType;
+    PyramidOperation pyrOp;
     KernelStatistics *kernelStatistics;
     // kernel member information
     SmallVector<KernelMemberInfo, 16> members;
@@ -426,6 +559,9 @@ class HipaccKernelClass {
     QualType getPixelType() { return pixelType; }
     QualType getBinType() { return binType; }
 
+    void setPyramidOperation(PyramidOperation op) { pyrOp = op; }
+    PyramidOperation  getPyramidOperation() { return pyrOp; }
+
     KernelStatistics &getKernelStatistics(void) {
       return *kernelStatistics;
     }
@@ -574,7 +710,9 @@ class HipaccKernel : public HipaccKernelFeatures {
     std::string name;
     std::string kernelName, reduceName, binningName;
     std::string fileName;
-    std::string reduceStr, binningStr, infoStr, numBinsStr;
+    std::string reduceStr, binningStr, infoStr, numBinsStr, streamStr;
+    bool waitEvent, recordEvent;
+    std::string waitEventStr, recordEventStr;
     unsigned infoStrCnt, binningStrCnt;
     HipaccIterationSpace *iterationSpace;
     std::map<FieldDecl *, HipaccAccessor *> imgMap;
@@ -614,7 +752,8 @@ class HipaccKernel : public HipaccKernelFeatures {
       reduceName(options.getTargetPrefix() + KC->getName() + name + "Reduce"),
       binningName(options.getTargetPrefix() + KC->getName() + name + "Binning"),
       fileName(options.getTargetPrefix() + KC->getName() + VD->getNameAsString()),
-      reduceStr(), binningStr(), infoStr(), numBinsStr(),
+      reduceStr(), binningStr(), infoStr(), numBinsStr(), streamStr("stream"),
+      waitEvent(false), recordEvent(false), waitEventStr("event"), recordEventStr("event"),
       infoStrCnt(0), binningStrCnt(0),
       iterationSpace(nullptr),
       imgMap(),
@@ -751,6 +890,8 @@ class HipaccKernel : public HipaccKernelFeatures {
       // recreate parameter information
       createArgInfo();
     }
+    unsigned getResourceUsageReg() { return num_reg; }
+    unsigned getResourceUsageSmem() { return num_smem; }
 
     void setDefaultConfig();
 
@@ -803,6 +944,16 @@ class HipaccKernel : public HipaccKernelFeatures {
     unsigned getPixelsPerThreadReduce() {
       return pixels_per_thread[GlobalOperator];
     }
+    void setStream(std::string s) { streamStr = s; }
+    std::string getStream() { return streamStr; }
+    void setWaitEvent(bool e) { waitEvent = e; }
+    bool hasWaitEvent() { return waitEvent; }
+    void setRecordEvent(bool e) { recordEvent = e; }
+    bool hasRecordEvent() { return recordEvent; }
+    void setWaitEventStr(std::string s) { waitEventStr = s; }
+    std::string getWaitEventStr() { return waitEventStr; }
+    void setRecordEventStr(std::string s) { recordEventStr = s; }
+    std::string getRecordEventStr() { return recordEventStr; }
 };
 } // namespace hipacc
 } // namespace clang
diff --git a/include/hipacc/Device/TargetDescription.h b/include/hipacc/Device/TargetDescription.h
index abfb701b..560fc3b7 100644
--- a/include/hipacc/Device/TargetDescription.h
+++ b/include/hipacc/Device/TargetDescription.h
@@ -78,6 +78,7 @@ class HipaccDeviceOptions {
         case Device::Maxwell_50:
         case Device::Maxwell_52:
         case Device::Maxwell_53:
+        case Device::Turing_75:
           alignment = 256;
           if (options.emitCUDA()) local_memory_threshold = 6;
           else local_memory_threshold = 11;
@@ -206,6 +207,7 @@ class HipaccDevice : public HipaccDeviceOptions {
     // NVIDIA only device properties
     unsigned num_alus;
     unsigned num_sfus;
+    unsigned num_sms;
 
   public:
     explicit HipaccDevice(CompilerOptions &options) :
@@ -214,7 +216,8 @@ class HipaccDevice : public HipaccDeviceOptions {
       max_threads_per_warp(32),
       max_blocks_per_multiprocessor(8),
       num_alus(0),
-      num_sfus(0)
+      num_sfus(0),
+      num_sms(1)
     {
       switch (target_device) {
         case Device::CPU:
@@ -261,10 +264,16 @@ class HipaccDevice : public HipaccDeviceOptions {
           if (target_device==Device::Kepler_30) {
             max_register_per_thread = 63;
           }
+          if (target_device==Device::Kepler_30) {
+            num_sms = 8;
+          } else if (target_device==Device::Kepler_35) {
+            num_sms = 13;
+          }
           break;
         case Device::Maxwell_50:
         case Device::Maxwell_52:
         case Device::Maxwell_53:
+        case Device::Turing_75:
           max_blocks_per_multiprocessor = 32;
           max_threads_per_block = 1024;
           max_warps_per_multiprocessor = 64;
@@ -274,6 +283,7 @@ class HipaccDevice : public HipaccDeviceOptions {
           max_register_per_thread = 255;
           num_alus = 128;
           num_sfus = 32;
+          num_sms = 46;
           if (target_device==Device::Maxwell_52) {
             max_total_shared_memory = 98304;
           }
@@ -343,7 +353,7 @@ class HipaccDevice : public HipaccDeviceOptions {
 
     bool isNVIDIAGPU() {
       return target_device >= Device::Fermi_20 &&
-             target_device <= Device::Maxwell_53;
+             target_device <= Device::Turing_75;
     }
 
     std::string getTargetDeviceName() {
@@ -358,6 +368,7 @@ class HipaccDevice : public HipaccDeviceOptions {
         case Device::Maxwell_50:      return "NVIDIA Maxwell (50)";
         case Device::Maxwell_52:      return "NVIDIA Maxwell (52)";
         case Device::Maxwell_53:      return "NVIDIA Maxwell (53)";
+        case Device::Turing_75:       return "NVIDIA Turing (75)";
         case Device::Evergreen:       return "AMD Evergreen";
         case Device::NorthernIsland:  return "AMD Northern Island";
         //case Device::SouthernIsland:  return "AMD Southern Island";
diff --git a/include/hipacc/Device/TargetDevices.h b/include/hipacc/Device/TargetDevices.h
index 1a990ee6..04034720 100644
--- a/include/hipacc/Device/TargetDevices.h
+++ b/include/hipacc/Device/TargetDevices.h
@@ -49,6 +49,7 @@ enum class Device {
   Maxwell_50        = 50,
   Maxwell_52        = 52,
   Maxwell_53        = 53,
+  Turing_75         = 75,
   Evergreen         = 58,
   NorthernIsland    = 69,
   //SouthernIsland    = 79
diff --git a/include/hipacc/Rewrite/CreateHostStrings.h b/include/hipacc/Rewrite/CreateHostStrings.h
index 170fd575..2d18c3e0 100644
--- a/include/hipacc/Rewrite/CreateHostStrings.h
+++ b/include/hipacc/Rewrite/CreateHostStrings.h
@@ -88,7 +88,7 @@ class CreateHostStrings {
         MemoryTransferDirection direction, std::string &resultStr);
     void writeMemoryTransferDomainFromMask(HipaccMask *Domain,
         HipaccMask *Mask, std::string &resultStr);
-    void writeKernelCall(HipaccKernel *K, std::string &resultStr);
+    void writeKernelCall(HipaccKernel *K, std::string &resultStr, HipaccPyramidPipeline *pyrPipe=nullptr);
     void writeReduceCall(HipaccKernel *K, std::string &resultStr);
     void writeBinningCall(HipaccKernel *K, std::string &resultStr);
     std::string getInterpolationDefinition(HipaccKernel *K, HipaccAccessor *Acc,
diff --git a/lib/DSL/ClassRepresentation.cpp b/lib/DSL/ClassRepresentation.cpp
index 3442b654..fe16092a 100644
--- a/lib/DSL/ClassRepresentation.cpp
+++ b/lib/DSL/ClassRepresentation.cpp
@@ -651,5 +651,170 @@ void HipaccKernel::createHostArgInfo(ArrayRef<Expr *> hostArgs, std::string
   }
 }
 
+unsigned HipaccPyramidPipeline::getNumWaves(HipaccKernel *K, unsigned w, unsigned h, bool print) {
+  unsigned sx = K->getNumThreadsX();
+  unsigned sy = K->getNumThreadsY();
+  unsigned r_reg = std::max(K->getResourceUsageReg(), (unsigned)1);
+  unsigned r_smem = std::max(K->getResourceUsageSmem(), (unsigned)1);
+  unsigned k_blk = std::ceil((float)w / sx) * std::ceil((float)h / sy);
+
+  unsigned n_blk_reg = std::floor((float)max_total_registers / (r_reg * sx * sy));
+  unsigned n_blk_smem = std::floor((float)max_total_shared_memory / r_smem);
+  unsigned n_blk_thr = std::floor((float)max_threads_per_multiprocessor / (sx*sy));
+  unsigned n_blk = std::min({n_blk_reg, n_blk_smem, n_blk_thr, max_blocks_per_multiprocessor});
+
+  if (print) {
+    llvm::errs() << "    [wave-info] r_reg:" << r_reg << " r_smem:" << r_smem << " k_blk:" << k_blk << "\n";
+		llvm::errs() << "                n_blk_reg:" << n_blk_reg << " n_blk_smem:" << n_blk_smem << " n_blk_thr:"
+                 << n_blk_thr << " n_blk:" << n_blk << "\n";
+  }
+
+  unsigned n_wave = std::ceil((float)k_blk / (n_blk * num_sms));
+  return n_wave;
+}
+
+void HipaccPyramidPipeline::printStreamPipelineInfo() {
+  llvm::errs() << "\nInformation for Pyramid cuda stream scheduling\n";
+  std::string deviceInfo_str, singleStream_str, multiStream_str;
+  deviceInfo_str += "  GPU arch: " + getTargetDeviceName() + "\n";
+  deviceInfo_str += "  max_register_per_sm: " + std::to_string(max_total_registers) + "\n";
+  deviceInfo_str += "  max_shared_memory_per_sm: " + std::to_string(max_total_shared_memory) + "\n";
+  deviceInfo_str += "  max_threads_per_sm: " + std::to_string(max_threads_per_multiprocessor) + "\n";
+  deviceInfo_str += "  max_blocks_per_sm: " + std::to_string(max_blocks_per_multiprocessor) + "\n";
+  deviceInfo_str += "  num_sm: " + std::to_string(num_sms) + "\n";
+  llvm::errs() << deviceInfo_str;
+
+  HipaccKernel *K_r, *K_f, *K_e; // Unpack kernels
+  for (auto kp : KernelMap) {
+    if (kp.second == PyramidOperation::REDUCE) { K_r = kp.first; }
+    else if (kp.second == PyramidOperation::FILTER) { K_f = kp.first; }
+    else if (kp.second == PyramidOperation::EXPAND) { K_e = kp.first; }
+  }
+  assert(K_r && "no reduce kernel for multi stream pyramid execution");
+  assert(K_f && "no filter kernel for multi stream pyramid execution");
+  assert(K_e && "no expand kernel for multi stream pyramid execution");
+
+  unsigned imgSzX = baseImgs.front()->getSizeX(); // assume same-sized base images
+  unsigned imgSzY = baseImgs.front()->getSizeY();
+
+  // single stream analysis
+  singleStream_str += "  -single-stream wave estimation\n";
+  std::vector<unsigned> numWavesDS;
+  std::vector<unsigned> numWavesLP;
+  std::vector<unsigned> numWavesUS;
+
+  for (size_t d=0; d<depth; ++d) {
+    if (d > 0) {
+      // Downsampling
+      numWavesDS.push_back(getNumWaves(K_r, imgSzX, imgSzY));
+    }
+    if (d < depth - 1) {
+      // level processing
+      numWavesLP.push_back(getNumWaves(K_f, imgSzX, imgSzY));
+      // Upsampling
+      numWavesUS.push_back(getNumWaves(K_e, imgSzX, imgSzY));
+    }
+    imgSzX /= 2; imgSzY /= 2;
+  }
+
+  assert((numWavesDS.size() == depth-1) && "number of waves in reduce does not match depth\n");
+  assert((numWavesLP.size() == depth-1) && "number of waves in filter does not match depth\n");
+  assert((numWavesUS.size() == depth-1) && "number of waves in expand does not match depth\n");
+
+  unsigned numWavesDSSum = std::accumulate(numWavesDS.begin(), numWavesDS.end(), 0);
+  unsigned numWavesLPSum = std::accumulate(numWavesLP.begin(), numWavesLP.end(), 0);
+  unsigned numWavesUSSum = std::accumulate(numWavesUS.begin(), numWavesUS.end(), 0);
+
+  singleStream_str += "    Reduce kernel " + K_r->getName() + " takes " + std::to_string(numWavesDSSum) + " waves\n";
+  singleStream_str += "    Filter kernel " + K_f->getName() + " takes " + std::to_string(numWavesLPSum) + " waves\n";
+  singleStream_str += "    Expand kernel " + K_e->getName() + " takes " + std::to_string(numWavesUSSum) + " waves\n";
+  llvm::errs() << singleStream_str;
+
+  // multi stream info
+  imgSzX = baseImgs.front()->getSizeX(); // assume same-sized base images
+  imgSzY = baseImgs.front()->getSizeY();
+  multiStream_str += "  -multi-stream wave estimation\n";
+
+  unsigned n_wave_seq_r = 0;
+  unsigned n_wave_seq_f = 0;
+  unsigned n_wave_expand = 0;
+  unsigned n_p = 0;
+  std::vector<unsigned> S_p;
+
+  // downsampling
+  for (size_t d=0; d<depth-1; ++d) {  // reduce + filter, mainly filter
+    unsigned sx = K_f->getNumThreadsX();
+    unsigned sy = K_f->getNumThreadsY();
+    unsigned r_reg = std::max(K_f->getResourceUsageReg(), (unsigned)1);
+    unsigned r_smem = std::max(K_f->getResourceUsageSmem(), (unsigned)1);
+    unsigned k_blk = std::ceil((float)imgSzX / sx) * std::ceil((float)imgSzY / sy);
+    unsigned n_blk_reg = std::floor((float)max_total_registers / (r_reg * sx * sy));
+    unsigned n_blk_smem = std::floor((float)max_total_shared_memory / r_smem);
+    unsigned n_blk_thr = std::floor((float)max_threads_per_multiprocessor / (sx*sy));
+    unsigned n_blk = std::min({n_blk_reg, n_blk_smem, n_blk_thr, max_blocks_per_multiprocessor});
+
+    unsigned n_wave = std::ceil((float)k_blk / (n_blk * num_sms));
+    unsigned n_blk_tail = k_blk % (n_blk * num_sms);
+    unsigned n_blk_tail_per_sm = std::ceil((float)n_blk_tail / num_sms);
+
+    if (n_wave > 1) {  // more than one wave, fine-grained level execution
+      resetResourceUsage();
+      n_wave_seq_f += n_wave;
+      n_wave_seq_r += getNumWaves(K_r, imgSzX / 2, imgSzY / 2);
+    } else if (n_blk_tail == 0) { // exactly one wave, fine-grained level execution
+      resetResourceUsage();
+      n_wave_seq_f += n_wave;
+      n_wave_seq_r += getNumWaves(K_r, imgSzX / 2, imgSzY / 2);
+    } else if (n_blk_tail > 0) { // less than one wave, coarse-grained level execution
+      PyrResource res;
+      res.reg = n_blk_tail_per_sm * (r_reg * sx * sy);
+      res.smem = n_blk_tail_per_sm * r_smem;
+      res.thr = n_blk_tail_per_sm * (sx * sy);
+      res.blk = n_blk_tail_per_sm;
+
+      if (hasResourceAvailable(res)) {  // resource is sufficient, same wave
+        n_p++;
+      } else {  // resource not sufficient, new wave
+        S_p.push_back(n_p);
+        n_p = 0;
+        resetResourceUsage();
+      }
+      useResource(res);
+    }
+    imgSzX /= 2;
+    imgSzY /= 2;
+  }
+
+  // upsampling
+  imgSzX = baseImgs.front()->getSizeX(); // assume same-sized base images
+  imgSzY = baseImgs.front()->getSizeY();
+  for (size_t d=0; d<depth-1; ++d) {  // expand
+    unsigned numWaves = getNumWaves(K_e, imgSzX, imgSzY);
+    n_wave_expand += numWaves;
+    imgSzX /= 2;
+    imgSzY /= 2;
+  }
+
+  multiStream_str += "    Fine-grained level execution:\n";
+  multiStream_str += "      Reduce kernel " + K_r->getName() + " takes " + std::to_string(n_wave_seq_r + 1) + " waves\n";
+  multiStream_str += "      Filter kernel " + K_f->getName() + " takes " + std::to_string(n_wave_seq_f) + " waves\n";
+  multiStream_str += "    Coarse-grained level execution:\n";
+  if (S_p.size()) {
+    multiStream_str +=
+        "      takes " + std::to_string(S_p.size() + 1) + " waves and ";
+    for (auto np : S_p) {
+      multiStream_str += std::to_string(np) + " ";
+    }
+    multiStream_str += std::to_string(n_p) + " parallel streams\n";
+  } else {
+    multiStream_str +=
+        "      takes 1 wave and " + std::to_string(n_p) + " parallel streams\n";
+  }
+  multiStream_str += "    Expand execution: takes " + std::to_string(n_wave_expand) + " waves\n";
+  llvm::errs() << multiStream_str;
+}
+
+
+
 // vim: set ts=2 sw=2 sts=2 et ai:
 
diff --git a/lib/Rewrite/CreateHostStrings.cpp b/lib/Rewrite/CreateHostStrings.cpp
index 9a618d06..d7f90cbd 100644
--- a/lib/Rewrite/CreateHostStrings.cpp
+++ b/lib/Rewrite/CreateHostStrings.cpp
@@ -260,11 +260,19 @@ void CreateHostStrings::writeMemoryTransfer(HipaccImage *Img, std::string mem,
     case HOST_TO_DEVICE:
       resultStr += "hipaccWriteMemory(";
       resultStr += Img->getName();
-      resultStr += ", " + mem + ");";
+      resultStr += ", " + mem;
+      if (options.asyncKernelLaunch()) {
+        resultStr += ", " + Img->getStream();
+      }
+      resultStr += ");";
       break;
     case DEVICE_TO_HOST:
       resultStr += "hipaccReadMemory<" + Img->getTypeStr() + ">(";
-      resultStr += Img->getName() + ");";
+      resultStr += Img->getName();
+      if (options.asyncKernelLaunch()) {
+        resultStr += ", " + Img->getStream();
+      }
+      resultStr += ");";
       break;
     case DEVICE_TO_DEVICE:
       resultStr += "hipaccCopyMemory(";
@@ -284,11 +292,19 @@ void CreateHostStrings::writeMemoryTransfer(HipaccPyramid *Pyr, std::string idx,
     case HOST_TO_DEVICE:
       resultStr += "hipaccWriteMemory(";
       resultStr += Pyr->getName() + "(" + idx + ")";
-      resultStr += ", " + mem + ");";
+      resultStr += ", " + mem;
+      if (options.asyncKernelLaunch()) {
+        resultStr += ", " + Pyr->getStream();
+      }
+      resultStr += ");";
       break;
     case DEVICE_TO_HOST:
       resultStr += "hipaccReadMemory<" + Pyr->getTypeStr() + ">(";
-      resultStr += Pyr->getName() + "(" + idx + "));";
+      resultStr += Pyr->getName() + "(" + idx + ")";
+      if (options.asyncKernelLaunch()) {
+        resultStr += ", " + Pyr->getStream();
+      }
+      resultStr += ");";
       break;
     case DEVICE_TO_DEVICE:
       resultStr += "hipaccCopyMemory(";
@@ -387,7 +403,7 @@ void CreateHostStrings::writeMemoryTransferDomainFromMask(
 }
 
 
-void CreateHostStrings::writeKernelCall(HipaccKernel *K, std::string &resultStr) {
+void CreateHostStrings::writeKernelCall(HipaccKernel *K, std::string &resultStr, HipaccPyramidPipeline *pyrPipe) {
   auto argTypeNames = K->getArgTypeNames();
   auto deviceArgNames = K->getDeviceArgNames();
   auto hostArgNames = K->getHostArgNames();
@@ -728,6 +744,13 @@ void CreateHostStrings::writeKernelCall(HipaccKernel *K, std::string &resultStr)
   }
   resultStr += "\n" + indent;
 
+  // insert cuda stream and wait event
+  if (options.emitCUDA() && options.asyncKernelLaunch() && K->hasWaitEvent()) {
+    resultStr += "cudaStreamWaitEvent(" + K->getStream() + ", ";
+    resultStr += K->getWaitEventStr() + ", 0);";
+    resultStr += "\n" + indent;
+  }
+
   // launch kernel
   if (options.exploreConfig() || options.timeKernels()) {
     switch (options.getTargetLang()) {
@@ -805,6 +828,34 @@ void CreateHostStrings::writeKernelCall(HipaccKernel *K, std::string &resultStr)
         resultStr += ", " + gridStr;
         resultStr += ", " + blockStr;
         resultStr += ", _args" + kernel_name + ".data()";
+
+        if (options.asyncKernelLaunch()) {
+          if (pyrPipe) { // pyramid app
+            if (pyrPipe->isSingleStream()) {
+              resultStr += ", " + K->getStream();
+            } else if (pyrPipe->isMultiStream()) {
+              switch (pyrPipe->getPyramidOperation(K->getDecl())) {
+                case PyramidOperation::REDUCE:
+                  resultStr += ", stream[" + pyrPipe->getGlobalLevelStr() + " - 1]";
+                  resultStr += ", events[" + pyrPipe->getGlobalLevelStr() + " - 1]"; // wait event
+                  resultStr += ", events[" + pyrPipe->getGlobalLevelStr() + "]"; // record event
+                  break;
+                case PyramidOperation::FILTER:
+                  resultStr += ", stream[" + pyrPipe->getGlobalLevelStr() + " - 1]";
+                  break;
+                case PyramidOperation::EXPAND:
+                  resultStr += ", stream[" + pyrPipe->getGlobalLevelStr() + "]";
+                  resultStr += ", events[" + std::to_string(pyrPipe->getDepth()) + " * 2";// wait event
+                  resultStr += " - " + pyrPipe->getGlobalLevelStr() + " - 2]";
+                  resultStr += ", events[" + std::to_string(pyrPipe->getDepth()) + " * 2";// record event
+                  resultStr += " - " + pyrPipe->getGlobalLevelStr() + " - 1]";
+                  break;
+              }
+            }
+          } else { // non-pyramid app
+            resultStr += ", " + K->getStream();
+          }
+        }
         resultStr += ");";
         break;
       case Language::Renderscript:
@@ -825,6 +876,13 @@ void CreateHostStrings::writeKernelCall(HipaccKernel *K, std::string &resultStr)
         break;
     }
   }
+
+  // insert cuda stream and record event
+  if (options.emitCUDA() && options.asyncKernelLaunch() && K->hasRecordEvent()) {
+    resultStr += "cudaEventRecord(" + K->getRecordEventStr() + ", ";
+    resultStr += K->getStream() + ");";
+    resultStr += "\n" + indent;
+  }
 }
 
 
diff --git a/lib/Rewrite/Rewrite.cpp b/lib/Rewrite/Rewrite.cpp
index 0f295033..dd219600 100644
--- a/lib/Rewrite/Rewrite.cpp
+++ b/lib/Rewrite/Rewrite.cpp
@@ -100,6 +100,9 @@ class Rewrite : public ASTConsumer,  public RecursiveASTVisitor<Rewrite> {
     // store interpolation methods required for CUDA
     SmallVector<std::string, 16> InterpolationDefinitionsGlobal;
 
+    // pipeline global config for pyramid
+    HipaccPyramidPipeline *pyramidPipe;
+
     // pointer to main function
     FunctionDecl *mainFD;
     FileID mainFileID;
@@ -120,6 +123,7 @@ class Rewrite : public ASTConsumer,  public RecursiveASTVisitor<Rewrite> {
       builtins(CI.getASTContext()),
       stringCreator(CreateHostStrings(options, targetDevice)),
       compilerClasses(CompilerKnownClasses()),
+      pyramidPipe(nullptr),
       mainFD(nullptr),
       literalCount(0),
       skipTransfer(false)
@@ -376,11 +380,52 @@ void Rewrite::HandleTranslationUnit(ASTContext &) {
   CompoundStmt *CS = dyn_cast<CompoundStmt>(mainFD->getBody());
   assert(CS->size() && "CompoundStmt has no statements.");
 
-  std::string initStr;
+  std::string initStr, cleanupStr;
 
   // get initialization string for run-time
   stringCreator.writeInitialization(initStr);
 
+  // print pyramid pipeline info for cuda
+  if (compilerOptions.emitCUDA() && compilerOptions.asyncKernelLaunch() &&
+      pyramidPipe->isPipelineKernelLaunch()) {
+      pyramidPipe->printStreamPipelineInfo();
+  }
+
+  // write CUDA streams
+  if (compilerOptions.asyncKernelLaunch()) {
+    if (pyramidPipe->isSingleStream()) {
+      initStr += "cudaStream_t stream;";
+      initStr += "\n" + stringCreator.getIndent();
+
+      initStr += "cudaStreamCreate(&stream);";
+      initStr += "\n" + stringCreator.getIndent();
+    } else if (pyramidPipe->isMultiStream()) {
+      initStr += "cudaStream_t stream[";
+      initStr += std::to_string(pyramidPipe->getDepth()) + " - 1];";
+      initStr += "\n" + stringCreator.getIndent();
+
+      initStr += "for (int i = 0; i < ";
+      initStr += std::to_string(pyramidPipe->getDepth()) + " - 1; i++) {";
+      initStr += "\n" + stringCreator.getIndent();
+      initStr += "  cudaStreamCreate(&stream[i]);";
+      initStr += "\n" + stringCreator.getIndent() + "}";
+      initStr += "\n" + stringCreator.getIndent();
+
+      initStr += "cudaEvent_t events[";
+      initStr += std::to_string(pyramidPipe->getDepth()) + " * 2];";
+      initStr += "\n" + stringCreator.getIndent();
+
+      initStr += "for (int i = 0; i < ";
+      initStr += std::to_string(pyramidPipe->getDepth()) + " * 2; i++) {";
+      initStr += "\n" + stringCreator.getIndent();
+      initStr += "  cudaEventCreate(&events[i]);";
+      initStr += "\n" + stringCreator.getIndent() + "}";
+      initStr += "\n" + stringCreator.getIndent();
+    } else {
+      //
+    }
+  }
+
   // load OpenCL kernel files and compile the OpenCL kernels
   if (!compilerOptions.exploreConfig()) {
     for (auto map : KernelDeclMap)
@@ -411,6 +456,31 @@ void Rewrite::HandleTranslationUnit(ASTContext &) {
   // insert initialization before first statement
   TextRewriter.InsertTextBefore(CS->body_front()->getBeginLoc(), initStr);
 
+  // insert cleanup before last statement
+  if (compilerOptions.asyncKernelLaunch()) {
+    if (pyramidPipe->isSingleStream()) {
+      cleanupStr = "cudaStreamDestroy(stream);";
+      cleanupStr += "\n" + stringCreator.getIndent();
+    } else if (pyramidPipe->isMultiStream()) {
+      cleanupStr += "for (int i = 0; i < ";
+      cleanupStr += std::to_string(pyramidPipe->getDepth()) + " - 1; i++) {";
+      cleanupStr += "\n" + stringCreator.getIndent();
+      cleanupStr += "  cudaStreamDestroy(stream[i]);";
+      cleanupStr += "\n" + stringCreator.getIndent() + "}";
+      cleanupStr += "\n" + stringCreator.getIndent();
+
+      cleanupStr += "for (int i = 0; i < ";
+      cleanupStr += std::to_string(pyramidPipe->getDepth()) + " * 2; i++) {";
+      cleanupStr += "\n" + stringCreator.getIndent();
+      cleanupStr += "  cudaEventDestroy(events[i]);";
+      cleanupStr += "\n" + stringCreator.getIndent() + "}";
+      cleanupStr += "\n" + stringCreator.getIndent();
+    } else {
+      //
+    }
+    TextRewriter.InsertTextBefore(CS->body_back()->getBeginLoc(), cleanupStr);
+  }
+
   // get buffer of main file id. If we haven't changed it, then we are done.
   if (auto RewriteBuf = TextRewriter.getRewriteBufferFor(mainFileID)) {
     *Out << std::string(RewriteBuf->begin(), RewriteBuf->end());
@@ -627,6 +697,20 @@ bool Rewrite::VisitCXXRecordDecl(CXXRecordDecl *D) {
         KC->setBinningFunction(method);
         continue;
       }
+
+      // pyramid functions
+      if (method->getNameAsString() == "_operatePyramidReduce") {
+        KC->setPyramidOperation(PyramidOperation::REDUCE);
+        continue;
+      }
+      if (method->getNameAsString() == "_operatePyramidFilter") {
+        KC->setPyramidOperation(PyramidOperation::FILTER);
+        continue;
+      }
+      if (method->getNameAsString() == "_operatePyramidExpand") {
+        KC->setPyramidOperation(PyramidOperation::EXPAND);
+        continue;
+      }
     }
   }
 
@@ -683,7 +767,25 @@ bool Rewrite::VisitDeclStmt(DeclStmt *D) {
         std::string width_str  = convertToString(CCE->getArg(0));
         std::string height_str = convertToString(CCE->getArg(1));
 
-        if (compilerOptions.emitC99()) {
+        if (compilerOptions.asyncKernelLaunch() && compilerOptions.emitCUDA()) {
+          // extract depth level from pyramid
+          unsigned IDConstant = Diags.getCustomDiagID(DiagnosticsEngine::Error,
+                  "Constant expression for %0 argument of Image %1 required (Pyramid stream CUDA only).");
+          if (!CCE->getArg(0)->isEvaluatable(Context)) {
+            Diags.Report(CCE->getArg(0)->getExprLoc(), IDConstant) << "width"
+              << Img->getName();
+          }
+          if (!CCE->getArg(1)->isEvaluatable(Context)) {
+            Diags.Report(CCE->getArg(1)->getExprLoc(), IDConstant) << "height"
+              << Img->getName();
+          }
+
+          int64_t img_stride = CCE->getArg(0)->EvaluateKnownConstInt(Context).getSExtValue();
+          int64_t img_height = CCE->getArg(1)->EvaluateKnownConstInt(Context).getSExtValue();
+
+          Img->setSizeX(img_stride);
+          Img->setSizeY(img_height);
+        } else if (compilerOptions.emitC99()) {
           // check if the parameter can be resolved to a constant
           unsigned IDConstant = Diags.getCustomDiagID(DiagnosticsEngine::Error,
                 "Constant expression for %0 argument of Image %1 required (C/C++ only).");
@@ -746,6 +848,36 @@ bool Rewrite::VisitDeclStmt(DeclStmt *D) {
         std::string image_str = convertToString(CCE->getArg(0));
         std::string depth_str = convertToString(CCE->getArg(1));
 
+        if (compilerOptions.asyncKernelLaunch() && compilerOptions.emitCUDA()) {
+          // extract depth level from pyramid
+          unsigned IDConstant = Diags.getCustomDiagID(DiagnosticsEngine::Error,
+                  "Constant expression for %0 argument of Pyramid %1 required (CUDA only).");
+          if (!CCE->getArg(1)->isEvaluatable(Context)) {
+            Diags.Report(CCE->getArg(1)->getExprLoc(), IDConstant) << "depth"
+              << Pyr->getName();
+          }
+          int64_t pyr_depth = CCE->getArg(1)->EvaluateKnownConstInt(Context).getSExtValue();
+          Pyr->setDepth(pyr_depth);
+
+          pyramidPipe->updateDepth(pyr_depth);
+          std::string pyr_name = VD->getName();
+          pyramidPipe->setGlobalLevelStr(pyr_name + ".level()");
+
+          // TODO, generalize this to async h2d
+          auto arg = CCE->getArg(0)->IgnoreParenCasts();
+          HipaccImage *Img = nullptr;
+          if (auto DRE = dyn_cast<DeclRefExpr>(arg)) {
+              // check if the argument specifies the image
+              if (ImgDeclMap.count(DRE->getDecl())) {
+                Img = ImgDeclMap[DRE->getDecl()];
+                if (pyramidPipe->isMultiStream()) {
+                  Img->setStream("stream[0]");
+                }
+                pyramidPipe->recordBaseImg(Img);
+              }
+          }
+        }
+
         // create memory allocation string
         std::string newStr;
         stringCreator.writePyramidAllocation(VD->getName(),
@@ -1244,6 +1376,20 @@ bool Rewrite::VisitDeclStmt(DeclStmt *D) {
           HipaccKernel *K = new HipaccKernel(Context, VD, KC, compilerOptions);
           KernelDeclMap[VD] = K;
 
+          if (compilerOptions.asyncKernelLaunch() && compilerOptions.emitCUDA()) {
+            switch (KC->getPyramidOperation()) {
+              case PyramidOperation::REDUCE:
+                pyramidPipe->recordKernel(VD, K, PyramidOperation::REDUCE);
+                break;
+              case PyramidOperation::FILTER:
+                pyramidPipe->recordKernel(VD, K, PyramidOperation::FILTER);
+                break;
+              case PyramidOperation::EXPAND:
+                pyramidPipe->recordKernel(VD, K, PyramidOperation::EXPAND);
+                break;
+            }
+          }
+
           // remove kernel declaration
           TextRewriter.RemoveText(D->getSourceRange());
 
@@ -1342,6 +1488,10 @@ bool Rewrite::VisitFunctionDecl(FunctionDecl *D) {
     assert(D->getBody() && "main function has no body.");
     assert(isa<CompoundStmt>(D->getBody()) && "CompoundStmt for main body expected.");
     mainFD = D;
+    // TODO, triggle pyramid config
+    if (compilerOptions.emitCUDA() && compilerOptions.asyncKernelLaunch()) {
+      pyramidPipe = new HipaccPyramidPipeline(compilerOptions);
+    }
   }
 
   return true;
@@ -1624,7 +1774,12 @@ bool Rewrite::VisitCXXMemberCallExpr(CXXMemberCallExpr *E) {
         // TODO: handle the case when only reduce function is specified
         //
         // create kernel call string
-        stringCreator.writeKernelCall(K, newStr);
+        if (compilerOptions.emitCUDA() && compilerOptions.asyncKernelLaunch() &&
+            pyramidPipe->isPipelineKernelLaunch()) {
+          stringCreator.writeKernelCall(K, newStr, pyramidPipe);
+        } else {
+          stringCreator.writeKernelCall(K, newStr);
+        }
 
         // rewrite kernel invocation
         replaceText(E->getBeginLoc(), E->getBeginLoc(), ';', newStr);
@@ -1741,6 +1896,10 @@ bool Rewrite::VisitCallExpr (CallExpr *E) {
         SourceRange range(E->getBeginLoc(),
                           E->getBeginLoc().getLocWithOffset(std::string("traverse").length()-1));
         TextRewriter.ReplaceText(range, "hipaccTraverse");
+
+        if (compilerOptions.emitCUDA() && compilerOptions.asyncKernelLaunch()) {
+          pyramidPipe->enablePipelineKernelLaunch();
+        }
       }
     }
   }
diff --git a/runtime/hipacc_cu.hpp.cmake b/runtime/hipacc_cu.hpp.cmake
index 4d0f579b..f083ca1a 100644
--- a/runtime/hipacc_cu.hpp.cmake
+++ b/runtime/hipacc_cu.hpp.cmake
@@ -181,6 +181,7 @@ void hipaccInitCUDA();
 void hipaccCopyMemory(const HipaccImage &src, HipaccImage &dst);
 void hipaccCopyMemoryRegion(const HipaccAccessor &src, const HipaccAccessor &dst);
 void hipaccLaunchKernel(const void *kernel, std::string kernel_name, dim3 grid, dim3 block, void **args, bool print_timing=true);
+void hipaccLaunchKernel(const void *kernel, std::string kernel_name, dim3 grid, dim3 block, void **args, cudaStream_t stream, cudaEvent_t waitEvent=nullptr, cudaEvent_t recordEvent=nullptr);
 void hipaccLaunchKernelBenchmark(const void *kernel, std::string kernel_name, dim3 grid, dim3 block, std::vector<void *> args, bool print_timing=true);
 void hipaccLaunchKernelExploration(std::string filename, std::string kernel, std::vector<void *> args,
                                    std::vector<hipacc_smem_info> smems, std::vector<hipacc_const_info> consts, std::vector<hipacc_tex_info*> texs,
@@ -207,8 +208,12 @@ HipaccImage hipaccCreatePyramidImage(const HipaccImage &base, size_t width, size
 template<typename T>
 void hipaccWriteMemory(HipaccImage &img, T *host_mem);
 template<typename T>
+void hipaccWriteMemory(HipaccImage &img, T *host_mem, cudaStream_t stream);
+template<typename T>
 T *hipaccReadMemory(const HipaccImage &img);
 template<typename T>
+T *hipaccReadMemory(const HipaccImage &img, cudaStream_t stream);
+template<typename T>
 void hipaccBindTexture(hipaccMemoryType mem_type, const textureReference *tex, const HipaccImage &img);
 template<typename T>
 void hipaccBindSurface(hipaccMemoryType mem_type, const surfaceReference *surf, const HipaccImage &img);
diff --git a/runtime/hipacc_cu.tpp b/runtime/hipacc_cu.tpp
index 36ce05e9..88d43fbc 100644
--- a/runtime/hipacc_cu.tpp
+++ b/runtime/hipacc_cu.tpp
@@ -122,6 +122,33 @@ void hipaccWriteMemory(HipaccImage &img, T *host_mem) {
 }
 
 
+// Write to memory async
+template<typename T>
+void hipaccWriteMemory(HipaccImage &img, T *host_mem, cudaStream_t stream) {
+    if (host_mem == NULL) return;
+
+    size_t width  = img->width;
+    size_t height = img->height;
+    size_t stride = img->stride;
+
+    if ((char *)host_mem != img->host)
+        std::copy(host_mem, host_mem + width*height, (T*)img->host);
+
+    if (img->mem_type >= Array2D) {
+        cudaError_t err = cudaMemcpy2DToArrayAsync((cudaArray *)img->mem, 0, 0, host_mem, stride*sizeof(T), width*sizeof(T), height, cudaMemcpyHostToDevice, stream);
+        checkErr(err, "cudaMemcpy2DToArray()");
+    } else {
+        if (stride > width) {
+            cudaError_t err = cudaMemcpy2DAsync(img->mem, stride*sizeof(T), host_mem, width*sizeof(T), width*sizeof(T), height, cudaMemcpyHostToDevice, stream);
+            checkErr(err, "cudaMemcpy2D()");
+        } else {
+            cudaError_t err = cudaMemcpyAsync(img->mem, host_mem, sizeof(T)*width*height, cudaMemcpyHostToDevice, stream);
+            checkErr(err, "cudaMemcpy()");
+        }
+    }
+}
+
+
 // Read from memory
 template<typename T>
 T *hipaccReadMemory(const HipaccImage &img) {
@@ -146,6 +173,30 @@ T *hipaccReadMemory(const HipaccImage &img) {
 }
 
 
+// Read from memory async
+template<typename T>
+T *hipaccReadMemory(const HipaccImage &img, cudaStream_t stream) {
+    size_t width  = img->width;
+    size_t height = img->height;
+    size_t stride = img->stride;
+
+    if (img->mem_type >= Array2D) {
+        cudaError_t err = cudaMemcpy2DFromArrayAsync((T*)img->host, stride*sizeof(T), (cudaArray *)img->mem, 0, 0, width*sizeof(T), height, cudaMemcpyDeviceToHost, stream);
+        checkErr(err, "cudaMemcpy2DFromArray()");
+    } else {
+        if (stride > width) {
+            cudaError_t err = cudaMemcpy2DAsync((T*)img->host, width*sizeof(T), img->mem, stride*sizeof(T), width*sizeof(T), height, cudaMemcpyDeviceToHost, stream);
+            checkErr(err, "cudaMemcpy2D()");
+        } else {
+            cudaError_t err = cudaMemcpyAsync((T*)img->host, img->mem, sizeof(T)*width*height, cudaMemcpyDeviceToHost, stream);
+            checkErr(err, "cudaMemcpy()");
+        }
+    }
+
+    return (T*)img->host;
+}
+
+
 // Bind memory to texture
 template<typename T>
 void hipaccBindTexture(hipaccMemoryType mem_type, const textureReference *tex, const HipaccImage &img) {
diff --git a/runtime/hipacc_cu_standalone.hpp.cmake b/runtime/hipacc_cu_standalone.hpp.cmake
index d2c54166..68ef83b7 100644
--- a/runtime/hipacc_cu_standalone.hpp.cmake
+++ b/runtime/hipacc_cu_standalone.hpp.cmake
@@ -234,6 +234,19 @@ void hipaccLaunchKernel(const void *kernel, std::string kernel_name, dim3 grid,
 }
 
 
+// Launch kernel async
+void hipaccLaunchKernel(const void *kernel, std::string kernel_name, dim3 grid, dim3 block, void **args, cudaStream_t stream, cudaEvent_t waitEvent, cudaEvent_t recordEvent) {
+    if (waitEvent) {
+      cudaStreamWaitEvent(stream, waitEvent, 0);
+    }
+    cudaError_t err = cudaLaunchKernel(kernel, grid, block, args, 0, stream);
+    checkErr(err, "cudaLaunchKernel(" + kernel_name + ")");
+    if (recordEvent) {
+      cudaEventRecord(recordEvent, stream);
+    }
+}
+
+
 // Benchmark timing for a kernel call
 void hipaccLaunchKernelBenchmark(const void *kernel, std::string kernel_name, dim3 grid, dim3 block, std::vector<void *> args, bool print_timing) {
     std::vector<float> times;