diff --git a/CMakeLists.txt b/CMakeLists.txt
index 84a7191a..fe124651 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -55,6 +55,7 @@ find_program(clang       NAMES clang       PATHS ${LLVM_TOOLS_BINARY_DIR})
 find_package(CUDA)
 find_package(NVML)
 find_package(OpenCL)
+find_package(FFTW)
 
 if(CUDA_FOUND AND CUDA_VERSION VERSION_LESS "7.0")
     message(WARNING "At least CUDA version 7.0 required, but found CUDA version ${CUDA_VERSION}.")
@@ -86,6 +87,7 @@ message(STATUS "CUDA support: ${CUDA_FOUND}")
 message(STATUS "OpenCL support: ${OpenCL_FOUND}")
 message(STATUS "Polly support: ${USE_POLLY}")
 message(STATUS "JIT estimates: ${USE_JIT_ESTIMATE}")
+message(STATUS "FFTW support: ${FFTW_FOUND}")
 message(STATUS "===")
 
 
@@ -184,3 +186,8 @@ endif()
 if(EXISTS ${CMAKE_SOURCE_DIR}/samples/CMakeLists.txt)
     add_subdirectory(samples)
 endif()
+
+# add apps if available
+if(EXISTS ${CMAKE_SOURCE_DIR}/apps/CMakeLists.txt)
+    add_subdirectory(apps)
+endif()
diff --git a/LICENSE b/LICENSE
index 3f7f789a..3cc17a66 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,3 +1,4 @@
+Copyright (c) 2020, University of Erlangen-Nuremberg
 Copyright (c) 2014, Saarland University
 Copyright (c) 2012, University of Erlangen-Nuremberg
 Copyright (c) 2012, Siemens AG
diff --git a/README.md b/README.md
index 9dc7d367..d4e4db88 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,8 @@
+This is a fork of the [Hipacc](http://hipacc-lang.org) project, with primarily experiential features:
+* Asynchronous runtime APIs
+* Multi-stream implementation for multiresolution filters in CUDA
+
+
 # Hipacc
 A domain-specific language and compiler for image processing
 
diff --git a/_clang-format b/_clang-format
new file mode 100644
index 00000000..a81aaae7
--- /dev/null
+++ b/_clang-format
@@ -0,0 +1,121 @@
+---
+Language:        Cpp
+# BasedOnStyle:  LLVM
+AccessModifierOffset: -2
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlines: Right
+AlignOperands:   true
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: MultiLine
+BinPackArguments: true
+BinPackParameters: true
+BraceWrapping:   
+  AfterClass:      false
+  AfterControlStatement: false
+  AfterEnum:       false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  AfterExternBlock: false
+  BeforeCatch:     false
+  BeforeElse:      false
+  IndentBraces:    false
+  SplitEmptyFunction: true
+  SplitEmptyRecord: true
+  SplitEmptyNamespace: true
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Attach
+BreakBeforeInheritanceComma: false
+BreakInheritanceList: BeforeColon
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakConstructorInitializers: BeforeColon
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: true
+ColumnLimit:     82
+CommentPragmas:  '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: false
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+DisableFormat:   false
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: true
+ForEachMacros:   
+  - foreach
+  - Q_FOREACH
+  - BOOST_FOREACH
+IncludeBlocks:   Preserve
+IncludeCategories: 
+  - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
+    Priority:        2
+  - Regex:           '^(<|"(gtest|gmock|isl|json)/)'
+    Priority:        3
+  - Regex:           '.*'
+    Priority:        1
+IncludeIsMainRegex: '(Test)?$'
+IndentCaseLabels: false
+IndentPPDirectives: None
+IndentWidth:     2
+IndentWrappedFunctionNames: false
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: true
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBinPackProtocolList: Auto
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: true
+PenaltyBreakAssignment: 2
+PenaltyBreakBeforeFirstCallParameter: 19
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 60
+PointerAlignment: Right
+ReflowComments:  true
+SortIncludes:    true
+SortUsingDeclarations: true
+SpaceAfterCStyleCast: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatements
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles:  false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard:        Cpp11
+StatementMacros: 
+  - Q_UNUSED
+  - QT_REQUIRE_VERSION
+TabWidth:        8
+UseTab:          Never
+...
+
diff --git a/apps/6_Multiresolution_Filters/Laplacian_Pyramid_Encoding/src/main.cpp b/apps/6_Multiresolution_Filters/Laplacian_Pyramid_Encoding/src/main.cpp
new file mode 100644
index 00000000..b69dcaad
--- /dev/null
+++ b/apps/6_Multiresolution_Filters/Laplacian_Pyramid_Encoding/src/main.cpp
@@ -0,0 +1,212 @@
+//
+// Copyright (c) 2020, University of Erlangen-Nuremberg
+// Copyright (c) 2012, University of Erlangen-Nuremberg
+// Copyright (c) 2012, Siemens AG
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+
+#include "hipacc.hpp"
+#include <iostream>
+#include <hipacc_helper.hpp>
+
+#define SIZE_X 3
+#define SIZE_Y 3
+#define WIDTH 1024
+#define HEIGHT 1024
+
+using namespace hipacc;
+using namespace hipacc::math;
+
+
+class Gaussian : public Kernel<char> {
+  private:
+    Accessor<char> &input;
+    Mask<float> &mask;
+
+  public:
+    Gaussian(IterationSpace<char> &iter, Accessor<char> &input,
+             Mask<float> &mask)
+          : Kernel(iter), input(input), mask(mask) {
+        add_accessor(&input);
+    }
+
+    void kernel() {
+        output() = convolve(mask, Reduce::SUM, [&] () {
+            return input(mask) * mask();
+        });
+    }
+
+    void _operatePyramidReduce() {}
+};
+
+class DifferenceOfGaussian : public Kernel<char> {
+  private:
+    Accessor<char> &input1;
+    Accessor<char> &input2;
+
+  public:
+    DifferenceOfGaussian(IterationSpace<char> &iter, Accessor<char> &input1,
+                         Accessor<char> &input2)
+          : Kernel(iter), input1(input1), input2(input2) {
+        add_accessor(&input1);
+        add_accessor(&input2);
+    }
+
+    void kernel() {
+        output() = input1() - input2();
+    }
+
+    void _operatePyramidFilter() {}
+};
+
+class Blend : public Kernel<char> {
+  private:
+    Accessor<char> &input1;
+    Accessor<char> &input2;
+
+  public:
+    Blend(IterationSpace<char> &iter, Accessor<char> &input1,
+          Accessor<char> &input2)
+        : Kernel(iter), input1(input1), input2(input2) {
+      add_accessor(&input1);
+      add_accessor(&input2);
+    }
+
+    void kernel() {
+        output() = (short)input1() + (short)input2() / 2;
+    }
+
+    void _operatePyramidExpand() {}
+};
+
+
+/*************************************************************************
+ * Main function                                                         *
+ *************************************************************************/
+int main(int argc, const char **argv) {
+    const int width = WIDTH;
+    const int height = HEIGHT;
+    const int size_x = SIZE_X;
+    const int size_y = SIZE_Y;
+    float timing = 0;
+
+    // only filter kernel sizes 3x3, 5x5, and 7x7 implemented
+    if (size_x != size_y || !(size_x == 3 || size_x == 5 || size_x == 7)) {
+        std::cout << "Wrong filter kernel size. "
+                  << "Currently supported values: 3x3, 5x5, and 7x7!"
+                  << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    // convolution filter mask
+    const float coef[SIZE_Y][SIZE_X] = {
+#if SIZE_X == 3
+        { 0.057118f, 0.124758f, 0.057118f },
+        { 0.124758f, 0.272496f, 0.124758f },
+        { 0.057118f, 0.124758f, 0.057118f }
+#endif
+#if SIZE_X == 5
+        { 0.005008f, 0.017300f, 0.026151f, 0.017300f, 0.005008f },
+        { 0.017300f, 0.059761f, 0.090339f, 0.059761f, 0.017300f },
+        { 0.026151f, 0.090339f, 0.136565f, 0.090339f, 0.026151f },
+        { 0.017300f, 0.059761f, 0.090339f, 0.059761f, 0.017300f },
+        { 0.005008f, 0.017300f, 0.026151f, 0.017300f, 0.005008f }
+#endif
+#if SIZE_X == 7
+        { 0.000841f, 0.003010f, 0.006471f, 0.008351f, 0.006471f, 0.003010f, 0.000841f },
+        { 0.003010f, 0.010778f, 0.023169f, 0.029902f, 0.023169f, 0.010778f, 0.003010f },
+        { 0.006471f, 0.023169f, 0.049806f, 0.064280f, 0.049806f, 0.023169f, 0.006471f },
+        { 0.008351f, 0.029902f, 0.064280f, 0.082959f, 0.064280f, 0.029902f, 0.008351f },
+        { 0.006471f, 0.023169f, 0.049806f, 0.064280f, 0.049806f, 0.023169f, 0.006471f },
+        { 0.003010f, 0.010778f, 0.023169f, 0.029902f, 0.023169f, 0.010778f, 0.003010f },
+        { 0.000841f, 0.003010f, 0.006471f, 0.008351f, 0.006471f, 0.003010f, 0.000841f }
+#endif
+    };
+
+    // host memory for random generated image of width x height pixels
+    char *input = new char[width * height];
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        input[y * width + x] = (char)((y * width + x) % 19);
+      }
+    }
+
+    std::cout << "Calculating Hipacc Gaussian Laplacian pyramid ..." << std::endl;
+
+    //************************************************************************//
+
+    // input and output image of width x height pixels
+    Image<char> gaus(width, height);
+    Image<char> lap(width, height);
+    Mask<float> mask(coef);
+
+    const int depth = 10;
+    Pyramid<char> pgaus(gaus, depth);
+    Pyramid<char> plap(lap, depth);
+
+    gaus = input;
+    traverse(pgaus, plap, [&] () {
+        if (!pgaus.is_top_level()) {
+            // construct Gaussian pyramid
+            BoundaryCondition<char> bound(pgaus(-1), mask, Boundary::CLAMP);
+            Accessor<char> acc1(bound);
+            IterationSpace<char> iter1(pgaus(0));
+            Gaussian blur(iter1, acc1, mask);
+            blur.execute();
+
+            // construct Laplacian pyramid
+            Accessor<char> acc3(pgaus(-1));
+            Accessor<char> acc4(pgaus(0), Interpolate::LF);
+            IterationSpace<char> iter3(plap(-1));
+            DifferenceOfGaussian DoG(iter3, acc3, acc4);
+            DoG.execute();
+        }
+
+        traverse();
+
+        // collapse pyramids
+        if (!pgaus.is_bottom_level()) {
+            // blend final output image from Laplacian pyramid
+            Accessor<char> acc3(plap(1), Interpolate::LF);
+            Accessor<char> acc4(plap(0));
+            IterationSpace<char> iter2(plap(0));
+            Blend blend(iter2, acc3, acc4);
+            blend.execute();
+        }
+    });
+
+    // get pointer to result data
+    char *output = lap.data();
+
+    //************************************************************************//
+
+    // convert to uchar for visualization
+    for (int p = 0; p < width*height; ++p) {
+        output[p] = (char)(output[p] + 127);
+    }
+
+    // free memory
+    delete[] input;
+
+    return EXIT_SUCCESS;
+}
diff --git a/apps/7_FFT/DFT_DCT/src/main.cpp b/apps/7_FFT/DFT_DCT/src/main.cpp
new file mode 100644
index 00000000..e03d7161
--- /dev/null
+++ b/apps/7_FFT/DFT_DCT/src/main.cpp
@@ -0,0 +1,139 @@
+//
+// Copyright (c) 2012, University of Erlangen-Nuremberg
+// Copyright (c) 2012, Siemens AG
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+
+#include "hipacc.hpp"
+
+#include <hipacc_helper.hpp>
+#include <iostream>
+
+/*#define WIDTH 4032
+#define HEIGHT 3024
+#define IMAGE "../../common/img/fuerte_ship.jpg"*/
+#define WIDTH 600
+#define HEIGHT 450
+#define IMAGE "../../common/img/halftone-face.jpg"
+
+#define TYPE float
+#define TYPE2 float
+
+using namespace hipacc;
+using namespace hipacc::math;
+
+/*************************************************************************
+ * Main function                                                         *
+ *************************************************************************/
+int main(int argc, const char **argv) {
+  const int width = WIDTH;
+  const int height = HEIGHT;
+
+  // host memory for image of width x height pixels
+  TYPE *input = load_data<TYPE>(width, height, 1, IMAGE);
+
+  // std::cout << "Calculating Hipacc Gaussian filter ..." << std::endl;
+
+  //************************************************************************//
+
+  // input and output image of width x height pixels
+  Image<TYPE> in(width, height, input);
+  Image<TYPE> out_dct(width, height);
+  Image<TYPE> out_fft(width, height);
+  Image<TYPE> out_mag_dct(width, height);
+  Image<TYPE> out_mag_fft(width, height);
+
+  // write fft of in to fftResult
+  float *fftResult = (float *)fft<TYPE, float>(in);
+  // create magnitude from fft
+  fftToMagnitude<TYPE>(fftResult, out_mag_fft);
+  ifft<TYPE, float>(fftResult, out_fft);
+
+  float *dctResult = (float *)dct<TYPE, float>(in);
+  // create magnitude from fft
+  dctToMagnitude<TYPE>(dctResult, out_mag_dct);
+  idct<TYPE, float>(dctResult, out_dct);
+
+  // get pointer to result data
+  TYPE *output_dct = out_dct.data();
+  TYPE *output_fft = out_fft.data();
+  TYPE *output_mag_dct = out_mag_dct.data();
+  TYPE *output_mag_fft = out_mag_fft.data();
+
+  save_data(width, height, 1, input, "input.jpg");
+  save_data(width, height, 1, output_dct, "output_dct.jpg");
+  save_data(width, height, 1, output_fft, "output_fft.jpg");
+  save_data(width, height, 1, output_mag_dct, "output_mag_dct.jpg");
+  save_data(width, height, 1, output_mag_fft, "output_mag_fft.jpg");
+
+
+  // manual transforms
+
+  TYPE2 test[4][4] = {{1,2,3,4},{5,6,7,8},{9,10,11,12},{13,14,15,16}};
+  TYPE2 fft[4][(4 / 2 + 1) * 2];
+  TYPE2 dct[4][4];
+
+  TYPE2 *test_d;
+  TYPE2 *fft_d;
+  TYPE2 *dct_d;
+
+  /*cudaMalloc((void**)&test_d, sizeof(TYPE2) * 4*4);
+  cudaMalloc((void**)&fft_d, sizeof(TYPE2) * 4*(4 / 2 + 1) * 2);
+  cudaMalloc((void**)&dct_d, sizeof(TYPE2) * 4*4);
+  cudaMemcpy(test_d, test, sizeof(TYPE2) * 4*4, cudaMemcpyHostToDevice);
+  cudaMemcpy(fft_d, fft, sizeof(TYPE2) * 4*(4 / 2 + 1) * 2,
+  cudaMemcpyHostToDevice); cudaMemcpy(dct_d, dct, sizeof(TYPE2) * 4*4,
+  cudaMemcpyHostToDevice);
+
+  fft_transform_device(test_d, 4, 4, fft_d);
+
+  cudaMemcpy(fft, fft_d, sizeof(TYPE2) * 4*(4 / 2 + 1) * 2,
+  cudaMemcpyDeviceToHost);
+
+  for (int y = 0; y < 4; y++) {
+    for (int x = 0; x < (4 / 2 + 1) * 2; x++) {
+      std::cout << fft[y][x] << "\t";
+    }
+    std::cout << std::endl;
+  }
+  std::cout << std::endl;
+
+  dct_transform_device((TYPE2*)test_d, 4, 4, (TYPE2*)dct_d);
+  dct_transform_device((TYPE2*)dct_d, 4, 4, (TYPE2*)test_d, false);
+
+  //cudaMemcpy(dct, dct_d, sizeof(TYPE2) * 4*4, cudaMemcpyDeviceToHost);
+  cudaMemcpy(dct, test_d, sizeof(TYPE2) * 4*4, cudaMemcpyDeviceToHost);*/
+
+  for (int y = 0; y < 4; y++) {
+    for (int x = 0; x < 4; x++) {
+      std::cout << dct[y][x] << "\t";
+    }
+    std::cout << std::endl;
+  }
+  std::cout << std::endl;
+
+  // free memory
+  delete[] input;
+
+  return EXIT_SUCCESS;
+}
diff --git a/apps/7_FFT/Gaussian_Blur/src/main.cpp b/apps/7_FFT/Gaussian_Blur/src/main.cpp
new file mode 100644
index 00000000..d7e42520
--- /dev/null
+++ b/apps/7_FFT/Gaussian_Blur/src/main.cpp
@@ -0,0 +1,226 @@
+//
+// Copyright (c) 2012, University of Erlangen-Nuremberg
+// Copyright (c) 2012, Siemens AG
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+
+#include "hipacc.hpp"
+
+#include <hipacc_helper.hpp>
+#include <iostream>
+
+#define SIZE_X 7
+#define SIZE_Y 7
+
+#define WIDTH 4032
+#define HEIGHT 3024
+#define IMAGE "../../common/img/fuerte_ship.jpg"
+/*
+#define WIDTH 700
+#define HEIGHT 445
+#define IMAGE "../../common/img/halftone-printing-process.jpg"
+*/
+#define TYPE uchar
+
+using namespace hipacc;
+using namespace hipacc::math;
+
+// Gaussian blur filter in Hipacc
+class GaussianBlur : public Kernel<TYPE> {
+private:
+  Accessor<TYPE> &input;
+  Mask<float> &mask;
+
+public:
+  GaussianBlur(IterationSpace<TYPE> &iter, Accessor<TYPE> &input,
+               Mask<float> &mask)
+      : Kernel(iter), input(input), mask(mask) {
+    add_accessor(&input);
+  }
+
+  void kernel() {
+    output() = (TYPE)(convolve(mask, Reduce::SUM,
+                               [&]() -> float { return mask() * input(mask); }) +
+                      0.0f);
+  }
+};
+
+// forward declaration of reference implementation
+template <typename T>
+void gaussian_filter(T *in, T *out, float *filter, int size_x, int size_y,
+                     int width, int height);
+
+// alternative compare to analyse precision and errors
+template <typename T>
+void compare(const T *cmp1, const T *cmp2, const unsigned int width,
+             const unsigned int height, const unsigned int border_x = 0,
+             const unsigned int border_y = 0) {
+  int count = 0;
+  double max = 0.0;
+  double sum = 0.0;
+  for (int i = border_y; i < height - border_y; i++) {
+    for (int j = border_x; j < width - border_x; j++) {
+      double diff = (double)std::abs(cmp1[i * width + j] - cmp2[i * width + j]);
+      if (diff > 0.0) {
+        count++;
+        sum += diff;
+        max = std::max(diff, max);
+      }
+    }
+  }
+  double avg = sum / ((width - border_x * 2) * (height - border_y * 2));
+  std::cout << "different pixels:" << count << " avg error:" << avg
+            << " max error:" << max << std::endl;
+}
+
+/*************************************************************************
+ * Main function                                                         *
+ *************************************************************************/
+int main(int argc, const char **argv) {
+  const int width = WIDTH;
+  const int height = HEIGHT;
+  const int size_x = SIZE_X;
+  const int size_y = SIZE_Y;
+  const int offset_x = size_x >> 1;
+  const int offset_y = size_y >> 1;
+  float timing = 0;
+
+  // only filter kernel sizes 3x3, 5x5, and 7x7 implemented
+  if (size_x != size_y || !(size_x == 3 || size_x == 5 || size_x == 7)) {
+    std::cerr << "Wrong filter kernel size. "
+              << "Currently supported values: 3x3, 5x5, and 7x7!" << std::endl;
+    exit(EXIT_FAILURE);
+  }
+
+  // convolution filter mask
+  const float coef[SIZE_Y][SIZE_X] = {
+#if SIZE_X == 3
+    {0.057118f, 0.124758f, 0.057118f},
+    {0.124758f, 0.272496f, 0.124758f},
+    {0.057118f, 0.124758f, 0.057118f}
+#endif
+#if SIZE_X == 5
+    {0.005008f, 0.017300f, 0.026151f, 0.017300f, 0.005008f},
+    {0.017300f, 0.059761f, 0.090339f, 0.059761f, 0.017300f},
+    {0.026151f, 0.090339f, 0.136565f, 0.090339f, 0.026151f},
+    {0.017300f, 0.059761f, 0.090339f, 0.059761f, 0.017300f},
+    {0.005008f, 0.017300f, 0.026151f, 0.017300f, 0.005008f}
+#endif
+#if SIZE_X == 7
+    {0.000841f, 0.003010f, 0.006471f, 0.008351f, 0.006471f, 0.003010f, 0.000841f},
+    {0.003010f, 0.010778f, 0.023169f, 0.029902f, 0.023169f, 0.010778f, 0.003010f},
+    {0.006471f, 0.023169f, 0.049806f, 0.064280f, 0.049806f, 0.023169f, 0.006471f},
+    {0.008351f, 0.029902f, 0.064280f, 0.082959f, 0.064280f, 0.029902f, 0.008351f},
+    {0.006471f, 0.023169f, 0.049806f, 0.064280f, 0.049806f, 0.023169f, 0.006471f},
+    {0.003010f, 0.010778f, 0.023169f, 0.029902f, 0.023169f, 0.010778f, 0.003010f},
+    {0.000841f, 0.003010f, 0.006471f, 0.008351f, 0.006471f, 0.003010f, 0.000841f}
+#endif
+  };
+
+  // host memory for image of width x height pixels
+  TYPE *input = load_data<TYPE>(width, height, 1, IMAGE);
+  TYPE *ref_out = new TYPE[width * height];
+
+  std::cout << "Calculating Hipacc Gaussian filter ..." << std::endl;
+
+  //************************************************************************//
+
+  // input and output image of width x height pixels
+  Image<TYPE> in(width, height, input);
+  Image<TYPE> out(width, height);
+
+  // define Mask for Gaussian filter
+  Mask<float> mask(coef);
+
+  BoundaryCondition<TYPE> bound(in, mask, Boundary::CONSTANT, 0);
+  Accessor<TYPE> acc(bound);
+
+  IterationSpace<TYPE> iter(out);
+  GaussianBlur filter(iter, acc, mask);
+
+  Image<TYPE> outFFT(width, height);
+  IterationSpace<TYPE> iterFFT(outFFT);
+  GaussianBlur filterFFT(iterFFT, acc, mask);
+
+  filter.execute();
+  timing = hipacc_last_kernel_timing();
+
+  filterFFT.convolveFFT();
+
+  // get pointer to result data
+  TYPE *output = out.data();
+  TYPE *outputFFT = outFFT.data();
+
+  //************************************************************************//
+
+  std::cout << "Hipacc: " << timing << " ms, " << (width * height / timing) / 1000
+            << " Mpixel/s" << std::endl;
+
+  std::cout << "Calculating reference ..." << std::endl;
+  double start = time_ms();
+  gaussian_filter(input, ref_out, (float *)coef, size_x, size_y, width, height);
+  double end = time_ms();
+  std::cout << "Reference: " << end - start << " ms, "
+            << (width * height / (end - start)) / 1000 << " Mpixel/s"
+            << std::endl;
+
+  // compare hipacc to reference
+  compare(output, ref_out, width, height, offset_x, offset_y);
+  // compare hipacc to fft convolution
+  compare(outputFFT, output, width, height /*, offset_x, offset_y*/);
+
+  save_data(width, height, 1, input, "input.jpg");
+  save_data(width, height, 1, output, "output.jpg");
+  save_data(width, height, 1, outputFFT, "outputFFT.jpg");
+  show_data(width, height, 1, output, "output.jpg");
+
+  // free memory
+  delete[] input;
+  delete[] ref_out;
+
+  return EXIT_SUCCESS;
+}
+
+// Gaussian blur filter reference
+template <typename T>
+void gaussian_filter(T *in, T *out, float *filter, int size_x, int size_y,
+                     int width, int height) {
+  int anchor_x = size_x >> 1;
+  int anchor_y = size_y >> 1;
+  int upper_x = width - anchor_x;
+  int upper_y = height - anchor_y;
+
+  for (int y = anchor_y; y < upper_y; ++y) {
+    for (int x = anchor_x; x < upper_x; ++x) {
+      float sum = 0.0f;
+
+      for (int yf = -anchor_y; yf <= anchor_y; ++yf) {
+        for (int xf = -anchor_x; xf <= anchor_x; ++xf) {
+          sum += filter[(yf + anchor_y) * size_x + xf + anchor_x] *
+                 in[(y + yf) * width + x + xf];
+        }
+      }
+      out[y * width + x] = (T)(sum);
+    }
+  }
+}
diff --git a/apps/7_FFT/Halftone/src/main.cpp b/apps/7_FFT/Halftone/src/main.cpp
new file mode 100644
index 00000000..91c43edd
--- /dev/null
+++ b/apps/7_FFT/Halftone/src/main.cpp
@@ -0,0 +1,252 @@
+//
+// Copyright (c) 2012, University of Erlangen-Nuremberg
+// Copyright (c) 2012, Siemens AG
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+
+#include "hipacc.hpp"
+
+#include <hipacc_helper.hpp>
+#include <iostream>
+
+#define SIZE_X 7
+#define SIZE_Y 7
+/*
+#define WIDTH 700
+#define HEIGHT 445
+#define IMAGE "../../common/img/halftone-printing-process.jpg"
+*/
+#define WIDTH 600
+#define HEIGHT 450
+#define IMAGE "../../common/img/halftone-face.jpg"
+
+#define TYPE uchar
+
+using namespace hipacc;
+using namespace hipacc::math;
+
+// Kernel description in Hipacc
+class Dilate : public Kernel<TYPE> {
+private:
+  Accessor<TYPE> &in;
+  Domain &dom;
+
+public:
+  Dilate(IterationSpace<TYPE> &iter, Accessor<TYPE> &in, Domain &dom)
+      : Kernel(iter), in(in), dom(dom) {
+    add_accessor(&in);
+  }
+
+  void kernel() {
+    output() = reduce(dom, Reduce::MIN, [&]() -> TYPE { return in(dom); });
+  }
+};
+
+class ImproveFilter : public Kernel<TYPE> {
+private:
+  Accessor<TYPE> &in;
+  Domain &dom;
+  int size_x, size_y;
+  Mask<float> &mask;
+
+public:
+  ImproveFilter(IterationSpace<TYPE> &iter, Accessor<TYPE> &in, Domain &dom,
+                int size_x, int size_y, Mask<float> &mask)
+      : Kernel(iter), in(in), dom(dom), size_x(size_x), size_y(size_y),
+        mask(mask) {
+    add_accessor(&in);
+  }
+
+  void kernel() {
+    float w_avg =
+        convolve(mask, Reduce::SUM, [&]() -> float { return mask() * in(mask); });
+    float avg = reduce(dom, Reduce::SUM, [&]() -> float { return in(dom); }) /
+                (TYPE)(size_x * size_y);
+    float max = reduce(dom, Reduce::MAX, [&]() -> TYPE { return in(dom); });
+    output() = ((w_avg - avg) > 1) || (max > (avg / 2 + 40)) ? 0 : 255;
+  }
+};
+
+// Gaussian blur filter in Hipacc
+class GaussianBlur : public Kernel<TYPE> {
+private:
+  Accessor<TYPE> &input;
+  Mask<float> &mask;
+
+public:
+  GaussianBlur(IterationSpace<TYPE> &iter, Accessor<TYPE> &input,
+               Mask<float> &mask)
+      : Kernel(iter), input(input), mask(mask) {
+    add_accessor(&input);
+  }
+
+  void kernel() {
+    output() = (TYPE)(convolve(mask, Reduce::SUM,
+                               [&]() -> TYPE { return mask() * input(mask); }) +
+                      0.0f);
+  }
+};
+
+/*************************************************************************
+ * Main function                                                         *
+ *************************************************************************/
+int main(int argc, const char **argv) {
+  const int width = WIDTH;
+  const int height = HEIGHT;
+  const int size_x = SIZE_X;
+  const int size_y = SIZE_Y;
+
+  // only filter kernel sizes 3x3, 5x5, and 7x7 implemented
+  if (size_x != size_y || !(size_x == 3 || size_x == 5 || size_x == 7)) {
+    std::cerr << "Wrong filter kernel size. "
+              << "Currently supported values: 3x3, 5x5, and 7x7!" << std::endl;
+    exit(EXIT_FAILURE);
+  }
+
+  // convolution filter mask
+  const float coef3[3][3] = {{0.057118f, 0.124758f, 0.057118f},
+                             {0.124758f, 0.272496f, 0.124758f},
+                             {0.057118f, 0.124758f, 0.057118f}};
+
+  const float coef5[5][5] = {
+      {0.005008f, 0.017300f, 0.026151f, 0.017300f, 0.005008f},
+      {0.017300f, 0.059761f, 0.090339f, 0.059761f, 0.017300f},
+      {0.026151f, 0.090339f, 0.136565f, 0.090339f, 0.026151f},
+      {0.017300f, 0.059761f, 0.090339f, 0.059761f, 0.017300f},
+      {0.005008f, 0.017300f, 0.026151f, 0.017300f, 0.005008f}};
+
+  const float coef7[7][7] = {{0.000841f, 0.003010f, 0.006471f, 0.008351f,
+                              0.006471f, 0.003010f, 0.000841f},
+                             {0.003010f, 0.010778f, 0.023169f, 0.029902f,
+                              0.023169f, 0.010778f, 0.003010f},
+                             {0.006471f, 0.023169f, 0.049806f, 0.064280f,
+                              0.049806f, 0.023169f, 0.006471f},
+                             {0.008351f, 0.029902f, 0.064280f, 0.082959f,
+                              0.064280f, 0.029902f, 0.008351f},
+                             {0.006471f, 0.023169f, 0.049806f, 0.064280f,
+                              0.049806f, 0.023169f, 0.006471f},
+                             {0.003010f, 0.010778f, 0.023169f, 0.029902f,
+                              0.023169f, 0.010778f, 0.003010f},
+                             {0.000841f, 0.003010f, 0.006471f, 0.008351f,
+                              0.006471f, 0.003010f, 0.000841f}};
+
+  // host memory for image of width x height pixels
+  TYPE *input = load_data<TYPE>(width, height, 1, IMAGE);
+
+  // std::cout << "Calculating Hipacc Gaussian filter ..." << std::endl;
+
+  //************************************************************************//
+
+  // input and output image of width x height pixels
+  Image<TYPE> in(width, height, input);
+  Image<TYPE> out(width, height);
+
+  // blur masks
+  Mask<float> mask3(coef3);
+  Mask<float> mask5(coef5);
+  Mask<float> mask7(coef7);
+
+  // images
+  Image<TYPE> mag(width, height);
+  Image<TYPE> mag_blur(width, height);
+  Image<TYPE> mag_out(width, height);
+  Image<TYPE> mag_out2(width, height);
+  Image<TYPE> mag_out3(width, height);
+
+  // kernel domains
+  Domain domImp(SIZE_X, SIZE_Y);
+  Domain domDil(5, 5);
+
+  // declare all acessors and iterationSpaces for images
+  BoundaryCondition<TYPE> bound_mag(mag, mask3, Boundary::CLAMP);
+  Accessor<TYPE> acc_mag(bound_mag);
+  IterationSpace<TYPE> iter_mag(mag);
+
+  BoundaryCondition<TYPE> bound_mag_out(mag_out, domDil, Boundary::CLAMP);
+  Accessor<TYPE> acc_mag_out(bound_mag_out);
+  IterationSpace<TYPE> iter_mag_out(mag_out);
+
+  BoundaryCondition<TYPE> bound_mag_out2(mag_out2, mask5, Boundary::CLAMP);
+  Accessor<TYPE> acc_mag_out2(bound_mag_out2);
+  IterationSpace<TYPE> iter_mag_out2(mag_out2);
+
+  BoundaryCondition<TYPE> bound_mag_out3(mag_out3, domImp, Boundary::CLAMP);
+  Accessor<TYPE> acc_mag_out3(bound_mag_out3);
+  IterationSpace<TYPE> iter_mag_out3(mag_out3);
+
+  BoundaryCondition<TYPE> bound_mag_blur(mag_blur, domImp, Boundary::CLAMP);
+  Accessor<TYPE> acc_mag_blur(bound_mag_blur);
+  IterationSpace<TYPE> iter_mag_blur(mag_blur);
+
+  // write fft of in to fftResult
+  float *fftResult = (float *)fft<TYPE, float>(in);
+
+  // create magnitude from fft
+  fftToMagnitude<TYPE>(fftResult, mag);
+
+  // blur magnitude
+  Domain domBlur(7, 7);
+  GaussianBlur blur(iter_mag_blur, acc_mag, mask3);
+  blur.execute();
+
+  // apply ImproveFilter
+  ImproveFilter improve(iter_mag_out, acc_mag_blur, domImp, SIZE_X, SIZE_Y,
+                        mask7);
+  improve.execute();
+  Dilate dilate(iter_mag_out2, acc_mag_out, domDil);
+  dilate.execute();
+
+  // blur magnitude mask to prevent ringing effect
+  GaussianBlur blur_mask(iter_mag_out3, acc_mag_out2, mask5);
+  blur_mask.execute();
+
+  // apply mask to fftResult
+  fftResetMask<TYPE>(mag_out3, min(width, height) / 20, true, 10);
+  fftApplyPassFilter<TYPE>(mag_out3, min(width, height) * 0.25, true, 100);
+  fftScaleMagnitude<TYPE>(fftResult, mag_out3);
+
+  // visualize resulting magnitude
+  Image<TYPE> back(width, height);
+  fftToMagnitude<TYPE>(fftResult, back);
+
+  // get result image from inverse fft
+  ifft<TYPE>(fftResult, out);
+
+  // get pointer to result data
+  TYPE *output = out.data();
+  TYPE *Mag = mag.data();
+  TYPE *Mag_out = mag_out3.data();
+  TYPE *Back = back.data();
+
+  save_data(width, height, 1, input, "input.jpg");
+  save_data(width, height, 1, output, "output.jpg");
+  save_data(width, height, 1, Mag, "Mag.png");
+  save_data(width, height, 1, Mag_out, "Mag_mask.png");
+  save_data(width, height, 1, Back, "Mag_new.png");
+  show_data(width, height, 1, output, "output.jpg");
+
+  // free memory
+  delete[] input;
+
+  return EXIT_SUCCESS;
+}
diff --git a/apps/7_FFT/Image_Manipulation/src/main.cpp b/apps/7_FFT/Image_Manipulation/src/main.cpp
new file mode 100644
index 00000000..aeaa181b
--- /dev/null
+++ b/apps/7_FFT/Image_Manipulation/src/main.cpp
@@ -0,0 +1,285 @@
+//
+// Copyright (c) 2012, University of Erlangen-Nuremberg
+// Copyright (c) 2012, Siemens AG
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+
+#include "hipacc.hpp"
+
+#include <arpa/inet.h>
+#include <fstream> // for png image size
+#include <iostream>
+#include <string>
+#include <hipacc_helper.hpp>
+
+#define WIDTH  4032
+#define HEIGHT 3024
+#define IMAGE  "../../common/img/fuerte_ship.png"
+
+
+using namespace hipacc;
+using namespace hipacc::math;
+
+
+//  filter in Hipacc
+/*class LBP : public Kernel<uchar> {
+    private:
+        Accessor<uchar> &input;
+        Mask<int> &mask;
+
+    public:
+        LBP(IterationSpace<uchar> &iter, Accessor<uchar> &input,
+                     Mask<int> &mask)
+              : Kernel(iter), input(input), mask(mask) {
+            add_accessor(&input);
+        }
+
+        void kernel() {
+            int result = 0;
+            for (int y = -1; y <= 1; ++y) {
+                for (int x = -1; x <= 1; ++x) {
+                    result += mask(x, y) * (input(x, y) > input(0, 0) ? 1 : 0);
+                }
+            }
+            output() = (uchar) result;
+            output() = (uchar)(convolve(mask, Reduce::SUM, [&] () -> float {
+                     return mask() * (input(mask) > input(0, 0) ? 1 : 0);
+                 }));
+        }
+};*/
+
+class LBP : public Kernel<uchar> {
+    private:
+        Accessor<uchar> &input;
+        Mask<int> &mask;
+
+    public:
+        LBP(IterationSpace<uchar> &iter, Accessor<uchar> &input,
+                     Mask<int> &mask)
+              : Kernel(iter), input(input), mask(mask) {
+            add_accessor(&input);
+        }
+
+        void kernel() {
+            output() = (uchar)(convolve(mask, Reduce::SUM, [&] () -> float {
+                     return mask() * (input(mask) > input(0, 0) ? 1 : 0);
+                 }));
+        }
+};
+
+/*
+class LTP : public Kernel<uchar4> {
+    private:
+        Accessor<uchar> &input;
+        Mask<int> &mask;
+
+    public:
+        LTP(IterationSpace<uchar4> &iter, Accessor<uchar> &input,
+                     Mask<int> &mask)
+              : Kernel(iter), input(input), mask(mask) {
+            add_accessor(&input);
+        }
+
+        void kernel() {
+            int ltpu = 0;
+            int ltpl = 0;
+            const int th = 10;
+            for (int y = -1; y <= 1; ++y) {
+                for (int x = -1; x <= 1; ++x) {
+                    if (input(x, y) >= input(0, 0) + th) {
+                        ltpu += mask(x, y);
+                    } else if (input(x, y) <= input(0, 0) - th) {
+                        ltpl += mask(x, y);
+                    }
+                }
+            }
+            uchar4 result = (uchar4){(uchar)ltpu, (uchar)ltpl, 0, 0};
+            output() = result;
+        }
+};
+*/
+
+// Kernel description in Hipacc
+class ColorConversionY : public Kernel<uchar> {
+    private:
+        Accessor<uchar4> &in;
+
+    public:
+        ColorConversionY(IterationSpace<uchar> &iter, Accessor<uchar4> &acc)
+              : Kernel(iter), in(acc) {
+            add_accessor(&in);
+        }
+
+        void kernel() {
+            uchar4 pixel = in();
+            output() = .299f*pixel.x + .587f*pixel.y + .114f*pixel.z;
+        }
+};
+
+// Kernel description in Hipacc
+class ColorConversionCb : public Kernel<uchar> {
+    private:
+        Accessor<uchar4> &in;
+
+    public:
+        ColorConversionCb(IterationSpace<uchar> &iter, Accessor<uchar4> &acc)
+              : Kernel(iter), in(acc) {
+            add_accessor(&in);
+        }
+
+        void kernel() {
+            uchar4 pixel = in();
+            output() = -.168736f*pixel.x + -.331264f*pixel.y + .5f*pixel.z + 128;
+        }
+};
+
+// Kernel description in Hipacc
+class ColorConversionCr : public Kernel<uchar> {
+    private:
+        Accessor<uchar4> &in;
+
+    public:
+        ColorConversionCr(IterationSpace<uchar> &iter, Accessor<uchar4> &acc)
+              : Kernel(iter), in(acc) {
+            add_accessor(&in);
+        }
+
+        void kernel() {
+            uchar4 pixel = in();
+            output() = .5f*pixel.x + .418688f*pixel.y + -.081312f*pixel.z + 128;
+        } 
+};
+
+
+/*************************************************************************
+ * Main function                                                         *
+ *************************************************************************/
+int main(int argc, const char **argv) {
+    std::string imagePath = IMAGE;
+    // get width and height for png format 
+    const int width = WIDTH;
+    const int height = HEIGHT;
+
+    float timing = 0;
+
+    // convolution filter mask
+    const int weights[3][3] = {
+        { 32,  64, 128 },
+        { 16,   0,   1 },
+        {  8,   4,   2 }
+    };
+
+    std::cout << "Calculating color Conversion filter ..." << std::endl;
+
+    //************************************************************************// 
+
+
+    // ColorConversion
+
+    // host memory for image of width x height pixels
+    uchar4 *input = (uchar4*)load_data<uchar>(width, height, 4, IMAGE);
+    float *inputFloat = (float*)load_data<float>(width, height, 1, IMAGE);
+
+    
+
+
+    // input and output image of width x height pixels
+    Image<uchar4> in(width, height, input);
+    Image<uchar> outY(width, height);
+    Image<uchar> outCb(width, height);
+    Image<uchar> outCr(width, height);
+    
+    Accessor<uchar4> acc(in);
+
+    IterationSpace<uchar> iterY(outY);
+    ColorConversionY filterY(iterY, acc);
+
+    IterationSpace<uchar> iterCb(outCb);
+    ColorConversionCb filterCb(iterCb, acc);
+
+    IterationSpace<uchar> iterCr(outCr);
+    ColorConversionCr filterCr(iterCr, acc);
+
+    filterY.execute();
+    filterCb.execute();
+    filterCr.execute();
+
+    uchar *outputY = outY.data();
+    uchar *outputCb = outCb.data();
+    uchar *outputCr = outCr.data();
+
+    save_data(width, height, 1, outputY, "outputY.png");
+    save_data(width, height, 1, outputCb, "outputCb.png"); 
+    save_data(width, height, 1, outputCr, "outputCr.png");
+
+
+    // input and output image of width x height pixels
+    Image<uchar> inLP(width, height, outputCr);
+    Image<uchar> outLBP(width, height);
+    //Image<uchar4> outLTP(width, height);
+    
+    // define Mask for filter
+    Mask<int> mask(weights);
+
+    BoundaryCondition<uchar> bound(inLP, mask, Boundary::CLAMP);
+    Accessor<uchar> acc2(bound);
+
+    IterationSpace<uchar> iterLBP(outLBP);
+    LBP filterLBP(iterLBP, acc2, mask);
+
+    std::cout << "Calculating LBP ..." << std::endl;
+    filterLBP.execute();
+
+    //IterationSpace<uchar4> iterLTP(outLTP);
+    //LTP filterLTP(iterLTP, acc2, mask);
+
+    //std::cout << "Calculating LTP ..." << std::endl;
+    //filterLTP.execute();
+
+    timing = hipacc_last_kernel_timing();
+
+    Image<uchar> outDCT(width, height);
+
+    float *dctResult = (float *)dct<uchar, float>(outLBP);
+    dctToMagnitude<uchar>(dctResult, outDCT);
+
+
+    // get pointer to result data
+    uchar *output = outDCT.data();
+
+    //************************************************************************//
+
+    std::cout << "Hipacc: " << timing << " ms, "
+              << (width*height/timing)/1000 << " Mpixel/s" << std::endl;
+
+    save_data(width, height, 1, output, "outputDCT.png");
+    //save_data(width, height, 4, outputLTP, "outputLTP.png");
+
+
+
+    // free memory
+    delete[] input;
+
+    return EXIT_SUCCESS;
+}
+
diff --git a/apps/7_FFT/Makefile b/apps/7_FFT/Makefile
new file mode 100644
index 00000000..0353920e
--- /dev/null
+++ b/apps/7_FFT/Makefile
@@ -0,0 +1,167 @@
+# Search binaries
+CXX        := $(shell which c++)
+NVCC       := $(shell which nvcc)
+ADB        := $(shell which adb)
+NDK_BUILD  := $(shell which ndk-build)
+PKG_CONFIG := $(shell which pkg-config)
+
+################################################################################
+
+# Search Hipacc (prefer relative path over package path over PATH env var)
+HIPACC := $(shell if [ -f $(CURDIR)/../../../bin/hipacc ]; then \
+                    echo $(CURDIR)/../../../bin/hipacc; \
+                  elif [ -f /bin/hipacc ]; then \
+                    echo /bin/hipacc; \
+                  else \
+                    echo $$(which hipacc); \
+                  fi)
+ifndef HIPACC
+  $(error "Error: Could not find binary 'hipacc' in PATH")
+endif
+
+################################################################################
+
+# Set paths for build tools
+HIPACC_PATH := $(shell dirname $(HIPACC))/..
+ifdef NVCC
+CUDA_PATH   := $(shell dirname $(NVCC))/..
+endif
+
+################################################################################
+
+# Set compiler options
+OS           := $(shell uname)
+CXX_FLAGS     = -std=c++11 -O2
+CXX_INCLUDE   = -I ../../common \
+                -I $(HIPACC_PATH)/include
+
+# standalone runtime disabled by default
+#CXX_LIB_DIR   = -L $(HIPACC_PATH)/lib
+#CXX_LINK      = -lhipaccRuntime
+
+CXX_LINK	  = -lfftw3_omp -lfftw3 -lfftw3f_omp -lfftw3f -lm
+CXX_LINK_EXCL = -fopenmp
+
+OCL_INCLUDE   = $(CXX_INCLUDE)
+OCL_LIB_DIR   = $(CXX_LIB_DIR)
+ifeq ($(OS),Darwin)
+OCL_LINK      = $(CXX_LINK) -framework OpenCL
+else
+OCL_LINK      = $(CXX_LINK) -lOpenCL
+endif
+
+ifdef NVCC
+NVCC_FLAGS    = $(CXX_FLAGS) -x cu
+NVCC_INCLUDE  = $(CXX_INCLUDE)
+NVCC_LIB_DIR  = $(CXX_LIB_DIR)
+ifeq ($(OS),Darwin)
+NVCC_LINK     = -Xlinker $(CXX_LINK) -Xlinker -lnvrtc -Xlinker -framework,CUDA
+else
+NVCC_LINK     = $(CXX_LINK) -lcuda -lcudart -lcufft -lnvrtc -lnvidia-ml
+OCL_INCLUDE  += -I $(CUDA_PATH)/include
+OCL_LIB_DIR  += -L $(CUDA_PATH)/lib64
+endif
+endif
+
+HIPACC_FLAGS   = -std=c++11 -nostdinc++
+ifeq ($(OS),Darwin)
+HIPACC_FLAGS  += -isystem `xcrun --sdk macosx --show-sdk-path`/usr/include
+endif
+DSL_INCLUDE    = -I ../../common \
+                 -I $(HIPACC_PATH)/include/dsl
+HIPACC_INCLUDE = $(DSL_INCLUDE) \
+                 -I $(HIPACC_PATH)/include/c++/v1 \
+                 -I $(HIPACC_PATH)/include/clang
+HIPACC_OPTS    = -emit-$(1) \
+                 $(shell if [ -f $(1).conf ]; then \
+                           cat $(1).conf | xargs; \
+                         elif [ -f ../../common/config/$(1).conf ]; then \
+                           cat ../../common/config/$(1).conf | xargs; \
+                         fi)
+
+ifdef PKG_CONFIG
+# Check for OpenCV
+OPENCV=$(shell $(PKG_CONFIG) --list-all | grep opencv | head -n1 | cut -d" " -f1)
+ifneq ($(OPENCV),)
+HIPACC_FLAGS   += $(shell $(PKG_CONFIG) --cflags-only-other $(OPENCV)) -DUSE_OPENCV
+HIPACC_INCLUDE += $(shell $(PKG_CONFIG) --cflags-only-I $(OPENCV))
+CXX_FLAGS      += $(shell $(PKG_CONFIG) --cflags-only-other $(OPENCV)) -DUSE_OPENCV
+CXX_INCLUDE    += $(shell $(PKG_CONFIG) --cflags-only-I $(OPENCV))
+CXX_LIB_DIR    += $(shell $(PKG_CONFIG) --libs-only-L $(OPENCV))
+CXX_LINK       += $(shell $(PKG_CONFIG) --libs-only-l $(OPENCV)) -lpthread
+endif
+endif
+
+################################################################################
+
+# Turn on GNU Make feature for using automatic variables in dependencies
+.SECONDEXPANSION:
+.PHONY: cpu cuda opencl-acc opencl-cpu opencl-gpu renderscript filterscript
+.PRECIOUS: main_%.cc
+
+################################################################################
+
+# Target rules
+all: cpu
+
+# Run CPU/CUDA/OpenCL binary
+dsl cpu cuda opencl-acc opencl-cpu opencl-gpu: main_$$@
+	./main_$@
+
+# Run Renderscript/Filterscript binary on Android device
+renderscript filterscript: main_$$@
+ifdef ADB
+	$(ADB) shell mkdir -p /data/local/tmp
+	$(ADB) push main_$@ /data/local/tmp
+	$(ADB) shell /data/local/tmp/main_$@
+else
+	@echo "Abort: Could not find binary 'adb' in PATH"
+	@exit 1
+endif
+
+# Run Hipacc
+main_%.cc: src/main.cpp
+	$(HIPACC) $(HIPACC_FLAGS) -use-fft fast $(call HIPACC_OPTS,$*) $(HIPACC_INCLUDE) $(CURDIR)/$< -o $@
+
+# Build DSL
+main_dsl: src/main.cpp
+	$(CXX) $(CXX_FLAGS) $< $(DSL_INCLUDE) -o $@
+
+# Build CPU
+main_cpu: $$@.cc
+	$(CXX) $(CXX_FLAGS) $< $(CXX_INCLUDE) $(CXX_LIB_DIR) $(CXX_LINK_EXCL) $(CXX_LINK) -o $@
+
+# Build CUDA
+main_cuda: $$@.cc
+ifdef NVCC
+	$(NVCC) $(NVCC_FLAGS) $< $(NVCC_INCLUDE) $(NVCC_LIB_DIR) $(NVCC_LINK) -o $@
+else
+	@echo "Abort: Could not find binary 'nvcc' in PATH"
+	@exit 1
+endif
+
+# Build OpenCL-CPU or OpenCL-GPU
+main_opencl-%: $$@.cc
+	$(CXX) $(CXX_FLAGS) $< $(OCL_INCLUDE) $(OCL_LIB_DIR) $(OCL_LINK) -o $@
+
+# Build Renderscript or Filterscript
+main_%: $$@.cc
+	mkdir -p jni
+	cp ../../common/Android.mk jni/Android.mk
+	cp ../../common/Application.mk jni/Application.mk
+ifdef NDK_BUILD
+	export HIPACC_MAIN=$@.cc; \
+	export HIPACC_INCLUDE=$(HIPACC_PATH)/include; \
+	$(NDK_BUILD) -B
+	cp libs/armeabi-v7a/main_renderscript ./$@
+else
+	@echo "Abort: Could not find binary 'ndk-build' in PATH"
+	@exit 1
+endif
+
+clean:
+	rm -rf jni libs obj
+	rm -f *cc *.cu *.cubin *.rs *.fs *.jpg
+
+distclean: clean
+	rm -f main_* *.cl
diff --git a/apps/7_FFT/img/halftone-face.jpg b/apps/7_FFT/img/halftone-face.jpg
new file mode 100644
index 00000000..39969e26
Binary files /dev/null and b/apps/7_FFT/img/halftone-face.jpg differ
diff --git a/apps/7_FFT/img/halftone-printing-process.jpg b/apps/7_FFT/img/halftone-printing-process.jpg
new file mode 100644
index 00000000..9b15ea92
Binary files /dev/null and b/apps/7_FFT/img/halftone-printing-process.jpg differ
diff --git a/apps/CMakeLists.txt b/apps/CMakeLists.txt
new file mode 100644
index 00000000..6b2762a3
--- /dev/null
+++ b/apps/CMakeLists.txt
@@ -0,0 +1,19 @@
+file(GLOB SAMPLE_DIRS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/[0-9]*)
+foreach(SAMPLE_DIR IN LISTS SAMPLE_DIRS)
+    # install samples
+    install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/${SAMPLE_DIR} DESTINATION samples COMPONENT samples)
+
+    # deploy build tool for samples
+    file(GLOB SAMPLES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}/${SAMPLE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/${SAMPLE_DIR}/*)
+    foreach(SAMPLE IN LISTS SAMPLES)
+        if(IS_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/${SAMPLE_DIR}/${SAMPLE})
+            if(UNIX)
+                install(FILES ${CMAKE_BINARY_DIR}/samples/buildtools/unix/Makefile DESTINATION samples/${SAMPLE_DIR}/${SAMPLE} COMPONENT samples)
+            elseif(WIN32)
+                file(GLOB NMAKE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/buildtools/nmake/*)
+                file(GLOB VS2015_FILES ${CMAKE_CURRENT_SOURCE_DIR}/buildtools/vs2015/*)
+                install(FILES ${NMAKE_FILES} ${VS2015_FILES} DESTINATION samples/${SAMPLE_DIR}/${SAMPLE} COMPONENT samples)
+            endif()
+        endif()
+    endforeach()
+endforeach()
diff --git a/apps/readme.txt b/apps/readme.txt
new file mode 100644
index 00000000..5cf2dccd
--- /dev/null
+++ b/apps/readme.txt
@@ -0,0 +1,7 @@
+Application folder for new samples
+
+How to enable stream-based code generation for apps:
+1 follow the standard getting started guide at (https://hipacc-lang.org/install.html)
+2 the apps should be copied automatically to the same directory as samples  
+3 enable the Hipacc flag `-use-stream n` in cuda.conf
+4 `-use-stream 1` for single-stream and `-use-stream 2` for multi-stream
diff --git a/cmake/modules/FindFFTW.cmake b/cmake/modules/FindFFTW.cmake
new file mode 100644
index 00000000..dfdf8d07
--- /dev/null
+++ b/cmake/modules/FindFFTW.cmake
@@ -0,0 +1,20 @@
+# Find the FFTW includes and library
+#
+# Once done this will define
+#  FFTW_INCLUDE_DIRS    - where to find FFTW include files
+#  FFTW_LIBRARIES       - where to find FFTW libs
+#  FFTW_FOUND           - True if FFTW is found
+
+find_path(FFTW_INCLUDE_DIR fftw3.h PATHS ${FFTW_INCLUDE_DIRS})
+find_library(FFTW_LIBRARY fftw3)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(FFTW DEFAULT_MSG FFTW_INCLUDE_DIR FFTW_LIBRARY)
+
+set(FFTW_INCLUDE_DIRS ${FFTW_INCLUDE_DIR})
+set(FFTW_LIBRARIES ${FFTW_LIBRARY})
+if(NOT FFTW_LIBRARIES)
+    set(FFTW_LIBRARIES "")
+endif()
+
+mark_as_advanced(FFTW_INCLUDE_DIR FFTW_LIBRARY)
diff --git a/compiler/hipacc.cpp b/compiler/hipacc.cpp
index d278f1f4..336f34a1 100644
--- a/compiler/hipacc.cpp
+++ b/compiler/hipacc.cpp
@@ -61,51 +61,81 @@ void printCopyright() {
 
 
 void printUsage() {
-  llvm::errs() << "OVERVIEW: Hipacc - Heterogeneous Image Processing Acceleration framework\n\n"
-    << "USAGE:  hipacc [options] <input>\n\n"
-    << "OPTIONS:\n\n"
-    << "  -emit-cpu               Emit C++ code\n"
-    << "  -emit-cuda              Emit CUDA code for GPU devices\n"
-    << "  -emit-opencl-acc        Emit OpenCL code for Accelerator devices\n"
-    << "  -emit-opencl-cpu        Emit OpenCL code for CPU devices\n"
-    << "  -emit-opencl-gpu        Emit OpenCL code for GPU devices\n"
-    << "  -emit-renderscript      Emit Renderscript code for Android\n"
-    << "  -emit-filterscript      Emit Filterscript code for Android\n"
-    << "  -emit-padding <n>       Emit CUDA/OpenCL/Renderscript image padding, using alignment of <n> bytes for GPU devices\n"
-    << "  -target <n>             Generate code for GPUs with code name <n>.\n"
-    << "                          Code names for CUDA/OpenCL on NVIDIA devices are:\n"
-    << "                            'Fermi-20' and 'Fermi-21' for Fermi architecture.\n"
-    << "                            'Kepler-30', 'Kepler-32', 'Kepler-35', and 'Kepler-37' for Kepler architecture.\n"
-    << "                            'Maxwell-50', 'Maxwell-52', and 'Maxwell-53' for Maxwell architecture.\n"
-    << "                          Code names for for OpenCL on AMD devices are:\n"
-    << "                            'Evergreen'      for Evergreen architecture (Radeon HD5xxx).\n"
-    << "                            'NorthernIsland' for Northern Island architecture (Radeon HD6xxx).\n"
-    << "                          Code names for for OpenCL on ARM devices are:\n"
-    << "                            'Midgard' for Mali-T6xx' for Mali.\n"
-    << "                          Code names for for OpenCL on Intel Xeon Phi devices are:\n"
-    << "                            'KnightsCorner' for Knights Corner Many Integrated Cores architecture.\n"
-    << "  -explore-config         Emit code that explores all possible kernel configuration and print its performance\n"
-    << "  -use-config <nxm>       Emit code that uses a configuration of nxm threads, e.g. 128x1\n"
-    << "  -reduce-config <nxm>    Emit code that uses a multi-dimensional reduction configuration of\n"
-    << "                            n warps per block    (affects block size and shared memory size)\n"
-    << "                            m partial histograms (affects number of blocks)\n"
-    << "  -time-kernels           Emit code that executes each kernel multiple times to get accurate timings\n"
-    << "  -use-textures <o>       Enable/disable usage of textures (cached) in CUDA/OpenCL to read/write image pixels - for GPU devices only\n"
-    << "                          Valid values for CUDA on NVIDIA devices: 'off', 'Linear1D', 'Linear2D', 'Array2D', and 'Ldg'\n"
-    << "                          Valid values for OpenCL: 'off' and 'Array2D'\n"
-    << "  -use-local <o>          Enable/disable usage of shared/local memory in CUDA/OpenCL to stage image pixels to scratchpad\n"
-    << "                          Valid values: 'on' and 'off'\n"
-    << "  -vectorize <o>          Enable/disable vectorization of generated CUDA/OpenCL code\n"
-    << "                          Valid values: 'on' and 'off'\n"
-    << "  -pixels-per-thread <n>  Specify how many pixels should be calculated per thread\n"
-    << "  -rs-package <string>    Specify Renderscript package name. (default: \"org.hipacc.rs\")\n"
-    << "  -o <file>               Write output to <file>\n"
-    << "  --help                  Display available options\n"
-    << "  --version               Display version information\n";
+  llvm::errs()
+      << "OVERVIEW: Hipacc - Heterogeneous Image Processing Acceleration "
+         "framework\n\n"
+      << "USAGE:  hipacc [options] <input>\n\n"
+      << "OPTIONS:\n\n"
+      << "  -emit-cpu               Emit C++ code\n"
+      << "  -emit-cuda              Emit CUDA code for GPU devices\n"
+      << "  -emit-opencl-acc        Emit OpenCL code for Accelerator devices\n"
+      << "  -emit-opencl-cpu        Emit OpenCL code for CPU devices\n"
+      << "  -emit-opencl-gpu        Emit OpenCL code for GPU devices\n"
+      << "  -emit-renderscript      Emit Renderscript code for Android\n"
+      << "  -emit-filterscript      Emit Filterscript code for Android\n"
+      << "  -emit-padding <n>       Emit CUDA/OpenCL/Renderscript image padding, "
+         "using alignment of <n> bytes for GPU devices\n"
+      << "  -target <n>             Generate code for GPUs with code name <n>.\n"
+      << "                          Code names for CUDA/OpenCL on NVIDIA devices "
+         "are:\n"
+      << "                            'Fermi-20' and 'Fermi-21' for Fermi "
+         "architecture.\n"
+      << "                            'Kepler-30', 'Kepler-32', 'Kepler-35', and "
+         "'Kepler-37' for Kepler architecture.\n"
+      << "                            'Maxwell-50', 'Maxwell-52', and "
+         "'Maxwell-53' for Maxwell architecture.\n"
+      << "                          Code names for for OpenCL on AMD devices "
+         "are:\n"
+      << "                            'Evergreen'      for Evergreen "
+         "architecture (Radeon HD5xxx).\n"
+      << "                            'NorthernIsland' for Northern Island "
+         "architecture (Radeon HD6xxx).\n"
+      << "                          Code names for for OpenCL on ARM devices "
+         "are:\n"
+      << "                            'Midgard' for Mali-T6xx' for Mali.\n"
+      << "                          Code names for for OpenCL on Intel Xeon Phi "
+         "devices are:\n"
+      << "                            'KnightsCorner' for Knights Corner Many "
+         "Integrated Cores architecture.\n"
+      << "  -explore-config         Emit code that explores all possible kernel "
+         "configuration and print its performance\n"
+      << "  -use-config <nxm>       Emit code that uses a configuration of nxm "
+         "threads, e.g. 128x1\n"
+      << "  -reduce-config <nxm>    Emit code that uses a multi-dimensional "
+         "reduction configuration of\n"
+      << "                            n warps per block    (affects block size "
+         "and shared memory size)\n"
+      << "                            m partial histograms (affects number of "
+         "blocks)\n"
+      << "  -time-kernels           Emit code that executes each kernel multiple "
+         "times to get accurate timings\n"
+      << "  -use-textures <o>       Enable/disable usage of textures (cached) in "
+         "CUDA/OpenCL to read/write image pixels - for GPU devices only\n"
+      << "                          Valid values for CUDA on NVIDIA devices: "
+         "'off', 'Linear1D', 'Linear2D', 'Array2D', and 'Ldg'\n"
+      << "                          Valid values for OpenCL: 'off' and "
+         "'Array2D'\n"
+      << "  -use-local <o>          Enable/disable usage of shared/local memory "
+         "in CUDA/OpenCL to stage image pixels to scratchpad\n"
+      << "                          Valid values: 'on' and 'off'\n"
+      << "  -vectorize <o>          Enable/disable vectorization of generated "
+         "CUDA/OpenCL code\n"
+      << "                          Valid values: 'on' and 'off'\n"
+      << "  -pixels-per-thread <n>  Specify how many pixels should be calculated "
+         "per thread\n"
+      << "  -rs-package <string>    Specify Renderscript package name. (default: "
+         "\"org.hipacc.rs\")\n"
+      << "  -o <file>               Write output to <file>\n"
+      << "  --help                  Display available options\n"
+      << "  --version               Display version information\n"
+      << "  -use-stream <n>         Async kernel launch with streams\n"
+      << "  -use-fft <o>            Enable/disable usage of convolution with fft "
+         "for CPU and CUDA targets\n"
+      << "                          Valid values: 'on', 'fast' and 'off'\n";
 }
 
 
-void printVersion() {
+void printVersion(){
   llvm::errs() << "hipacc version " << HIPACC_VERSION
     << " (" << GIT_REPOSITORY " " << GIT_VERSION << ")\n";
 }
@@ -247,6 +277,20 @@ int main(int argc, char *argv[]) {
       compilerOptions.setTimeKernels(USER_ON);
       continue;
     }
+    if (StringRef(argv[i]) == "-use-stream") {
+      assert(i<(argc-1) && "Mandatory integer parameter for -use-stream switch missing.");
+      std::istringstream buffer(argv[i+1]);
+      int val;
+      buffer >> val;
+      if (buffer.fail()) {
+        llvm::errs() << "ERROR: Expected integer parameter for -use-stream switch.\n\n";
+        printUsage();
+        return EXIT_FAILURE;
+      }
+      compilerOptions.setStreamAsyncKernelLaunch(val);
+      ++i;
+      continue;
+    }
     if (StringRef(argv[i]) == "-use-textures") {
       assert(i<(argc-1) && "Mandatory texture memory specification for -use-textures switch missing.");
       if (StringRef(argv[i+1]) == "off") {
@@ -295,6 +339,22 @@ int main(int argc, char *argv[]) {
       ++i;
       continue;
     }
+    if (StringRef(argv[i]) == "-use-fft") {
+      assert(i<(argc-1) && "Mandatory specification for -use-fft switch missing.");
+      if (StringRef(argv[i+1]) == "off") {
+        compilerOptions.setUseFFT(OFF);
+      } else if (StringRef(argv[i+1]) == "on") {
+        compilerOptions.setUseFFT(ON);
+      } else if (StringRef(argv[i + 1]) == "fast") {
+        compilerOptions.setUseFFT(FAST);
+      } else {
+        llvm::errs() << "ERROR: Expected valid specification for -use-fft switch.\n\n";
+        printUsage();
+        return EXIT_FAILURE;
+      }
+      ++i;
+      continue;
+    }
     if (StringRef(argv[i]) == "-pixels-per-thread") {
       assert(i<(argc-1) && "Mandatory integer parameter for -pixels-per-thread switch missing.");
       std::istringstream buffer(argv[i+1]);
@@ -421,6 +481,13 @@ int main(int argc, char *argv[]) {
     // kernels are timed internally by the runtime in case of exploration
     compilerOptions.setTimeKernels(OFF);
   }
+  // Enable CUDA streams
+  if (!compilerOptions.emitCUDA() && compilerOptions.asyncKernelLaunch()) {
+      llvm::errs() << "ERROR: async (stream) support is only available for CUDA!\n\n";
+      printUsage();
+      return EXIT_FAILURE;
+  }
+
 
   // print summary of compiler options
   compilerOptions.printSummary(targetDevice.getTargetDeviceName());
diff --git a/dsl/fft.hpp b/dsl/fft.hpp
new file mode 100644
index 00000000..68917085
--- /dev/null
+++ b/dsl/fft.hpp
@@ -0,0 +1,109 @@
+//
+// Copyright (c) 2012, University of Erlangen-Nuremberg
+// Copyright (c) 2012, Siemens AG
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+
+#ifndef __FFT_HPP__
+#define __FFT_HPP__
+
+namespace hipacc {
+
+template <class TPrecision>
+void fftTransform(TPrecision *in, int width, int height, TPrecision *out,
+                  bool forward = true, bool scale = false){
+
+};
+template <class TPrecision>
+void fftTransformDevice(TPrecision *in, int width, int height, TPrecision *out,
+                        bool forward = true, bool scale = false){
+
+};
+template <class TPrecision>
+void dctTransform(TPrecision *in, int width, int height, TPrecision *out,
+                  bool forward = true){
+
+};
+template <class TPrecision>
+void dctTransformDevice(TPrecision *in, int width, int height, TPrecision *out,
+                        bool forward = true){
+
+};
+template <class data_t, class TPrecision, class T> TPrecision *fft(Image<T> &in) {
+  static_assert(std::is_same<T, data_t>::value,
+                "Type of Image and Memory have to be the same!");
+  return nullptr;
+};
+template <class data_t, class TPrecision, class T>
+void ifft(TPrecision *in, Image<T> &out) {
+  static_assert(std::is_same<T, data_t>::value,
+                "Type of Image and Memory have to be the same!");
+};
+template <class data_t, class TPrecision, class T>
+void fftToMagnitude(TPrecision *in, Image<T> &mag) {
+  static_assert(std::is_same<T, data_t>::value,
+                "Type of Image and Memory have to be the same!");
+};
+template <class data_t, class TPrecision, class T>
+void dctToMagnitude(TPrecision *in, Image<T> &mag) {
+  static_assert(std::is_same<T, data_t>::value,
+                "Type of Image and Memory have to be the same!");
+};
+template <class data_t, class TPrecision, class T>
+void fftScaleMagnitude(TPrecision *in, Image<T> &mag) {
+  static_assert(std::is_same<T, data_t>::value,
+                "Type of Image and Memory have to be the same!");
+};
+template <class data_t, class T>
+void fftResetMask(Image<T> &mag, int radius, bool low, int window = 0) {
+  static_assert(std::is_same<T, data_t>::value,
+                "Type of Image and Memory have to be the same!");
+};
+template <class data_t, class T>
+void fftApplyPassFilter(Image<T> &mag, int radius, bool low, int window = 0) {
+  static_assert(std::is_same<T, data_t>::value,
+                "Type of Image and Memory have to be the same!");
+};
+template <class data_t, class T> void fftShift(Image<T> &mag) {
+  static_assert(std::is_same<T, data_t>::value,
+                "Type of Image and Memory have to be the same!");
+};
+template <class data_t, class T> void ifftShift(Image<T> &mag) {
+  static_assert(std::is_same<T, data_t>::value,
+                "Type of Image and Memory have to be the same!");
+};
+
+template <class data_t, class TPrecision, class T> TPrecision *dct(Image<T> &in) {
+  static_assert(std::is_same<T, data_t>::value,
+                "Type of Image and Memory have to be the same!");
+  return nullptr;
+};
+template <class data_t, class TPrecision, class T>
+void idct(TPrecision *in, Image<T> &out) {
+  static_assert(std::is_same<T, data_t>::value,
+                "Type of Image and Memory have to be the same!");
+};
+
+}
+
+#endif // __FFT_HPP__
\ No newline at end of file
diff --git a/dsl/hipacc.hpp b/dsl/hipacc.hpp
index 02d5413f..ebbcc9d5 100644
--- a/dsl/hipacc.hpp
+++ b/dsl/hipacc.hpp
@@ -39,6 +39,7 @@
 #include "kernel.hpp"
 #include "mask.hpp"
 #include "pyramid.hpp"
+#include "fft.hpp"
 
 namespace hipacc {
 class HipaccEoP{};
diff --git a/dsl/kernel.hpp b/dsl/kernel.hpp
index 58449dd0..cda317eb 100644
--- a/dsl/kernel.hpp
+++ b/dsl/kernel.hpp
@@ -226,8 +226,11 @@ class Kernel {
         auto reduce(Domain &domain, Reduce mode, const Function &fun) -> decltype(fun());
         template <typename Function>
         void iterate(Domain &domain, const Function &fun);
+        void convolveFFT();
 };
 
+template <typename data_t, typename bin_t>
+void Kernel<data_t, bin_t>::convolveFFT() {}
 
 template <typename data_t, typename bin_t> template <typename data_m, typename Function>
 auto Kernel<data_t, bin_t>::convolve(Mask<data_m> &mask, Reduce mode, const Function& fun) -> decltype(fun()) {
diff --git a/include/hipacc/Config/CompilerOptions.h b/include/hipacc/Config/CompilerOptions.h
index aa9d8e70..c4364546 100644
--- a/include/hipacc/Config/CompilerOptions.h
+++ b/include/hipacc/Config/CompilerOptions.h
@@ -50,11 +50,12 @@ namespace hipacc {
 
 // compiler option possibilities
 enum CompilerOption {
-  AUTO                = 0x1,
-  ON                  = 0x2,
-  OFF                 = 0x4,
-  USER_ON             = 0x8,
-  USER_OFF            = 0x10
+  AUTO = 0x1,
+  ON = 0x2,
+  OFF = 0x4,
+  USER_ON = 0x8,
+  USER_OFF = 0x10,
+  FAST = 0x20
 };
 
 // target language specification
@@ -84,11 +85,14 @@ class CompilerOptions {
     CompilerOption local_memory;
     CompilerOption multiple_pixels;
     CompilerOption vectorize_kernels;
+    CompilerOption use_fft;
+    CompilerOption async_kernels;
     // user defined values for target code features
     int kernel_config_x, kernel_config_y;
     int reduce_config_num_warps, reduce_config_num_hists;
     int align_bytes;
     int pixels_per_thread;
+    int num_streams;
     Texture texture_type;
     std::string rs_package_name, rs_directory;
 
@@ -104,6 +108,9 @@ class CompilerOptions {
         case AUTO:
           llvm::errs() << "AUTO - determined by the framework";
           break;
+        case FAST:
+          llvm::errs() << "ENABLED - FAST";
+          break;
         case ON:
           llvm::errs() << "ENABLED";
           break;
@@ -126,12 +133,15 @@ class CompilerOptions {
       local_memory(AUTO),
       multiple_pixels(AUTO),
       vectorize_kernels(OFF),
+      use_fft(OFF),
+      async_kernels(OFF),
       kernel_config_x(128),
       kernel_config_y(1),
       reduce_config_num_warps(16),
       reduce_config_num_hists(16),
       align_bytes(0),
       pixels_per_thread(1),
+      num_streams(1),
       texture_type(Texture::None),
       rs_package_name("org.hipacc.rs"),
       rs_directory("/data/local/tmp")
@@ -155,6 +165,7 @@ class CompilerOptions {
 
     static const auto option_ou  = static_cast<CompilerOption>(     ON|USER_ON);
     static const auto option_aou = static_cast<CompilerOption>(AUTO|ON|USER_ON);
+    static const auto option_of = static_cast<CompilerOption>(ON | FAST);
 
     bool exploreConfig(CompilerOption option=option_ou) {
       return explore_config & option;
@@ -190,6 +201,9 @@ class CompilerOptions {
     bool vectorizeKernels(CompilerOption option=option_ou) {
       return vectorize_kernels & option;
     }
+    bool asyncKernelLaunch(CompilerOption option=option_ou) {
+      return async_kernels & option;
+    }
     bool multiplePixelsPerThread(CompilerOption option=option_ou) {
       return multiple_pixels & option;
     }
@@ -208,6 +222,17 @@ class CompilerOptions {
     void setLocalMemory(CompilerOption o) { local_memory = o; }
     void setVectorizeKernels(CompilerOption o) { vectorize_kernels = o; }
 
+    void setUseFFT(CompilerOption o) { use_fft = o; }
+    bool getUseFFT(CompilerOption option = option_of) { return use_fft & option; }
+
+    void setStreamAsyncKernelLaunch(int stream) {
+      assert((stream > 0) && "Number of streams must be larger than 0");
+      num_streams = stream;
+      async_kernels = USER_ON;
+    }
+
+    int getNumStreams() { return num_streams; }
+
     void setTextureMemory(Texture type) {
       texture_type = type;
       if (type == Texture::None) texture_memory = USER_OFF;
@@ -304,6 +329,8 @@ class CompilerOptions {
       getOptionAsString(multiple_pixels, pixels_per_thread);
       llvm::errs() << "\n  Vectorization of kernels: ";
       getOptionAsString(vectorize_kernels);
+      llvm::errs() << "\n  Usage of FFT: ";
+      getOptionAsString(use_fft);
       llvm::errs() << "\n\n";
     }
 };
diff --git a/include/hipacc/DSL/ClassRepresentation.h b/include/hipacc/DSL/ClassRepresentation.h
index c4952147..40321678 100644
--- a/include/hipacc/DSL/ClassRepresentation.h
+++ b/include/hipacc/DSL/ClassRepresentation.h
@@ -44,6 +44,7 @@
 #include <map>
 #include <set>
 #include <string>
+#include <numeric>
 
 namespace clang {
 namespace hipacc {
@@ -85,6 +86,12 @@ enum class Interpolate : uint8_t {
   L3
 };
 
+// basic pyramid operations
+enum class PyramidOperation : uint8_t {
+  REDUCE = 0,
+  FILTER,
+  EXPAND
+};
 
 // common base class for images, masks and pyramids
 class HipaccSize {
@@ -143,24 +150,149 @@ class HipaccMemory : public HipaccSize {
 class HipaccImage : public HipaccMemory {
   private:
     ASTContext &Ctx;
+    std::string streamStr;
 
   public:
     HipaccImage(ASTContext &Ctx, VarDecl *VD, QualType QT) :
       HipaccMemory(VD, VD->getNameAsString(), QT),
-      Ctx(Ctx)
+      Ctx(Ctx), streamStr("stream")
     {}
 
     unsigned getPixelSize() { return Ctx.getTypeSize(type)/8; }
     std::string getTextureType();
     std::string getImageReadFunction();
+    void setStream(std::string s) { streamStr = s; }
+    std::string getStream() { return streamStr; }
 };
 
 
 class HipaccPyramid : public HipaccImage {
+  private:
+    unsigned depth;
   public:
     HipaccPyramid(ASTContext &Ctx, VarDecl *VD, QualType QT) :
-      HipaccImage(Ctx, VD, QT)
+      HipaccImage(Ctx, VD, QT), depth(0)
     {}
+    void setDepth(unsigned d) { depth = d; }
+    unsigned getDepth() { return depth; }
+};
+
+
+// TODO, generalize this to an abstract class interface
+class HipaccPyramidPipeline : public HipaccDevice {
+  private:
+    CompilerOptions &options;
+
+    bool singleStream;
+    bool multiStream;
+    bool pipelineKernelLaunch;
+    unsigned depth;
+
+    SmallVector<HipaccImage *, 16> baseImgs;
+    std::string global_level_str;
+    llvm::DenseMap<ValueDecl *, PyramidOperation> KernelDeclMap;
+    llvm::DenseMap<HipaccKernel *, PyramidOperation> KernelMap;
+
+    // resource usage in the pipeline
+    struct PyrResource {
+      unsigned reg = 0;
+      unsigned smem = 0;
+      unsigned thr = 0;
+      unsigned blk = 0;
+    };
+    PyrResource UsedResourcePyr;
+
+  public:
+    HipaccPyramidPipeline(CompilerOptions &options) :
+      HipaccDevice(options),
+      options(options),
+      pipelineKernelLaunch(false),
+      depth(0)
+    {
+      if (options.getNumStreams() == 1) {
+        singleStream = true;
+        multiStream = false;
+      } else if (options.getNumStreams() > 1) {
+        singleStream = false;
+        multiStream = true;
+      }
+      resetResourceUsage();
+    }
+
+    void resetResourceUsage() {
+       UsedResourcePyr.reg = 0;
+       UsedResourcePyr.smem = 0;
+       UsedResourcePyr.thr = 0;
+       UsedResourcePyr.blk = 0;
+    }
+
+    bool hasResourceAvailable(PyrResource &r) {
+      bool reg_available = (max_total_registers - UsedResourcePyr.reg) > r.reg;
+      bool smem_available = (max_total_shared_memory - UsedResourcePyr.smem) > r.smem;
+      bool thr_available = (max_threads_per_multiprocessor - UsedResourcePyr.thr) > r.thr;
+      bool blk_available = (max_blocks_per_multiprocessor - UsedResourcePyr.blk) > r.blk;
+      return (reg_available && smem_available && thr_available && blk_available);
+    }
+
+    bool useResource(PyrResource &r) {
+      if (hasResourceAvailable(r)) {
+        UsedResourcePyr.reg += r.reg;
+        UsedResourcePyr.smem += r.smem;
+        UsedResourcePyr.thr += r.thr;
+        UsedResourcePyr.blk += r.blk;
+        return true;
+      } else {
+        return false;
+      }
+    }
+
+    std::string getPyramidOperationStr(PyramidOperation opr) {
+			switch(opr)
+			{
+					case PyramidOperation::REDUCE : return "REDUCE";
+					case PyramidOperation::FILTER : return "FILTER";
+					case PyramidOperation::EXPAND : return "EXPAND";
+					default: return "PyramidOperation{" + std::to_string(int(opr)) + '}';
+			}
+    }
+
+    unsigned getNumWaves(HipaccKernel *K, unsigned w, unsigned h, bool print=0);
+
+    void updateDepth(unsigned d) {
+      depth = (d > depth) ? d : depth;
+    }
+    unsigned getDepth() { return depth; }
+    void recordBaseImg(HipaccImage *img) {
+      baseImgs.push_back(img);
+    }
+    void setGlobalLevelStr(std::string level) {
+      if (global_level_str.empty()) {
+        global_level_str = level;
+      }
+    }
+    std::string getGlobalLevelStr() {
+      return global_level_str;
+    }
+    bool isSingleStream() { return singleStream; }
+    bool isMultiStream() { return multiStream; }
+    void setSingleStream(bool s) { singleStream = s; }
+    void setMultiStream(bool s) { multiStream = s; }
+    void recordKernel(ValueDecl *VD, HipaccKernel *K, PyramidOperation pyrOp) {
+      KernelDeclMap[VD] = pyrOp;
+      KernelMap[K] = pyrOp;
+    }
+    bool isRecordedKernel(ValueDecl *VD) { return KernelDeclMap.count(VD); }
+    PyramidOperation getPyramidOperation(ValueDecl *VD) {
+      assert(isRecordedKernel(VD) && "Kernel is not recorded in the pyramid pipeline.");
+      return KernelDeclMap[VD];
+    }
+    void enablePipelineKernelLaunch() {
+      pipelineKernelLaunch = true;
+    }
+    bool isPipelineKernelLaunch() {
+      return pipelineKernelLaunch;
+    }
+    void printStreamPipelineInfo();
 };
 
 
@@ -385,6 +517,7 @@ class HipaccKernelClass {
     std::string name;
     CXXMethodDecl *kernelFunction, *reduceFunction, *binningFunction;
     QualType pixelType, binType;
+    PyramidOperation pyrOp;
     KernelStatistics *kernelStatistics;
     // kernel member information
     SmallVector<KernelMemberInfo, 16> members;
@@ -426,6 +559,9 @@ class HipaccKernelClass {
     QualType getPixelType() { return pixelType; }
     QualType getBinType() { return binType; }
 
+    void setPyramidOperation(PyramidOperation op) { pyrOp = op; }
+    PyramidOperation  getPyramidOperation() { return pyrOp; }
+
     KernelStatistics &getKernelStatistics(void) {
       return *kernelStatistics;
     }
@@ -574,7 +710,9 @@ class HipaccKernel : public HipaccKernelFeatures {
     std::string name;
     std::string kernelName, reduceName, binningName;
     std::string fileName;
-    std::string reduceStr, binningStr, infoStr, numBinsStr;
+    std::string reduceStr, binningStr, infoStr, numBinsStr, streamStr;
+    bool waitEvent, recordEvent;
+    std::string waitEventStr, recordEventStr;
     unsigned infoStrCnt, binningStrCnt;
     HipaccIterationSpace *iterationSpace;
     std::map<FieldDecl *, HipaccAccessor *> imgMap;
@@ -614,7 +752,8 @@ class HipaccKernel : public HipaccKernelFeatures {
       reduceName(options.getTargetPrefix() + KC->getName() + name + "Reduce"),
       binningName(options.getTargetPrefix() + KC->getName() + name + "Binning"),
       fileName(options.getTargetPrefix() + KC->getName() + VD->getNameAsString()),
-      reduceStr(), binningStr(), infoStr(), numBinsStr(),
+      reduceStr(), binningStr(), infoStr(), numBinsStr(), streamStr("stream"),
+      waitEvent(false), recordEvent(false), waitEventStr("event"), recordEventStr("event"),
       infoStrCnt(0), binningStrCnt(0),
       iterationSpace(nullptr),
       imgMap(),
@@ -751,6 +890,8 @@ class HipaccKernel : public HipaccKernelFeatures {
       // recreate parameter information
       createArgInfo();
     }
+    unsigned getResourceUsageReg() { return num_reg; }
+    unsigned getResourceUsageSmem() { return num_smem; }
 
     void setDefaultConfig();
 
@@ -803,6 +944,16 @@ class HipaccKernel : public HipaccKernelFeatures {
     unsigned getPixelsPerThreadReduce() {
       return pixels_per_thread[GlobalOperator];
     }
+    void setStream(std::string s) { streamStr = s; }
+    std::string getStream() { return streamStr; }
+    void setWaitEvent(bool e) { waitEvent = e; }
+    bool hasWaitEvent() { return waitEvent; }
+    void setRecordEvent(bool e) { recordEvent = e; }
+    bool hasRecordEvent() { return recordEvent; }
+    void setWaitEventStr(std::string s) { waitEventStr = s; }
+    std::string getWaitEventStr() { return waitEventStr; }
+    void setRecordEventStr(std::string s) { recordEventStr = s; }
+    std::string getRecordEventStr() { return recordEventStr; }
 };
 } // namespace hipacc
 } // namespace clang
diff --git a/include/hipacc/Device/TargetDescription.h b/include/hipacc/Device/TargetDescription.h
index abfb701b..560fc3b7 100644
--- a/include/hipacc/Device/TargetDescription.h
+++ b/include/hipacc/Device/TargetDescription.h
@@ -78,6 +78,7 @@ class HipaccDeviceOptions {
         case Device::Maxwell_50:
         case Device::Maxwell_52:
         case Device::Maxwell_53:
+        case Device::Turing_75:
           alignment = 256;
           if (options.emitCUDA()) local_memory_threshold = 6;
           else local_memory_threshold = 11;
@@ -206,6 +207,7 @@ class HipaccDevice : public HipaccDeviceOptions {
     // NVIDIA only device properties
     unsigned num_alus;
     unsigned num_sfus;
+    unsigned num_sms;
 
   public:
     explicit HipaccDevice(CompilerOptions &options) :
@@ -214,7 +216,8 @@ class HipaccDevice : public HipaccDeviceOptions {
       max_threads_per_warp(32),
       max_blocks_per_multiprocessor(8),
       num_alus(0),
-      num_sfus(0)
+      num_sfus(0),
+      num_sms(1)
     {
       switch (target_device) {
         case Device::CPU:
@@ -261,10 +264,16 @@ class HipaccDevice : public HipaccDeviceOptions {
           if (target_device==Device::Kepler_30) {
             max_register_per_thread = 63;
           }
+          if (target_device==Device::Kepler_30) {
+            num_sms = 8;
+          } else if (target_device==Device::Kepler_35) {
+            num_sms = 13;
+          }
           break;
         case Device::Maxwell_50:
         case Device::Maxwell_52:
         case Device::Maxwell_53:
+        case Device::Turing_75:
           max_blocks_per_multiprocessor = 32;
           max_threads_per_block = 1024;
           max_warps_per_multiprocessor = 64;
@@ -274,6 +283,7 @@ class HipaccDevice : public HipaccDeviceOptions {
           max_register_per_thread = 255;
           num_alus = 128;
           num_sfus = 32;
+          num_sms = 46;
           if (target_device==Device::Maxwell_52) {
             max_total_shared_memory = 98304;
           }
@@ -343,7 +353,7 @@ class HipaccDevice : public HipaccDeviceOptions {
 
     bool isNVIDIAGPU() {
       return target_device >= Device::Fermi_20 &&
-             target_device <= Device::Maxwell_53;
+             target_device <= Device::Turing_75;
     }
 
     std::string getTargetDeviceName() {
@@ -358,6 +368,7 @@ class HipaccDevice : public HipaccDeviceOptions {
         case Device::Maxwell_50:      return "NVIDIA Maxwell (50)";
         case Device::Maxwell_52:      return "NVIDIA Maxwell (52)";
         case Device::Maxwell_53:      return "NVIDIA Maxwell (53)";
+        case Device::Turing_75:       return "NVIDIA Turing (75)";
         case Device::Evergreen:       return "AMD Evergreen";
         case Device::NorthernIsland:  return "AMD Northern Island";
         //case Device::SouthernIsland:  return "AMD Southern Island";
diff --git a/include/hipacc/Device/TargetDevices.h b/include/hipacc/Device/TargetDevices.h
index 1a990ee6..04034720 100644
--- a/include/hipacc/Device/TargetDevices.h
+++ b/include/hipacc/Device/TargetDevices.h
@@ -49,6 +49,7 @@ enum class Device {
   Maxwell_50        = 50,
   Maxwell_52        = 52,
   Maxwell_53        = 53,
+  Turing_75         = 75,
   Evergreen         = 58,
   NorthernIsland    = 69,
   //SouthernIsland    = 79
diff --git a/include/hipacc/Rewrite/CreateHostStrings.h b/include/hipacc/Rewrite/CreateHostStrings.h
index 170fd575..05505e9c 100644
--- a/include/hipacc/Rewrite/CreateHostStrings.h
+++ b/include/hipacc/Rewrite/CreateHostStrings.h
@@ -88,7 +88,9 @@ class CreateHostStrings {
         MemoryTransferDirection direction, std::string &resultStr);
     void writeMemoryTransferDomainFromMask(HipaccMask *Domain,
         HipaccMask *Mask, std::string &resultStr);
-    void writeKernelCall(HipaccKernel *K, std::string &resultStr);
+    void writeFFTConvolutionCall(HipaccKernel *K, std::string &resultStr,
+                                 ASTContext &Context, bool fast = true);
+    void writeKernelCall(HipaccKernel *K, std::string &resultStr, HipaccPyramidPipeline *pyrPipe=nullptr);
     void writeReduceCall(HipaccKernel *K, std::string &resultStr);
     void writeBinningCall(HipaccKernel *K, std::string &resultStr);
     std::string getInterpolationDefinition(HipaccKernel *K, HipaccAccessor *Acc,
diff --git a/lib/DSL/ClassRepresentation.cpp b/lib/DSL/ClassRepresentation.cpp
index 3442b654..fe16092a 100644
--- a/lib/DSL/ClassRepresentation.cpp
+++ b/lib/DSL/ClassRepresentation.cpp
@@ -651,5 +651,170 @@ void HipaccKernel::createHostArgInfo(ArrayRef<Expr *> hostArgs, std::string
   }
 }
 
+unsigned HipaccPyramidPipeline::getNumWaves(HipaccKernel *K, unsigned w, unsigned h, bool print) {
+  unsigned sx = K->getNumThreadsX();
+  unsigned sy = K->getNumThreadsY();
+  unsigned r_reg = std::max(K->getResourceUsageReg(), (unsigned)1);
+  unsigned r_smem = std::max(K->getResourceUsageSmem(), (unsigned)1);
+  unsigned k_blk = std::ceil((float)w / sx) * std::ceil((float)h / sy);
+
+  unsigned n_blk_reg = std::floor((float)max_total_registers / (r_reg * sx * sy));
+  unsigned n_blk_smem = std::floor((float)max_total_shared_memory / r_smem);
+  unsigned n_blk_thr = std::floor((float)max_threads_per_multiprocessor / (sx*sy));
+  unsigned n_blk = std::min({n_blk_reg, n_blk_smem, n_blk_thr, max_blocks_per_multiprocessor});
+
+  if (print) {
+    llvm::errs() << "    [wave-info] r_reg:" << r_reg << " r_smem:" << r_smem << " k_blk:" << k_blk << "\n";
+		llvm::errs() << "                n_blk_reg:" << n_blk_reg << " n_blk_smem:" << n_blk_smem << " n_blk_thr:"
+                 << n_blk_thr << " n_blk:" << n_blk << "\n";
+  }
+
+  unsigned n_wave = std::ceil((float)k_blk / (n_blk * num_sms));
+  return n_wave;
+}
+
+void HipaccPyramidPipeline::printStreamPipelineInfo() {
+  llvm::errs() << "\nInformation for Pyramid cuda stream scheduling\n";
+  std::string deviceInfo_str, singleStream_str, multiStream_str;
+  deviceInfo_str += "  GPU arch: " + getTargetDeviceName() + "\n";
+  deviceInfo_str += "  max_register_per_sm: " + std::to_string(max_total_registers) + "\n";
+  deviceInfo_str += "  max_shared_memory_per_sm: " + std::to_string(max_total_shared_memory) + "\n";
+  deviceInfo_str += "  max_threads_per_sm: " + std::to_string(max_threads_per_multiprocessor) + "\n";
+  deviceInfo_str += "  max_blocks_per_sm: " + std::to_string(max_blocks_per_multiprocessor) + "\n";
+  deviceInfo_str += "  num_sm: " + std::to_string(num_sms) + "\n";
+  llvm::errs() << deviceInfo_str;
+
+  HipaccKernel *K_r, *K_f, *K_e; // Unpack kernels
+  for (auto kp : KernelMap) {
+    if (kp.second == PyramidOperation::REDUCE) { K_r = kp.first; }
+    else if (kp.second == PyramidOperation::FILTER) { K_f = kp.first; }
+    else if (kp.second == PyramidOperation::EXPAND) { K_e = kp.first; }
+  }
+  assert(K_r && "no reduce kernel for multi stream pyramid execution");
+  assert(K_f && "no filter kernel for multi stream pyramid execution");
+  assert(K_e && "no expand kernel for multi stream pyramid execution");
+
+  unsigned imgSzX = baseImgs.front()->getSizeX(); // assume same-sized base images
+  unsigned imgSzY = baseImgs.front()->getSizeY();
+
+  // single stream analysis
+  singleStream_str += "  -single-stream wave estimation\n";
+  std::vector<unsigned> numWavesDS;
+  std::vector<unsigned> numWavesLP;
+  std::vector<unsigned> numWavesUS;
+
+  for (size_t d=0; d<depth; ++d) {
+    if (d > 0) {
+      // Downsampling
+      numWavesDS.push_back(getNumWaves(K_r, imgSzX, imgSzY));
+    }
+    if (d < depth - 1) {
+      // level processing
+      numWavesLP.push_back(getNumWaves(K_f, imgSzX, imgSzY));
+      // Upsampling
+      numWavesUS.push_back(getNumWaves(K_e, imgSzX, imgSzY));
+    }
+    imgSzX /= 2; imgSzY /= 2;
+  }
+
+  assert((numWavesDS.size() == depth-1) && "number of waves in reduce does not match depth\n");
+  assert((numWavesLP.size() == depth-1) && "number of waves in filter does not match depth\n");
+  assert((numWavesUS.size() == depth-1) && "number of waves in expand does not match depth\n");
+
+  unsigned numWavesDSSum = std::accumulate(numWavesDS.begin(), numWavesDS.end(), 0);
+  unsigned numWavesLPSum = std::accumulate(numWavesLP.begin(), numWavesLP.end(), 0);
+  unsigned numWavesUSSum = std::accumulate(numWavesUS.begin(), numWavesUS.end(), 0);
+
+  singleStream_str += "    Reduce kernel " + K_r->getName() + " takes " + std::to_string(numWavesDSSum) + " waves\n";
+  singleStream_str += "    Filter kernel " + K_f->getName() + " takes " + std::to_string(numWavesLPSum) + " waves\n";
+  singleStream_str += "    Expand kernel " + K_e->getName() + " takes " + std::to_string(numWavesUSSum) + " waves\n";
+  llvm::errs() << singleStream_str;
+
+  // multi stream info
+  imgSzX = baseImgs.front()->getSizeX(); // assume same-sized base images
+  imgSzY = baseImgs.front()->getSizeY();
+  multiStream_str += "  -multi-stream wave estimation\n";
+
+  unsigned n_wave_seq_r = 0;
+  unsigned n_wave_seq_f = 0;
+  unsigned n_wave_expand = 0;
+  unsigned n_p = 0;
+  std::vector<unsigned> S_p;
+
+  // downsampling
+  for (size_t d=0; d<depth-1; ++d) {  // reduce + filter, mainly filter
+    unsigned sx = K_f->getNumThreadsX();
+    unsigned sy = K_f->getNumThreadsY();
+    unsigned r_reg = std::max(K_f->getResourceUsageReg(), (unsigned)1);
+    unsigned r_smem = std::max(K_f->getResourceUsageSmem(), (unsigned)1);
+    unsigned k_blk = std::ceil((float)imgSzX / sx) * std::ceil((float)imgSzY / sy);
+    unsigned n_blk_reg = std::floor((float)max_total_registers / (r_reg * sx * sy));
+    unsigned n_blk_smem = std::floor((float)max_total_shared_memory / r_smem);
+    unsigned n_blk_thr = std::floor((float)max_threads_per_multiprocessor / (sx*sy));
+    unsigned n_blk = std::min({n_blk_reg, n_blk_smem, n_blk_thr, max_blocks_per_multiprocessor});
+
+    unsigned n_wave = std::ceil((float)k_blk / (n_blk * num_sms));
+    unsigned n_blk_tail = k_blk % (n_blk * num_sms);
+    unsigned n_blk_tail_per_sm = std::ceil((float)n_blk_tail / num_sms);
+
+    if (n_wave > 1) {  // more than one wave, fine-grained level execution
+      resetResourceUsage();
+      n_wave_seq_f += n_wave;
+      n_wave_seq_r += getNumWaves(K_r, imgSzX / 2, imgSzY / 2);
+    } else if (n_blk_tail == 0) { // exactly one wave, fine-grained level execution
+      resetResourceUsage();
+      n_wave_seq_f += n_wave;
+      n_wave_seq_r += getNumWaves(K_r, imgSzX / 2, imgSzY / 2);
+    } else if (n_blk_tail > 0) { // less than one wave, coarse-grained level execution
+      PyrResource res;
+      res.reg = n_blk_tail_per_sm * (r_reg * sx * sy);
+      res.smem = n_blk_tail_per_sm * r_smem;
+      res.thr = n_blk_tail_per_sm * (sx * sy);
+      res.blk = n_blk_tail_per_sm;
+
+      if (hasResourceAvailable(res)) {  // resource is sufficient, same wave
+        n_p++;
+      } else {  // resource not sufficient, new wave
+        S_p.push_back(n_p);
+        n_p = 0;
+        resetResourceUsage();
+      }
+      useResource(res);
+    }
+    imgSzX /= 2;
+    imgSzY /= 2;
+  }
+
+  // upsampling
+  imgSzX = baseImgs.front()->getSizeX(); // assume same-sized base images
+  imgSzY = baseImgs.front()->getSizeY();
+  for (size_t d=0; d<depth-1; ++d) {  // expand
+    unsigned numWaves = getNumWaves(K_e, imgSzX, imgSzY);
+    n_wave_expand += numWaves;
+    imgSzX /= 2;
+    imgSzY /= 2;
+  }
+
+  multiStream_str += "    Fine-grained level execution:\n";
+  multiStream_str += "      Reduce kernel " + K_r->getName() + " takes " + std::to_string(n_wave_seq_r + 1) + " waves\n";
+  multiStream_str += "      Filter kernel " + K_f->getName() + " takes " + std::to_string(n_wave_seq_f) + " waves\n";
+  multiStream_str += "    Coarse-grained level execution:\n";
+  if (S_p.size()) {
+    multiStream_str +=
+        "      takes " + std::to_string(S_p.size() + 1) + " waves and ";
+    for (auto np : S_p) {
+      multiStream_str += std::to_string(np) + " ";
+    }
+    multiStream_str += std::to_string(n_p) + " parallel streams\n";
+  } else {
+    multiStream_str +=
+        "      takes 1 wave and " + std::to_string(n_p) + " parallel streams\n";
+  }
+  multiStream_str += "    Expand execution: takes " + std::to_string(n_wave_expand) + " waves\n";
+  llvm::errs() << multiStream_str;
+}
+
+
+
 // vim: set ts=2 sw=2 sts=2 et ai:
 
diff --git a/lib/Rewrite/CreateHostStrings.cpp b/lib/Rewrite/CreateHostStrings.cpp
index 9a618d06..7d2e777c 100644
--- a/lib/Rewrite/CreateHostStrings.cpp
+++ b/lib/Rewrite/CreateHostStrings.cpp
@@ -41,9 +41,15 @@ using namespace hipacc;
 void CreateHostStrings::writeHeaders(std::string &resultStr) {
   switch (options.getTargetLang()) {
     case Language::C99:
-      resultStr += "#include \"hipacc_cpu_standalone.hpp\"\n\n"; break;
+      resultStr += "#include \"hipacc_cpu_standalone.hpp\"\n\n";
+      if (options.getUseFFT())
+        resultStr += "#include \"hipacc_cpu_fftw.hpp\"\n\n";
+      break;
     case Language::CUDA:
-      resultStr += "#include \"hipacc_cu_standalone.hpp\"\n\n";  break;
+      resultStr += "#include \"hipacc_cu_standalone.hpp\"\n\n";
+      if (options.getUseFFT())
+        resultStr += "#include \"hipacc_cu_cufft.hpp\"\n\n";
+      break;
     case Language::OpenCLACC:
     case Language::OpenCLCPU:
     case Language::OpenCLGPU:
@@ -260,11 +266,19 @@ void CreateHostStrings::writeMemoryTransfer(HipaccImage *Img, std::string mem,
     case HOST_TO_DEVICE:
       resultStr += "hipaccWriteMemory(";
       resultStr += Img->getName();
-      resultStr += ", " + mem + ");";
+      resultStr += ", " + mem;
+      if (options.asyncKernelLaunch()) {
+        resultStr += ", " + Img->getStream();
+      }
+      resultStr += ");";
       break;
     case DEVICE_TO_HOST:
       resultStr += "hipaccReadMemory<" + Img->getTypeStr() + ">(";
-      resultStr += Img->getName() + ");";
+      resultStr += Img->getName();
+      if (options.asyncKernelLaunch()) {
+        resultStr += ", " + Img->getStream();
+      }
+      resultStr += ");";
       break;
     case DEVICE_TO_DEVICE:
       resultStr += "hipaccCopyMemory(";
@@ -284,11 +298,19 @@ void CreateHostStrings::writeMemoryTransfer(HipaccPyramid *Pyr, std::string idx,
     case HOST_TO_DEVICE:
       resultStr += "hipaccWriteMemory(";
       resultStr += Pyr->getName() + "(" + idx + ")";
-      resultStr += ", " + mem + ");";
+      resultStr += ", " + mem;
+      if (options.asyncKernelLaunch()) {
+        resultStr += ", " + Pyr->getStream();
+      }
+      resultStr += ");";
       break;
     case DEVICE_TO_HOST:
       resultStr += "hipaccReadMemory<" + Pyr->getTypeStr() + ">(";
-      resultStr += Pyr->getName() + "(" + idx + "));";
+      resultStr += Pyr->getName() + "(" + idx + ")";
+      if (options.asyncKernelLaunch()) {
+        resultStr += ", " + Pyr->getStream();
+      }
+      resultStr += ");";
       break;
     case DEVICE_TO_DEVICE:
       resultStr += "hipaccCopyMemory(";
@@ -386,8 +408,91 @@ void CreateHostStrings::writeMemoryTransferDomainFromMask(
   }
 }
 
+void CreateHostStrings::writeFFTConvolutionCall(HipaccKernel *K,
+                                                std::string &resultStr,
+                                                ASTContext &Context, bool fast) {
+  if (options.getUseFFT() && (options.getTargetLang() == Language::C99 ||
+                              options.getTargetLang() == Language::CUDA)) {
+    std::string precision = "double";
+    if (fast) {
+      precision = "float";
+    }
+    switch (options.getTargetLang()) {
+    case Language::C99:
+      resultStr += "fftwConvolution<" + precision + ">(";
+      break;
+    case Language::CUDA:
+      resultStr += "cufftConvolution_device<" + precision + ">(";
+      break;
+    default:
+      assert(false && "Convolution with FFT is only supported for C99 and CUDA");
+      return;
+    }
+    // get Iterationspace
+    std::string iterName = K->getIterationSpace()->getName();
+    // get Accessor
+    HipaccAccessor *Acc;
+    std::string accName;
+    for (auto img : K->getKernelClass()->getImgFields()) {
+      Acc = K->getImgFromMapping(img);
+      accName = Acc->getName();
+      if (accName != iterName)
+        break;
+    }
+    // get Mask
+    HipaccMask *Mask;
+    std::string mskName;
+    for (auto msk : K->getKernelClass()->getMaskFields()) {
+      Mask = K->getMaskFromMapping(msk);
+      mskName = Mask->getName();
+      if (Mask->getHostMemName().empty()) {
+        assert(false && "FFT Convolution without mask not supported");
+      }
+    }
+    // put function arguments
+    resultStr += "(" + Acc->getImage()->getTypeStr() + " *)(" + Acc->getName() +
+                 ".img->mem)";
+    // resultStr += ", " + K->getIterationSpace()->getImage()->getSizeXStr() + ",
+    // " + K->getIterationSpace()->getImage()->getSizeYStr();
+    resultStr += ", " + K->getIterationSpace()->getName() + ".width, " +
+                 K->getIterationSpace()->getName() + ".height";
+    resultStr += ", " + Mask->getHostMemName() + ", " + Mask->getSizeXStr() +
+                 ", " + Mask->getSizeYStr();
+    resultStr += ", (" + K->getIterationSpace()->getImage()->getTypeStr() +
+                 " *)(" + K->getIterationSpace()->getName() + ".img->mem)";
+    resultStr += ", " + std::to_string(device.alignment);
+    std::string boundary_linear;
+    if (Acc->getBoundaryMode() == Boundary::REPEAT) {
+      boundary_linear = "false"; // circular convolution
+    } else if (Acc->getBoundaryMode() == Boundary::CONSTANT) {
+      boundary_linear = "true"; // linear convolution
+    } else {
+      assert(false && "Only REPEAT and CONSTANT boundary modes supported");
+    }
+    resultStr += ", " + boundary_linear;
+    if (Acc->getBoundaryMode() == Boundary::CONSTANT) {
+      std::string constVal = "0";
+      clang::Expr::EvalResult Result;
+      if (Acc->getConstExpr()->EvaluateAsRValue(Result, Context)) {
+        constVal = Result.Val.getAsString(Context, K->getKernelClass()->getPixelType());
+      }
+      resultStr += ", " + constVal; // boundary constant
+    }
+    
+    resultStr += ");\n";
+  } else {
+    if (options.getUseFFT()) {
+      assert(false && "Convolution with FFT is only supported for C99 and CUDA");
+      return;
+    } else {
+      assert(false && "Convolution with FFT can not be used without -use-fft switch");
+      return;
+    }
+  }
+}
 
-void CreateHostStrings::writeKernelCall(HipaccKernel *K, std::string &resultStr) {
+void CreateHostStrings::writeKernelCall(HipaccKernel *K, std::string &resultStr,
+                                        HipaccPyramidPipeline *pyrPipe) {
   auto argTypeNames = K->getArgTypeNames();
   auto deviceArgNames = K->getDeviceArgNames();
   auto hostArgNames = K->getHostArgNames();
@@ -728,6 +833,13 @@ void CreateHostStrings::writeKernelCall(HipaccKernel *K, std::string &resultStr)
   }
   resultStr += "\n" + indent;
 
+  // insert cuda stream and wait event
+  if (options.emitCUDA() && options.asyncKernelLaunch() && K->hasWaitEvent()) {
+    resultStr += "cudaStreamWaitEvent(" + K->getStream() + ", ";
+    resultStr += K->getWaitEventStr() + ", 0);";
+    resultStr += "\n" + indent;
+  }
+
   // launch kernel
   if (options.exploreConfig() || options.timeKernels()) {
     switch (options.getTargetLang()) {
@@ -805,6 +917,34 @@ void CreateHostStrings::writeKernelCall(HipaccKernel *K, std::string &resultStr)
         resultStr += ", " + gridStr;
         resultStr += ", " + blockStr;
         resultStr += ", _args" + kernel_name + ".data()";
+
+        if (options.asyncKernelLaunch()) {
+          if (pyrPipe) { // pyramid app
+            if (pyrPipe->isSingleStream()) {
+              resultStr += ", " + K->getStream();
+            } else if (pyrPipe->isMultiStream()) {
+              switch (pyrPipe->getPyramidOperation(K->getDecl())) {
+                case PyramidOperation::REDUCE:
+                  resultStr += ", stream[" + pyrPipe->getGlobalLevelStr() + " - 1]";
+                  resultStr += ", events[" + pyrPipe->getGlobalLevelStr() + " - 1]"; // wait event
+                  resultStr += ", events[" + pyrPipe->getGlobalLevelStr() + "]"; // record event
+                  break;
+                case PyramidOperation::FILTER:
+                  resultStr += ", stream[" + pyrPipe->getGlobalLevelStr() + " - 1]";
+                  break;
+                case PyramidOperation::EXPAND:
+                  resultStr += ", stream[" + pyrPipe->getGlobalLevelStr() + "]";
+                  resultStr += ", events[" + std::to_string(pyrPipe->getDepth()) + " * 2";// wait event
+                  resultStr += " - " + pyrPipe->getGlobalLevelStr() + " - 2]";
+                  resultStr += ", events[" + std::to_string(pyrPipe->getDepth()) + " * 2";// record event
+                  resultStr += " - " + pyrPipe->getGlobalLevelStr() + " - 1]";
+                  break;
+              }
+            }
+          } else { // non-pyramid app
+            resultStr += ", " + K->getStream();
+          }
+        }
         resultStr += ");";
         break;
       case Language::Renderscript:
@@ -825,8 +965,14 @@ void CreateHostStrings::writeKernelCall(HipaccKernel *K, std::string &resultStr)
         break;
     }
   }
-}
 
+  // insert cuda stream and record event
+  if (options.emitCUDA() && options.asyncKernelLaunch() && K->hasRecordEvent()) {
+    resultStr += "cudaEventRecord(" + K->getRecordEventStr() + ", ";
+    resultStr += K->getStream() + ");";
+    resultStr += "\n" + indent;
+  }
+}
 
 void CreateHostStrings::writeReduceCall(HipaccKernel *K, std::string &resultStr) {
   std::string typeStr(K->getIterationSpace()->getImage()->getTypeStr());
diff --git a/lib/Rewrite/Rewrite.cpp b/lib/Rewrite/Rewrite.cpp
index 0f295033..266f10ec 100644
--- a/lib/Rewrite/Rewrite.cpp
+++ b/lib/Rewrite/Rewrite.cpp
@@ -100,6 +100,9 @@ class Rewrite : public ASTConsumer,  public RecursiveASTVisitor<Rewrite> {
     // store interpolation methods required for CUDA
     SmallVector<std::string, 16> InterpolationDefinitionsGlobal;
 
+    // pipeline global config for pyramid
+    HipaccPyramidPipeline *pyramidPipe;
+
     // pointer to main function
     FunctionDecl *mainFD;
     FileID mainFileID;
@@ -120,6 +123,7 @@ class Rewrite : public ASTConsumer,  public RecursiveASTVisitor<Rewrite> {
       builtins(CI.getASTContext()),
       stringCreator(CreateHostStrings(options, targetDevice)),
       compilerClasses(CompilerKnownClasses()),
+      pyramidPipe(nullptr),
       mainFD(nullptr),
       literalCount(0),
       skipTransfer(false)
@@ -376,11 +380,52 @@ void Rewrite::HandleTranslationUnit(ASTContext &) {
   CompoundStmt *CS = dyn_cast<CompoundStmt>(mainFD->getBody());
   assert(CS->size() && "CompoundStmt has no statements.");
 
-  std::string initStr;
+  std::string initStr, cleanupStr;
 
   // get initialization string for run-time
   stringCreator.writeInitialization(initStr);
 
+  // print pyramid pipeline info for cuda
+  if (compilerOptions.emitCUDA() && compilerOptions.asyncKernelLaunch() &&
+      pyramidPipe->isPipelineKernelLaunch()) {
+      pyramidPipe->printStreamPipelineInfo();
+  }
+
+  // write CUDA streams
+  if (compilerOptions.asyncKernelLaunch()) {
+    if (pyramidPipe->isSingleStream()) {
+      initStr += "cudaStream_t stream;";
+      initStr += "\n" + stringCreator.getIndent();
+
+      initStr += "cudaStreamCreate(&stream);";
+      initStr += "\n" + stringCreator.getIndent();
+    } else if (pyramidPipe->isMultiStream()) {
+      initStr += "cudaStream_t stream[";
+      initStr += std::to_string(pyramidPipe->getDepth()) + " - 1];";
+      initStr += "\n" + stringCreator.getIndent();
+
+      initStr += "for (int i = 0; i < ";
+      initStr += std::to_string(pyramidPipe->getDepth()) + " - 1; i++) {";
+      initStr += "\n" + stringCreator.getIndent();
+      initStr += "  cudaStreamCreate(&stream[i]);";
+      initStr += "\n" + stringCreator.getIndent() + "}";
+      initStr += "\n" + stringCreator.getIndent();
+
+      initStr += "cudaEvent_t events[";
+      initStr += std::to_string(pyramidPipe->getDepth()) + " * 2];";
+      initStr += "\n" + stringCreator.getIndent();
+
+      initStr += "for (int i = 0; i < ";
+      initStr += std::to_string(pyramidPipe->getDepth()) + " * 2; i++) {";
+      initStr += "\n" + stringCreator.getIndent();
+      initStr += "  cudaEventCreate(&events[i]);";
+      initStr += "\n" + stringCreator.getIndent() + "}";
+      initStr += "\n" + stringCreator.getIndent();
+    } else {
+      //
+    }
+  }
+
   // load OpenCL kernel files and compile the OpenCL kernels
   if (!compilerOptions.exploreConfig()) {
     for (auto map : KernelDeclMap)
@@ -411,6 +456,31 @@ void Rewrite::HandleTranslationUnit(ASTContext &) {
   // insert initialization before first statement
   TextRewriter.InsertTextBefore(CS->body_front()->getBeginLoc(), initStr);
 
+  // insert cleanup before last statement
+  if (compilerOptions.asyncKernelLaunch()) {
+    if (pyramidPipe->isSingleStream()) {
+      cleanupStr = "cudaStreamDestroy(stream);";
+      cleanupStr += "\n" + stringCreator.getIndent();
+    } else if (pyramidPipe->isMultiStream()) {
+      cleanupStr += "for (int i = 0; i < ";
+      cleanupStr += std::to_string(pyramidPipe->getDepth()) + " - 1; i++) {";
+      cleanupStr += "\n" + stringCreator.getIndent();
+      cleanupStr += "  cudaStreamDestroy(stream[i]);";
+      cleanupStr += "\n" + stringCreator.getIndent() + "}";
+      cleanupStr += "\n" + stringCreator.getIndent();
+
+      cleanupStr += "for (int i = 0; i < ";
+      cleanupStr += std::to_string(pyramidPipe->getDepth()) + " * 2; i++) {";
+      cleanupStr += "\n" + stringCreator.getIndent();
+      cleanupStr += "  cudaEventDestroy(events[i]);";
+      cleanupStr += "\n" + stringCreator.getIndent() + "}";
+      cleanupStr += "\n" + stringCreator.getIndent();
+    } else {
+      //
+    }
+    TextRewriter.InsertTextBefore(CS->body_back()->getBeginLoc(), cleanupStr);
+  }
+
   // get buffer of main file id. If we haven't changed it, then we are done.
   if (auto RewriteBuf = TextRewriter.getRewriteBufferFor(mainFileID)) {
     *Out << std::string(RewriteBuf->begin(), RewriteBuf->end());
@@ -627,6 +697,20 @@ bool Rewrite::VisitCXXRecordDecl(CXXRecordDecl *D) {
         KC->setBinningFunction(method);
         continue;
       }
+
+      // pyramid functions
+      if (method->getNameAsString() == "_operatePyramidReduce") {
+        KC->setPyramidOperation(PyramidOperation::REDUCE);
+        continue;
+      }
+      if (method->getNameAsString() == "_operatePyramidFilter") {
+        KC->setPyramidOperation(PyramidOperation::FILTER);
+        continue;
+      }
+      if (method->getNameAsString() == "_operatePyramidExpand") {
+        KC->setPyramidOperation(PyramidOperation::EXPAND);
+        continue;
+      }
     }
   }
 
@@ -683,7 +767,25 @@ bool Rewrite::VisitDeclStmt(DeclStmt *D) {
         std::string width_str  = convertToString(CCE->getArg(0));
         std::string height_str = convertToString(CCE->getArg(1));
 
-        if (compilerOptions.emitC99()) {
+        if (compilerOptions.asyncKernelLaunch() && compilerOptions.emitCUDA()) {
+          // extract depth level from pyramid
+          unsigned IDConstant = Diags.getCustomDiagID(DiagnosticsEngine::Error,
+                  "Constant expression for %0 argument of Image %1 required (Pyramid stream CUDA only).");
+          if (!CCE->getArg(0)->isEvaluatable(Context)) {
+            Diags.Report(CCE->getArg(0)->getExprLoc(), IDConstant) << "width"
+              << Img->getName();
+          }
+          if (!CCE->getArg(1)->isEvaluatable(Context)) {
+            Diags.Report(CCE->getArg(1)->getExprLoc(), IDConstant) << "height"
+              << Img->getName();
+          }
+
+          int64_t img_stride = CCE->getArg(0)->EvaluateKnownConstInt(Context).getSExtValue();
+          int64_t img_height = CCE->getArg(1)->EvaluateKnownConstInt(Context).getSExtValue();
+
+          Img->setSizeX(img_stride);
+          Img->setSizeY(img_height);
+        } else if (compilerOptions.emitC99()) {
           // check if the parameter can be resolved to a constant
           unsigned IDConstant = Diags.getCustomDiagID(DiagnosticsEngine::Error,
                 "Constant expression for %0 argument of Image %1 required (C/C++ only).");
@@ -746,6 +848,36 @@ bool Rewrite::VisitDeclStmt(DeclStmt *D) {
         std::string image_str = convertToString(CCE->getArg(0));
         std::string depth_str = convertToString(CCE->getArg(1));
 
+        if (compilerOptions.asyncKernelLaunch() && compilerOptions.emitCUDA()) {
+          // extract depth level from pyramid
+          unsigned IDConstant = Diags.getCustomDiagID(DiagnosticsEngine::Error,
+                  "Constant expression for %0 argument of Pyramid %1 required (CUDA only).");
+          if (!CCE->getArg(1)->isEvaluatable(Context)) {
+            Diags.Report(CCE->getArg(1)->getExprLoc(), IDConstant) << "depth"
+              << Pyr->getName();
+          }
+          int64_t pyr_depth = CCE->getArg(1)->EvaluateKnownConstInt(Context).getSExtValue();
+          Pyr->setDepth(pyr_depth);
+
+          pyramidPipe->updateDepth(pyr_depth);
+          std::string pyr_name = VD->getName();
+          pyramidPipe->setGlobalLevelStr(pyr_name + ".level()");
+
+          // TODO, generalize this to async h2d
+          auto arg = CCE->getArg(0)->IgnoreParenCasts();
+          HipaccImage *Img = nullptr;
+          if (auto DRE = dyn_cast<DeclRefExpr>(arg)) {
+              // check if the argument specifies the image
+              if (ImgDeclMap.count(DRE->getDecl())) {
+                Img = ImgDeclMap[DRE->getDecl()];
+                if (pyramidPipe->isMultiStream()) {
+                  Img->setStream("stream[0]");
+                }
+                pyramidPipe->recordBaseImg(Img);
+              }
+          }
+        }
+
         // create memory allocation string
         std::string newStr;
         stringCreator.writePyramidAllocation(VD->getName(),
@@ -1244,6 +1376,20 @@ bool Rewrite::VisitDeclStmt(DeclStmt *D) {
           HipaccKernel *K = new HipaccKernel(Context, VD, KC, compilerOptions);
           KernelDeclMap[VD] = K;
 
+          if (compilerOptions.asyncKernelLaunch() && compilerOptions.emitCUDA()) {
+            switch (KC->getPyramidOperation()) {
+              case PyramidOperation::REDUCE:
+                pyramidPipe->recordKernel(VD, K, PyramidOperation::REDUCE);
+                break;
+              case PyramidOperation::FILTER:
+                pyramidPipe->recordKernel(VD, K, PyramidOperation::FILTER);
+                break;
+              case PyramidOperation::EXPAND:
+                pyramidPipe->recordKernel(VD, K, PyramidOperation::EXPAND);
+                break;
+            }
+          }
+
           // remove kernel declaration
           TextRewriter.RemoveText(D->getSourceRange());
 
@@ -1342,6 +1488,10 @@ bool Rewrite::VisitFunctionDecl(FunctionDecl *D) {
     assert(D->getBody() && "main function has no body.");
     assert(isa<CompoundStmt>(D->getBody()) && "CompoundStmt for main body expected.");
     mainFD = D;
+    // TODO, triggle pyramid config
+    if (compilerOptions.emitCUDA() && compilerOptions.asyncKernelLaunch()) {
+      pyramidPipe = new HipaccPyramidPipeline(compilerOptions);
+    }
   }
 
   return true;
@@ -1624,7 +1774,35 @@ bool Rewrite::VisitCXXMemberCallExpr(CXXMemberCallExpr *E) {
         // TODO: handle the case when only reduce function is specified
         //
         // create kernel call string
-        stringCreator.writeKernelCall(K, newStr);
+        if (compilerOptions.emitCUDA() && compilerOptions.asyncKernelLaunch() &&
+            pyramidPipe->isPipelineKernelLaunch()) {
+          stringCreator.writeKernelCall(K, newStr, pyramidPipe);
+        } else {
+          stringCreator.writeKernelCall(K, newStr);
+        }
+
+        // rewrite kernel invocation
+        replaceText(E->getBeginLoc(), E->getBeginLoc(), ';', newStr);
+      }
+    } else if (!KernelDeclMap.empty() &&
+               E->getDirectCallee()->getNameAsString() == "convolveFFT") {
+      // get the user Kernel class
+      if (KernelDeclMap.count(DRE->getDecl())) {
+        HipaccKernel *K = KernelDeclMap[DRE->getDecl()];
+        VarDecl *VD = K->getDecl();
+        std::string newStr;
+
+        // this was checked before, when the user class was parsed
+        CXXConstructExpr *CCE = dyn_cast<CXXConstructExpr>(VD->getInit());
+        assert(CCE->getNumArgs() == K->getKernelClass()->getMembers().size() &&
+            "number of arguments doesn't match!");
+
+        // set host argument names and retrieve literals stored to temporaries
+        K->setHostArgNames(llvm::makeArrayRef(CCE->getArgs(),
+              CCE->getNumArgs()), newStr, literalCount);
+
+        bool fast = compilerOptions.getUseFFT(FAST);
+        stringCreator.writeFFTConvolutionCall(K, newStr, Context, fast);
 
         // rewrite kernel invocation
         replaceText(E->getBeginLoc(), E->getBeginLoc(), ';', newStr);
@@ -1741,6 +1919,10 @@ bool Rewrite::VisitCallExpr (CallExpr *E) {
         SourceRange range(E->getBeginLoc(),
                           E->getBeginLoc().getLocWithOffset(std::string("traverse").length()-1));
         TextRewriter.ReplaceText(range, "hipaccTraverse");
+
+        if (compilerOptions.emitCUDA() && compilerOptions.asyncKernelLaunch()) {
+          pyramidPipe->enablePipelineKernelLaunch();
+        }
       }
     }
   }
diff --git a/runtime/cuda_complex.hpp b/runtime/cuda_complex.hpp
new file mode 100644
index 00000000..633c7736
--- /dev/null
+++ b/runtime/cuda_complex.hpp
@@ -0,0 +1,1335 @@
+// An implementation of C++ std::complex for use on CUDA devices.
+// Written by John C. Travers <jtravs@gmail.com> (2012)
+//
+// Missing:
+//  - long double support (not supported on CUDA)
+//  - some integral pow functions (due to lack of C++11 support on CUDA)
+//
+// Heavily derived from the LLVM libcpp project (svn revision 147853).
+// Based on libcxx/include/complex.
+// The git history contains the complete change history from the original.
+// The modifications are licensed as per the original LLVM license below.
+//
+// -*- C++ -*-
+//===--------------------------- complex ----------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CUDA_COMPLEX_HPP
+#define CUDA_COMPLEX_HPP
+
+#ifdef __CUDACC__
+#define CUDA_CALLABLE_MEMBER __host__ __device__
+#else
+#define CUDA_CALLABLE_MEMBER
+#endif
+
+/*
+    complex synopsis
+
+template<class T>
+class complex
+{
+public:
+    typedef T value_type;
+
+    complex(const T& re = T(), const T& im = T());
+    complex(const complex&);
+    template<class X> complex(const complex<X>&);
+
+    T real() const;
+    T imag() const;
+
+    void real(T);
+    void imag(T);
+
+    complex<T>& operator= (const T&);
+    complex<T>& operator+=(const T&);
+    complex<T>& operator-=(const T&);
+    complex<T>& operator*=(const T&);
+    complex<T>& operator/=(const T&);
+
+    complex& operator=(const complex&);
+    template<class X> complex<T>& operator= (const complex<X>&);
+    template<class X> complex<T>& operator+=(const complex<X>&);
+    template<class X> complex<T>& operator-=(const complex<X>&);
+    template<class X> complex<T>& operator*=(const complex<X>&);
+    template<class X> complex<T>& operator/=(const complex<X>&);
+};
+
+template<>
+class complex<float>
+{
+public:
+    typedef float value_type;
+
+    constexpr complex(float re = 0.0f, float im = 0.0f);
+    explicit constexpr complex(const complex<double>&);
+
+    constexpr float real() const;
+    void real(float);
+    constexpr float imag() const;
+    void imag(float);
+
+    complex<float>& operator= (float);
+    complex<float>& operator+=(float);
+    complex<float>& operator-=(float);
+    complex<float>& operator*=(float);
+    complex<float>& operator/=(float);
+
+    complex<float>& operator=(const complex<float>&);
+    template<class X> complex<float>& operator= (const complex<X>&);
+    template<class X> complex<float>& operator+=(const complex<X>&);
+    template<class X> complex<float>& operator-=(const complex<X>&);
+    template<class X> complex<float>& operator*=(const complex<X>&);
+    template<class X> complex<float>& operator/=(const complex<X>&);
+};
+
+template<>
+class complex<double>
+{
+public:
+    typedef double value_type;
+
+    constexpr complex(double re = 0.0, double im = 0.0);
+    constexpr complex(const complex<float>&);
+
+    constexpr double real() const;
+    void real(double);
+    constexpr double imag() const;
+    void imag(double);
+
+    complex<double>& operator= (double);
+    complex<double>& operator+=(double);
+    complex<double>& operator-=(double);
+    complex<double>& operator*=(double);
+    complex<double>& operator/=(double);
+    complex<double>& operator=(const complex<double>&);
+
+    template<class X> complex<double>& operator= (const complex<X>&);
+    template<class X> complex<double>& operator+=(const complex<X>&);
+    template<class X> complex<double>& operator-=(const complex<X>&);
+    template<class X> complex<double>& operator*=(const complex<X>&);
+    template<class X> complex<double>& operator/=(const complex<X>&);
+};
+
+// 26.3.6 operators:
+template<class T> complex<T> operator+(const complex<T>&, const complex<T>&);
+template<class T> complex<T> operator+(const complex<T>&, const T&);
+template<class T> complex<T> operator+(const T&, const complex<T>&);
+template<class T> complex<T> operator-(const complex<T>&, const complex<T>&);
+template<class T> complex<T> operator-(const complex<T>&, const T&);
+template<class T> complex<T> operator-(const T&, const complex<T>&);
+template<class T> complex<T> operator*(const complex<T>&, const complex<T>&);
+template<class T> complex<T> operator*(const complex<T>&, const T&);
+template<class T> complex<T> operator*(const T&, const complex<T>&);
+template<class T> complex<T> operator/(const complex<T>&, const complex<T>&);
+template<class T> complex<T> operator/(const complex<T>&, const T&);
+template<class T> complex<T> operator/(const T&, const complex<T>&);
+template<class T> complex<T> operator+(const complex<T>&);
+template<class T> complex<T> operator-(const complex<T>&);
+template<class T> bool operator==(const complex<T>&, const complex<T>&);
+template<class T> bool operator==(const complex<T>&, const T&);
+template<class T> bool operator==(const T&, const complex<T>&);
+template<class T> bool operator!=(const complex<T>&, const complex<T>&);
+template<class T> bool operator!=(const complex<T>&, const T&);
+template<class T> bool operator!=(const T&, const complex<T>&);
+
+template<class T, class charT, class traits>
+  basic_istream<charT, traits>&
+  operator>>(basic_istream<charT, traits>&, complex<T>&);
+template<class T, class charT, class traits>
+  basic_ostream<charT, traits>&
+  operator<<(basic_ostream<charT, traits>&, const complex<T>&);
+
+// 26.3.7 values:
+
+template<class T>              T real(const complex<T>&);
+                          double real(double);
+template<Integral T>      double real(T);
+                          float  real(float);
+
+template<class T>              T imag(const complex<T>&);
+                          double imag(double);
+template<Integral T>      double imag(T);
+                          float  imag(float);
+
+template<class T> T abs(const complex<T>&);
+
+template<class T>              T arg(const complex<T>&);
+                          double arg(double);
+template<Integral T>      double arg(T);
+                          float  arg(float);
+
+template<class T>              T norm(const complex<T>&);
+                          double norm(double);
+template<Integral T>      double norm(T);
+                          float  norm(float);
+
+template<class T>      complex<T>           conj(const complex<T>&);
+                       complex<double>      conj(double);
+template<Integral T>   complex<double>      conj(T);
+                       complex<float>       conj(float);
+
+template<class T>    complex<T>           proj(const complex<T>&);
+                     complex<double>      proj(double);
+template<Integral T> complex<double>      proj(T);
+                     complex<float>       proj(float);
+
+template<class T> complex<T> polar(const T&, const T& = 0);
+
+// 26.3.8 transcendentals:
+template<class T> complex<T> acos(const complex<T>&);
+template<class T> complex<T> asin(const complex<T>&);
+template<class T> complex<T> atan(const complex<T>&);
+template<class T> complex<T> acosh(const complex<T>&);
+template<class T> complex<T> asinh(const complex<T>&);
+template<class T> complex<T> atanh(const complex<T>&);
+template<class T> complex<T> cos (const complex<T>&);
+template<class T> complex<T> cosh (const complex<T>&);
+template<class T> complex<T> exp (const complex<T>&);
+template<class T> complex<T> log (const complex<T>&);
+template<class T> complex<T> log10(const complex<T>&);
+
+template<class T> complex<T> pow(const complex<T>&, const T&);
+template<class T> complex<T> pow(const complex<T>&, const complex<T>&);
+template<class T> complex<T> pow(const T&, const complex<T>&);
+
+template<class T> complex<T> sin (const complex<T>&);
+template<class T> complex<T> sinh (const complex<T>&);
+template<class T> complex<T> sqrt (const complex<T>&);
+template<class T> complex<T> tan (const complex<T>&);
+template<class T> complex<T> tanh (const complex<T>&);
+
+template<class T, class charT, class traits>
+  basic_istream<charT, traits>&
+  operator>>(basic_istream<charT, traits>& is, complex<T>& x);
+
+template<class T, class charT, class traits>
+  basic_ostream<charT, traits>&
+  operator<<(basic_ostream<charT, traits>& o, const complex<T>& x);
+
+*/
+
+#include <math.h>
+#include <sstream>
+
+template<class _Tp> class  complex;
+
+template<class _Tp> complex<_Tp> operator*(const complex<_Tp>& __z, const complex<_Tp>& __w);
+template<class _Tp> complex<_Tp> operator/(const complex<_Tp>& __x, const complex<_Tp>& __y);
+
+template<class _Tp>
+class  complex
+{
+public:
+    typedef _Tp value_type;
+private:
+    value_type __re_;
+    value_type __im_;
+public:
+    CUDA_CALLABLE_MEMBER
+    complex(const value_type& __re = value_type(), const value_type& __im = value_type())
+        : __re_(__re), __im_(__im) {}
+    template<class _Xp> CUDA_CALLABLE_MEMBER
+    complex(const complex<_Xp>& __c)
+        : __re_(__c.real()), __im_(__c.imag()) {}
+
+    CUDA_CALLABLE_MEMBER value_type real() const {return __re_;}
+    CUDA_CALLABLE_MEMBER value_type imag() const {return __im_;}
+
+    CUDA_CALLABLE_MEMBER void real(value_type __re) {__re_ = __re;}
+    CUDA_CALLABLE_MEMBER void imag(value_type __im) {__im_ = __im;}
+
+    CUDA_CALLABLE_MEMBER complex& operator= (const value_type& __re)
+        {__re_ = __re; __im_ = value_type(); return *this;}
+    CUDA_CALLABLE_MEMBER complex& operator+=(const value_type& __re) {__re_ += __re; return *this;}
+    CUDA_CALLABLE_MEMBER complex& operator-=(const value_type& __re) {__re_ -= __re; return *this;}
+    CUDA_CALLABLE_MEMBER complex& operator*=(const value_type& __re) {__re_ *= __re; __im_ *= __re; return *this;}
+    CUDA_CALLABLE_MEMBER complex& operator/=(const value_type& __re) {__re_ /= __re; __im_ /= __re; return *this;}
+
+    template<class _Xp> CUDA_CALLABLE_MEMBER complex& operator= (const complex<_Xp>& __c)
+        {
+            __re_ = __c.real();
+            __im_ = __c.imag();
+            return *this;
+        }
+    template<class _Xp> CUDA_CALLABLE_MEMBER complex& operator+=(const complex<_Xp>& __c)
+        {
+            __re_ += __c.real();
+            __im_ += __c.imag();
+            return *this;
+        }
+    template<class _Xp> CUDA_CALLABLE_MEMBER complex& operator-=(const complex<_Xp>& __c)
+        {
+            __re_ -= __c.real();
+            __im_ -= __c.imag();
+            return *this;
+        }
+    template<class _Xp> CUDA_CALLABLE_MEMBER complex& operator*=(const complex<_Xp>& __c)
+        {
+            *this = *this * __c;
+            return *this;
+        }
+    template<class _Xp> CUDA_CALLABLE_MEMBER complex& operator/=(const complex<_Xp>& __c)
+        {
+            *this = *this / __c;
+            return *this;
+        }
+};
+
+template<> class  complex<double>;
+
+template<>
+class  complex<float>
+{
+    float __re_;
+    float __im_;
+public:
+    typedef float value_type;
+
+    /*constexpr*/ CUDA_CALLABLE_MEMBER complex(float __re = 0.0f, float __im = 0.0f)
+        : __re_(__re), __im_(__im) {}
+    explicit /*constexpr*/ complex(const complex<double>& __c);
+
+    /*constexpr*/ CUDA_CALLABLE_MEMBER float real() const {return __re_;}
+    /*constexpr*/ CUDA_CALLABLE_MEMBER float imag() const {return __im_;}
+
+    CUDA_CALLABLE_MEMBER void real(value_type __re) {__re_ = __re;}
+    CUDA_CALLABLE_MEMBER void imag(value_type __im) {__im_ = __im;}
+
+    CUDA_CALLABLE_MEMBER complex& operator= (float __re)
+        {__re_ = __re; __im_ = value_type(); return *this;}
+    CUDA_CALLABLE_MEMBER complex& operator+=(float __re) {__re_ += __re; return *this;}
+    CUDA_CALLABLE_MEMBER complex& operator-=(float __re) {__re_ -= __re; return *this;}
+    CUDA_CALLABLE_MEMBER complex& operator*=(float __re) {__re_ *= __re; __im_ *= __re; return *this;}
+    CUDA_CALLABLE_MEMBER complex& operator/=(float __re) {__re_ /= __re; __im_ /= __re; return *this;}
+
+    template<class _Xp> CUDA_CALLABLE_MEMBER complex& operator= (const complex<_Xp>& __c)
+        {
+            __re_ = __c.real();
+            __im_ = __c.imag();
+            return *this;
+        }
+    template<class _Xp> CUDA_CALLABLE_MEMBER complex& operator+=(const complex<_Xp>& __c)
+        {
+            __re_ += __c.real();
+            __im_ += __c.imag();
+            return *this;
+        }
+    template<class _Xp> CUDA_CALLABLE_MEMBER complex& operator-=(const complex<_Xp>& __c)
+        {
+            __re_ -= __c.real();
+            __im_ -= __c.imag();
+            return *this;
+        }
+    template<class _Xp> CUDA_CALLABLE_MEMBER complex& operator*=(const complex<_Xp>& __c)
+        {
+            *this = *this * __c;
+            return *this;
+        }
+    template<class _Xp> CUDA_CALLABLE_MEMBER complex& operator/=(const complex<_Xp>& __c)
+        {
+            *this = *this / __c;
+            return *this;
+        }
+};
+
+template<>
+class  complex<double>
+{
+    double __re_;
+    double __im_;
+public:
+    typedef double value_type;
+
+    /*constexpr*/ CUDA_CALLABLE_MEMBER complex(double __re = 0.0, double __im = 0.0)
+        : __re_(__re), __im_(__im) {}
+    /*constexpr*/ complex(const complex<float>& __c);
+
+    /*constexpr*/ CUDA_CALLABLE_MEMBER double real() const {return __re_;}
+    /*constexpr*/ CUDA_CALLABLE_MEMBER double imag() const {return __im_;}
+
+    CUDA_CALLABLE_MEMBER void real(value_type __re) {__re_ = __re;}
+    CUDA_CALLABLE_MEMBER void imag(value_type __im) {__im_ = __im;}
+
+    CUDA_CALLABLE_MEMBER complex& operator= (double __re)
+        {__re_ = __re; __im_ = value_type(); return *this;}
+    CUDA_CALLABLE_MEMBER complex& operator+=(double __re) {__re_ += __re; return *this;}
+    CUDA_CALLABLE_MEMBER complex& operator-=(double __re) {__re_ -= __re; return *this;}
+    CUDA_CALLABLE_MEMBER complex& operator*=(double __re) {__re_ *= __re; __im_ *= __re; return *this;}
+    CUDA_CALLABLE_MEMBER complex& operator/=(double __re) {__re_ /= __re; __im_ /= __re; return *this;}
+
+    template<class _Xp> CUDA_CALLABLE_MEMBER complex& operator= (const complex<_Xp>& __c)
+        {
+            __re_ = __c.real();
+            __im_ = __c.imag();
+            return *this;
+        }
+    template<class _Xp> CUDA_CALLABLE_MEMBER complex& operator+=(const complex<_Xp>& __c)
+        {
+            __re_ += __c.real();
+            __im_ += __c.imag();
+            return *this;
+        }
+    template<class _Xp> CUDA_CALLABLE_MEMBER complex& operator-=(const complex<_Xp>& __c)
+        {
+            __re_ -= __c.real();
+            __im_ -= __c.imag();
+            return *this;
+        }
+    template<class _Xp> CUDA_CALLABLE_MEMBER complex& operator*=(const complex<_Xp>& __c)
+        {
+            *this = *this * __c;
+            return *this;
+        }
+    template<class _Xp> CUDA_CALLABLE_MEMBER complex& operator/=(const complex<_Xp>& __c)
+        {
+            *this = *this / __c;
+            return *this;
+        }
+};
+
+//constexpr
+inline CUDA_CALLABLE_MEMBER
+complex<float>::complex(const complex<double>& __c)
+    : __re_(__c.real()), __im_(__c.imag()) {}
+
+//constexpr
+inline CUDA_CALLABLE_MEMBER
+complex<double>::complex(const complex<float>& __c)
+    : __re_(__c.real()), __im_(__c.imag()) {}
+
+// 26.3.6 operators:
+
+template<class _Tp>
+inline CUDA_CALLABLE_MEMBER
+complex<_Tp>
+operator+(const complex<_Tp>& __x, const complex<_Tp>& __y)
+{
+    complex<_Tp> __t(__x);
+    __t += __y;
+    return __t;
+}
+
+template<class _Tp>
+inline CUDA_CALLABLE_MEMBER
+complex<_Tp>
+operator+(const complex<_Tp>& __x, const _Tp& __y)
+{
+    complex<_Tp> __t(__x);
+    __t += __y;
+    return __t;
+}
+
+template<class _Tp>
+inline CUDA_CALLABLE_MEMBER
+complex<_Tp>
+operator+(const _Tp& __x, const complex<_Tp>& __y)
+{
+    complex<_Tp> __t(__y);
+    __t += __x;
+    return __t;
+}
+
+template<class _Tp>
+inline CUDA_CALLABLE_MEMBER
+complex<_Tp>
+operator-(const complex<_Tp>& __x, const complex<_Tp>& __y)
+{
+    complex<_Tp> __t(__x);
+    __t -= __y;
+    return __t;
+}
+
+template<class _Tp>
+inline CUDA_CALLABLE_MEMBER
+complex<_Tp>
+operator-(const complex<_Tp>& __x, const _Tp& __y)
+{
+    complex<_Tp> __t(__x);
+    __t -= __y;
+    return __t;
+}
+
+template<class _Tp>
+inline CUDA_CALLABLE_MEMBER
+complex<_Tp>
+operator-(const _Tp& __x, const complex<_Tp>& __y)
+{
+    complex<_Tp> __t(-__y);
+    __t += __x;
+    return __t;
+}
+
+template<class _Tp>
+CUDA_CALLABLE_MEMBER
+complex<_Tp>
+operator*(const complex<_Tp>& __z, const complex<_Tp>& __w)
+{
+    _Tp __a = __z.real();
+    _Tp __b = __z.imag();
+    _Tp __c = __w.real();
+    _Tp __d = __w.imag();
+    _Tp __ac = __a * __c;
+    _Tp __bd = __b * __d;
+    _Tp __ad = __a * __d;
+    _Tp __bc = __b * __c;
+    _Tp __x = __ac - __bd;
+    _Tp __y = __ad + __bc;
+    if (isnan(__x) && isnan(__y))
+    {
+        bool __recalc = false;
+        if (isinf(__a) || isinf(__b))
+        {
+            __a = copysign(isinf(__a) ? _Tp(1) : _Tp(0), __a);
+            __b = copysign(isinf(__b) ? _Tp(1) : _Tp(0), __b);
+            if (isnan(__c))
+                __c = copysign(_Tp(0), __c);
+            if (isnan(__d))
+                __d = copysign(_Tp(0), __d);
+            __recalc = true;
+        }
+        if (isinf(__c) || isinf(__d))
+        {
+            __c = copysign(isinf(__c) ? _Tp(1) : _Tp(0), __c);
+            __d = copysign(isinf(__d) ? _Tp(1) : _Tp(0), __d);
+            if (isnan(__a))
+                __a = copysign(_Tp(0), __a);
+            if (isnan(__b))
+                __b = copysign(_Tp(0), __b);
+            __recalc = true;
+        }
+        if (!__recalc && (isinf(__ac) || isinf(__bd) ||
+                          isinf(__ad) || isinf(__bc)))
+        {
+            if (isnan(__a))
+                __a = copysign(_Tp(0), __a);
+            if (isnan(__b))
+                __b = copysign(_Tp(0), __b);
+            if (isnan(__c))
+                __c = copysign(_Tp(0), __c);
+            if (isnan(__d))
+                __d = copysign(_Tp(0), __d);
+            __recalc = true;
+        }
+        if (__recalc)
+        {
+            __x = _Tp(INFINITY) * (__a * __c - __b * __d);
+            __y = _Tp(INFINITY) * (__a * __d + __b * __c);
+        }
+    }
+    return complex<_Tp>(__x, __y);
+}
+
+template<class _Tp>
+inline CUDA_CALLABLE_MEMBER
+complex<_Tp>
+operator*(const complex<_Tp>& __x, const _Tp& __y)
+{
+    complex<_Tp> __t(__x);
+    __t *= __y;
+    return __t;
+}
+
+template<class _Tp>
+inline CUDA_CALLABLE_MEMBER
+complex<_Tp>
+operator*(const _Tp& __x, const complex<_Tp>& __y)
+{
+    complex<_Tp> __t(__y);
+    __t *= __x;
+    return __t;
+}
+
+template<class _Tp>
+CUDA_CALLABLE_MEMBER
+complex<_Tp>
+operator/(const complex<_Tp>& __z, const complex<_Tp>& __w)
+{
+    int __ilogbw = 0;
+    _Tp __a = __z.real();
+    _Tp __b = __z.imag();
+    _Tp __c = __w.real();
+    _Tp __d = __w.imag();
+    _Tp __logbw = logb(fmax(fabs(__c), fabs(__d)));
+    if (isfinite(__logbw))
+    {
+        __ilogbw = static_cast<int>(__logbw);
+        __c = scalbn(__c, -__ilogbw);
+        __d = scalbn(__d, -__ilogbw);
+    }
+    _Tp __denom = __c * __c + __d * __d;
+    _Tp __x = scalbn((__a * __c + __b * __d) / __denom, -__ilogbw);
+    _Tp __y = scalbn((__b * __c - __a * __d) / __denom, -__ilogbw);
+    if (isnan(__x) && isnan(__y))
+    {
+        if ((__denom == _Tp(0)) && (!isnan(__a) || !isnan(__b)))
+        {
+            __x = copysign(_Tp(INFINITY), __c) * __a;
+            __y = copysign(_Tp(INFINITY), __c) * __b;
+        }
+        else if ((isinf(__a) || isinf(__b)) && isfinite(__c) && isfinite(__d))
+        {
+            __a = copysign(isinf(__a) ? _Tp(1) : _Tp(0), __a);
+            __b = copysign(isinf(__b) ? _Tp(1) : _Tp(0), __b);
+            __x = _Tp(INFINITY) * (__a * __c + __b * __d);
+            __y = _Tp(INFINITY) * (__b * __c - __a * __d);
+        }
+        else if (isinf(__logbw) && __logbw > _Tp(0) && isfinite(__a) && isfinite(__b))
+        {
+            __c = copysign(isinf(__c) ? _Tp(1) : _Tp(0), __c);
+            __d = copysign(isinf(__d) ? _Tp(1) : _Tp(0), __d);
+            __x = _Tp(0) * (__a * __c + __b * __d);
+            __y = _Tp(0) * (__b * __c - __a * __d);
+        }
+    }
+    return complex<_Tp>(__x, __y);
+}
+
+template<>
+CUDA_CALLABLE_MEMBER
+complex<float>
+operator/(const complex<float>& __z, const complex<float>& __w)
+{
+    int __ilogbw = 0;
+    float __a = __z.real();
+    float __b = __z.imag();
+    float __c = __w.real();
+    float __d = __w.imag();
+    float __logbw = logbf(fmaxf(fabsf(__c), fabsf(__d)));
+    if (isfinite(__logbw))
+    {
+        __ilogbw = static_cast<int>(__logbw);
+        __c = scalbnf(__c, -__ilogbw);
+        __d = scalbnf(__d, -__ilogbw);
+    }
+    float __denom = __c * __c + __d * __d;
+    float __x = scalbnf((__a * __c + __b * __d) / __denom, -__ilogbw);
+    float __y = scalbnf((__b * __c - __a * __d) / __denom, -__ilogbw);
+    if (isnan(__x) && isnan(__y))
+    {
+        if ((__denom == float(0)) && (!isnan(__a) || !isnan(__b)))
+        {
+#pragma warning(suppress: 4756) // Ignore INFINITY related warning
+            __x = copysignf(INFINITY, __c) * __a;
+#pragma warning(suppress: 4756) // Ignore INFINITY related warning
+            __y = copysignf(INFINITY, __c) * __b;
+        }
+        else if ((isinf(__a) || isinf(__b)) && isfinite(__c) && isfinite(__d))
+        {
+            __a = copysignf(isinf(__a) ? float(1) : float(0), __a);
+            __b = copysignf(isinf(__b) ? float(1) : float(0), __b);
+#pragma warning(suppress: 4756) // Ignore INFINITY related warning
+            __x = INFINITY * (__a * __c + __b * __d);
+#pragma warning(suppress: 4756) // Ignore INFINITY related warning
+            __y = INFINITY * (__b * __c - __a * __d);
+        }
+        else if (isinf(__logbw) && __logbw > float(0) && isfinite(__a) && isfinite(__b))
+        {
+            __c = copysignf(isinf(__c) ? float(1) : float(0), __c);
+            __d = copysignf(isinf(__d) ? float(1) : float(0), __d);
+            __x = float(0) * (__a * __c + __b * __d);
+            __y = float(0) * (__b * __c - __a * __d);
+        }
+    }
+    return complex<float>(__x, __y);
+}
+
+template<class _Tp>
+inline CUDA_CALLABLE_MEMBER
+complex<_Tp>
+operator/(const complex<_Tp>& __x, const _Tp& __y)
+{
+    return complex<_Tp>(__x.real() / __y, __x.imag() / __y);
+}
+
+template<class _Tp>
+inline CUDA_CALLABLE_MEMBER
+complex<_Tp>
+operator/(const _Tp& __x, const complex<_Tp>& __y)
+{
+    complex<_Tp> __t(__x);
+    __t /= __y;
+    return __t;
+}
+
+template<class _Tp>
+inline CUDA_CALLABLE_MEMBER
+complex<_Tp>
+operator+(const complex<_Tp>& __x)
+{
+    return __x;
+}
+
+template<class _Tp>
+inline CUDA_CALLABLE_MEMBER
+complex<_Tp>
+operator-(const complex<_Tp>& __x)
+{
+    return complex<_Tp>(-__x.real(), -__x.imag());
+}
+
+template<class _Tp>
+inline CUDA_CALLABLE_MEMBER
+bool
+operator==(const complex<_Tp>& __x, const complex<_Tp>& __y)
+{
+    return __x.real() == __y.real() && __x.imag() == __y.imag();
+}
+
+template<class _Tp>
+inline CUDA_CALLABLE_MEMBER
+bool
+operator==(const complex<_Tp>& __x, const _Tp& __y)
+{
+    return __x.real() == __y && __x.imag() == 0;
+}
+
+template<class _Tp>
+inline CUDA_CALLABLE_MEMBER
+bool
+operator==(const _Tp& __x, const complex<_Tp>& __y)
+{
+    return __x == __y.real() && 0 == __y.imag();
+}
+
+template<class _Tp>
+inline CUDA_CALLABLE_MEMBER
+bool
+operator!=(const complex<_Tp>& __x, const complex<_Tp>& __y)
+{
+    return !(__x == __y);
+}
+
+template<class _Tp>
+inline CUDA_CALLABLE_MEMBER
+bool
+operator!=(const complex<_Tp>& __x, const _Tp& __y)
+{
+    return !(__x == __y);
+}
+
+template<class _Tp>
+inline CUDA_CALLABLE_MEMBER
+bool
+operator!=(const _Tp& __x, const complex<_Tp>& __y)
+{
+    return !(__x == __y);
+}
+
+// 26.3.7 values:
+
+// real
+
+template<class _Tp>
+inline CUDA_CALLABLE_MEMBER
+_Tp
+real(const complex<_Tp>& __c)
+{
+    return __c.real();
+}
+
+inline CUDA_CALLABLE_MEMBER
+double
+real(double __re)
+{
+    return __re;
+}
+
+inline CUDA_CALLABLE_MEMBER
+float
+real(float  __re)
+{
+    return __re;
+}
+
+// imag
+
+template<class _Tp>
+inline CUDA_CALLABLE_MEMBER
+_Tp
+imag(const complex<_Tp>& __c)
+{
+    return __c.imag();
+}
+
+inline CUDA_CALLABLE_MEMBER
+double
+imag(double __re)
+{
+    return 0;
+}
+
+inline CUDA_CALLABLE_MEMBER
+float
+imag(float  __re)
+{
+    return 0;
+}
+
+// abs
+
+template<class _Tp>
+inline CUDA_CALLABLE_MEMBER
+_Tp
+abs(const complex<_Tp>& __c)
+{
+    return hypot(__c.real(), __c.imag());
+}
+
+// arg
+
+template<class _Tp>
+inline CUDA_CALLABLE_MEMBER
+_Tp
+arg(const complex<_Tp>& __c)
+{
+    return atan2(__c.imag(), __c.real());
+}
+
+inline CUDA_CALLABLE_MEMBER
+double
+arg(double __re)
+{
+    return atan2(0., __re);
+}
+
+inline CUDA_CALLABLE_MEMBER
+float
+arg(float __re)
+{
+    return atan2f(0.F, __re);
+}
+
+// norm
+
+template<class _Tp>
+inline CUDA_CALLABLE_MEMBER
+_Tp
+norm(const complex<_Tp>& __c)
+{
+    if (isinf(__c.real()))
+        return fabs(__c.real());
+    if (isinf(__c.imag()))
+        return fabs(__c.imag());
+    return __c.real() * __c.real() + __c.imag() * __c.imag();
+}
+
+inline CUDA_CALLABLE_MEMBER
+double
+norm(double __re)
+{
+    return __re * __re;
+}
+
+inline CUDA_CALLABLE_MEMBER
+float
+norm(float __re)
+{
+    return __re * __re;
+}
+
+// conj
+
+template<class _Tp>
+inline CUDA_CALLABLE_MEMBER
+complex<_Tp>
+conj(const complex<_Tp>& __c)
+{
+    return complex<_Tp>(__c.real(), -__c.imag());
+}
+
+inline CUDA_CALLABLE_MEMBER
+complex<double>
+conj(double __re)
+{
+    return complex<double>(__re);
+}
+
+inline CUDA_CALLABLE_MEMBER
+complex<float>
+conj(float __re)
+{
+    return complex<float>(__re);
+}
+
+// proj
+
+template<class _Tp>
+inline CUDA_CALLABLE_MEMBER
+complex<_Tp>
+proj(const complex<_Tp>& __c)
+{
+    complex<_Tp> __r = __c;
+    if (isinf(__c.real()) || isinf(__c.imag()))
+        __r = complex<_Tp>(INFINITY, copysign(_Tp(0), __c.imag()));
+    return __r;
+}
+
+inline CUDA_CALLABLE_MEMBER
+complex<double>
+proj(double __re)
+{
+    if (isinf(__re))
+        __re = fabs(__re);
+    return complex<double>(__re);
+}
+
+inline CUDA_CALLABLE_MEMBER
+complex<float>
+proj(float __re)
+{
+    if (isinf(__re))
+        __re = fabs(__re);
+    return complex<float>(__re);
+}
+
+// polar
+
+template<class _Tp>
+CUDA_CALLABLE_MEMBER
+complex<_Tp>
+polar(const _Tp& __rho, const _Tp& __theta = _Tp(0))
+{
+    if (isnan(__rho) || signbit(__rho))
+        return complex<_Tp>(_Tp(NAN), _Tp(NAN));
+    if (isnan(__theta))
+    {
+        if (isinf(__rho))
+            return complex<_Tp>(__rho, __theta);
+        return complex<_Tp>(__theta, __theta);
+    }
+    if (isinf(__theta))
+    {
+        if (isinf(__rho))
+            return complex<_Tp>(__rho, _Tp(NAN));
+        return complex<_Tp>(_Tp(NAN), _Tp(NAN));
+    }
+    _Tp __x = __rho * cos(__theta);
+    if (isnan(__x))
+        __x = 0;
+    _Tp __y = __rho * sin(__theta);
+    if (isnan(__y))
+        __y = 0;
+    return complex<_Tp>(__x, __y);
+}
+
+// log
+
+template<class _Tp>
+inline CUDA_CALLABLE_MEMBER
+complex<_Tp>
+log(const complex<_Tp>& __x)
+{
+    return complex<_Tp>(log(abs(__x)), arg(__x));
+}
+
+// log10
+
+template<class _Tp>
+inline CUDA_CALLABLE_MEMBER
+complex<_Tp>
+log10(const complex<_Tp>& __x)
+{
+    return log(__x) / log(_Tp(10));
+}
+
+// sqrt
+
+template<class _Tp>
+CUDA_CALLABLE_MEMBER
+complex<_Tp>
+sqrt(const complex<_Tp>& __x)
+{
+    if (isinf(__x.imag()))
+        return complex<_Tp>(_Tp(INFINITY), __x.imag());
+    if (isinf(__x.real()))
+    {
+        if (__x.real() > _Tp(0))
+            return complex<_Tp>(__x.real(), isnan(__x.imag()) ? __x.imag() : copysign(_Tp(0), __x.imag()));
+        return complex<_Tp>(isnan(__x.imag()) ? __x.imag() : _Tp(0), copysign(__x.real(), __x.imag()));
+    }
+    return polar(sqrt(abs(__x)), arg(__x) / _Tp(2));
+}
+
+// exp
+
+template<class _Tp>
+CUDA_CALLABLE_MEMBER
+complex<_Tp>
+exp(const complex<_Tp>& __x)
+{
+    _Tp __i = __x.imag();
+    if (isinf(__x.real()))
+    {
+        if (__x.real() < _Tp(0))
+        {
+            if (!isfinite(__i))
+                __i = _Tp(1);
+        }
+        else if (__i == 0 || !isfinite(__i))
+        {
+            if (isinf(__i))
+                __i = _Tp(NAN);
+            return complex<_Tp>(__x.real(), __i);
+        }
+    }
+    else if (isnan(__x.real()) && __x.imag() == 0)
+        return __x;
+    _Tp __e = exp(__x.real());
+    return complex<_Tp>(__e * cos(__i), __e * sin(__i));
+}
+
+// pow
+
+template<class _Tp>
+inline CUDA_CALLABLE_MEMBER
+complex<_Tp>
+pow(const complex<_Tp>& __x, const complex<_Tp>& __y)
+{
+    return exp(__y * log(__x));
+}
+
+template<class _Tp>
+inline CUDA_CALLABLE_MEMBER
+complex<_Tp>
+pow(const complex<_Tp>& __x, const _Tp& __y)
+{   
+    return pow(__x, complex<_Tp>(__y));
+}
+
+template<class _Tp>
+inline CUDA_CALLABLE_MEMBER
+complex<_Tp>
+pow(const _Tp& __x, const complex<_Tp>& __y)
+{
+    return pow(complex<_Tp>(__x), __y);
+}
+
+// asinh
+
+template<class _Tp>
+CUDA_CALLABLE_MEMBER
+complex<_Tp>
+asinh(const complex<_Tp>& __x)
+{
+    const _Tp __pi(atan2(+0., -0.));
+    if (isinf(__x.real()))
+    {
+        if (isnan(__x.imag()))
+            return __x;
+        if (isinf(__x.imag()))
+            return complex<_Tp>(__x.real(), copysign(__pi * _Tp(0.25), __x.imag()));
+        return complex<_Tp>(__x.real(), copysign(_Tp(0), __x.imag()));
+    }
+    if (isnan(__x.real()))
+    {
+        if (isinf(__x.imag()))
+            return complex<_Tp>(__x.imag(), __x.real());
+        if (__x.imag() == 0)
+            return __x;
+        return complex<_Tp>(__x.real(), __x.real());
+    }
+    if (isinf(__x.imag()))
+        return complex<_Tp>(copysign(__x.imag(), __x.real()), copysign(__pi/_Tp(2), __x.imag()));
+    complex<_Tp> __z = log(__x + sqrt(pow(__x, _Tp(2)) + _Tp(1)));
+    return complex<_Tp>(copysign(__z.real(), __x.real()), copysign(__z.imag(), __x.imag()));
+}
+
+// acosh
+
+template<class _Tp>
+CUDA_CALLABLE_MEMBER
+complex<_Tp>
+acosh(const complex<_Tp>& __x)
+{
+    const _Tp __pi(atan2(+0., -0.));
+    if (isinf(__x.real()))
+    {
+        if (isnan(__x.imag()))
+            return complex<_Tp>(fabs(__x.real()), __x.imag());
+        if (isinf(__x.imag()))
+            if (__x.real() > 0)
+                return complex<_Tp>(__x.real(), copysign(__pi * _Tp(0.25), __x.imag()));
+            else
+                return complex<_Tp>(-__x.real(), copysign(__pi * _Tp(0.75), __x.imag()));
+        if (__x.real() < 0)
+            return complex<_Tp>(-__x.real(), copysign(__pi, __x.imag()));
+        return complex<_Tp>(__x.real(), copysign(_Tp(0), __x.imag()));
+    }
+    if (isnan(__x.real()))
+    {
+        if (isinf(__x.imag()))
+            return complex<_Tp>(fabs(__x.imag()), __x.real());
+        return complex<_Tp>(__x.real(), __x.real());
+    }
+    if (isinf(__x.imag()))
+        return complex<_Tp>(fabs(__x.imag()), copysign(__pi/_Tp(2), __x.imag()));
+    complex<_Tp> __z = log(__x + sqrt(pow(__x, _Tp(2)) - _Tp(1)));
+    return complex<_Tp>(copysign(__z.real(), _Tp(0)), copysign(__z.imag(), __x.imag()));
+}
+
+// atanh
+
+template<class _Tp>
+CUDA_CALLABLE_MEMBER
+complex<_Tp>
+atanh(const complex<_Tp>& __x)
+{
+    const _Tp __pi(atan2(+0., -0.));
+    if (isinf(__x.imag()))
+    {
+        return complex<_Tp>(copysign(_Tp(0), __x.real()), copysign(__pi/_Tp(2), __x.imag()));
+    }
+    if (isnan(__x.imag()))
+    {
+        if (isinf(__x.real()) || __x.real() == 0)
+            return complex<_Tp>(copysign(_Tp(0), __x.real()), __x.imag());
+        return complex<_Tp>(__x.imag(), __x.imag());
+    }
+    if (isnan(__x.real()))
+    {
+        return complex<_Tp>(__x.real(), __x.real());
+    }
+    if (isinf(__x.real()))
+    {
+        return complex<_Tp>(copysign(_Tp(0), __x.real()), copysign(__pi/_Tp(2), __x.imag()));
+    }
+    if (fabs(__x.real()) == _Tp(1) && __x.imag() == _Tp(0))
+    {
+        return complex<_Tp>(copysign(_Tp(INFINITY), __x.real()), copysign(_Tp(0), __x.imag()));
+    }
+    complex<_Tp> __z = log((_Tp(1) + __x) / (_Tp(1) - __x)) / _Tp(2);
+    return complex<_Tp>(copysign(__z.real(), __x.real()), copysign(__z.imag(), __x.imag()));
+}
+
+// sinh
+
+template<class _Tp>
+CUDA_CALLABLE_MEMBER
+complex<_Tp>
+sinh(const complex<_Tp>& __x)
+{
+    if (isinf(__x.real()) && !isfinite(__x.imag()))
+        return complex<_Tp>(__x.real(), _Tp(NAN));
+    if (__x.real() == 0 && !isfinite(__x.imag()))
+        return complex<_Tp>(__x.real(), _Tp(NAN));
+    if (__x.imag() == 0 && !isfinite(__x.real()))
+        return __x;
+    return complex<_Tp>(sinh(__x.real()) * cos(__x.imag()), cosh(__x.real()) * sin(__x.imag()));
+}
+
+// cosh
+
+template<class _Tp>
+CUDA_CALLABLE_MEMBER
+complex<_Tp>
+cosh(const complex<_Tp>& __x)
+{
+    if (isinf(__x.real()) && !isfinite(__x.imag()))
+        return complex<_Tp>(fabs(__x.real()), _Tp(NAN));
+    if (__x.real() == 0 && !isfinite(__x.imag()))
+        return complex<_Tp>(_Tp(NAN), __x.real());
+    if (__x.real() == 0 && __x.imag() == 0)
+        return complex<_Tp>(_Tp(1), __x.imag());
+    if (__x.imag() == 0 && !isfinite(__x.real()))
+        return complex<_Tp>(fabs(__x.real()), __x.imag());
+    return complex<_Tp>(cosh(__x.real()) * cos(__x.imag()), sinh(__x.real()) * sin(__x.imag()));
+}
+
+// tanh
+
+template<class _Tp>
+CUDA_CALLABLE_MEMBER
+complex<_Tp>
+tanh(const complex<_Tp>& __x)
+{
+    if (isinf(__x.real()))
+    {
+        if (!isfinite(__x.imag()))
+            return complex<_Tp>(_Tp(1), _Tp(0));
+        return complex<_Tp>(_Tp(1), copysign(_Tp(0), sin(_Tp(2) * __x.imag())));
+    }
+    if (isnan(__x.real()) && __x.imag() == 0)
+        return __x;
+    _Tp __2r(_Tp(2) * __x.real());
+    _Tp __2i(_Tp(2) * __x.imag());
+    _Tp __d(cosh(__2r) + cos(__2i));
+    return  complex<_Tp>(sinh(__2r)/__d, sin(__2i)/__d);
+}
+
+// asin
+
+template<class _Tp>
+CUDA_CALLABLE_MEMBER
+complex<_Tp>
+asin(const complex<_Tp>& __x)
+{
+    complex<_Tp> __z = asinh(complex<_Tp>(-__x.imag(), __x.real()));
+    return complex<_Tp>(__z.imag(), -__z.real());
+}
+
+// acos
+
+template<class _Tp>
+CUDA_CALLABLE_MEMBER
+complex<_Tp>
+acos(const complex<_Tp>& __x)
+{
+    const _Tp __pi(atan2(+0., -0.));
+    if (isinf(__x.real()))
+    {
+        if (isnan(__x.imag()))
+            return complex<_Tp>(__x.imag(), __x.real());
+        if (isinf(__x.imag()))
+        {
+            if (__x.real() < _Tp(0))
+                return complex<_Tp>(_Tp(0.75) * __pi, -__x.imag());
+            return complex<_Tp>(_Tp(0.25) * __pi, -__x.imag());
+        }
+        if (__x.real() < _Tp(0))
+            return complex<_Tp>(__pi, signbit(__x.imag()) ? -__x.real() : __x.real());
+        return complex<_Tp>(_Tp(0), signbit(__x.imag()) ? __x.real() : -__x.real());
+    }
+    if (isnan(__x.real()))
+    {
+        if (isinf(__x.imag()))
+            return complex<_Tp>(__x.real(), -__x.imag());
+        return complex<_Tp>(__x.real(), __x.real());
+    }
+    if (isinf(__x.imag()))
+        return complex<_Tp>(__pi/_Tp(2), -__x.imag());
+    if (__x.real() == 0)
+        return complex<_Tp>(__pi/_Tp(2), -__x.imag());
+    complex<_Tp> __z = log(__x + sqrt(pow(__x, _Tp(2)) - _Tp(1)));
+    if (signbit(__x.imag()))
+        return complex<_Tp>(fabs(__z.imag()), fabs(__z.real()));
+    return complex<_Tp>(fabs(__z.imag()), -fabs(__z.real()));
+}
+
+// atan
+
+template<class _Tp>
+CUDA_CALLABLE_MEMBER
+complex<_Tp>
+atan(const complex<_Tp>& __x)
+{
+    complex<_Tp> __z = atanh(complex<_Tp>(-__x.imag(), __x.real()));
+    return complex<_Tp>(__z.imag(), -__z.real());
+}
+
+// sin
+
+template<class _Tp>
+CUDA_CALLABLE_MEMBER
+complex<_Tp>
+sin(const complex<_Tp>& __x)
+{
+    complex<_Tp> __z = sinh(complex<_Tp>(-__x.imag(), __x.real()));
+    return complex<_Tp>(__z.imag(), -__z.real());
+}
+
+// cos
+
+template<class _Tp>
+inline CUDA_CALLABLE_MEMBER
+complex<_Tp>
+cos(const complex<_Tp>& __x)
+{
+    return cosh(complex<_Tp>(-__x.imag(), __x.real()));
+}
+
+// tan
+
+template<class _Tp>
+CUDA_CALLABLE_MEMBER
+complex<_Tp>
+tan(const complex<_Tp>& __x)
+{
+    complex<_Tp> __z = tanh(complex<_Tp>(-__x.imag(), __x.real()));
+    return complex<_Tp>(__z.imag(), -__z.real());
+}
+
+template<class _Tp, class _CharT, class _Traits>
+std::basic_istream<_CharT, _Traits>&
+operator>>(std::basic_istream<_CharT, _Traits>& __is, complex<_Tp>& __x)
+{
+    if (__is.good())
+    {
+        ws(__is);
+        if (__is.peek() == _CharT('('))
+        {
+            __is.get();
+            _Tp __r;
+            __is >> __r;
+            if (!__is.fail())
+            {
+                ws(__is);
+                _CharT __c = __is.peek();
+                if (__c == _CharT(','))
+                {
+                    __is.get();
+                    _Tp __i;
+                    __is >> __i;
+                    if (!__is.fail())
+                    {
+                        ws(__is);
+                        __c = __is.peek();
+                        if (__c == _CharT(')'))
+                        {
+                            __is.get();
+                            __x = complex<_Tp>(__r, __i);
+                        }
+                        else
+                            __is.setstate(std::ios_base::failbit);
+                    }
+                    else
+                        __is.setstate(std::ios_base::failbit);
+                }
+                else if (__c == _CharT(')'))
+                {
+                    __is.get();
+                    __x = complex<_Tp>(__r, _Tp(0));
+                }
+                else
+                    __is.setstate(std::ios_base::failbit);
+            }
+            else
+                __is.setstate(std::ios_base::failbit);
+        }
+        else
+        {
+            _Tp __r;
+            __is >> __r;
+            if (!__is.fail())
+                __x = complex<_Tp>(__r, _Tp(0));
+            else
+                __is.setstate(std::ios_base::failbit);
+        }
+    }
+    else
+        __is.setstate(std::ios_base::failbit);
+    return __is;
+}
+
+template<class _Tp, class _CharT, class _Traits>
+std::basic_ostream<_CharT, _Traits>&
+operator<<(std::basic_ostream<_CharT, _Traits>& __os, const complex<_Tp>& __x)
+{
+    std::basic_ostringstream<_CharT, _Traits> __s;
+    __s.flags(__os.flags());
+    __s.imbue(__os.getloc());
+    __s.precision(__os.precision());
+    __s << '(' << __x.real() << ',' << __x.imag() << ')';
+    return __os << __s.str();
+}
+
+//} // close namespace cuda_complex
+
+#endif  // CUDA_COMPLEX_HPP
diff --git a/runtime/hipacc_cpu_fftw.hpp b/runtime/hipacc_cpu_fftw.hpp
new file mode 100644
index 00000000..b048ffe0
--- /dev/null
+++ b/runtime/hipacc_cpu_fftw.hpp
@@ -0,0 +1,604 @@
+#include <fftw3.h>
+#include <omp.h>
+
+#include "hipacc_fft_helper.hpp"
+
+#include <assert.h>
+#include <iostream>
+#include <type_traits>
+
+#include <cmath>
+#include <complex>
+#include <string>
+
+template <class P> struct FFT {};
+// fftw wrapper functinos for single precision
+template <> struct FFT<float> {
+  template <typename... TArgs> static void *fft_malloc(TArgs... args) {
+    return fftwf_malloc(args...);
+  }
+  template <typename... TArgs>
+  static fftwf_plan fft_plan_dft_r2c_2d(TArgs... args) {
+    return fftwf_plan_dft_r2c_2d(args...);
+  }
+  template <typename... TArgs>
+  static fftwf_plan fft_plan_dft_c2r_2d(TArgs... args) {
+    return fftwf_plan_dft_c2r_2d(args...);
+  }
+  template <typename... TArgs> static fftwf_plan fft_plan_r2r_2d(TArgs... args) {
+    return fftwf_plan_r2r_2d(args...);
+  }
+  template <typename... TArgs> static void fft_execute(TArgs... args) {
+    fftwf_execute(args...);
+  }
+  template <typename... TArgs> static void fft_destroy_plan(TArgs... args) {
+    fftwf_destroy_plan(args...);
+  }
+  template <typename... TArgs> static void fft_free(TArgs... args) {
+    fftwf_free(args...);
+  }
+  template <typename... TArgs> static void fft_flops(TArgs... args) {
+    fftwf_flops(args...);
+  }
+  template <typename... TArgs> static void fft_init_threads(TArgs... args) {
+    fftwf_init_threads(args...);
+  }
+  template <typename... TArgs> static void fft_set_timelimit(TArgs... args) {
+    fftwf_set_timelimit(args...);
+  }
+  template <typename... TArgs> static void fft_plan_with_nthreads(TArgs... args) {
+    fftwf_plan_with_nthreads(args...);
+  }
+  template <typename... TArgs> static void fft_cleanup_threads(TArgs... args) {
+    fftwf_cleanup_threads(args...);
+  }
+};
+// fftw wrapper functinos for double precision
+template <> struct FFT<double> {
+  template <typename... TArgs> static void *fft_malloc(TArgs... args) {
+    return fftw_malloc(args...);
+  }
+  template <typename... TArgs>
+  static fftw_plan fft_plan_dft_r2c_2d(TArgs... args) {
+    return fftw_plan_dft_r2c_2d(args...);
+  }
+  template <typename... TArgs>
+  static fftw_plan fft_plan_dft_c2r_2d(TArgs... args) {
+    return fftw_plan_dft_c2r_2d(args...);
+  }
+  template <typename... TArgs> static fftw_plan fft_plan_r2r_2d(TArgs... args) {
+    return fftw_plan_r2r_2d(args...);
+  }
+  template <typename... TArgs> static void fft_execute(TArgs... args) {
+    fftw_execute(args...);
+  }
+  template <typename... TArgs> static void fft_destroy_plan(TArgs... args) {
+    fftw_destroy_plan(args...);
+  }
+  template <typename... TArgs> static void fft_free(TArgs... args) {
+    fftw_free(args...);
+  }
+  template <typename... TArgs> static void fft_flops(TArgs... args) {
+    fftw_flops(args...);
+  }
+  template <typename... TArgs> static void fft_init_threads(TArgs... args) {
+    fftw_init_threads(args...);
+  }
+  template <typename... TArgs> static void fft_set_timelimit(TArgs... args) {
+    fftw_set_timelimit(args...);
+  }
+  template <typename... TArgs> static void fft_plan_with_nthreads(TArgs... args) {
+    fftw_plan_with_nthreads(args...);
+  }
+  template <typename... TArgs> static void fft_cleanup_threads(TArgs... args) {
+    fftw_cleanup_threads(args...);
+  }
+};
+
+template <class TPrecision, class V>
+void fftConvolve(TPrecision *in, int width, int height, V *k, int k_w, int k_h,
+                 TPrecision *out) {
+  typedef
+      typename std::conditional<std::is_same<TPrecision, float>::value,
+                                fftwf_complex, fftw_complex>::type fft_complex;
+  typedef typename std::conditional<std::is_same<TPrecision, float>::value,
+                                    fftwf_plan, fftw_plan>::type fft_plan;
+  bool floatPrecision = false;
+  if (std::is_same<TPrecision, float>::value) {
+    floatPrecision = true;
+  }
+
+  FFT<TPrecision>::fft_init_threads();
+  FFT<TPrecision>::fft_plan_with_nthreads(2);
+  const unsigned int PLANNER_FLAG = FFTW_MEASURE; // FFTW_ESTIMATE or FFTW_MEASURE
+  FFT<TPrecision>::fft_set_timelimit(0.001);
+
+  int image_width = width;
+  int image_height = height;
+  int matsize = image_width * image_height;
+  int intermediateSize = image_height * (image_width / 2 + 1);
+
+  // setup buffers
+  TPrecision *kernel =
+      (TPrecision *)FFT<TPrecision>::fft_malloc(sizeof(TPrecision) * matsize);
+  fft_complex *image_fft = (fft_complex *)FFT<TPrecision>::fft_malloc(
+      sizeof(fft_complex) * intermediateSize);
+  fft_complex *kernel_fft = (fft_complex *)FFT<TPrecision>::fft_malloc(
+      sizeof(fft_complex) * intermediateSize);
+
+  memset(kernel, 0, sizeof(TPrecision) * matsize);
+
+  // prepare kernel
+  putKernel(k, kernel, k_w, k_h, image_width, image_height);
+
+  // create plans
+  fft_plan plan_image = FFT<TPrecision>::fft_plan_dft_r2c_2d(
+      image_height, image_width, in, image_fft, PLANNER_FLAG);
+  fft_plan plan_kernel = FFT<TPrecision>::fft_plan_dft_r2c_2d(
+      image_height, image_width, kernel, kernel_fft, PLANNER_FLAG);
+  fft_plan plan_inverse = FFT<TPrecision>::fft_plan_dft_c2r_2d(
+      image_height, image_width, image_fft, out, PLANNER_FLAG);
+
+  // start
+  auto start_time = std::chrono::system_clock::now();
+
+  //#pragma omp parallel sections
+  {
+    //#pragma omp section
+    { FFT<TPrecision>::fft_execute(plan_image); }
+    //#pragma omp section
+    { FFT<TPrecision>::fft_execute(plan_kernel); }
+  }
+
+// Pointwise Multiplication in Frequency Domain
+#pragma omp parallel for
+  for (int ind = 0; ind < intermediateSize; ++ind) {
+    reinterpret_cast<std::complex<TPrecision> *>(image_fft)[ind] *=
+        reinterpret_cast<std::complex<TPrecision> *>(kernel_fft)[ind];
+  }
+
+  FFT<TPrecision>::fft_execute(plan_inverse);
+
+// scale
+#pragma omp parallel for
+  for (int ind = 0; ind < matsize; ++ind) {
+    out[ind] /= (image_height * image_width);
+  }
+
+  // stop
+  auto end_time = std::chrono::system_clock::now();
+  auto elapsed_time = std::chrono::duration_cast<std::chrono::microseconds>(
+      end_time - start_time);
+  std::cout << "h: " << height << " w: " << width << std::endl;
+  std::cout << (float)elapsed_time.count() / 1000.0 << " ms" << std::endl;
+
+  // cleanup
+  FFT<TPrecision>::fft_destroy_plan(plan_image);
+  FFT<TPrecision>::fft_destroy_plan(plan_kernel);
+  FFT<TPrecision>::fft_destroy_plan(plan_inverse);
+  FFT<TPrecision>::fft_free(kernel);
+  FFT<TPrecision>::fft_free(image_fft);
+  FFT<TPrecision>::fft_free(kernel_fft);
+  FFT<TPrecision>::fft_cleanup_threads();
+}
+
+bool estimateConvolutionExecutionTime(int width, int height, int padWidth,
+                                      int padHeight, int k_w, int k_h,
+                                      bool linear) {
+  // assumed 4 GFLOPS per core
+  const double FLOPMS = 4000000; // FLOP per MS
+  const double THREADS = 4;
+
+  double N1 = padWidth;
+  double N2 = padHeight;
+
+  double singleFFTcost = 7.0 * N1 * N2 * log2(N1 * N2);
+
+  // double FFTConvolutionCost1 = (singleFFTcost1 * 3 + (N1*N2*(3+1))) / THREADS;
+  // double FFTConvolutionCost2 = (singleFFTcost2 * 3 + (N1*N2*(3+1))) / THREADS;
+
+  double FFTConvolutionCost = (singleFFTcost * 3 + (N1 * N2 * (3 + 1))) / THREADS;
+  std::cout << "FFTConvolutionCost: " << FFTConvolutionCost / FLOPMS << std::endl;
+
+  double flopPerPixelInHipaccKernel; // depends on border condition
+  if (linear) {
+    flopPerPixelInHipaccKernel = 4;
+  } else {
+    flopPerPixelInHipaccKernel = 12;
+  }
+  double simpleHipaccCost =
+      (height * width) * (k_w * k_h) * (2 + flopPerPixelInHipaccKernel);
+  double parallelHipaccCost = simpleHipaccCost / THREADS;
+  std::cout << "simpleHipaccCost: " << simpleHipaccCost / FLOPMS << std::endl;
+
+  if (FFTConvolutionCost < simpleHipaccCost) {
+    std::cout << "FFT Convolution will be faster" << std::endl;
+    return true;
+  } else {
+    std::cout << "Hipacc Kernel will be faster" << std::endl;
+    return false;
+  }
+}
+
+template <class TPrecision, class T, class U, class V, size_t rows, size_t cols,
+          class B = int>
+void fftwConvolution(T *in, int width, int height, const V (&kernel)[rows][cols],
+                     int k_w, int k_h, U *out, int alignment, bool linear,
+                     B boundaryConstant = 0) {
+  assert(rows == k_h && cols == k_w);
+
+  int width_in = alignedWidth<T>(width, alignment);
+  int width_out = alignedWidth<U>(width, alignment);
+
+  int padWidth = width;
+  int padHeight = height;
+  if (linear) {
+    // required padding for linear convolution
+    padWidth = width + k_w / 2;
+    padHeight = height + k_h / 2;
+    // additional padding for performance
+    padWidth = upperPowerOfTwo(padWidth); // width has more influence
+    padHeight = nextDiv(padHeight, 8);
+  }
+  int padSize = padWidth * padHeight;
+
+  /*
+  estimateConvolutionExecutionTime(width, height, padWidth, padHeight, k_w, k_h,
+                                   linear);
+  */
+
+  // prepare input buffer
+  TPrecision *input = new TPrecision[padSize];
+  if (linear) {
+    std::fill_n(input, padSize, (TPrecision)boundaryConstant);
+  }
+  for (int y = 0; y < height; y++) {
+    for (int x = 0; x < width; x++) {
+      input[y * padWidth + x] = (TPrecision)in[y * width_in + x];
+    }
+  }
+  // prepare output buffer
+  TPrecision *output = new TPrecision[padSize];
+  fftConvolve(input, padWidth, padHeight, (V *)(&kernel[0][0]), k_w, k_h, output);
+  for (int y = 0; y < height; y++) {
+    for (int x = 0; x < width; x++) {
+      out[y * width_out + x] = (U)output[y * padWidth + x];
+    }
+  }
+
+  free(input);
+  free(output);
+}
+
+template <class TPrecision>
+void fftTransform(TPrecision *in, int width, int height, TPrecision *out,
+                  bool forward = true, bool scale = false) {
+  typedef
+      typename std::conditional<std::is_same<TPrecision, float>::value,
+                                fftwf_complex, fftw_complex>::type fft_complex;
+  typedef typename std::conditional<std::is_same<TPrecision, float>::value,
+                                    fftwf_plan, fftw_plan>::type fft_plan;
+  bool floatPrecision = false;
+  if (std::is_same<TPrecision, float>::value) {
+    floatPrecision = true;
+  }
+
+  FFT<TPrecision>::fft_init_threads();
+  FFT<TPrecision>::fft_plan_with_nthreads(2);
+  const unsigned int PLANNER_FLAG = FFTW_MEASURE; // FFTW_ESTIMATE or FFTW_MEASURE
+  FFT<TPrecision>::fft_set_timelimit(0.001);
+
+  // create plans
+  fft_plan plan_image;
+  if (forward) {
+    fft_complex *fft = reinterpret_cast<fft_complex *>(out);
+    plan_image = FFT<TPrecision>::fft_plan_dft_r2c_2d(height, width, in, fft,
+                                                      PLANNER_FLAG);
+  } else {
+    fft_complex *fft = reinterpret_cast<fft_complex *>(in);
+    plan_image = FFT<TPrecision>::fft_plan_dft_c2r_2d(height, width, fft, out,
+                                                      PLANNER_FLAG);
+  }
+
+  FFT<TPrecision>::fft_execute(plan_image);
+
+  if (!forward && scale) {
+    // scale
+#pragma omp parallel for
+    for (int ind = 0; ind < width * height; ++ind) {
+      out[ind] /= (width * height);
+    }
+  }
+
+  // cleanup
+  FFT<TPrecision>::fft_destroy_plan(plan_image);
+  FFT<TPrecision>::fft_cleanup_threads();
+}
+
+template <class TPrecision>
+void fftTransformDevice(TPrecision *in, int width, int height, TPrecision *out,
+                        bool forward = true, bool scale = false) {
+  // static_assert(false, "fftTransformDevice not available for cpu!");
+  std::cerr << "fftTransformDevice is not available for cpu! Using fftTransform "
+               "instead."
+            << std::endl;
+  fftTransform(in, width, height, out, forward, scale);
+}
+
+template <class TPrecision>
+void dctTransformDevice(TPrecision *in, int width, int height, TPrecision *out,
+                        bool forward = true) {
+  // static_assert(false, "dctTransformDevice not available for cpu!");
+  std::cerr << "dctTransformDevice is not available for cpu! Using dctTransform "
+               "instead."
+            << std::endl;
+  dctTransform(in, width, height, out, forward);
+}
+
+template <class TPrecision>
+void dctTransform(TPrecision *in, int width, int height, TPrecision *out,
+                  bool forward = true) {
+  typedef
+      typename std::conditional<std::is_same<TPrecision, float>::value,
+                                fftwf_complex, fftw_complex>::type fft_complex;
+  typedef typename std::conditional<std::is_same<TPrecision, float>::value,
+                                    fftwf_plan, fftw_plan>::type fft_plan;
+  bool floatPrecision = false;
+  if (std::is_same<TPrecision, float>::value) {
+    floatPrecision = true;
+  }
+
+  FFT<TPrecision>::fft_init_threads();
+  FFT<TPrecision>::fft_plan_with_nthreads(2);
+  const unsigned int PLANNER_FLAG = FFTW_MEASURE; // FFTW_ESTIMATE or FFTW_MEASURE
+  FFT<TPrecision>::fft_set_timelimit(0.001);
+
+  // setup buffers
+  // create plans
+  fft_plan plan_image;
+  if (forward) {
+    TPrecision *fft = reinterpret_cast<TPrecision *>(out);
+    plan_image = FFT<TPrecision>::fft_plan_r2r_2d(
+        height, width, in, fft, FFTW_REDFT10, FFTW_REDFT10, PLANNER_FLAG);
+  } else {
+    TPrecision *fft = reinterpret_cast<TPrecision *>(in);
+    plan_image = FFT<TPrecision>::fft_plan_r2r_2d(
+        height, width, fft, out, FFTW_REDFT01, FFTW_REDFT01, PLANNER_FLAG);
+  }
+
+  FFT<TPrecision>::fft_execute(plan_image);
+
+  if (!forward) {
+    // scale
+#pragma omp parallel for
+    for (int ind = 0; ind < width * height; ++ind) {
+      out[ind] /= (2 * width * 2 * height);
+    }
+  }
+
+  // cleanup
+  FFT<TPrecision>::fft_destroy_plan(plan_image);
+  FFT<TPrecision>::fft_cleanup_threads();
+}
+
+// forward FFT
+template <class T, class TPrecision> TPrecision *fft(HipaccImage &in) {
+  int width = in->width;
+  int height = in->height;
+  int width_in = alignedWidth<T>(width, in->alignment);
+
+  // prepare input buffer
+  TPrecision *input = new TPrecision[width * height];
+  for (int y = 0; y < height; y++) {
+    for (int x = 0; x < width; x++) {
+      input[y * width + x] = (TPrecision)((T *)(in->mem))[y * width_in + x];
+    }
+  }
+  // prepare output buffer
+  TPrecision *output = new TPrecision[2 * (width / 2 + 1) * height];
+
+  fftTransform(input, width, height, output);
+
+  free(input);
+
+  return output;
+}
+
+// inverse FFT
+template <class T, class TPrecision> void ifft(TPrecision *in, HipaccImage &out, bool scaled = true) {
+  int width = out->width;
+  int height = out->height;
+  int width_out = alignedWidth<T>(width, out->alignment);
+
+  // prepare output buffer
+  TPrecision *output = new TPrecision[width * height];
+
+  fftTransform(in, width, height, output, false, scaled);
+
+  // truncate values outside of range 0-255
+  for (int y = 0; y < height; y++) {
+    for (int x = 0; x < width; x++) {
+      T val = output[y * width + x];
+      if (output[y * width + x] < 0) {
+        val = 0;
+      } else if (output[y * width + x] > 255) {
+        val = 255;
+      }
+      // convert output
+      ((T *)(out->mem))[y * width_out + x] = (T)(val);
+    }
+  }
+
+  free(output);
+}
+
+// create magnitude from fft
+template <class T, class TPrecision>
+void fftToMagnitude(TPrecision *in, HipaccImage &mag) {
+  int width = mag->width;
+  int height = mag->height;
+  int width_out = alignedWidth<T>(width, mag->alignment);
+
+  T *tmp = new T[width * height];
+  calcMagnitude(reinterpret_cast<std::complex<TPrecision> *>(in), tmp, width,
+                height);
+
+  shiftFFT(tmp, width, height);
+
+  for (int y = 0; y < height; y++) {
+    for (int x = 0; x < width; x++) {
+      ((T *)(mag->mem))[y * width_out + x] = (T)tmp[y * width + x];
+    }
+  }
+
+  free(tmp);
+}
+
+// create magnitude from dct
+template <class T, class TPrecision>
+void dctToMagnitude(TPrecision *in, HipaccImage &mag) {
+  int width = mag->width;
+  int height = mag->height;
+  int width_out = alignedWidth<T>(width, mag->alignment);
+
+  T *tmp = new T[width * height];
+  for (int y = 0; y < height; y++) {
+    for (int x = 0; x < width; x++) {
+      tmp[y * width + x] = std::abs(in[y * width + x]);
+    }
+  }
+
+  float max = 0.0;
+  for (int y = 0; y < height; y++) {
+    for (int x = 0; x < width; x++) {
+      if (tmp[y * width + x] > max)
+        max = tmp[y * width + x];
+    }
+  }
+  for (int y = 0; y < height; y++) {
+    for (int x = 0; x < width; x++) {
+      tmp[y * width + x] =
+          (T)(255.0 * pow(tmp[y * width + x] * (1.0 / max), 1.0 / 4.0));
+    }
+  }
+
+  for (int y = 0; y < height; y++) {
+    for (int x = 0; x < width; x++) {
+      ((T *)(mag->mem))[y * width_out + x] = (T)tmp[y * width + x];
+    }
+  }
+
+  free(tmp);
+}
+
+// apply mask mag to result of FFT in
+// and ignore values for frequencies lower than r
+template <class T, class TPrecision>
+void fftScaleMagnitude(TPrecision *in, HipaccImage &mag) {
+  int width = mag->width;
+  int height = mag->height;
+  int width_in = alignedWidth<T>(width, mag->alignment);
+
+  T *scale = new T[width_in * height];
+  memcpy(scale, mag->mem, sizeof(T) * width_in * height);
+
+  iShiftFFT(scale, width, height, mag->alignment);
+
+  for (int y = 0; y < height; y++) {
+    for (int x = 0; x < (width / 2 + 1); x++) {
+      in[(y * (width / 2 + 1) + x) * 2] *=
+          (TPrecision)scale[y * width_in + x] / 255;
+      in[(y * (width / 2 + 1) + x) * 2 + 1] *=
+          (TPrecision)scale[y * width_in + x] / 255;
+    }
+  }
+
+  free(scale);
+}
+
+// forward DCT
+template <class T, class TPrecision> TPrecision *dct(HipaccImage &in) {
+  int width = in->width;
+  int height = in->height;
+  int width_in = alignedWidth<T>(width, in->alignment);
+
+  // prepare input buffer
+  TPrecision *input = new TPrecision[width * height];
+  for (int y = 0; y < height; y++) {
+    for (int x = 0; x < width; x++) {
+      input[y * width + x] = (TPrecision)((T *)(in->mem))[y * width_in + x];
+    }
+  }
+  // prepare output buffer
+  TPrecision *output = new TPrecision[width * height];
+
+  dctTransform(input, width, height, output);
+
+  free(input);
+
+  return output;
+}
+
+// inverse DCT
+template <class T, class TPrecision> void idct(TPrecision *in, HipaccImage &out) {
+  int width = out->width;
+  int height = out->height;
+  int width_out = alignedWidth<T>(width, out->alignment);
+
+  // prepare output buffer
+  TPrecision *output = new TPrecision[width * height];
+
+  dctTransform(in, width, height, output, false);
+
+  // truncate values outside of range 0-255
+  for (int y = 0; y < height; y++) {
+    for (int x = 0; x < width; x++) {
+      T val = output[y * width + x];
+      if (output[y * width + x] < 0) {
+        val = 0;
+      } else if (output[y * width + x] > 255) {
+        val = 255;
+      }
+      // convert output
+      ((T *)(out->mem))[y * width_out + x] = (T)(val);
+    }
+  }
+
+  free(output);
+}
+
+// function wrappers for images
+
+template <class T> void fftShift(HipaccImage &mag) {
+  int width = mag->width;
+  int height = mag->height;
+  int width_in = alignedWidth<T>(width, mag->alignment);
+
+  shiftFFT((T *)(mag->mem), width, height, width_in);
+}
+
+template <class T> void ifftShift(HipaccImage &mag) {
+  int width = mag->width;
+  int height = mag->height;
+  int width_in = alignedWidth<T>(width, mag->alignment);
+
+  iShiftFFT((T *)(mag->mem), width, height, width_in);
+}
+
+template <class T>
+void fftResetMask(HipaccImage &mag, int radius, bool low, int window = 0) {
+  int width = mag->width;
+  int height = mag->height;
+  int width_in = alignedWidth<T>(width, mag->alignment);
+
+  magResetFreq((T *)(mag->mem), width, height, width_in, radius, window, low);
+}
+
+template <class T>
+void fftApplyPassFilter(HipaccImage &mag, int radius, bool low, int window = 0) {
+  int width = mag->width;
+  int height = mag->height;
+  int width_in = alignedWidth<T>(width, mag->alignment);
+
+  magPassFilter((T *)(mag->mem), width, height, width_in, radius, window, low);
+}
diff --git a/runtime/hipacc_cu.hpp.cmake b/runtime/hipacc_cu.hpp.cmake
index 4d0f579b..f083ca1a 100644
--- a/runtime/hipacc_cu.hpp.cmake
+++ b/runtime/hipacc_cu.hpp.cmake
@@ -181,6 +181,7 @@ void hipaccInitCUDA();
 void hipaccCopyMemory(const HipaccImage &src, HipaccImage &dst);
 void hipaccCopyMemoryRegion(const HipaccAccessor &src, const HipaccAccessor &dst);
 void hipaccLaunchKernel(const void *kernel, std::string kernel_name, dim3 grid, dim3 block, void **args, bool print_timing=true);
+void hipaccLaunchKernel(const void *kernel, std::string kernel_name, dim3 grid, dim3 block, void **args, cudaStream_t stream, cudaEvent_t waitEvent=nullptr, cudaEvent_t recordEvent=nullptr);
 void hipaccLaunchKernelBenchmark(const void *kernel, std::string kernel_name, dim3 grid, dim3 block, std::vector<void *> args, bool print_timing=true);
 void hipaccLaunchKernelExploration(std::string filename, std::string kernel, std::vector<void *> args,
                                    std::vector<hipacc_smem_info> smems, std::vector<hipacc_const_info> consts, std::vector<hipacc_tex_info*> texs,
@@ -207,8 +208,12 @@ HipaccImage hipaccCreatePyramidImage(const HipaccImage &base, size_t width, size
 template<typename T>
 void hipaccWriteMemory(HipaccImage &img, T *host_mem);
 template<typename T>
+void hipaccWriteMemory(HipaccImage &img, T *host_mem, cudaStream_t stream);
+template<typename T>
 T *hipaccReadMemory(const HipaccImage &img);
 template<typename T>
+T *hipaccReadMemory(const HipaccImage &img, cudaStream_t stream);
+template<typename T>
 void hipaccBindTexture(hipaccMemoryType mem_type, const textureReference *tex, const HipaccImage &img);
 template<typename T>
 void hipaccBindSurface(hipaccMemoryType mem_type, const surfaceReference *surf, const HipaccImage &img);
diff --git a/runtime/hipacc_cu.tpp b/runtime/hipacc_cu.tpp
index 36ce05e9..88d43fbc 100644
--- a/runtime/hipacc_cu.tpp
+++ b/runtime/hipacc_cu.tpp
@@ -122,6 +122,33 @@ void hipaccWriteMemory(HipaccImage &img, T *host_mem) {
 }
 
 
+// Write to memory async
+template<typename T>
+void hipaccWriteMemory(HipaccImage &img, T *host_mem, cudaStream_t stream) {
+    if (host_mem == NULL) return;
+
+    size_t width  = img->width;
+    size_t height = img->height;
+    size_t stride = img->stride;
+
+    if ((char *)host_mem != img->host)
+        std::copy(host_mem, host_mem + width*height, (T*)img->host);
+
+    if (img->mem_type >= Array2D) {
+        cudaError_t err = cudaMemcpy2DToArrayAsync((cudaArray *)img->mem, 0, 0, host_mem, stride*sizeof(T), width*sizeof(T), height, cudaMemcpyHostToDevice, stream);
+        checkErr(err, "cudaMemcpy2DToArray()");
+    } else {
+        if (stride > width) {
+            cudaError_t err = cudaMemcpy2DAsync(img->mem, stride*sizeof(T), host_mem, width*sizeof(T), width*sizeof(T), height, cudaMemcpyHostToDevice, stream);
+            checkErr(err, "cudaMemcpy2D()");
+        } else {
+            cudaError_t err = cudaMemcpyAsync(img->mem, host_mem, sizeof(T)*width*height, cudaMemcpyHostToDevice, stream);
+            checkErr(err, "cudaMemcpy()");
+        }
+    }
+}
+
+
 // Read from memory
 template<typename T>
 T *hipaccReadMemory(const HipaccImage &img) {
@@ -146,6 +173,30 @@ T *hipaccReadMemory(const HipaccImage &img) {
 }
 
 
+// Read from memory async
+template<typename T>
+T *hipaccReadMemory(const HipaccImage &img, cudaStream_t stream) {
+    size_t width  = img->width;
+    size_t height = img->height;
+    size_t stride = img->stride;
+
+    if (img->mem_type >= Array2D) {
+        cudaError_t err = cudaMemcpy2DFromArrayAsync((T*)img->host, stride*sizeof(T), (cudaArray *)img->mem, 0, 0, width*sizeof(T), height, cudaMemcpyDeviceToHost, stream);
+        checkErr(err, "cudaMemcpy2DFromArray()");
+    } else {
+        if (stride > width) {
+            cudaError_t err = cudaMemcpy2DAsync((T*)img->host, width*sizeof(T), img->mem, stride*sizeof(T), width*sizeof(T), height, cudaMemcpyDeviceToHost, stream);
+            checkErr(err, "cudaMemcpy2D()");
+        } else {
+            cudaError_t err = cudaMemcpyAsync((T*)img->host, img->mem, sizeof(T)*width*height, cudaMemcpyDeviceToHost, stream);
+            checkErr(err, "cudaMemcpy()");
+        }
+    }
+
+    return (T*)img->host;
+}
+
+
 // Bind memory to texture
 template<typename T>
 void hipaccBindTexture(hipaccMemoryType mem_type, const textureReference *tex, const HipaccImage &img) {
diff --git a/runtime/hipacc_cu_cufft.hpp b/runtime/hipacc_cu_cufft.hpp
new file mode 100644
index 00000000..8b0ddf91
--- /dev/null
+++ b/runtime/hipacc_cu_cufft.hpp
@@ -0,0 +1,1501 @@
+#include <cuComplex.h>
+#include <cufft.h>
+#include <cufftXt.h>
+
+#include "cuda_complex.hpp"
+#include "hipacc_base_standalone.hpp"
+#include "hipacc_fft_helper.hpp"
+
+#include <iostream>
+#include <complex>
+
+#define gpuErrchk(ans)                                                           \
+  { gpuAssert((ans), __FILE__, __LINE__); }
+inline void gpuAssert(cudaError_t code, const char *file, int line,
+                      bool abort = true) {
+  if (code != cudaSuccess) {
+    fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
+            line);
+    if (abort)
+      exit(code);
+  }
+}
+
+void Handle_Error(cudaError_t err, const char *file, int line) {
+  if (err != cudaSuccess) {
+    std::string cu = std::string(cudaGetErrorString(err)) + " in " +
+                     std::string(file) + " at line: " + std::to_string(line) +
+                     "\n";
+    std::cout << cu;
+  }
+}
+#define HANDLE_ERROR(err) (Handle_Error(err, __FILE__, __LINE__))
+
+// Complex data type
+struct d2 {
+  double x;
+  double y;
+};
+typedef d2 DoubleComplex;
+// typedef float2 Complex;
+struct f2 {
+  float x;
+  float y;
+};
+typedef f2 Complex;
+
+// Mul
+__device__ cufftComplex C_Mul(cufftComplex a, cufftComplex b) {
+  cufftComplex temp;
+  temp.x = a.x * b.x - a.y * b.y;
+  temp.y = a.y * b.x + b.y * a.x;
+  return temp;
+}
+// Mul double
+__device__ cufftDoubleComplex C_Mul(cufftDoubleComplex a, cufftDoubleComplex b) {
+  cufftDoubleComplex temp;
+  temp.x = a.x * b.x - a.y * b.y;
+  temp.y = a.y * b.x + b.y * a.x;
+  return temp;
+}
+// Scale
+template <class C> __device__ C C_Scale(C a, float s) {
+  C c;
+  c.x = s * a.x;
+  c.y = s * a.y;
+  return c;
+}
+// Mul and scale
+__global__ void MulScalePw(cufftDoubleComplex *image, cufftDoubleComplex *filter,
+                           int Nx, int Ny, double scale) {
+  int tix = threadIdx.x + blockIdx.x * blockDim.x;
+  int tiy = threadIdx.y + blockIdx.y * blockDim.y;
+  int tid = tix + Nx * tiy;
+
+  if (tid >= Nx * Ny)
+    return;
+
+  if ((tix < Nx) && (tiy < Ny)) {
+    image[tid] = C_Scale(C_Mul(image[tid], filter[tid]), scale);
+  }
+}
+// Mul and scale float
+__global__ void MulScalePw(cufftComplex *image, cufftComplex *filter, int Nx,
+                           int Ny, float scale) {
+  int tix = threadIdx.x + blockIdx.x * blockDim.x;
+  int tiy = threadIdx.y + blockIdx.y * blockDim.y;
+  int tid = tix + Nx * tiy;
+
+  if (tid >= Nx * Ny)
+    return;
+
+  if ((tix < Nx) && (tiy < Ny)) {
+    image[tid] = C_Scale(C_Mul(image[tid], filter[tid]), scale);
+  }
+}
+
+// scale
+template <class C>
+__global__ void ScalePw(C *image, int Nx, int Ny, double scale) {
+  int tix = threadIdx.x + blockIdx.x * blockDim.x;
+  int tiy = threadIdx.y + blockIdx.y * blockDim.y;
+  int tid = tix + Nx * tiy;
+
+  if (tid >= Nx * Ny)
+    return;
+
+  if ((tix < Nx) && (tiy < Ny)) {
+    image[tid] = C_Scale(image[tid], scale);
+  }
+}
+
+// scale element-wise
+template <class T, class TScale>
+__global__ void ScaleE(T *image, int Nx, int Ny, TScale scale) {
+  int tix = threadIdx.x + blockIdx.x * blockDim.x;
+  int tiy = threadIdx.y + blockIdx.y * blockDim.y;
+  int tid = tix + Nx * tiy;
+
+  if (tid >= Nx * Ny)
+    return;
+
+  if ((tix < Nx) && (tiy < Ny)) {
+    image[tid] = image[tid] * scale;
+  }
+}
+
+// Makhoul reorder
+template <class T, class U>
+__global__ void Reorder(T *image, U *out, int width, int height, int width_in,
+                        int blockX, int blockY) {
+  int x = threadIdx.x + blockIdx.x * blockDim.x;
+  int y = threadIdx.y + blockIdx.y * blockDim.y;
+  int tid = x + width_in * y;
+
+  if (tid >= width_in * height)
+    return;
+
+  if ((x < width) && (y < height)) {
+    int dstX;
+    if (x % 2 == 0) {
+      dstX = x / 2;
+    } else {
+      dstX = (width - (x / 2) - 1);
+    }
+    int dstY;
+    if (y % 2 == 0) {
+      dstY = y / 2;
+    } else {
+      dstY = (width - (y / 2) - 1);
+    }
+    out[dstY * width + dstX] = (U)image[x + width_in * y];
+  }
+}
+
+// Makhoul inverse reorder
+template <class T, class U>
+__global__ void iReorder(T *in, U *image, int width, int height, int width_out,
+                         int blockX, int blockY) {
+  int x = threadIdx.x + blockIdx.x * blockDim.x;
+  int y = threadIdx.y + blockIdx.y * blockDim.y;
+  int tid = x + width_out * y;
+
+  if (tid >= width_out * height)
+    return;
+
+  if ((x < width) && (y < height)) {
+    int dstX;
+    if (x % 2 == 0) {
+      dstX = x / 2;
+    } else {
+      dstX = (width - (x / 2) - 1);
+    }
+    int dstY;
+    if (y % 2 == 0) {
+      dstY = y / 2;
+    } else {
+      dstY = (width - (y / 2) - 1);
+    }
+    image[x + width_out * y] = (T)in[dstY * width + dstX];
+  }
+}
+
+// Makhoul factor multiplication
+template <class T>
+__global__ void Factoring(complex<T> *in, T *out, int width, int height,
+                          int blockX, int blockY, int out_width) {
+  int x = threadIdx.x + blockIdx.x * blockDim.x;
+  int y = threadIdx.y + blockIdx.y * blockDim.y;
+  int tid = x + width * y;
+
+  if (tid >= width * height)
+    return;
+
+  const complex<T> i(0.0, 1.0);
+  complex<T> factor = exp(-i * complex<T>(M_PI * y / (2 * height)));
+  complex<T> factor2 = exp(-i * complex<T>(M_PI * x / (2 * width)));
+  complex<T> factor3 = exp(-i * complex<T>(M_PI * (-x) / (2 * width)));
+  complex<T> factor4 = exp(-i * complex<T>(M_PI * (-y) / (2 * height)));
+
+  if ((x < width / 2 + 1) && (y < height)) {
+    if (x == 0 && y == 0) {
+      out[y * width + x] =
+          4.0 * (factor * (factor2 * in[y * out_width + x])).real();
+    } else if (x == 0) {
+      out[y * width + x] =
+          2.0 * (factor2 * (factor * in[y * out_width + x] +
+                            factor4 * in[(height - y) * out_width + x]))
+                    .real();
+    } else if (y == 0) {
+      out[y * width + x] =
+          2.0 * (factor * (factor2 * in[y * out_width + x] +
+                           factor3 * conj(in[y * out_width + x])))
+                    .real();
+    } else {
+      out[y * width + x] =
+          2.0 * (factor * (factor2 * in[y * out_width + x] +
+                           factor3 * conj(in[(height - y) * out_width + x])))
+                    .real();
+    }
+  } else if ((x < width) && (y < height)) {
+    if (x == 0 && y == 0) {
+      out[y * width + x] =
+          4.0 * (factor * (factor2 * conj(in[y * out_width + width - x]))).real();
+    } else if (x == 0) {
+      out[y * width + x] =
+          2.0 *
+          (factor2 * (factor * conj(in[(height - y) * out_width + width - x]) +
+                      factor4 * conj(in[y * out_width + width - x])))
+              .real();
+    } else if (y == 0) {
+      out[y * width + x] =
+          2.0 * (factor * (factor2 * conj(in[y * out_width + width - x]) +
+                           factor3 * in[y * out_width + width - x]))
+                    .real();
+    } else {
+      out[y * width + x] =
+          2.0 *
+          (factor * (factor2 * conj(in[(height - y) * out_width + width - x]) +
+                     factor3 * in[y * out_width + width - x]))
+              .real();
+    }
+  }
+}
+
+// Makhoul inverse factor multiplication
+template <class T>
+__global__ void iFactoring(T *in, complex<T> *out, int width, int height,
+                           int blockX, int blockY) {
+  int x = threadIdx.x + blockIdx.x * blockDim.x;
+  int y = threadIdx.y + blockIdx.y * blockDim.y;
+  int tid = x + width * y;
+
+  if (tid >= width * height)
+    return;
+
+  int in_width = width / 2 + 1;
+
+  const complex<T> i(0.0, 1.0);
+
+  if ((x < in_width) && (y < height)) {
+    complex<T> factor3 = exp(-i * complex<T>(M_PI * (-x) / (2 * width)));
+    complex<T> factor4 = exp(-i * complex<T>(M_PI * (-y) / (2 * height)));
+    if (x == 0 && y == 0) {
+      out[y * in_width + x] = complex<T>(0.25, 0.0) * factor4 * factor3 *
+                              complex<T>(in[y * width + x]);
+    } else if (y == 0) {
+      out[y * in_width + x] = complex<T>(0.25, 0.0) * factor4 * factor3 *
+                              (complex<T>(in[y * width + x]) -
+                               i * complex<T>(in[y * width + width - x]));
+    } else if (x == 0) {
+      out[y * in_width + x] = complex<T>(0.25, 0.0) * factor4 * factor3 *
+                              (complex<T>(in[y * width + x]) -
+                               i * complex<T>(in[(height - y) * width + x]));
+    } else {
+      out[y * in_width + x] =
+          complex<T>(0.25, 0.0) * factor4 * factor3 *
+          ((complex<T>(in[y * width + x]) -
+            complex<T>(in[(height - y) * width + width - x])) -
+           i * (complex<T>(in[(height - y) * width + x]) +
+                complex<T>(in[y * width + width - x])));
+    }
+  }
+}
+
+template <class T, class U>
+__global__ void CopyAligned(T *in, U *out, int width, int height, int width_in,
+                            int width_out) {
+  int x = threadIdx.x + blockIdx.x * blockDim.x;
+  int y = threadIdx.y + blockIdx.y * blockDim.y;
+  int tid = x + width * y;
+
+  if (tid >= width_in * height)
+    return;
+
+  if ((x < width) && (y < height)) {
+    out[y * width_out + x] = (U)in[x + width_in * y];
+  }
+}
+
+template <class T, class U>
+__global__ void FillMem(T *mem, U val, int width, int height) {
+  int x = threadIdx.x + blockIdx.x * blockDim.x;
+  int y = threadIdx.y + blockIdx.y * blockDim.y;
+  int tid = x + width * y;
+
+  if (tid >= width * height)
+    return;
+
+  if ((x < width) && (y < height)) {
+    mem[y * width + x] = (T)val;
+  }
+}
+
+template <class T, class U>
+__global__ void PutKernel(T *kernel, U *dest, int kw, int kh, int nx, int ny) {
+  int x = threadIdx.x + blockIdx.x * blockDim.x;
+  int y = threadIdx.y + blockIdx.y * blockDim.y;
+  int tid = x + kw * y;
+
+  if (tid >= kw * kh)
+    return;
+
+  if ((x < kw) && (y < kh)) {
+    int kx = x - kw / 2;
+    int ky = y - kh / 2;
+
+    dest[nx * ((ny + ky) % ny) + ((nx + kx) % nx)] =
+        (U)kernel[kw * (kh / 2 + ky) + (kw / 2 + kx)];
+  }
+}
+
+template <class TPrecision, class V>
+void fftConvolve(TPrecision *in, int width, int height, V *k, int k_w, int k_h,
+                 TPrecision *out) {
+  typedef typename std::conditional<std::is_same<TPrecision, float>::value,
+                                    cufftComplex, cufftDoubleComplex>::type
+      cufft_Complex;
+  typedef typename std::conditional<std::is_same<TPrecision, float>::value,
+                                    cufftReal, cufftDoubleReal>::type cufft_Real;
+  bool floatPrecision = false;
+  if (std::is_same<TPrecision, float>::value) {
+    floatPrecision = true;
+  }
+
+  int image_width = width;
+  int image_height = height;
+
+  int matsize = image_width * image_height;
+  int intermediateWidth = image_width / 2 + 1;
+  int intermediateSize = image_height * intermediateWidth;
+
+  // setup buffers
+  TPrecision *kernel = (TPrecision *)malloc(sizeof(TPrecision) * matsize);
+
+  memset(kernel, 0, sizeof(TPrecision) * matsize);
+
+  // prepare kernel
+  putKernel(k, kernel, k_w, k_h, image_width, image_height);
+
+  // Allocate device memory
+  TPrecision *d_in, *d_kernel, *d_out;
+  cufft_Complex *d_image_fft, *d_kernel_fft;
+  cudaMalloc(reinterpret_cast<void **>(&d_in), sizeof(TPrecision) * matsize);
+  cudaMalloc(reinterpret_cast<void **>(&d_kernel), sizeof(TPrecision) * matsize);
+  cudaMalloc(reinterpret_cast<void **>(&d_image_fft),
+             sizeof(cufft_Complex) * intermediateSize);
+  cudaMalloc(reinterpret_cast<void **>(&d_kernel_fft),
+             sizeof(cufft_Complex) * intermediateSize);
+  cudaMalloc(reinterpret_cast<void **>(&d_out), sizeof(TPrecision) * matsize);
+  // Copy host memory to device
+  cudaMemcpy(d_in, in, sizeof(TPrecision) * matsize, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_kernel, kernel, sizeof(TPrecision) * matsize,
+             cudaMemcpyHostToDevice);
+
+  cudaDeviceSynchronize(); // only for time measure
+  // start
+  auto start_time = std::chrono::system_clock::now();
+
+  // create plans
+  cufftHandle plan_forward_many, plan_reverse_many;
+  int n[2] = {image_height, image_width};
+  if (floatPrecision) {
+    cufftPlanMany(&plan_forward_many, 2, n, nullptr, 1, 1, nullptr, 1, 1,
+                  CUFFT_R2C, 1);
+    cufftPlanMany(&plan_reverse_many, 2, n, nullptr, 1, 1, nullptr, 1, 1,
+                  CUFFT_C2R, 1);
+  } else {
+    cufftPlanMany(&plan_forward_many, 2, n, nullptr, 1, 1, nullptr, 1, 1,
+                  CUFFT_D2Z, 1);
+    cufftPlanMany(&plan_reverse_many, 2, n, nullptr, 1, 1, nullptr, 1, 1,
+                  CUFFT_Z2D, 1);
+  }
+
+  // Transform signal and kernel
+  if (floatPrecision) {
+    cufftExecR2C(plan_forward_many, reinterpret_cast<cufftReal *>(d_in),
+                 reinterpret_cast<cufftComplex *>(d_image_fft));
+    cufftExecR2C(plan_forward_many, reinterpret_cast<cufftReal *>(d_kernel),
+                 reinterpret_cast<cufftComplex *>(d_kernel_fft));
+  } else {
+    cufftExecD2Z(plan_forward_many, reinterpret_cast<cufftDoubleReal *>(d_in),
+                 reinterpret_cast<cufftDoubleComplex *>(d_image_fft));
+    cufftExecD2Z(plan_forward_many, reinterpret_cast<cufftDoubleReal *>(d_kernel),
+                 reinterpret_cast<cufftDoubleComplex *>(d_kernel_fft));
+  }
+
+  // define dimensions
+  dim3 threadsPerBlock(32, 32); // 1024 threads
+  dim3 numBlocks(image_width / threadsPerBlock.x + 1,
+                 image_height / threadsPerBlock.y + 1);
+
+  // Pointwise Multiplication in Frequency Domain
+  MulScalePw<<<numBlocks, threadsPerBlock>>>(
+      reinterpret_cast<cufft_Complex *>(d_image_fft),
+      reinterpret_cast<cufft_Complex *>(d_kernel_fft), intermediateWidth,
+      image_height, 1.0 / (image_height * image_width));
+  HANDLE_ERROR(cudaGetLastError());
+
+  if (floatPrecision) {
+    cufftExecC2R(plan_reverse_many, reinterpret_cast<cufftComplex *>(d_image_fft),
+                 reinterpret_cast<cufftReal *>(d_out));
+  } else {
+    cufftExecZ2D(plan_reverse_many,
+                 reinterpret_cast<cufftDoubleComplex *>(d_image_fft),
+                 reinterpret_cast<cufftDoubleReal *>(d_out));
+  }
+
+  cudaDeviceSynchronize(); // only for time measure
+  // stop
+  auto end_time = std::chrono::system_clock::now();
+  auto elapsed_time = std::chrono::duration_cast<std::chrono::microseconds>(
+      end_time - start_time);
+  std::cout << (float)elapsed_time.count() / 1000.0 << " ms" << std::endl;
+
+  // Copy device memory to host
+  cudaMemcpy(out, d_out, sizeof(TPrecision) * matsize, cudaMemcpyDeviceToHost);
+
+  // cleanup
+  cufftDestroy(plan_forward_many);
+  cufftDestroy(plan_reverse_many);
+  free(kernel);
+  cudaFree(d_in);
+  cudaFree(d_kernel);
+  cudaFree(d_image_fft);
+  cudaFree(d_kernel_fft);
+  cudaFree(d_out);
+}
+
+template <class TPrecision>
+void fftConvolve_device(TPrecision *in, int width, int height, TPrecision *k,
+                        TPrecision *out) {
+  typedef typename std::conditional<std::is_same<TPrecision, float>::value,
+                                    cufftComplex, cufftDoubleComplex>::type
+      cufft_Complex;
+  typedef typename std::conditional<std::is_same<TPrecision, float>::value,
+                                    cufftReal, cufftDoubleReal>::type cufft_Real;
+  bool floatPrecision = false;
+  if (std::is_same<TPrecision, float>::value) {
+    floatPrecision = true;
+  }
+
+  int image_width = width;
+  int image_height = height;
+
+  std::cout << width << " x " << height << std::endl;
+
+  int intermediateWidth = image_width / 2 + 1;
+  int intermediateSize = image_height * intermediateWidth;
+
+  // Allocate device memory
+  cufft_Complex *d_image_fft, *d_kernel_fft;
+  cudaMalloc(reinterpret_cast<void **>(&d_image_fft),
+             sizeof(cufft_Complex) * intermediateSize);
+  cudaMalloc(reinterpret_cast<void **>(&d_kernel_fft),
+             sizeof(cufft_Complex) * intermediateSize);
+
+  // create plans
+  cufftHandle plan_forward_many, plan_reverse_many;
+  int n[2] = {image_height, image_width};
+  if (floatPrecision) {
+    cufftPlanMany(&plan_forward_many, 2, n, nullptr, 1, 1, nullptr, 1, 1,
+                  CUFFT_R2C, 1);
+    cufftPlanMany(&plan_reverse_many, 2, n, nullptr, 1, 1, nullptr, 1, 1,
+                  CUFFT_C2R, 1);
+  } else {
+    cufftPlanMany(&plan_forward_many, 2, n, nullptr, 1, 1, nullptr, 1, 1,
+                  CUFFT_D2Z, 1);
+    cufftPlanMany(&plan_reverse_many, 2, n, nullptr, 1, 1, nullptr, 1, 1,
+                  CUFFT_Z2D, 1);
+  }
+
+  cudaDeviceSynchronize(); // only for time measure
+  // start
+  auto start_time = std::chrono::system_clock::now();
+
+  // Transform signal and kernel
+  if (floatPrecision) {
+    cufftExecR2C(plan_forward_many, reinterpret_cast<cufftReal *>(in),
+                 reinterpret_cast<cufftComplex *>(d_image_fft));
+    cufftExecR2C(plan_forward_many, reinterpret_cast<cufftReal *>(k),
+                 reinterpret_cast<cufftComplex *>(d_kernel_fft));
+  } else {
+    cufftExecD2Z(plan_forward_many, reinterpret_cast<cufftDoubleReal *>(in),
+                 reinterpret_cast<cufftDoubleComplex *>(d_image_fft));
+    cufftExecD2Z(plan_forward_many, reinterpret_cast<cufftDoubleReal *>(k),
+                 reinterpret_cast<cufftDoubleComplex *>(d_kernel_fft));
+  }
+
+  // define dimensions
+  dim3 threadsPerBlock(32, 32); // 1024 threads
+  dim3 numBlocks(image_width / threadsPerBlock.x + 1,
+                 image_height / threadsPerBlock.y + 1);
+
+  // Pointwise Multiplication in Frequency Domain
+  MulScalePw<<<numBlocks, threadsPerBlock>>>(
+      reinterpret_cast<cufft_Complex *>(d_image_fft),
+      reinterpret_cast<cufft_Complex *>(d_kernel_fft), intermediateWidth,
+      image_height, 1.0 / (image_height * image_width));
+  HANDLE_ERROR(cudaGetLastError());
+
+  if (floatPrecision) {
+    cufftExecC2R(plan_reverse_many, reinterpret_cast<cufftComplex *>(d_image_fft),
+                 reinterpret_cast<cufftReal *>(out));
+  } else {
+    cufftExecZ2D(plan_reverse_many,
+                 reinterpret_cast<cufftDoubleComplex *>(d_image_fft),
+                 reinterpret_cast<cufftDoubleReal *>(out));
+  }
+
+  cudaDeviceSynchronize(); // only for time measure
+  // stop
+  auto end_time = std::chrono::system_clock::now();
+  auto elapsed_time = std::chrono::duration_cast<std::chrono::microseconds>(
+      end_time - start_time);
+  std::cout << (float)elapsed_time.count() / 1000.0 << " ms" << std::endl;
+
+  // cleanup
+  cufftDestroy(plan_forward_many);
+  cufftDestroy(plan_reverse_many);
+  cudaFree(d_image_fft);
+  cudaFree(d_kernel_fft);
+}
+
+bool estimateConvolutionExecutionTime(int width, int height, int padWidth,
+                                      int padHeight, int k_w, int k_h,
+                                      bool linear) {
+  // assumed:
+  // 2496 CUDA cores
+  // 3.52 TFLOPS single, 1.17 TFLOPS double
+  const double FLOPMS = 3500000000; // FLOP per MS
+
+  double N1 = padWidth;
+  double N2 = padHeight;
+
+  double singleFFTcost = 8.0 * N1 * N2 * log2(N1 * N2);
+
+  double FFTConvolutionCost = (singleFFTcost * 3 + (N1 * N2 * (3 + 1))) * 80;
+  std::cout << "FFTConvolutionCost: " << FFTConvolutionCost / FLOPMS << std::endl;
+
+  double simpleHipaccCost = (height * width) * (k_w * k_h) * 2 * 22;
+  std::cout << "simpleHipaccCost: " << simpleHipaccCost / FLOPMS << std::endl;
+
+  if (FFTConvolutionCost < simpleHipaccCost) {
+    std::cout << "FFT Convolution will be faster" << std::endl;
+    return true;
+  } else {
+    std::cout << "Hipacc Kernel will be faster" << std::endl;
+    return false;
+  }
+}
+
+template <class TPrecision, class T, class U, class V, size_t rows, size_t cols,
+          class B = int>
+void cufftConvolution(T *in, int width, int height, const V (&kernel)[rows][cols],
+                      int k_w, int k_h, U *out, int alignment, bool linear,
+                      B boundaryConstant = 0) {
+  assert(rows == k_h && cols == k_w);
+
+  int width_in = alignedWidth<T>(width, alignment);
+  int width_out = alignedWidth<U>(width, alignment);
+
+  int padWidth = width;
+  int padHeight = height;
+  if (linear) {
+    padWidth = width + k_w / 2;
+    padHeight = height + k_h / 2;
+  }
+  int padSize = padWidth * padHeight;
+
+  /*
+  estimateConvolutionExecutionTime(width, height, padWidth, padHeight, k_w, k_h,
+                                   linear);
+  */
+
+  // prepare input buffer
+  TPrecision *input = new TPrecision[padSize];
+  if (linear) {
+    std::fill_n(input, padSize, (TPrecision)boundaryConstant);
+  }
+  T *input_d = new T[width_in * height];
+  // convert input
+  gpuErrchk(cudaMemcpy(input_d, in, sizeof(T) * width_in * height,
+                       cudaMemcpyDeviceToHost));
+  for (int y = 0; y < height; y++) {
+    for (int x = 0; x < width; x++) {
+      input[y * padWidth + x] = (TPrecision)input_d[y * width_in + x];
+    }
+  }
+  // prepare output buffer
+  TPrecision *output = new TPrecision[padSize];
+  fftConvolve(input, padWidth, padHeight, (V *)(&kernel[0][0]), k_w, k_h, output);
+  // convert output
+  U *out_d = new U[width_out * height];
+  for (int y = 0; y < height; y++) {
+    for (int x = 0; x < width; x++) {
+      out_d[y * width_out + x] = (U)output[y * padWidth + x];
+    }
+  }
+  cudaMemcpy(out, out_d, sizeof(U) * width_out * height, cudaMemcpyHostToDevice);
+
+  free(input);
+  free(input_d);
+  free(output);
+  free(out_d);
+}
+
+template <class TPrecision, class T, class U, class V, size_t rows, size_t cols,
+          class B = int>
+void cufftConvolution_device(T *in, int width, int height,
+                             const V (&kernel)[rows][cols], int k_w, int k_h,
+                             U *out, int alignment, bool linear,
+                             B boundaryConstant = 0) {
+  assert(rows == k_h && cols == k_w);
+
+  int width_in = alignedWidth<T>(width, alignment);
+  int width_out = alignedWidth<U>(width, alignment);
+
+  int padWidth = width;
+  int padHeight = height;
+  if (linear) {
+    padWidth = width + k_w / 2;
+    padHeight = height + k_h / 2;
+  }
+  int padSize = padWidth * padHeight;
+
+  /*
+  estimateConvolutionExecutionTime(width, height, padWidth, padHeight, k_w, k_h,
+                                   linear);
+  */
+
+  V *kernel_in;
+  cudaMalloc(reinterpret_cast<void **>(&kernel_in), sizeof(V) * k_w * k_h);
+  gpuErrchk(cudaMemcpy(kernel_in, (V *)(&kernel[0][0]), sizeof(V) * k_w * k_h,
+                       cudaMemcpyHostToDevice));
+
+  TPrecision *d_in, *d_kernel, *d_out;
+  cudaMalloc(reinterpret_cast<void **>(&d_in), sizeof(TPrecision) * padSize);
+  cudaMalloc(reinterpret_cast<void **>(&d_kernel), sizeof(TPrecision) * padSize);
+  cudaMalloc(reinterpret_cast<void **>(&d_out), sizeof(TPrecision) * padSize);
+
+  dim3 threadsPerBlock(32, 32); // 1024 threads
+  dim3 numBlocks(width / threadsPerBlock.x + 1, height / threadsPerBlock.y + 1);
+  dim3 threadsPerBlock2(32, 32); // 1024 threads
+  dim3 numBlocks2(padWidth / threadsPerBlock.x + 1,
+                  padHeight / threadsPerBlock.y + 1);
+
+  // prepare image buffer
+  if (linear) {
+    FillMem<<<numBlocks2, threadsPerBlock2>>>(d_in, (TPrecision)boundaryConstant,
+                                              padWidth, padHeight);
+    HANDLE_ERROR(cudaGetLastError());
+  }
+  CopyAligned<<<numBlocks, threadsPerBlock>>>(in, d_in, width, height, width_in,
+                                              padWidth);
+  HANDLE_ERROR(cudaGetLastError());
+
+  // prepare kernel buffer
+  cudaMemset(d_kernel, 0, sizeof(TPrecision) * padSize);
+  PutKernel<<<numBlocks, threadsPerBlock>>>(kernel_in, d_kernel, k_w, k_h,
+                                            padWidth, padHeight);
+
+  // prepare output buffer
+  // TPrecision *output = new TPrecision[padSize];
+  fftConvolve_device(d_in, padWidth, padHeight, d_kernel, d_out);
+
+  // convert output
+
+  CopyAligned<<<numBlocks, threadsPerBlock>>>(d_out, out, width, height, padWidth,
+                                              width_out);
+  HANDLE_ERROR(cudaGetLastError());
+
+  // clean up
+  cudaFree(d_in);
+  cudaFree(d_kernel);
+  cudaFree(d_out);
+}
+
+template <class TPrecision>
+void fftTransform(TPrecision *in, int width, int height, TPrecision *out,
+                  bool forward = true, bool scale = false) {
+  typedef typename std::conditional<std::is_same<TPrecision, float>::value,
+                                    cufftComplex, cufftDoubleComplex>::type
+      cufft_Complex;
+  typedef typename std::conditional<std::is_same<TPrecision, float>::value,
+                                    cufftReal, cufftDoubleReal>::type cufft_Real;
+  bool floatPrecision = false;
+  if (std::is_same<TPrecision, float>::value) {
+    floatPrecision = true;
+  }
+
+  int image_width = width;
+  int image_height = height;
+
+  int matsize = image_width * image_height;
+  int intermediateWidth = image_width / 2 + 1;
+  int intermediateSize = image_height * intermediateWidth;
+
+  // Allocate device memory
+  cufft_Real *d_in;
+  cufft_Complex *d_image_fft;
+  gpuErrchk(
+      cudaMalloc(reinterpret_cast<void **>(&d_in), sizeof(cufft_Real) * matsize));
+  gpuErrchk(cudaMalloc(reinterpret_cast<void **>(&d_image_fft),
+                       sizeof(cufft_Complex) * intermediateSize));
+  // Copy host memory to device
+  if (forward) {
+    gpuErrchk(cudaMemcpy(d_in, in, sizeof(cufft_Real) * matsize,
+                         cudaMemcpyHostToDevice));
+  } else {
+    gpuErrchk(cudaMemcpy(d_image_fft, in,
+                         sizeof(cufft_Complex) * intermediateSize,
+                         cudaMemcpyHostToDevice));
+  }
+  // create plans
+  cufftHandle plan_forward_many;
+  int n[2] = {image_height, image_width};
+  if (floatPrecision) {
+    if (forward) {
+      cufftPlanMany(&plan_forward_many, 2, n, nullptr, 1, 1, nullptr, 1, 1,
+                    CUFFT_R2C, 1);
+    } else {
+      cufftPlanMany(&plan_forward_many, 2, n, nullptr, 1, 1, nullptr, 1, 1,
+                    CUFFT_C2R, 1);
+    }
+  } else {
+    if (forward) {
+      cufftPlanMany(&plan_forward_many, 2, n, nullptr, 1, 1, nullptr, 1, 1,
+                    CUFFT_D2Z, 1);
+    } else {
+      cufftPlanMany(&plan_forward_many, 2, n, nullptr, 1, 1, nullptr, 1, 1,
+                    CUFFT_Z2D, 1);
+    }
+  }
+
+  // define dimensions
+  dim3 threadsPerBlock(32, 32); // 1024 threads
+  dim3 numBlocks(image_width / threadsPerBlock.x + 1,
+                 image_height / threadsPerBlock.y + 1);
+
+  // Transform signal
+  if (forward) {
+    if (floatPrecision) {
+      cufftExecR2C(plan_forward_many, reinterpret_cast<cufftReal *>(d_in),
+                   reinterpret_cast<cufftComplex *>(d_image_fft));
+    } else {
+      cufftExecD2Z(plan_forward_many, reinterpret_cast<cufftDoubleReal *>(d_in),
+                   reinterpret_cast<cufftDoubleComplex *>(d_image_fft));
+    }
+  } else {
+    if (floatPrecision) {
+      cufftExecC2R(plan_forward_many,
+                   reinterpret_cast<cufftComplex *>(d_image_fft),
+                   reinterpret_cast<cufftReal *>(d_in));
+    } else {
+      cufftExecZ2D(plan_forward_many,
+                   reinterpret_cast<cufftDoubleComplex *>(d_image_fft),
+                   reinterpret_cast<cufftDoubleReal *>(d_in));
+    }
+  }
+
+  if (!forward && scale) {
+    // Pointwise Scale
+    ScaleE<<<numBlocks, threadsPerBlock>>>(
+        reinterpret_cast<cufft_Real *>(d_in), image_width,
+        image_height, 1.0 / (image_height * image_width));
+    HANDLE_ERROR(cudaGetLastError());
+  }
+
+  // Copy device memory to host
+  if (forward) {
+    gpuErrchk(cudaMemcpy(out, d_image_fft,
+                         sizeof(cufft_Complex) * intermediateSize,
+                         cudaMemcpyDeviceToHost));
+  } else {
+    gpuErrchk(cudaMemcpy(out, d_in, sizeof(cufft_Real) * matsize,
+                         cudaMemcpyDeviceToHost));
+  }
+
+  // cleanup
+  cufftDestroy(plan_forward_many);
+  cudaFree(d_in);
+  cudaFree(d_image_fft);
+}
+
+template <class TPrecision>
+void fftTransformDevice(TPrecision *in, int width, int height, TPrecision *out,
+                        bool forward = true, bool scale = false,
+                        int alignment = 0) {
+  if (alignment == 0)
+    alignment = width;
+  typedef typename std::conditional<std::is_same<TPrecision, float>::value,
+                                    cufftComplex, cufftDoubleComplex>::type
+      cufft_Complex;
+  typedef typename std::conditional<std::is_same<TPrecision, float>::value,
+                                    cufftReal, cufftDoubleReal>::type cufft_Real;
+  bool floatPrecision = false;
+  if (std::is_same<TPrecision, float>::value) {
+    floatPrecision = true;
+  }
+
+  int image_width = width;
+  int image_height = height;
+
+  // int matsize = image_width * image_height;
+
+  // create plans
+  cufftHandle plan_forward_many;
+  int n[2] = {image_height, image_width};
+  int inembed[2] = {height, alignment};
+  int onembed[2] = {height, width / 2 + 1};
+  int idist = alignment * height;
+  int odist = (width / 2 + 1) * height;
+
+  if (floatPrecision) {
+    if (forward) {
+      cufftPlanMany(&plan_forward_many, 2, n, inembed, 1, idist, onembed, 1,
+                    odist, CUFFT_R2C, 1);
+    } else {
+      cufftPlanMany(&plan_forward_many, 2, n, onembed, 1, odist, inembed, 1,
+                    idist, CUFFT_C2R, 1);
+    }
+  } else {
+    if (forward) {
+      cufftPlanMany(&plan_forward_many, 2, n, inembed, 1, idist, onembed, 1,
+                    odist, CUFFT_D2Z, 1);
+    } else {
+      cufftPlanMany(&plan_forward_many, 2, n, onembed, 1, odist, inembed, 1,
+                    idist, CUFFT_Z2D, 1);
+    }
+  }
+
+  // define dimensions
+  dim3 threadsPerBlock(32, 32); // 1024 threads
+  dim3 numBlocks(alignment / threadsPerBlock.x + 1,
+                 image_height / threadsPerBlock.y + 1);
+
+  // Transform signal
+  if (forward) {
+    if (floatPrecision) {
+      cufftExecR2C(plan_forward_many, reinterpret_cast<cufftReal *>(in),
+                   reinterpret_cast<cufftComplex *>(out));
+    } else {
+      cufftExecD2Z(plan_forward_many, reinterpret_cast<cufftDoubleReal *>(in),
+                   reinterpret_cast<cufftDoubleComplex *>(out));
+    }
+  } else {
+    if (floatPrecision) {
+      cufftExecC2R(plan_forward_many,
+                   reinterpret_cast<cufftComplex *>(in),
+                   reinterpret_cast<cufftReal *>(out));
+    } else {
+      cufftExecZ2D(plan_forward_many,
+                   reinterpret_cast<cufftDoubleComplex *>(in),
+                   reinterpret_cast<cufftDoubleReal *>(out));
+    }
+  }
+
+  if (!forward && scale) {
+    // Pointwise Scale
+    ScaleE<<<numBlocks, threadsPerBlock>>>(reinterpret_cast<cufft_Real *>(out),
+                                           alignment, image_height,
+                                           1.0 / (image_height * image_width));
+    HANDLE_ERROR(cudaGetLastError());
+  }
+
+  // cleanup
+  cufftDestroy(plan_forward_many);
+}
+
+// old fft part without device functions
+/*
+// forward FFT
+template <class T, class TPrecision> TPrecision *fft(HipaccImage &in) {
+  int width = in->width;
+  int height = in->height;
+  int width_in = alignedWidth<T>(width, in->alignment);
+
+  // prepare input buffer
+  TPrecision *input = new TPrecision[width * height];
+  T *input_d = new T[width_in * height];
+  // convert input
+  gpuErrchk(cudaMemcpy(input_d, in->mem, sizeof(T) * width_in * height,
+                       cudaMemcpyDeviceToHost));
+  for (int y = 0; y < height; y++) {
+    for (int x = 0; x < width; x++) {
+      input[y * width + x] = (TPrecision)input_d[y * width_in + x];
+    }
+  }
+
+  // prepare output buffer
+  TPrecision *output = new TPrecision[2 * (width / 2 + 1) * height];
+  fftTransform(input, width, height, output, true);
+
+  free(input);
+  free(input_d);
+
+  return output;
+}
+
+// inverse FFT
+template <class T, class TPrecision> void ifft(TPrecision *in, HipaccImage &out) {
+  int width = out->width;
+  int height = out->height;
+  int width_out = alignedWidth<T>(width, out->alignment);
+
+  // prepare output buffer
+  TPrecision *output = new TPrecision[width * height];
+
+  fftTransform(in, width, height, output, false, true);
+
+  // truncate values outside of range 0-255
+  T *out_d = new T[width_out * height];
+  for (int y = 0; y < height; y++) {
+    for (int x = 0; x < width; x++) {
+      T val = output[y * width + x];
+      if (output[y * width + x] < 0) {
+        val = 0;
+      } else if (output[y * width + x] > 255) {
+        val = 255;
+      }
+      // convert output
+      out_d[y * width_out + x] = (T)(val);
+    }
+  }
+
+  gpuErrchk(cudaMemcpy(out->mem, out_d, sizeof(T) * width_out * height,
+                       cudaMemcpyHostToDevice));
+
+  free(output);
+  free(out_d);
+}
+*/
+
+// new fft part
+
+// forward FFT
+template <class T, class TPrecision> TPrecision *fft(HipaccImage &in) {
+  int width = in->width;
+  int height = in->height;
+  int width_in = alignedWidth<T>(width, in->alignment);
+
+  // prepare output buffer
+  TPrecision *output;
+  cudaMalloc((void **)&output, sizeof(TPrecision) * 2 * (width / 2 + 1) * height);
+  TPrecision *input;
+  if (std::is_same<T, TPrecision>::value) {
+    input = (TPrecision *)(in->mem);
+  } else {
+    gpuErrchk(
+        cudaMalloc((void **)&input, sizeof(TPrecision) * width_in * height));
+    // define dimensions
+    dim3 threadsPerBlock(32, 32); // 1024 threads
+    dim3 numBlocks(width / threadsPerBlock.x + 1, height / threadsPerBlock.y + 1);
+
+    // Pointwise Multiplication in Frequency Domain
+    CopyAligned<<<numBlocks, threadsPerBlock>>>((T *)(in->mem), input, width,
+                                                height, width_in, width_in);
+    HANDLE_ERROR(cudaGetLastError());
+  }
+  fftTransformDevice(input, width, height, output, true, false, width_in);
+  HANDLE_ERROR(cudaGetLastError());
+  return output;
+}
+
+// inverse FFT
+template <class T, class TPrecision> void ifft(TPrecision *in, HipaccImage &out) {
+  int width = out->width;
+  int height = out->height;
+  int width_out = alignedWidth<T>(width, out->alignment);
+
+  // prepare output buffer
+  TPrecision *output;
+  if (std::is_same<T, TPrecision>::value) {
+    output = (TPrecision *)(out->mem);
+  } else {
+    gpuErrchk(
+        cudaMalloc((void **)&output, sizeof(TPrecision) * width_out * height));
+  }
+  fftTransformDevice(in, width, height, output, false, true, width_out);
+
+  if (!std::is_same<T, TPrecision>::value) {
+    // define dimensions
+    dim3 threadsPerBlock(32, 32); // 1024 threads
+    dim3 numBlocks(width / threadsPerBlock.x + 1, height / threadsPerBlock.y + 1);
+
+    // Pointwise Multiplication in Frequency Domain
+    CopyAligned<<<numBlocks, threadsPerBlock>>>(output, (T *)(out->mem), width,
+                                                height, width_out, width_out);
+    HANDLE_ERROR(cudaGetLastError());
+  }
+}
+
+// create magnitude from fft
+template <class T, class TPrecision>
+void fftToMagnitude(TPrecision *in, HipaccImage &mag) {
+  int width = mag->width;
+  int height = mag->height;
+  int width_out = alignedWidth<T>(width, mag->alignment);
+
+  T *tmp = new T[width * height];
+  std::complex<TPrecision> *input =
+      new std::complex<TPrecision>[(width / 2 + 1) * height];
+  gpuErrchk(cudaMemcpy(
+      input, in, sizeof(std::complex<TPrecision>) * (width / 2 + 1) * height,
+      cudaMemcpyDeviceToHost));
+  calcMagnitude(reinterpret_cast<std::complex<TPrecision> *>(input), tmp, width,
+                height);
+
+  shiftFFT(tmp, width, height);
+
+  T *out = new T[width_out * height];
+  for (int y = 0; y < height; y++) {
+    for (int x = 0; x < width; x++) {
+      out[y * width_out + x] = (T)tmp[y * width + x];
+    }
+  }
+
+  gpuErrchk(cudaMemcpy(mag->mem, out, sizeof(T) * width_out * height,
+                       cudaMemcpyHostToDevice));
+  free(tmp);
+  free(out);
+}
+
+// create magnitude from dct
+template <class T, class TPrecision>
+void dctToMagnitude(TPrecision *in, HipaccImage &mag) {
+  int width = mag->width;
+  int height = mag->height;
+  int width_out = alignedWidth<T>(width, mag->alignment);
+
+  T *tmp = new T[width * height];
+  for (int y = 0; y < height; y++) {
+    for (int x = 0; x < width; x++) {
+      tmp[y * width + x] = std::abs(in[y * width + x]);
+    }
+  }
+
+  float max = 0.0;
+  for (int y = 0; y < height; y++) {
+    for (int x = 0; x < width; x++) {
+      if (tmp[y * width + x] > max)
+        max = tmp[y * width + x];
+    }
+  }
+  for (int y = 0; y < height; y++) {
+    for (int x = 0; x < width; x++) {
+      tmp[y * width + x] =
+          (T)(255.0 * pow(tmp[y * width + x] * (1.0 / max), 1.0 / 4.0));
+    }
+  }
+
+  T *out = new T[width_out * height];
+  for (int y = 0; y < height; y++) {
+    for (int x = 0; x < width; x++) {
+      out[y * width_out + x] = (T)tmp[y * width + x];
+    }
+  }
+
+  gpuErrchk(cudaMemcpy(mag->mem, out, sizeof(T) * width_out * height,
+                       cudaMemcpyHostToDevice));
+  free(tmp);
+  free(out);
+}
+
+// apply mask mag to result of FFT in
+template <class T, class TPrecision>
+void fftScaleMagnitude(TPrecision *in, HipaccImage &mag) {
+  int width = mag->width;
+  int height = mag->height;
+  int width_in = alignedWidth<T>(width, mag->alignment);
+  T *scale = new T[width_in * height];
+  std::complex<TPrecision> *input =
+      new std::complex<TPrecision>[(width / 2 + 1) * height];
+  gpuErrchk(cudaMemcpy(
+      input, in, sizeof(std::complex<TPrecision>) * (width / 2 + 1) * height,
+      cudaMemcpyDeviceToHost));
+  gpuErrchk(cudaMemcpy(scale, mag->mem, sizeof(T) * width_in * height,
+                       cudaMemcpyDeviceToHost));
+
+  iShiftFFT(scale, width, height, mag->alignment);
+
+  for (int y = 0; y < height; y++) {
+    for (int x = 0; x < (width / 2 + 1); x++) {
+      input[(y * (width / 2 + 1) + x)] *=
+          (TPrecision)scale[y * width_in + x] / 255;
+    }
+  }
+
+  gpuErrchk(cudaMemcpy(
+      in, input, sizeof(std::complex<TPrecision>) * (width / 2 + 1) * height,
+      cudaMemcpyHostToDevice));
+
+  free(scale);
+}
+
+template <class TPrecision>
+void dct_forward_device(TPrecision *in, int width, int height, TPrecision *out,
+                        int width_in = 0) {
+  if (width_in == 0) {
+    width_in = width;
+  }
+  // define dimensions
+  dim3 threadsPerBlock(32, 32); // 1024 threads
+  dim3 numBlocks(width / threadsPerBlock.x + 1, height / threadsPerBlock.y + 1);
+
+  TPrecision *reorder;
+  gpuErrchk(cudaMalloc((void **)&reorder, sizeof(TPrecision) * width * height));
+
+  Reorder<<<numBlocks, threadsPerBlock>>>(in, reorder, width, height, width_in,
+                                          width, height);
+  HANDLE_ERROR(cudaGetLastError());
+
+  // prepare output buffer
+  int out_width = width / 2 + 1;
+  std::complex<TPrecision> *output;
+  gpuErrchk(cudaMalloc((void **)&output,
+                       sizeof(std::complex<TPrecision>) * out_width * height));
+
+  fftTransformDevice(reorder, width, height,
+                     reinterpret_cast<TPrecision *>(output), true, false);
+
+  Factoring<<<numBlocks, threadsPerBlock>>>(
+      reinterpret_cast<complex<TPrecision> *>(output), out, width, height, width,
+      height, out_width);
+  cudaDeviceSynchronize();
+  HANDLE_ERROR(cudaGetLastError());
+
+  cudaFree(reorder);
+  cudaFree(output);
+}
+
+template <class TPrecision, class T>
+void dct_forward(T *in, int width, int height, TPrecision *out,
+                 int width_in = 0) {
+  if (width_in == 0) {
+    width_in = width;
+  }
+
+  TPrecision *input = new TPrecision[width * height];
+
+  for (int y = 0; y < height / 2; y++) {
+    for (int x = 0; x < width / 2; x++) {
+      input[y * width + x] = (TPrecision)in[(2 * y) * width_in + (2 * x)];
+      input[y * width + (x + width / 2)] =
+          (TPrecision)in[(2 * y) * width_in + (2 * (width / 2 - x - 1) + 1)];
+      input[(y + height / 2) * width + x] =
+          (TPrecision)in[(2 * (height / 2 - y - 1) + 1) * width_in + (2 * x)];
+      input[(y + height / 2) * width + (x + width / 2)] =
+          (TPrecision)in[(2 * (height / 2 - y - 1) + 1) * width_in +
+                         (2 * (width / 2 - x - 1) + 1)];
+    }
+  }
+
+  // prepare output buffer
+  int out_width = width / 2 + 1;
+  std::complex<TPrecision> *output =
+      new std::complex<TPrecision>[out_width * height];
+
+  fftTransform(input, width, height, reinterpret_cast<TPrecision *>(output),
+               true);
+
+  const std::complex<TPrecision> i(0.0, 1.0);
+  for (int y = 0; y < height; y++) {
+    for (int x = 0; x < width / 2 + 1; x++) {
+      std::complex<TPrecision> factor =
+          std::exp(-i * std::complex<TPrecision>(M_PI * y / (2 * height)));
+      std::complex<TPrecision> factor2 =
+          std::exp(-i * std::complex<TPrecision>(M_PI * x / (2 * width)));
+      std::complex<TPrecision> factor3 =
+          std::exp(-i * std::complex<TPrecision>(M_PI * (-x) / (2 * width)));
+      std::complex<TPrecision> factor4 =
+          std::exp(-i * std::complex<TPrecision>(M_PI * (-y) / (2 * height)));
+      if (x == 0 && y == 0) {
+        out[y * width + x] =
+            4.0 * (factor * (factor2 * output[y * out_width + x])).real();
+      } else if (x == 0) {
+        out[y * width + x] =
+            2.0 * (factor2 * (factor * output[y * out_width + x] +
+                              factor4 * output[(height - y) * out_width + x]))
+                      .real();
+      } else if (y == 0) {
+        out[y * width + x] =
+            2.0 * (factor * (factor2 * output[y * out_width + x] +
+                             factor3 * std::conj(output[y * out_width + x])))
+                      .real();
+      } else {
+        out[y * width + x] =
+            2.0 *
+            (factor * (factor2 * output[y * out_width + x] +
+                       factor3 * std::conj(output[(height - y) * out_width + x])))
+                .real();
+      }
+    }
+  }
+
+  for (int y = 0; y < height; y++) {
+    for (int x = width / 2 + 1; x < width; x++) {
+      std::complex<TPrecision> factor =
+          std::exp(-i * std::complex<TPrecision>(M_PI * y / (2 * height)));
+      std::complex<TPrecision> factor2 =
+          std::exp(-i * std::complex<TPrecision>(M_PI * x / (2 * width)));
+      std::complex<TPrecision> factor3 =
+          std::exp(-i * std::complex<TPrecision>(M_PI * (-x) / (2 * width)));
+      std::complex<TPrecision> factor4 =
+          std::exp(-i * std::complex<TPrecision>(M_PI * (-y) / (2 * height)));
+      if (x == 0 && y == 0) {
+        out[y * width + x] =
+            4.0 *
+            (factor * (factor2 * std::conj(output[y * out_width + width - x])))
+                .real();
+      } else if (x == 0) {
+        out[y * width + x] =
+            2.0 *
+            (factor2 *
+             (factor * std::conj(output[(height - y) * out_width + width - x]) +
+              factor4 * std::conj(output[y * out_width + width - x])))
+                .real();
+      } else if (y == 0) {
+        out[y * width + x] =
+            2.0 *
+            (factor * (factor2 * std::conj(output[y * out_width + width - x]) +
+                       factor3 * output[y * out_width + width - x]))
+                .real();
+      } else {
+        out[y * width + x] =
+            2.0 *
+            (factor *
+             (factor2 * std::conj(output[(height - y) * out_width + width - x]) +
+              factor3 * output[y * out_width + width - x]))
+                .real();
+      }
+    }
+  }
+
+  free(input);
+  free(output);
+}
+
+template <class T, class TPrecision>
+void dct_inverse_device(TPrecision *in, int width, int height, T *out,
+                        int width_out = 0) {
+  if (width_out == 0) {
+    width_out = width;
+  }
+
+  int in_width = width / 2 + 1;
+
+  // define dimensions
+  dim3 threadsPerBlock(32, 32); // 1024 threads
+  dim3 numBlocks(width_out / threadsPerBlock.x + 1,
+                 height / threadsPerBlock.y + 1);
+
+  // prepare buffers
+  std::complex<TPrecision> *input;
+  gpuErrchk(cudaMalloc((void **)&input,
+                       sizeof(std::complex<TPrecision>) * in_width * height));
+  TPrecision *output;
+  gpuErrchk(cudaMalloc((void **)&output, sizeof(TPrecision) * width * height));
+
+  iFactoring<<<numBlocks, threadsPerBlock>>>(
+      in, reinterpret_cast<complex<TPrecision> *>(input), width, height, width,
+      height);
+  cudaDeviceSynchronize();
+  HANDLE_ERROR(cudaGetLastError());
+
+  fftTransformDevice(reinterpret_cast<TPrecision *>(input), width, height, output,
+                     false, true);
+
+  iReorder<<<numBlocks, threadsPerBlock>>>(output, out, width, height, width_out,
+                                           width, height);
+  cudaDeviceSynchronize();
+  HANDLE_ERROR(cudaGetLastError());
+}
+
+template <class T, class TPrecision>
+void dct_inverse(TPrecision *in, int width, int height, T *out,
+                 int width_out = 0) {
+  if (width_out == 0) {
+    width_out = width;
+  }
+
+  int in_width = width / 2 + 1;
+
+  std::complex<TPrecision> *input =
+      new std::complex<TPrecision>[in_width * height];
+
+  const std::complex<TPrecision> i(0.0, 1.0);
+  for (int y = 0; y < height; y++) {
+    for (int x = 0; x < in_width; x++) {
+      std::complex<TPrecision> factor3 =
+          std::exp(-i * std::complex<TPrecision>(M_PI * (-x) / (2 * width)));
+      std::complex<TPrecision> factor4 =
+          std::exp(-i * std::complex<TPrecision>(M_PI * (-y) / (2 * height)));
+      if (x == 0 && y == 0) {
+        input[y * in_width + x] = std::complex<TPrecision>(0.25, 0.0) * factor4 *
+                                  factor3 *
+                                  std::complex<TPrecision>(in[y * width + x]);
+      } else if (y == 0) {
+        input[y * in_width + x] =
+            std::complex<TPrecision>(0.25, 0.0) * factor4 * factor3 *
+            (std::complex<TPrecision>(in[y * width + x]) -
+             i * std::complex<TPrecision>(in[y * width + width - x]));
+      } else if (x == 0) {
+        input[y * in_width + x] =
+            std::complex<TPrecision>(0.25, 0.0) * factor4 * factor3 *
+            (std::complex<TPrecision>(in[y * width + x]) -
+             i * std::complex<TPrecision>(in[(height - y) * width + x]));
+      } else {
+        input[y * in_width + x] =
+            std::complex<TPrecision>(0.25, 0.0) * factor4 * factor3 *
+            ((std::complex<TPrecision>(in[y * width + x]) -
+              std::complex<TPrecision>(in[(height - y) * width + width - x])) -
+             i * (std::complex<TPrecision>(in[(height - y) * width + x]) +
+                  std::complex<TPrecision>(in[y * width + width - x])));
+      }
+    }
+  }
+
+  // prepare output buffer
+  TPrecision *output = new TPrecision[width * height];
+
+  fftTransform(reinterpret_cast<TPrecision *>(input), width, height, output,
+               false, true);
+
+  for (int y = 0; y < height / 2; y++) {
+    for (int x = 0; x < width / 2; x++) {
+      out[(2 * y) * width_out + (2 * x)] = (T)output[y * width + x];
+      out[(2 * y) * width_out + (2 * (width / 2 - x - 1) + 1)] =
+          (T)output[y * width + (x + width / 2)];
+      out[(2 * (height / 2 - y - 1) + 1) * width_out + (2 * x)] =
+          (T)output[(y + height / 2) * width + x];
+      out[(2 * (height / 2 - y - 1) + 1) * width_out +
+          (2 * (width / 2 - x - 1) + 1)] =
+          (T)output[(y + height / 2) * width + (x + width / 2)];
+    }
+  }
+
+  free(output);
+}
+
+template <class TPrecision>
+void dctTransform(TPrecision *in, int width, int height, TPrecision *out,
+                  bool forward = true) {
+  if (forward) {
+    dct_forward(in, width, height, out);
+  } else {
+    dct_inverse(in, width, height, out);
+  }
+}
+
+template <class TPrecision>
+void dctTransformDevice(TPrecision *in, int width, int height, TPrecision *out,
+                        bool forward = true) {
+  if (forward) {
+    dct_forward_device(in, width, height, out);
+  } else {
+    dct_inverse_device(in, width, height, out);
+  }
+}
+
+// forward FCT
+template <class T, class TPrecision> TPrecision *dct(HipaccImage &in) {
+  int width = in->width;
+  int height = in->height;
+  int width_in = alignedWidth<T>(width, in->alignment);
+
+  // prepare input buffer
+  T *input_d = new T[width_in * height];
+  // convert input
+  gpuErrchk(cudaMemcpy(input_d, in->mem, sizeof(T) * width_in * height,
+                       cudaMemcpyDeviceToHost));
+
+  TPrecision *out = new TPrecision[width * height];
+  dct_forward(input_d, width, height, out, width_in);
+
+  free(input_d);
+
+  return out;
+}
+
+// inverse FCT
+template <class T, class TPrecision> void idct(TPrecision *in, HipaccImage &out) {
+  int width = out->width;
+  int height = out->height;
+  int width_out = alignedWidth<T>(width, out->alignment);
+
+  T *out_d = new T[width_out * height];
+
+  dct_inverse(in, width, height, out_d, width_out);
+
+  gpuErrchk(cudaMemcpy(out->mem, out_d, sizeof(T) * width_out * height,
+                       cudaMemcpyHostToDevice));
+
+  free(out_d);
+}
+
+/*
+template <class T, class TPrecision> TPrecision *dct(HipaccImage &in) {
+  int width = in->width;
+  int height = in->height;
+  int width_in = alignedWidth<T>(width, in->alignment);
+
+  // prepare output buffer
+  TPrecision *output;
+  gpuErrchk(cudaMalloc((void **)&output, sizeof(TPrecision) * width * height));
+  dct_forward_device((TPrecision *)(in->mem), width, height, output, width_in);
+
+  return output;
+}
+
+// inverse FCT
+template <class T, class TPrecision> void idct(TPrecision *in, HipaccImage &out) {
+  int width = out->width;
+  int height = out->height;
+  int width_out = alignedWidth<T>(width, out->alignment);
+
+  //dct_inverse_device(in, width, height, (TPrecision *)(out->mem), width_out);
+  dct_inverse_device(in, 1000, 100, (TPrecision *)(out->mem));
+
+}
+*/
+
+// function wrappers for images
+
+template <class T> void fftShift(HipaccImage &mag) {
+  int width = mag->width;
+  int height = mag->height;
+  int width_in = alignedWidth<T>(width, mag->alignment);
+  T *input = new T[width_in * height];
+  gpuErrchk(cudaMemcpy(input, mag->mem, sizeof(T) * width_in * height,
+                       cudaMemcpyDeviceToHost));
+
+  shiftFFT(input, width, height, width_in);
+
+  gpuErrchk(cudaMemcpy(mag->mem, input, sizeof(T) * width_in * height,
+                       cudaMemcpyHostToDevice));
+}
+
+template <class T> void ifftShift(HipaccImage &mag) {
+  int width = mag->width;
+  int height = mag->height;
+  int width_in = alignedWidth<T>(width, mag->alignment);
+  T *input = new T[width_in * height];
+  gpuErrchk(cudaMemcpy(input, mag->mem, sizeof(T) * width_in * height,
+                       cudaMemcpyDeviceToHost));
+
+  iShiftFFT(input, width, height, width_in);
+
+  gpuErrchk(cudaMemcpy(mag->mem, input, sizeof(T) * width_in * height,
+                       cudaMemcpyHostToDevice));
+}
+
+template <class T>
+void fftResetMask(HipaccImage &mag, int radius, bool low, int window = 0) {
+  int width = mag->width;
+  int height = mag->height;
+  int width_in = alignedWidth<T>(width, mag->alignment);
+  T *input = new T[width_in * height];
+  gpuErrchk(cudaMemcpy(input, mag->mem, sizeof(T) * width_in * height,
+                       cudaMemcpyDeviceToHost));
+
+  magResetFreq(input, width, height, width_in, radius, window, low);
+
+  gpuErrchk(cudaMemcpy(mag->mem, input, sizeof(T) * width_in * height,
+                       cudaMemcpyHostToDevice));
+}
+
+template <class T>
+void fftApplyPassFilter(HipaccImage &mag, int radius, bool low, int window = 0) {
+  int width = mag->width;
+  int height = mag->height;
+  int width_in = alignedWidth<T>(width, mag->alignment);
+  T *input = new T[width_in * height];
+  gpuErrchk(cudaMemcpy(input, mag->mem, sizeof(T) * width_in * height,
+                       cudaMemcpyDeviceToHost));
+
+  magPassFilter(input, width, height, width_in, radius, window, low);
+
+  gpuErrchk(cudaMemcpy(mag->mem, input, sizeof(T) * width_in * height,
+                       cudaMemcpyHostToDevice));
+}
diff --git a/runtime/hipacc_cu_standalone.hpp.cmake b/runtime/hipacc_cu_standalone.hpp.cmake
index d2c54166..68ef83b7 100644
--- a/runtime/hipacc_cu_standalone.hpp.cmake
+++ b/runtime/hipacc_cu_standalone.hpp.cmake
@@ -234,6 +234,19 @@ void hipaccLaunchKernel(const void *kernel, std::string kernel_name, dim3 grid,
 }
 
 
+// Launch kernel async
+void hipaccLaunchKernel(const void *kernel, std::string kernel_name, dim3 grid, dim3 block, void **args, cudaStream_t stream, cudaEvent_t waitEvent, cudaEvent_t recordEvent) {
+    if (waitEvent) {
+      cudaStreamWaitEvent(stream, waitEvent, 0);
+    }
+    cudaError_t err = cudaLaunchKernel(kernel, grid, block, args, 0, stream);
+    checkErr(err, "cudaLaunchKernel(" + kernel_name + ")");
+    if (recordEvent) {
+      cudaEventRecord(recordEvent, stream);
+    }
+}
+
+
 // Benchmark timing for a kernel call
 void hipaccLaunchKernelBenchmark(const void *kernel, std::string kernel_name, dim3 grid, dim3 block, std::vector<void *> args, bool print_timing) {
     std::vector<float> times;
diff --git a/runtime/hipacc_fft_helper.hpp b/runtime/hipacc_fft_helper.hpp
new file mode 100644
index 00000000..b1271114
--- /dev/null
+++ b/runtime/hipacc_fft_helper.hpp
@@ -0,0 +1,263 @@
+// This is a helper file for both hipacc_cpu_fftw.hpp and hipacc_cu_cufft.hpp
+
+#include <cmath>
+#include <complex>
+#include <string>
+
+// https://stackoverflow.com/questions/8204645/implementing-gaussian-blur-how-to-calculate-convolution-matrix-kernel#8204880
+void calcGauss(float *kernel, int W) {
+  double sigma = 1;
+  double mean = W / 2;
+  double sum = 0.0; // For accumulating the kernel values
+  for (int x = 0; x < W; ++x) {
+    for (int y = 0; y < W; ++y) {
+      kernel[x * W + y] = exp(-0.5 * (pow((x - mean) / sigma, 2.0) +
+                                      pow((y - mean) / sigma, 2.0))) /
+                          (2 * M_PI * sigma * sigma);
+
+      // Accumulate the kernel values
+      sum += kernel[x * W + y];
+    }
+  }
+  // Normalize the kernel
+  for (int x = 0; x < W; ++x)
+    for (int y = 0; y < W; ++y)
+      kernel[x * W + y] /= sum;
+}
+// boxblur
+void calcBox(float *kernel, int W) {
+  double count = W * W;
+  for (int x = 0; x < count; ++x) {
+    kernel[x] = 1.0 / count;
+  }
+}
+
+template <class T, class U>
+void putKernel(T *kernel, U *dest, int kx, int ky, int nx, int ny) {
+  // centered
+  /*for (int y = -ky/2; y <= ky/2; y++) {
+      for (int x = -kx/2; x <= kx/2; x++) {
+          dest[nx * (ny/2 + y) + (nx/2 + x)] = kernel[kx * (ky/2+y) + (kx/2+x)];
+      }
+  }*/
+
+  // edges wrap-around
+  for (int y = -ky / 2; y <= ky / 2; y++) {
+    for (int x = -kx / 2; x <= kx / 2; x++) {
+      dest[nx * ((ny + y) % ny) + ((nx + x) % nx)] =
+          kernel[kx * (ky / 2 + y) + (kx / 2 + x)];
+    }
+  }
+  // dest[0] = 1.0;
+}
+
+template <class T, class U>
+void putKernelComplex(T *kernel, std::complex<U> *dest, int kx, int ky, int nx,
+                      int ny) {
+  // edges wrap-around
+  for (int y = -ky / 2; y <= ky / 2; y++) {
+    for (int x = -kx / 2; x <= kx / 2; x++) {
+      dest[nx * ((ny + y) % ny) + ((nx + x) % nx)].real(
+          kernel[kx * (ky / 2 + y) + (kx / 2 + x)]);
+      dest[nx * ((ny + y) % ny) + ((nx + x) % nx)].imag(0);
+    }
+  }
+}
+
+// only unshifted
+template <class T, class U>
+void calcMagnitude(std::complex<T> *in, U *out, int width, int height) {
+  // in is result of fft (r2c: only left half (width/2+1)*height)
+  int width_in = width / 2 + 1;
+  int N = width * height;
+  float *magnitude = (float *)malloc(sizeof(float) * width * height);
+  for (int y = 0; y < height; y++) {
+    for (int x = 0; x < width_in; x++) {
+      magnitude[y * width + x] = std::abs(in[y * width_in + x]);
+      if (x < width / 2)
+        magnitude[width * height - (y * width + x) - 1] =
+            std::abs(in[y * width_in + x + 1]); // point mirror
+    }
+  }
+  float max = 0.0;
+  for (int y = 0; y < height; y++) {
+    for (int x = 0; x < width_in; x++) {
+      if (magnitude[y * width + x] > max)
+        max = magnitude[y * width + x];
+    }
+  }
+  for (int i = 0; i < N; ++i) {
+    out[i] = (U)(255.0 * pow(magnitude[i] * (1.0 / max), 1.0 / 4.0));
+  }
+  free(magnitude);
+}
+
+// 2d access to 1d array
+inline int linearize(int w, int x, int y) { return y * w + x; }
+
+// get next width that is aligned to a size of alignment of type T
+template <class T> int alignedWidth(int width, int alignment) {
+  alignment /= sizeof(T);
+  int res = width;
+  int rest = res % alignment;
+  if (rest != 0) {
+    res += alignment - (rest);
+  }
+  return res;
+}
+
+// get int greater or equal val that is divisible by div
+int nextDiv(int val, int div) {
+  int newVal = val;
+  int rest = newVal % div;
+  if (rest != 0) {
+    newVal += div - (rest);
+  }
+  return newVal;
+}
+
+// https://stackoverflow.com/questions/466204/rounding-up-to-next-power-of-2
+unsigned long upperPowerOfTwo(unsigned long v) {
+  v--;
+  v |= v >> 1;
+  v |= v >> 2;
+  v |= v >> 4;
+  v |= v >> 8;
+  v |= v >> 16;
+  v++;
+  return v;
+}
+
+// https://stackoverflow.com/questions/4424374/determining-if-a-number-is-prime
+bool isPrime(int number) {
+  if (number < 2)
+    return false;
+  if (number == 2)
+    return true;
+  if (number % 2 == 0)
+    return false;
+  for (int i = 3; (i * i) <= number; i += 2) {
+    if (number % i == 0)
+      return false;
+  }
+  return true;
+}
+
+// shift the zero-frequency component to the center of the image
+template <class T>
+void shiftFFT(T *image, int width, int height, int alignment = 0) {
+  int width_align = width;
+  if (alignment) {
+    width_align = alignedWidth<T>(width, alignment);
+  }
+  T *tmp = new T[width_align * height];
+
+  int y_inc = height / 2;
+  int x_inc = width / 2;
+  int cpy_len_l = sizeof(T) * (width / 2);
+  int cpy_len_r = sizeof(T) * (width / 2);
+  if (width % 2) {
+    cpy_len_l += 1;
+  }
+  int src_x_l = 0;
+  int src_x_r = width / 2;
+  if (width % 2) {
+    src_x_r += 1;
+  }
+
+  for (int y = 0; y < height; y++) {
+    memcpy(&tmp[linearize(width_align, (src_x_l + x_inc) % width,
+                          (y + y_inc) % height)],
+           &image[linearize(width_align, src_x_l, y)], cpy_len_l);
+    memcpy(&tmp[linearize(width_align, (src_x_r + x_inc) % width,
+                          (y + y_inc) % height)],
+           &image[linearize(width_align, src_x_r, y)], cpy_len_r);
+  }
+
+  memcpy(image, tmp, sizeof(T) * width_align * height);
+  free(tmp);
+}
+
+// inverse shift the zero-frequency
+template <class T>
+void iShiftFFT(T *image, int width, int height, int alignment = 0) {
+  int width_align = width;
+  if (alignment) {
+    width_align = alignedWidth<T>(width, alignment);
+  }
+  T *tmp = new T[width_align * height];
+
+  int y_inc = height / 2;
+  int x_inc = width / 2;
+  if (width % 2) {
+    y_inc += 1;
+    x_inc += 1;
+  }
+  int cpy_len_l = sizeof(T) * (width / 2);
+  int cpy_len_r = sizeof(T) * (width / 2);
+  if (width % 2) {
+    cpy_len_r += 1;
+  }
+  int src_x_l = 0;
+  int src_x_r = width / 2;
+
+  for (int y = 0; y < height; y++) {
+    memcpy(&tmp[linearize(width_align, (src_x_l + x_inc) % width,
+                          (y + y_inc) % height)],
+           &image[linearize(width_align, src_x_l, y)], cpy_len_l);
+    memcpy(&tmp[linearize(width_align, (src_x_r + x_inc) % width,
+                          (y + y_inc) % height)],
+           &image[linearize(width_align, src_x_r, y)], cpy_len_r);
+  }
+
+  memcpy(image, tmp, sizeof(T) * width_align * height);
+  free(tmp);
+}
+
+// hann window of size N for cos weighted gradient
+float hannWindow(float N, float x) {
+  if (x > N)
+    return 1;
+  N *= 2;
+  return 0.5 * (1.0 - cos(2.0 * M_PI * x / N));
+}
+
+// reset mask values for low frequencies below r with gradient blur of size w
+template <class T>
+void magResetFreq(T *image, int width, int height, int alignment, float r,
+                     float w, bool low) {
+  int width_align = alignedWidth<T>(width, alignment);
+  for (int y = 0; y < height; y++) {
+    for (int x = 0; x < width; x++) {
+      int xd = x - width / 2;
+      int yd = y - height / 2;
+      float d = sqrt(xd * xd + yd * yd);
+      if (low && d < r) {
+        image[y * width_align + x] =
+            255 - (255 - image[y * width_align + x]) * (1 - hannWindow(w, r - d));
+      } else if (!low && d > r) {
+        image[y * width_align + x] =
+            255 - (255 - image[y * width_align + x]) * (1 - hannWindow(w, d - r));
+      }
+    }
+  }
+}
+
+// low pass filter for low frequencies below r with gradient blur of size w
+template <class T>
+void magPassFilter(T *image, int width, int height, int alignment, int r,
+                      int w, bool low) {
+  int width_align = alignedWidth<T>(width, alignment);
+  for (int y = 0; y < height; y++) {
+    for (int x = 0; x < width; x++) {
+      int xd = x - width / 2;
+      int yd = y - height / 2;
+      float d = sqrt(xd * xd + yd * yd);
+      if (low && d > r) {
+        image[y * width_align + x] *= (1 - hannWindow(w, d - r));
+      } else if (!low && d<r) {
+        image[y * width_align + x] *= (1 - hannWindow(w, r - d));
+      }
+    }
+  }
+}