diff --git a/CMakeLists.txt b/CMakeLists.txt index 84a7191a..fe124651 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -55,6 +55,7 @@ find_program(clang NAMES clang PATHS ${LLVM_TOOLS_BINARY_DIR}) find_package(CUDA) find_package(NVML) find_package(OpenCL) +find_package(FFTW) if(CUDA_FOUND AND CUDA_VERSION VERSION_LESS "7.0") message(WARNING "At least CUDA version 7.0 required, but found CUDA version ${CUDA_VERSION}.") @@ -86,6 +87,7 @@ message(STATUS "CUDA support: ${CUDA_FOUND}") message(STATUS "OpenCL support: ${OpenCL_FOUND}") message(STATUS "Polly support: ${USE_POLLY}") message(STATUS "JIT estimates: ${USE_JIT_ESTIMATE}") +message(STATUS "FFTW support: ${FFTW_FOUND}") message(STATUS "===") @@ -184,3 +186,8 @@ endif() if(EXISTS ${CMAKE_SOURCE_DIR}/samples/CMakeLists.txt) add_subdirectory(samples) endif() + +# add apps if available +if(EXISTS ${CMAKE_SOURCE_DIR}/apps/CMakeLists.txt) + add_subdirectory(apps) +endif() diff --git a/LICENSE b/LICENSE index 3f7f789a..3cc17a66 100644 --- a/LICENSE +++ b/LICENSE @@ -1,3 +1,4 @@ +Copyright (c) 2020, University of Erlangen-Nuremberg Copyright (c) 2014, Saarland University Copyright (c) 2012, University of Erlangen-Nuremberg Copyright (c) 2012, Siemens AG diff --git a/README.md b/README.md index 9dc7d367..d4e4db88 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,8 @@ +This is a fork of the [Hipacc](http://hipacc-lang.org) project, with primarily experiential features: +* Asynchronous runtime APIs +* Multi-stream implementation for multiresolution filters in CUDA + + # Hipacc A domain-specific language and compiler for image processing diff --git a/_clang-format b/_clang-format new file mode 100644 index 00000000..a81aaae7 --- /dev/null +++ b/_clang-format @@ -0,0 +1,121 @@ +--- +Language: Cpp +# BasedOnStyle: LLVM +AccessModifierOffset: -2 +AlignAfterOpenBracket: Align +AlignConsecutiveAssignments: false +AlignConsecutiveDeclarations: false +AlignEscapedNewlines: Right +AlignOperands: true +AlignTrailingComments: true +AllowAllParametersOfDeclarationOnNextLine: true +AllowShortBlocksOnASingleLine: false +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: All +AllowShortIfStatementsOnASingleLine: false +AllowShortLoopsOnASingleLine: false +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: false +AlwaysBreakTemplateDeclarations: MultiLine +BinPackArguments: true +BinPackParameters: true +BraceWrapping: + AfterClass: false + AfterControlStatement: false + AfterEnum: false + AfterFunction: false + AfterNamespace: false + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + AfterExternBlock: false + BeforeCatch: false + BeforeElse: false + IndentBraces: false + SplitEmptyFunction: true + SplitEmptyRecord: true + SplitEmptyNamespace: true +BreakBeforeBinaryOperators: None +BreakBeforeBraces: Attach +BreakBeforeInheritanceComma: false +BreakInheritanceList: BeforeColon +BreakBeforeTernaryOperators: true +BreakConstructorInitializersBeforeComma: false +BreakConstructorInitializers: BeforeColon +BreakAfterJavaFieldAnnotations: false +BreakStringLiterals: true +ColumnLimit: 82 +CommentPragmas: '^ IWYU pragma:' +CompactNamespaces: false +ConstructorInitializerAllOnOneLineOrOnePerLine: false +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: true +DerivePointerAlignment: false +DisableFormat: false +ExperimentalAutoDetectBinPacking: false +FixNamespaceComments: true +ForEachMacros: + - foreach + - Q_FOREACH + - BOOST_FOREACH +IncludeBlocks: Preserve +IncludeCategories: + - Regex: '^"(llvm|llvm-c|clang|clang-c)/' + Priority: 2 + - Regex: '^(<|"(gtest|gmock|isl|json)/)' + Priority: 3 + - Regex: '.*' + Priority: 1 +IncludeIsMainRegex: '(Test)?$' +IndentCaseLabels: false +IndentPPDirectives: None +IndentWidth: 2 +IndentWrappedFunctionNames: false +JavaScriptQuotes: Leave +JavaScriptWrapImports: true +KeepEmptyLinesAtTheStartOfBlocks: true +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +ObjCBinPackProtocolList: Auto +ObjCBlockIndentWidth: 2 +ObjCSpaceAfterProperty: false +ObjCSpaceBeforeProtocolList: true +PenaltyBreakAssignment: 2 +PenaltyBreakBeforeFirstCallParameter: 19 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakString: 1000 +PenaltyBreakTemplateDeclaration: 10 +PenaltyExcessCharacter: 1000000 +PenaltyReturnTypeOnItsOwnLine: 60 +PointerAlignment: Right +ReflowComments: true +SortIncludes: true +SortUsingDeclarations: true +SpaceAfterCStyleCast: false +SpaceAfterTemplateKeyword: true +SpaceBeforeAssignmentOperators: true +SpaceBeforeCpp11BracedList: false +SpaceBeforeCtorInitializerColon: true +SpaceBeforeInheritanceColon: true +SpaceBeforeParens: ControlStatements +SpaceBeforeRangeBasedForLoopColon: true +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 1 +SpacesInAngles: false +SpacesInContainerLiterals: true +SpacesInCStyleCastParentheses: false +SpacesInParentheses: false +SpacesInSquareBrackets: false +Standard: Cpp11 +StatementMacros: + - Q_UNUSED + - QT_REQUIRE_VERSION +TabWidth: 8 +UseTab: Never +... + diff --git a/apps/6_Multiresolution_Filters/Laplacian_Pyramid_Encoding/src/main.cpp b/apps/6_Multiresolution_Filters/Laplacian_Pyramid_Encoding/src/main.cpp new file mode 100644 index 00000000..b69dcaad --- /dev/null +++ b/apps/6_Multiresolution_Filters/Laplacian_Pyramid_Encoding/src/main.cpp @@ -0,0 +1,212 @@ +// +// Copyright (c) 2020, University of Erlangen-Nuremberg +// Copyright (c) 2012, University of Erlangen-Nuremberg +// Copyright (c) 2012, Siemens AG +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, this +// list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + +#include "hipacc.hpp" +#include +#include + +#define SIZE_X 3 +#define SIZE_Y 3 +#define WIDTH 1024 +#define HEIGHT 1024 + +using namespace hipacc; +using namespace hipacc::math; + + +class Gaussian : public Kernel { + private: + Accessor &input; + Mask &mask; + + public: + Gaussian(IterationSpace &iter, Accessor &input, + Mask &mask) + : Kernel(iter), input(input), mask(mask) { + add_accessor(&input); + } + + void kernel() { + output() = convolve(mask, Reduce::SUM, [&] () { + return input(mask) * mask(); + }); + } + + void _operatePyramidReduce() {} +}; + +class DifferenceOfGaussian : public Kernel { + private: + Accessor &input1; + Accessor &input2; + + public: + DifferenceOfGaussian(IterationSpace &iter, Accessor &input1, + Accessor &input2) + : Kernel(iter), input1(input1), input2(input2) { + add_accessor(&input1); + add_accessor(&input2); + } + + void kernel() { + output() = input1() - input2(); + } + + void _operatePyramidFilter() {} +}; + +class Blend : public Kernel { + private: + Accessor &input1; + Accessor &input2; + + public: + Blend(IterationSpace &iter, Accessor &input1, + Accessor &input2) + : Kernel(iter), input1(input1), input2(input2) { + add_accessor(&input1); + add_accessor(&input2); + } + + void kernel() { + output() = (short)input1() + (short)input2() / 2; + } + + void _operatePyramidExpand() {} +}; + + +/************************************************************************* + * Main function * + *************************************************************************/ +int main(int argc, const char **argv) { + const int width = WIDTH; + const int height = HEIGHT; + const int size_x = SIZE_X; + const int size_y = SIZE_Y; + float timing = 0; + + // only filter kernel sizes 3x3, 5x5, and 7x7 implemented + if (size_x != size_y || !(size_x == 3 || size_x == 5 || size_x == 7)) { + std::cout << "Wrong filter kernel size. " + << "Currently supported values: 3x3, 5x5, and 7x7!" + << std::endl; + exit(EXIT_FAILURE); + } + + // convolution filter mask + const float coef[SIZE_Y][SIZE_X] = { +#if SIZE_X == 3 + { 0.057118f, 0.124758f, 0.057118f }, + { 0.124758f, 0.272496f, 0.124758f }, + { 0.057118f, 0.124758f, 0.057118f } +#endif +#if SIZE_X == 5 + { 0.005008f, 0.017300f, 0.026151f, 0.017300f, 0.005008f }, + { 0.017300f, 0.059761f, 0.090339f, 0.059761f, 0.017300f }, + { 0.026151f, 0.090339f, 0.136565f, 0.090339f, 0.026151f }, + { 0.017300f, 0.059761f, 0.090339f, 0.059761f, 0.017300f }, + { 0.005008f, 0.017300f, 0.026151f, 0.017300f, 0.005008f } +#endif +#if SIZE_X == 7 + { 0.000841f, 0.003010f, 0.006471f, 0.008351f, 0.006471f, 0.003010f, 0.000841f }, + { 0.003010f, 0.010778f, 0.023169f, 0.029902f, 0.023169f, 0.010778f, 0.003010f }, + { 0.006471f, 0.023169f, 0.049806f, 0.064280f, 0.049806f, 0.023169f, 0.006471f }, + { 0.008351f, 0.029902f, 0.064280f, 0.082959f, 0.064280f, 0.029902f, 0.008351f }, + { 0.006471f, 0.023169f, 0.049806f, 0.064280f, 0.049806f, 0.023169f, 0.006471f }, + { 0.003010f, 0.010778f, 0.023169f, 0.029902f, 0.023169f, 0.010778f, 0.003010f }, + { 0.000841f, 0.003010f, 0.006471f, 0.008351f, 0.006471f, 0.003010f, 0.000841f } +#endif + }; + + // host memory for random generated image of width x height pixels + char *input = new char[width * height]; + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; ++x) { + input[y * width + x] = (char)((y * width + x) % 19); + } + } + + std::cout << "Calculating Hipacc Gaussian Laplacian pyramid ..." << std::endl; + + //************************************************************************// + + // input and output image of width x height pixels + Image gaus(width, height); + Image lap(width, height); + Mask mask(coef); + + const int depth = 10; + Pyramid pgaus(gaus, depth); + Pyramid plap(lap, depth); + + gaus = input; + traverse(pgaus, plap, [&] () { + if (!pgaus.is_top_level()) { + // construct Gaussian pyramid + BoundaryCondition bound(pgaus(-1), mask, Boundary::CLAMP); + Accessor acc1(bound); + IterationSpace iter1(pgaus(0)); + Gaussian blur(iter1, acc1, mask); + blur.execute(); + + // construct Laplacian pyramid + Accessor acc3(pgaus(-1)); + Accessor acc4(pgaus(0), Interpolate::LF); + IterationSpace iter3(plap(-1)); + DifferenceOfGaussian DoG(iter3, acc3, acc4); + DoG.execute(); + } + + traverse(); + + // collapse pyramids + if (!pgaus.is_bottom_level()) { + // blend final output image from Laplacian pyramid + Accessor acc3(plap(1), Interpolate::LF); + Accessor acc4(plap(0)); + IterationSpace iter2(plap(0)); + Blend blend(iter2, acc3, acc4); + blend.execute(); + } + }); + + // get pointer to result data + char *output = lap.data(); + + //************************************************************************// + + // convert to uchar for visualization + for (int p = 0; p < width*height; ++p) { + output[p] = (char)(output[p] + 127); + } + + // free memory + delete[] input; + + return EXIT_SUCCESS; +} diff --git a/apps/7_FFT/DFT_DCT/src/main.cpp b/apps/7_FFT/DFT_DCT/src/main.cpp new file mode 100644 index 00000000..e03d7161 --- /dev/null +++ b/apps/7_FFT/DFT_DCT/src/main.cpp @@ -0,0 +1,139 @@ +// +// Copyright (c) 2012, University of Erlangen-Nuremberg +// Copyright (c) 2012, Siemens AG +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, this +// list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + +#include "hipacc.hpp" + +#include +#include + +/*#define WIDTH 4032 +#define HEIGHT 3024 +#define IMAGE "../../common/img/fuerte_ship.jpg"*/ +#define WIDTH 600 +#define HEIGHT 450 +#define IMAGE "../../common/img/halftone-face.jpg" + +#define TYPE float +#define TYPE2 float + +using namespace hipacc; +using namespace hipacc::math; + +/************************************************************************* + * Main function * + *************************************************************************/ +int main(int argc, const char **argv) { + const int width = WIDTH; + const int height = HEIGHT; + + // host memory for image of width x height pixels + TYPE *input = load_data(width, height, 1, IMAGE); + + // std::cout << "Calculating Hipacc Gaussian filter ..." << std::endl; + + //************************************************************************// + + // input and output image of width x height pixels + Image in(width, height, input); + Image out_dct(width, height); + Image out_fft(width, height); + Image out_mag_dct(width, height); + Image out_mag_fft(width, height); + + // write fft of in to fftResult + float *fftResult = (float *)fft(in); + // create magnitude from fft + fftToMagnitude(fftResult, out_mag_fft); + ifft(fftResult, out_fft); + + float *dctResult = (float *)dct(in); + // create magnitude from fft + dctToMagnitude(dctResult, out_mag_dct); + idct(dctResult, out_dct); + + // get pointer to result data + TYPE *output_dct = out_dct.data(); + TYPE *output_fft = out_fft.data(); + TYPE *output_mag_dct = out_mag_dct.data(); + TYPE *output_mag_fft = out_mag_fft.data(); + + save_data(width, height, 1, input, "input.jpg"); + save_data(width, height, 1, output_dct, "output_dct.jpg"); + save_data(width, height, 1, output_fft, "output_fft.jpg"); + save_data(width, height, 1, output_mag_dct, "output_mag_dct.jpg"); + save_data(width, height, 1, output_mag_fft, "output_mag_fft.jpg"); + + + // manual transforms + + TYPE2 test[4][4] = {{1,2,3,4},{5,6,7,8},{9,10,11,12},{13,14,15,16}}; + TYPE2 fft[4][(4 / 2 + 1) * 2]; + TYPE2 dct[4][4]; + + TYPE2 *test_d; + TYPE2 *fft_d; + TYPE2 *dct_d; + + /*cudaMalloc((void**)&test_d, sizeof(TYPE2) * 4*4); + cudaMalloc((void**)&fft_d, sizeof(TYPE2) * 4*(4 / 2 + 1) * 2); + cudaMalloc((void**)&dct_d, sizeof(TYPE2) * 4*4); + cudaMemcpy(test_d, test, sizeof(TYPE2) * 4*4, cudaMemcpyHostToDevice); + cudaMemcpy(fft_d, fft, sizeof(TYPE2) * 4*(4 / 2 + 1) * 2, + cudaMemcpyHostToDevice); cudaMemcpy(dct_d, dct, sizeof(TYPE2) * 4*4, + cudaMemcpyHostToDevice); + + fft_transform_device(test_d, 4, 4, fft_d); + + cudaMemcpy(fft, fft_d, sizeof(TYPE2) * 4*(4 / 2 + 1) * 2, + cudaMemcpyDeviceToHost); + + for (int y = 0; y < 4; y++) { + for (int x = 0; x < (4 / 2 + 1) * 2; x++) { + std::cout << fft[y][x] << "\t"; + } + std::cout << std::endl; + } + std::cout << std::endl; + + dct_transform_device((TYPE2*)test_d, 4, 4, (TYPE2*)dct_d); + dct_transform_device((TYPE2*)dct_d, 4, 4, (TYPE2*)test_d, false); + + //cudaMemcpy(dct, dct_d, sizeof(TYPE2) * 4*4, cudaMemcpyDeviceToHost); + cudaMemcpy(dct, test_d, sizeof(TYPE2) * 4*4, cudaMemcpyDeviceToHost);*/ + + for (int y = 0; y < 4; y++) { + for (int x = 0; x < 4; x++) { + std::cout << dct[y][x] << "\t"; + } + std::cout << std::endl; + } + std::cout << std::endl; + + // free memory + delete[] input; + + return EXIT_SUCCESS; +} diff --git a/apps/7_FFT/Gaussian_Blur/src/main.cpp b/apps/7_FFT/Gaussian_Blur/src/main.cpp new file mode 100644 index 00000000..d7e42520 --- /dev/null +++ b/apps/7_FFT/Gaussian_Blur/src/main.cpp @@ -0,0 +1,226 @@ +// +// Copyright (c) 2012, University of Erlangen-Nuremberg +// Copyright (c) 2012, Siemens AG +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, this +// list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + +#include "hipacc.hpp" + +#include +#include + +#define SIZE_X 7 +#define SIZE_Y 7 + +#define WIDTH 4032 +#define HEIGHT 3024 +#define IMAGE "../../common/img/fuerte_ship.jpg" +/* +#define WIDTH 700 +#define HEIGHT 445 +#define IMAGE "../../common/img/halftone-printing-process.jpg" +*/ +#define TYPE uchar + +using namespace hipacc; +using namespace hipacc::math; + +// Gaussian blur filter in Hipacc +class GaussianBlur : public Kernel { +private: + Accessor &input; + Mask &mask; + +public: + GaussianBlur(IterationSpace &iter, Accessor &input, + Mask &mask) + : Kernel(iter), input(input), mask(mask) { + add_accessor(&input); + } + + void kernel() { + output() = (TYPE)(convolve(mask, Reduce::SUM, + [&]() -> float { return mask() * input(mask); }) + + 0.0f); + } +}; + +// forward declaration of reference implementation +template +void gaussian_filter(T *in, T *out, float *filter, int size_x, int size_y, + int width, int height); + +// alternative compare to analyse precision and errors +template +void compare(const T *cmp1, const T *cmp2, const unsigned int width, + const unsigned int height, const unsigned int border_x = 0, + const unsigned int border_y = 0) { + int count = 0; + double max = 0.0; + double sum = 0.0; + for (int i = border_y; i < height - border_y; i++) { + for (int j = border_x; j < width - border_x; j++) { + double diff = (double)std::abs(cmp1[i * width + j] - cmp2[i * width + j]); + if (diff > 0.0) { + count++; + sum += diff; + max = std::max(diff, max); + } + } + } + double avg = sum / ((width - border_x * 2) * (height - border_y * 2)); + std::cout << "different pixels:" << count << " avg error:" << avg + << " max error:" << max << std::endl; +} + +/************************************************************************* + * Main function * + *************************************************************************/ +int main(int argc, const char **argv) { + const int width = WIDTH; + const int height = HEIGHT; + const int size_x = SIZE_X; + const int size_y = SIZE_Y; + const int offset_x = size_x >> 1; + const int offset_y = size_y >> 1; + float timing = 0; + + // only filter kernel sizes 3x3, 5x5, and 7x7 implemented + if (size_x != size_y || !(size_x == 3 || size_x == 5 || size_x == 7)) { + std::cerr << "Wrong filter kernel size. " + << "Currently supported values: 3x3, 5x5, and 7x7!" << std::endl; + exit(EXIT_FAILURE); + } + + // convolution filter mask + const float coef[SIZE_Y][SIZE_X] = { +#if SIZE_X == 3 + {0.057118f, 0.124758f, 0.057118f}, + {0.124758f, 0.272496f, 0.124758f}, + {0.057118f, 0.124758f, 0.057118f} +#endif +#if SIZE_X == 5 + {0.005008f, 0.017300f, 0.026151f, 0.017300f, 0.005008f}, + {0.017300f, 0.059761f, 0.090339f, 0.059761f, 0.017300f}, + {0.026151f, 0.090339f, 0.136565f, 0.090339f, 0.026151f}, + {0.017300f, 0.059761f, 0.090339f, 0.059761f, 0.017300f}, + {0.005008f, 0.017300f, 0.026151f, 0.017300f, 0.005008f} +#endif +#if SIZE_X == 7 + {0.000841f, 0.003010f, 0.006471f, 0.008351f, 0.006471f, 0.003010f, 0.000841f}, + {0.003010f, 0.010778f, 0.023169f, 0.029902f, 0.023169f, 0.010778f, 0.003010f}, + {0.006471f, 0.023169f, 0.049806f, 0.064280f, 0.049806f, 0.023169f, 0.006471f}, + {0.008351f, 0.029902f, 0.064280f, 0.082959f, 0.064280f, 0.029902f, 0.008351f}, + {0.006471f, 0.023169f, 0.049806f, 0.064280f, 0.049806f, 0.023169f, 0.006471f}, + {0.003010f, 0.010778f, 0.023169f, 0.029902f, 0.023169f, 0.010778f, 0.003010f}, + {0.000841f, 0.003010f, 0.006471f, 0.008351f, 0.006471f, 0.003010f, 0.000841f} +#endif + }; + + // host memory for image of width x height pixels + TYPE *input = load_data(width, height, 1, IMAGE); + TYPE *ref_out = new TYPE[width * height]; + + std::cout << "Calculating Hipacc Gaussian filter ..." << std::endl; + + //************************************************************************// + + // input and output image of width x height pixels + Image in(width, height, input); + Image out(width, height); + + // define Mask for Gaussian filter + Mask mask(coef); + + BoundaryCondition bound(in, mask, Boundary::CONSTANT, 0); + Accessor acc(bound); + + IterationSpace iter(out); + GaussianBlur filter(iter, acc, mask); + + Image outFFT(width, height); + IterationSpace iterFFT(outFFT); + GaussianBlur filterFFT(iterFFT, acc, mask); + + filter.execute(); + timing = hipacc_last_kernel_timing(); + + filterFFT.convolveFFT(); + + // get pointer to result data + TYPE *output = out.data(); + TYPE *outputFFT = outFFT.data(); + + //************************************************************************// + + std::cout << "Hipacc: " << timing << " ms, " << (width * height / timing) / 1000 + << " Mpixel/s" << std::endl; + + std::cout << "Calculating reference ..." << std::endl; + double start = time_ms(); + gaussian_filter(input, ref_out, (float *)coef, size_x, size_y, width, height); + double end = time_ms(); + std::cout << "Reference: " << end - start << " ms, " + << (width * height / (end - start)) / 1000 << " Mpixel/s" + << std::endl; + + // compare hipacc to reference + compare(output, ref_out, width, height, offset_x, offset_y); + // compare hipacc to fft convolution + compare(outputFFT, output, width, height /*, offset_x, offset_y*/); + + save_data(width, height, 1, input, "input.jpg"); + save_data(width, height, 1, output, "output.jpg"); + save_data(width, height, 1, outputFFT, "outputFFT.jpg"); + show_data(width, height, 1, output, "output.jpg"); + + // free memory + delete[] input; + delete[] ref_out; + + return EXIT_SUCCESS; +} + +// Gaussian blur filter reference +template +void gaussian_filter(T *in, T *out, float *filter, int size_x, int size_y, + int width, int height) { + int anchor_x = size_x >> 1; + int anchor_y = size_y >> 1; + int upper_x = width - anchor_x; + int upper_y = height - anchor_y; + + for (int y = anchor_y; y < upper_y; ++y) { + for (int x = anchor_x; x < upper_x; ++x) { + float sum = 0.0f; + + for (int yf = -anchor_y; yf <= anchor_y; ++yf) { + for (int xf = -anchor_x; xf <= anchor_x; ++xf) { + sum += filter[(yf + anchor_y) * size_x + xf + anchor_x] * + in[(y + yf) * width + x + xf]; + } + } + out[y * width + x] = (T)(sum); + } + } +} diff --git a/apps/7_FFT/Halftone/src/main.cpp b/apps/7_FFT/Halftone/src/main.cpp new file mode 100644 index 00000000..91c43edd --- /dev/null +++ b/apps/7_FFT/Halftone/src/main.cpp @@ -0,0 +1,252 @@ +// +// Copyright (c) 2012, University of Erlangen-Nuremberg +// Copyright (c) 2012, Siemens AG +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, this +// list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + +#include "hipacc.hpp" + +#include +#include + +#define SIZE_X 7 +#define SIZE_Y 7 +/* +#define WIDTH 700 +#define HEIGHT 445 +#define IMAGE "../../common/img/halftone-printing-process.jpg" +*/ +#define WIDTH 600 +#define HEIGHT 450 +#define IMAGE "../../common/img/halftone-face.jpg" + +#define TYPE uchar + +using namespace hipacc; +using namespace hipacc::math; + +// Kernel description in Hipacc +class Dilate : public Kernel { +private: + Accessor ∈ + Domain &dom; + +public: + Dilate(IterationSpace &iter, Accessor &in, Domain &dom) + : Kernel(iter), in(in), dom(dom) { + add_accessor(&in); + } + + void kernel() { + output() = reduce(dom, Reduce::MIN, [&]() -> TYPE { return in(dom); }); + } +}; + +class ImproveFilter : public Kernel { +private: + Accessor ∈ + Domain &dom; + int size_x, size_y; + Mask &mask; + +public: + ImproveFilter(IterationSpace &iter, Accessor &in, Domain &dom, + int size_x, int size_y, Mask &mask) + : Kernel(iter), in(in), dom(dom), size_x(size_x), size_y(size_y), + mask(mask) { + add_accessor(&in); + } + + void kernel() { + float w_avg = + convolve(mask, Reduce::SUM, [&]() -> float { return mask() * in(mask); }); + float avg = reduce(dom, Reduce::SUM, [&]() -> float { return in(dom); }) / + (TYPE)(size_x * size_y); + float max = reduce(dom, Reduce::MAX, [&]() -> TYPE { return in(dom); }); + output() = ((w_avg - avg) > 1) || (max > (avg / 2 + 40)) ? 0 : 255; + } +}; + +// Gaussian blur filter in Hipacc +class GaussianBlur : public Kernel { +private: + Accessor &input; + Mask &mask; + +public: + GaussianBlur(IterationSpace &iter, Accessor &input, + Mask &mask) + : Kernel(iter), input(input), mask(mask) { + add_accessor(&input); + } + + void kernel() { + output() = (TYPE)(convolve(mask, Reduce::SUM, + [&]() -> TYPE { return mask() * input(mask); }) + + 0.0f); + } +}; + +/************************************************************************* + * Main function * + *************************************************************************/ +int main(int argc, const char **argv) { + const int width = WIDTH; + const int height = HEIGHT; + const int size_x = SIZE_X; + const int size_y = SIZE_Y; + + // only filter kernel sizes 3x3, 5x5, and 7x7 implemented + if (size_x != size_y || !(size_x == 3 || size_x == 5 || size_x == 7)) { + std::cerr << "Wrong filter kernel size. " + << "Currently supported values: 3x3, 5x5, and 7x7!" << std::endl; + exit(EXIT_FAILURE); + } + + // convolution filter mask + const float coef3[3][3] = {{0.057118f, 0.124758f, 0.057118f}, + {0.124758f, 0.272496f, 0.124758f}, + {0.057118f, 0.124758f, 0.057118f}}; + + const float coef5[5][5] = { + {0.005008f, 0.017300f, 0.026151f, 0.017300f, 0.005008f}, + {0.017300f, 0.059761f, 0.090339f, 0.059761f, 0.017300f}, + {0.026151f, 0.090339f, 0.136565f, 0.090339f, 0.026151f}, + {0.017300f, 0.059761f, 0.090339f, 0.059761f, 0.017300f}, + {0.005008f, 0.017300f, 0.026151f, 0.017300f, 0.005008f}}; + + const float coef7[7][7] = {{0.000841f, 0.003010f, 0.006471f, 0.008351f, + 0.006471f, 0.003010f, 0.000841f}, + {0.003010f, 0.010778f, 0.023169f, 0.029902f, + 0.023169f, 0.010778f, 0.003010f}, + {0.006471f, 0.023169f, 0.049806f, 0.064280f, + 0.049806f, 0.023169f, 0.006471f}, + {0.008351f, 0.029902f, 0.064280f, 0.082959f, + 0.064280f, 0.029902f, 0.008351f}, + {0.006471f, 0.023169f, 0.049806f, 0.064280f, + 0.049806f, 0.023169f, 0.006471f}, + {0.003010f, 0.010778f, 0.023169f, 0.029902f, + 0.023169f, 0.010778f, 0.003010f}, + {0.000841f, 0.003010f, 0.006471f, 0.008351f, + 0.006471f, 0.003010f, 0.000841f}}; + + // host memory for image of width x height pixels + TYPE *input = load_data(width, height, 1, IMAGE); + + // std::cout << "Calculating Hipacc Gaussian filter ..." << std::endl; + + //************************************************************************// + + // input and output image of width x height pixels + Image in(width, height, input); + Image out(width, height); + + // blur masks + Mask mask3(coef3); + Mask mask5(coef5); + Mask mask7(coef7); + + // images + Image mag(width, height); + Image mag_blur(width, height); + Image mag_out(width, height); + Image mag_out2(width, height); + Image mag_out3(width, height); + + // kernel domains + Domain domImp(SIZE_X, SIZE_Y); + Domain domDil(5, 5); + + // declare all acessors and iterationSpaces for images + BoundaryCondition bound_mag(mag, mask3, Boundary::CLAMP); + Accessor acc_mag(bound_mag); + IterationSpace iter_mag(mag); + + BoundaryCondition bound_mag_out(mag_out, domDil, Boundary::CLAMP); + Accessor acc_mag_out(bound_mag_out); + IterationSpace iter_mag_out(mag_out); + + BoundaryCondition bound_mag_out2(mag_out2, mask5, Boundary::CLAMP); + Accessor acc_mag_out2(bound_mag_out2); + IterationSpace iter_mag_out2(mag_out2); + + BoundaryCondition bound_mag_out3(mag_out3, domImp, Boundary::CLAMP); + Accessor acc_mag_out3(bound_mag_out3); + IterationSpace iter_mag_out3(mag_out3); + + BoundaryCondition bound_mag_blur(mag_blur, domImp, Boundary::CLAMP); + Accessor acc_mag_blur(bound_mag_blur); + IterationSpace iter_mag_blur(mag_blur); + + // write fft of in to fftResult + float *fftResult = (float *)fft(in); + + // create magnitude from fft + fftToMagnitude(fftResult, mag); + + // blur magnitude + Domain domBlur(7, 7); + GaussianBlur blur(iter_mag_blur, acc_mag, mask3); + blur.execute(); + + // apply ImproveFilter + ImproveFilter improve(iter_mag_out, acc_mag_blur, domImp, SIZE_X, SIZE_Y, + mask7); + improve.execute(); + Dilate dilate(iter_mag_out2, acc_mag_out, domDil); + dilate.execute(); + + // blur magnitude mask to prevent ringing effect + GaussianBlur blur_mask(iter_mag_out3, acc_mag_out2, mask5); + blur_mask.execute(); + + // apply mask to fftResult + fftResetMask(mag_out3, min(width, height) / 20, true, 10); + fftApplyPassFilter(mag_out3, min(width, height) * 0.25, true, 100); + fftScaleMagnitude(fftResult, mag_out3); + + // visualize resulting magnitude + Image back(width, height); + fftToMagnitude(fftResult, back); + + // get result image from inverse fft + ifft(fftResult, out); + + // get pointer to result data + TYPE *output = out.data(); + TYPE *Mag = mag.data(); + TYPE *Mag_out = mag_out3.data(); + TYPE *Back = back.data(); + + save_data(width, height, 1, input, "input.jpg"); + save_data(width, height, 1, output, "output.jpg"); + save_data(width, height, 1, Mag, "Mag.png"); + save_data(width, height, 1, Mag_out, "Mag_mask.png"); + save_data(width, height, 1, Back, "Mag_new.png"); + show_data(width, height, 1, output, "output.jpg"); + + // free memory + delete[] input; + + return EXIT_SUCCESS; +} diff --git a/apps/7_FFT/Image_Manipulation/src/main.cpp b/apps/7_FFT/Image_Manipulation/src/main.cpp new file mode 100644 index 00000000..aeaa181b --- /dev/null +++ b/apps/7_FFT/Image_Manipulation/src/main.cpp @@ -0,0 +1,285 @@ +// +// Copyright (c) 2012, University of Erlangen-Nuremberg +// Copyright (c) 2012, Siemens AG +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, this +// list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + +#include "hipacc.hpp" + +#include +#include // for png image size +#include +#include +#include + +#define WIDTH 4032 +#define HEIGHT 3024 +#define IMAGE "../../common/img/fuerte_ship.png" + + +using namespace hipacc; +using namespace hipacc::math; + + +// filter in Hipacc +/*class LBP : public Kernel { + private: + Accessor &input; + Mask &mask; + + public: + LBP(IterationSpace &iter, Accessor &input, + Mask &mask) + : Kernel(iter), input(input), mask(mask) { + add_accessor(&input); + } + + void kernel() { + int result = 0; + for (int y = -1; y <= 1; ++y) { + for (int x = -1; x <= 1; ++x) { + result += mask(x, y) * (input(x, y) > input(0, 0) ? 1 : 0); + } + } + output() = (uchar) result; + output() = (uchar)(convolve(mask, Reduce::SUM, [&] () -> float { + return mask() * (input(mask) > input(0, 0) ? 1 : 0); + })); + } +};*/ + +class LBP : public Kernel { + private: + Accessor &input; + Mask &mask; + + public: + LBP(IterationSpace &iter, Accessor &input, + Mask &mask) + : Kernel(iter), input(input), mask(mask) { + add_accessor(&input); + } + + void kernel() { + output() = (uchar)(convolve(mask, Reduce::SUM, [&] () -> float { + return mask() * (input(mask) > input(0, 0) ? 1 : 0); + })); + } +}; + +/* +class LTP : public Kernel { + private: + Accessor &input; + Mask &mask; + + public: + LTP(IterationSpace &iter, Accessor &input, + Mask &mask) + : Kernel(iter), input(input), mask(mask) { + add_accessor(&input); + } + + void kernel() { + int ltpu = 0; + int ltpl = 0; + const int th = 10; + for (int y = -1; y <= 1; ++y) { + for (int x = -1; x <= 1; ++x) { + if (input(x, y) >= input(0, 0) + th) { + ltpu += mask(x, y); + } else if (input(x, y) <= input(0, 0) - th) { + ltpl += mask(x, y); + } + } + } + uchar4 result = (uchar4){(uchar)ltpu, (uchar)ltpl, 0, 0}; + output() = result; + } +}; +*/ + +// Kernel description in Hipacc +class ColorConversionY : public Kernel { + private: + Accessor ∈ + + public: + ColorConversionY(IterationSpace &iter, Accessor &acc) + : Kernel(iter), in(acc) { + add_accessor(&in); + } + + void kernel() { + uchar4 pixel = in(); + output() = .299f*pixel.x + .587f*pixel.y + .114f*pixel.z; + } +}; + +// Kernel description in Hipacc +class ColorConversionCb : public Kernel { + private: + Accessor ∈ + + public: + ColorConversionCb(IterationSpace &iter, Accessor &acc) + : Kernel(iter), in(acc) { + add_accessor(&in); + } + + void kernel() { + uchar4 pixel = in(); + output() = -.168736f*pixel.x + -.331264f*pixel.y + .5f*pixel.z + 128; + } +}; + +// Kernel description in Hipacc +class ColorConversionCr : public Kernel { + private: + Accessor ∈ + + public: + ColorConversionCr(IterationSpace &iter, Accessor &acc) + : Kernel(iter), in(acc) { + add_accessor(&in); + } + + void kernel() { + uchar4 pixel = in(); + output() = .5f*pixel.x + .418688f*pixel.y + -.081312f*pixel.z + 128; + } +}; + + +/************************************************************************* + * Main function * + *************************************************************************/ +int main(int argc, const char **argv) { + std::string imagePath = IMAGE; + // get width and height for png format + const int width = WIDTH; + const int height = HEIGHT; + + float timing = 0; + + // convolution filter mask + const int weights[3][3] = { + { 32, 64, 128 }, + { 16, 0, 1 }, + { 8, 4, 2 } + }; + + std::cout << "Calculating color Conversion filter ..." << std::endl; + + //************************************************************************// + + + // ColorConversion + + // host memory for image of width x height pixels + uchar4 *input = (uchar4*)load_data(width, height, 4, IMAGE); + float *inputFloat = (float*)load_data(width, height, 1, IMAGE); + + + + + // input and output image of width x height pixels + Image in(width, height, input); + Image outY(width, height); + Image outCb(width, height); + Image outCr(width, height); + + Accessor acc(in); + + IterationSpace iterY(outY); + ColorConversionY filterY(iterY, acc); + + IterationSpace iterCb(outCb); + ColorConversionCb filterCb(iterCb, acc); + + IterationSpace iterCr(outCr); + ColorConversionCr filterCr(iterCr, acc); + + filterY.execute(); + filterCb.execute(); + filterCr.execute(); + + uchar *outputY = outY.data(); + uchar *outputCb = outCb.data(); + uchar *outputCr = outCr.data(); + + save_data(width, height, 1, outputY, "outputY.png"); + save_data(width, height, 1, outputCb, "outputCb.png"); + save_data(width, height, 1, outputCr, "outputCr.png"); + + + // input and output image of width x height pixels + Image inLP(width, height, outputCr); + Image outLBP(width, height); + //Image outLTP(width, height); + + // define Mask for filter + Mask mask(weights); + + BoundaryCondition bound(inLP, mask, Boundary::CLAMP); + Accessor acc2(bound); + + IterationSpace iterLBP(outLBP); + LBP filterLBP(iterLBP, acc2, mask); + + std::cout << "Calculating LBP ..." << std::endl; + filterLBP.execute(); + + //IterationSpace iterLTP(outLTP); + //LTP filterLTP(iterLTP, acc2, mask); + + //std::cout << "Calculating LTP ..." << std::endl; + //filterLTP.execute(); + + timing = hipacc_last_kernel_timing(); + + Image outDCT(width, height); + + float *dctResult = (float *)dct(outLBP); + dctToMagnitude(dctResult, outDCT); + + + // get pointer to result data + uchar *output = outDCT.data(); + + //************************************************************************// + + std::cout << "Hipacc: " << timing << " ms, " + << (width*height/timing)/1000 << " Mpixel/s" << std::endl; + + save_data(width, height, 1, output, "outputDCT.png"); + //save_data(width, height, 4, outputLTP, "outputLTP.png"); + + + + // free memory + delete[] input; + + return EXIT_SUCCESS; +} + diff --git a/apps/7_FFT/Makefile b/apps/7_FFT/Makefile new file mode 100644 index 00000000..0353920e --- /dev/null +++ b/apps/7_FFT/Makefile @@ -0,0 +1,167 @@ +# Search binaries +CXX := $(shell which c++) +NVCC := $(shell which nvcc) +ADB := $(shell which adb) +NDK_BUILD := $(shell which ndk-build) +PKG_CONFIG := $(shell which pkg-config) + +################################################################################ + +# Search Hipacc (prefer relative path over package path over PATH env var) +HIPACC := $(shell if [ -f $(CURDIR)/../../../bin/hipacc ]; then \ + echo $(CURDIR)/../../../bin/hipacc; \ + elif [ -f /bin/hipacc ]; then \ + echo /bin/hipacc; \ + else \ + echo $$(which hipacc); \ + fi) +ifndef HIPACC + $(error "Error: Could not find binary 'hipacc' in PATH") +endif + +################################################################################ + +# Set paths for build tools +HIPACC_PATH := $(shell dirname $(HIPACC))/.. +ifdef NVCC +CUDA_PATH := $(shell dirname $(NVCC))/.. +endif + +################################################################################ + +# Set compiler options +OS := $(shell uname) +CXX_FLAGS = -std=c++11 -O2 +CXX_INCLUDE = -I ../../common \ + -I $(HIPACC_PATH)/include + +# standalone runtime disabled by default +#CXX_LIB_DIR = -L $(HIPACC_PATH)/lib +#CXX_LINK = -lhipaccRuntime + +CXX_LINK = -lfftw3_omp -lfftw3 -lfftw3f_omp -lfftw3f -lm +CXX_LINK_EXCL = -fopenmp + +OCL_INCLUDE = $(CXX_INCLUDE) +OCL_LIB_DIR = $(CXX_LIB_DIR) +ifeq ($(OS),Darwin) +OCL_LINK = $(CXX_LINK) -framework OpenCL +else +OCL_LINK = $(CXX_LINK) -lOpenCL +endif + +ifdef NVCC +NVCC_FLAGS = $(CXX_FLAGS) -x cu +NVCC_INCLUDE = $(CXX_INCLUDE) +NVCC_LIB_DIR = $(CXX_LIB_DIR) +ifeq ($(OS),Darwin) +NVCC_LINK = -Xlinker $(CXX_LINK) -Xlinker -lnvrtc -Xlinker -framework,CUDA +else +NVCC_LINK = $(CXX_LINK) -lcuda -lcudart -lcufft -lnvrtc -lnvidia-ml +OCL_INCLUDE += -I $(CUDA_PATH)/include +OCL_LIB_DIR += -L $(CUDA_PATH)/lib64 +endif +endif + +HIPACC_FLAGS = -std=c++11 -nostdinc++ +ifeq ($(OS),Darwin) +HIPACC_FLAGS += -isystem `xcrun --sdk macosx --show-sdk-path`/usr/include +endif +DSL_INCLUDE = -I ../../common \ + -I $(HIPACC_PATH)/include/dsl +HIPACC_INCLUDE = $(DSL_INCLUDE) \ + -I $(HIPACC_PATH)/include/c++/v1 \ + -I $(HIPACC_PATH)/include/clang +HIPACC_OPTS = -emit-$(1) \ + $(shell if [ -f $(1).conf ]; then \ + cat $(1).conf | xargs; \ + elif [ -f ../../common/config/$(1).conf ]; then \ + cat ../../common/config/$(1).conf | xargs; \ + fi) + +ifdef PKG_CONFIG +# Check for OpenCV +OPENCV=$(shell $(PKG_CONFIG) --list-all | grep opencv | head -n1 | cut -d" " -f1) +ifneq ($(OPENCV),) +HIPACC_FLAGS += $(shell $(PKG_CONFIG) --cflags-only-other $(OPENCV)) -DUSE_OPENCV +HIPACC_INCLUDE += $(shell $(PKG_CONFIG) --cflags-only-I $(OPENCV)) +CXX_FLAGS += $(shell $(PKG_CONFIG) --cflags-only-other $(OPENCV)) -DUSE_OPENCV +CXX_INCLUDE += $(shell $(PKG_CONFIG) --cflags-only-I $(OPENCV)) +CXX_LIB_DIR += $(shell $(PKG_CONFIG) --libs-only-L $(OPENCV)) +CXX_LINK += $(shell $(PKG_CONFIG) --libs-only-l $(OPENCV)) -lpthread +endif +endif + +################################################################################ + +# Turn on GNU Make feature for using automatic variables in dependencies +.SECONDEXPANSION: +.PHONY: cpu cuda opencl-acc opencl-cpu opencl-gpu renderscript filterscript +.PRECIOUS: main_%.cc + +################################################################################ + +# Target rules +all: cpu + +# Run CPU/CUDA/OpenCL binary +dsl cpu cuda opencl-acc opencl-cpu opencl-gpu: main_$$@ + ./main_$@ + +# Run Renderscript/Filterscript binary on Android device +renderscript filterscript: main_$$@ +ifdef ADB + $(ADB) shell mkdir -p /data/local/tmp + $(ADB) push main_$@ /data/local/tmp + $(ADB) shell /data/local/tmp/main_$@ +else + @echo "Abort: Could not find binary 'adb' in PATH" + @exit 1 +endif + +# Run Hipacc +main_%.cc: src/main.cpp + $(HIPACC) $(HIPACC_FLAGS) -use-fft fast $(call HIPACC_OPTS,$*) $(HIPACC_INCLUDE) $(CURDIR)/$< -o $@ + +# Build DSL +main_dsl: src/main.cpp + $(CXX) $(CXX_FLAGS) $< $(DSL_INCLUDE) -o $@ + +# Build CPU +main_cpu: $$@.cc + $(CXX) $(CXX_FLAGS) $< $(CXX_INCLUDE) $(CXX_LIB_DIR) $(CXX_LINK_EXCL) $(CXX_LINK) -o $@ + +# Build CUDA +main_cuda: $$@.cc +ifdef NVCC + $(NVCC) $(NVCC_FLAGS) $< $(NVCC_INCLUDE) $(NVCC_LIB_DIR) $(NVCC_LINK) -o $@ +else + @echo "Abort: Could not find binary 'nvcc' in PATH" + @exit 1 +endif + +# Build OpenCL-CPU or OpenCL-GPU +main_opencl-%: $$@.cc + $(CXX) $(CXX_FLAGS) $< $(OCL_INCLUDE) $(OCL_LIB_DIR) $(OCL_LINK) -o $@ + +# Build Renderscript or Filterscript +main_%: $$@.cc + mkdir -p jni + cp ../../common/Android.mk jni/Android.mk + cp ../../common/Application.mk jni/Application.mk +ifdef NDK_BUILD + export HIPACC_MAIN=$@.cc; \ + export HIPACC_INCLUDE=$(HIPACC_PATH)/include; \ + $(NDK_BUILD) -B + cp libs/armeabi-v7a/main_renderscript ./$@ +else + @echo "Abort: Could not find binary 'ndk-build' in PATH" + @exit 1 +endif + +clean: + rm -rf jni libs obj + rm -f *cc *.cu *.cubin *.rs *.fs *.jpg + +distclean: clean + rm -f main_* *.cl diff --git a/apps/7_FFT/img/halftone-face.jpg b/apps/7_FFT/img/halftone-face.jpg new file mode 100644 index 00000000..39969e26 Binary files /dev/null and b/apps/7_FFT/img/halftone-face.jpg differ diff --git a/apps/7_FFT/img/halftone-printing-process.jpg b/apps/7_FFT/img/halftone-printing-process.jpg new file mode 100644 index 00000000..9b15ea92 Binary files /dev/null and b/apps/7_FFT/img/halftone-printing-process.jpg differ diff --git a/apps/CMakeLists.txt b/apps/CMakeLists.txt new file mode 100644 index 00000000..6b2762a3 --- /dev/null +++ b/apps/CMakeLists.txt @@ -0,0 +1,19 @@ +file(GLOB SAMPLE_DIRS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/[0-9]*) +foreach(SAMPLE_DIR IN LISTS SAMPLE_DIRS) + # install samples + install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/${SAMPLE_DIR} DESTINATION samples COMPONENT samples) + + # deploy build tool for samples + file(GLOB SAMPLES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}/${SAMPLE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/${SAMPLE_DIR}/*) + foreach(SAMPLE IN LISTS SAMPLES) + if(IS_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/${SAMPLE_DIR}/${SAMPLE}) + if(UNIX) + install(FILES ${CMAKE_BINARY_DIR}/samples/buildtools/unix/Makefile DESTINATION samples/${SAMPLE_DIR}/${SAMPLE} COMPONENT samples) + elseif(WIN32) + file(GLOB NMAKE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/buildtools/nmake/*) + file(GLOB VS2015_FILES ${CMAKE_CURRENT_SOURCE_DIR}/buildtools/vs2015/*) + install(FILES ${NMAKE_FILES} ${VS2015_FILES} DESTINATION samples/${SAMPLE_DIR}/${SAMPLE} COMPONENT samples) + endif() + endif() + endforeach() +endforeach() diff --git a/apps/readme.txt b/apps/readme.txt new file mode 100644 index 00000000..5cf2dccd --- /dev/null +++ b/apps/readme.txt @@ -0,0 +1,7 @@ +Application folder for new samples + +How to enable stream-based code generation for apps: +1 follow the standard getting started guide at (https://hipacc-lang.org/install.html) +2 the apps should be copied automatically to the same directory as samples +3 enable the Hipacc flag `-use-stream n` in cuda.conf +4 `-use-stream 1` for single-stream and `-use-stream 2` for multi-stream diff --git a/cmake/modules/FindFFTW.cmake b/cmake/modules/FindFFTW.cmake new file mode 100644 index 00000000..dfdf8d07 --- /dev/null +++ b/cmake/modules/FindFFTW.cmake @@ -0,0 +1,20 @@ +# Find the FFTW includes and library +# +# Once done this will define +# FFTW_INCLUDE_DIRS - where to find FFTW include files +# FFTW_LIBRARIES - where to find FFTW libs +# FFTW_FOUND - True if FFTW is found + +find_path(FFTW_INCLUDE_DIR fftw3.h PATHS ${FFTW_INCLUDE_DIRS}) +find_library(FFTW_LIBRARY fftw3) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(FFTW DEFAULT_MSG FFTW_INCLUDE_DIR FFTW_LIBRARY) + +set(FFTW_INCLUDE_DIRS ${FFTW_INCLUDE_DIR}) +set(FFTW_LIBRARIES ${FFTW_LIBRARY}) +if(NOT FFTW_LIBRARIES) + set(FFTW_LIBRARIES "") +endif() + +mark_as_advanced(FFTW_INCLUDE_DIR FFTW_LIBRARY) diff --git a/compiler/hipacc.cpp b/compiler/hipacc.cpp index d278f1f4..336f34a1 100644 --- a/compiler/hipacc.cpp +++ b/compiler/hipacc.cpp @@ -61,51 +61,81 @@ void printCopyright() { void printUsage() { - llvm::errs() << "OVERVIEW: Hipacc - Heterogeneous Image Processing Acceleration framework\n\n" - << "USAGE: hipacc [options] \n\n" - << "OPTIONS:\n\n" - << " -emit-cpu Emit C++ code\n" - << " -emit-cuda Emit CUDA code for GPU devices\n" - << " -emit-opencl-acc Emit OpenCL code for Accelerator devices\n" - << " -emit-opencl-cpu Emit OpenCL code for CPU devices\n" - << " -emit-opencl-gpu Emit OpenCL code for GPU devices\n" - << " -emit-renderscript Emit Renderscript code for Android\n" - << " -emit-filterscript Emit Filterscript code for Android\n" - << " -emit-padding Emit CUDA/OpenCL/Renderscript image padding, using alignment of bytes for GPU devices\n" - << " -target Generate code for GPUs with code name .\n" - << " Code names for CUDA/OpenCL on NVIDIA devices are:\n" - << " 'Fermi-20' and 'Fermi-21' for Fermi architecture.\n" - << " 'Kepler-30', 'Kepler-32', 'Kepler-35', and 'Kepler-37' for Kepler architecture.\n" - << " 'Maxwell-50', 'Maxwell-52', and 'Maxwell-53' for Maxwell architecture.\n" - << " Code names for for OpenCL on AMD devices are:\n" - << " 'Evergreen' for Evergreen architecture (Radeon HD5xxx).\n" - << " 'NorthernIsland' for Northern Island architecture (Radeon HD6xxx).\n" - << " Code names for for OpenCL on ARM devices are:\n" - << " 'Midgard' for Mali-T6xx' for Mali.\n" - << " Code names for for OpenCL on Intel Xeon Phi devices are:\n" - << " 'KnightsCorner' for Knights Corner Many Integrated Cores architecture.\n" - << " -explore-config Emit code that explores all possible kernel configuration and print its performance\n" - << " -use-config Emit code that uses a configuration of nxm threads, e.g. 128x1\n" - << " -reduce-config Emit code that uses a multi-dimensional reduction configuration of\n" - << " n warps per block (affects block size and shared memory size)\n" - << " m partial histograms (affects number of blocks)\n" - << " -time-kernels Emit code that executes each kernel multiple times to get accurate timings\n" - << " -use-textures Enable/disable usage of textures (cached) in CUDA/OpenCL to read/write image pixels - for GPU devices only\n" - << " Valid values for CUDA on NVIDIA devices: 'off', 'Linear1D', 'Linear2D', 'Array2D', and 'Ldg'\n" - << " Valid values for OpenCL: 'off' and 'Array2D'\n" - << " -use-local Enable/disable usage of shared/local memory in CUDA/OpenCL to stage image pixels to scratchpad\n" - << " Valid values: 'on' and 'off'\n" - << " -vectorize Enable/disable vectorization of generated CUDA/OpenCL code\n" - << " Valid values: 'on' and 'off'\n" - << " -pixels-per-thread Specify how many pixels should be calculated per thread\n" - << " -rs-package Specify Renderscript package name. (default: \"org.hipacc.rs\")\n" - << " -o Write output to \n" - << " --help Display available options\n" - << " --version Display version information\n"; + llvm::errs() + << "OVERVIEW: Hipacc - Heterogeneous Image Processing Acceleration " + "framework\n\n" + << "USAGE: hipacc [options] \n\n" + << "OPTIONS:\n\n" + << " -emit-cpu Emit C++ code\n" + << " -emit-cuda Emit CUDA code for GPU devices\n" + << " -emit-opencl-acc Emit OpenCL code for Accelerator devices\n" + << " -emit-opencl-cpu Emit OpenCL code for CPU devices\n" + << " -emit-opencl-gpu Emit OpenCL code for GPU devices\n" + << " -emit-renderscript Emit Renderscript code for Android\n" + << " -emit-filterscript Emit Filterscript code for Android\n" + << " -emit-padding Emit CUDA/OpenCL/Renderscript image padding, " + "using alignment of bytes for GPU devices\n" + << " -target Generate code for GPUs with code name .\n" + << " Code names for CUDA/OpenCL on NVIDIA devices " + "are:\n" + << " 'Fermi-20' and 'Fermi-21' for Fermi " + "architecture.\n" + << " 'Kepler-30', 'Kepler-32', 'Kepler-35', and " + "'Kepler-37' for Kepler architecture.\n" + << " 'Maxwell-50', 'Maxwell-52', and " + "'Maxwell-53' for Maxwell architecture.\n" + << " Code names for for OpenCL on AMD devices " + "are:\n" + << " 'Evergreen' for Evergreen " + "architecture (Radeon HD5xxx).\n" + << " 'NorthernIsland' for Northern Island " + "architecture (Radeon HD6xxx).\n" + << " Code names for for OpenCL on ARM devices " + "are:\n" + << " 'Midgard' for Mali-T6xx' for Mali.\n" + << " Code names for for OpenCL on Intel Xeon Phi " + "devices are:\n" + << " 'KnightsCorner' for Knights Corner Many " + "Integrated Cores architecture.\n" + << " -explore-config Emit code that explores all possible kernel " + "configuration and print its performance\n" + << " -use-config Emit code that uses a configuration of nxm " + "threads, e.g. 128x1\n" + << " -reduce-config Emit code that uses a multi-dimensional " + "reduction configuration of\n" + << " n warps per block (affects block size " + "and shared memory size)\n" + << " m partial histograms (affects number of " + "blocks)\n" + << " -time-kernels Emit code that executes each kernel multiple " + "times to get accurate timings\n" + << " -use-textures Enable/disable usage of textures (cached) in " + "CUDA/OpenCL to read/write image pixels - for GPU devices only\n" + << " Valid values for CUDA on NVIDIA devices: " + "'off', 'Linear1D', 'Linear2D', 'Array2D', and 'Ldg'\n" + << " Valid values for OpenCL: 'off' and " + "'Array2D'\n" + << " -use-local Enable/disable usage of shared/local memory " + "in CUDA/OpenCL to stage image pixels to scratchpad\n" + << " Valid values: 'on' and 'off'\n" + << " -vectorize Enable/disable vectorization of generated " + "CUDA/OpenCL code\n" + << " Valid values: 'on' and 'off'\n" + << " -pixels-per-thread Specify how many pixels should be calculated " + "per thread\n" + << " -rs-package Specify Renderscript package name. (default: " + "\"org.hipacc.rs\")\n" + << " -o Write output to \n" + << " --help Display available options\n" + << " --version Display version information\n" + << " -use-stream Async kernel launch with streams\n" + << " -use-fft Enable/disable usage of convolution with fft " + "for CPU and CUDA targets\n" + << " Valid values: 'on', 'fast' and 'off'\n"; } -void printVersion() { +void printVersion(){ llvm::errs() << "hipacc version " << HIPACC_VERSION << " (" << GIT_REPOSITORY " " << GIT_VERSION << ")\n"; } @@ -247,6 +277,20 @@ int main(int argc, char *argv[]) { compilerOptions.setTimeKernels(USER_ON); continue; } + if (StringRef(argv[i]) == "-use-stream") { + assert(i<(argc-1) && "Mandatory integer parameter for -use-stream switch missing."); + std::istringstream buffer(argv[i+1]); + int val; + buffer >> val; + if (buffer.fail()) { + llvm::errs() << "ERROR: Expected integer parameter for -use-stream switch.\n\n"; + printUsage(); + return EXIT_FAILURE; + } + compilerOptions.setStreamAsyncKernelLaunch(val); + ++i; + continue; + } if (StringRef(argv[i]) == "-use-textures") { assert(i<(argc-1) && "Mandatory texture memory specification for -use-textures switch missing."); if (StringRef(argv[i+1]) == "off") { @@ -295,6 +339,22 @@ int main(int argc, char *argv[]) { ++i; continue; } + if (StringRef(argv[i]) == "-use-fft") { + assert(i<(argc-1) && "Mandatory specification for -use-fft switch missing."); + if (StringRef(argv[i+1]) == "off") { + compilerOptions.setUseFFT(OFF); + } else if (StringRef(argv[i+1]) == "on") { + compilerOptions.setUseFFT(ON); + } else if (StringRef(argv[i + 1]) == "fast") { + compilerOptions.setUseFFT(FAST); + } else { + llvm::errs() << "ERROR: Expected valid specification for -use-fft switch.\n\n"; + printUsage(); + return EXIT_FAILURE; + } + ++i; + continue; + } if (StringRef(argv[i]) == "-pixels-per-thread") { assert(i<(argc-1) && "Mandatory integer parameter for -pixels-per-thread switch missing."); std::istringstream buffer(argv[i+1]); @@ -421,6 +481,13 @@ int main(int argc, char *argv[]) { // kernels are timed internally by the runtime in case of exploration compilerOptions.setTimeKernels(OFF); } + // Enable CUDA streams + if (!compilerOptions.emitCUDA() && compilerOptions.asyncKernelLaunch()) { + llvm::errs() << "ERROR: async (stream) support is only available for CUDA!\n\n"; + printUsage(); + return EXIT_FAILURE; + } + // print summary of compiler options compilerOptions.printSummary(targetDevice.getTargetDeviceName()); diff --git a/dsl/fft.hpp b/dsl/fft.hpp new file mode 100644 index 00000000..68917085 --- /dev/null +++ b/dsl/fft.hpp @@ -0,0 +1,109 @@ +// +// Copyright (c) 2012, University of Erlangen-Nuremberg +// Copyright (c) 2012, Siemens AG +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, this +// list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + +#ifndef __FFT_HPP__ +#define __FFT_HPP__ + +namespace hipacc { + +template +void fftTransform(TPrecision *in, int width, int height, TPrecision *out, + bool forward = true, bool scale = false){ + +}; +template +void fftTransformDevice(TPrecision *in, int width, int height, TPrecision *out, + bool forward = true, bool scale = false){ + +}; +template +void dctTransform(TPrecision *in, int width, int height, TPrecision *out, + bool forward = true){ + +}; +template +void dctTransformDevice(TPrecision *in, int width, int height, TPrecision *out, + bool forward = true){ + +}; +template TPrecision *fft(Image &in) { + static_assert(std::is_same::value, + "Type of Image and Memory have to be the same!"); + return nullptr; +}; +template +void ifft(TPrecision *in, Image &out) { + static_assert(std::is_same::value, + "Type of Image and Memory have to be the same!"); +}; +template +void fftToMagnitude(TPrecision *in, Image &mag) { + static_assert(std::is_same::value, + "Type of Image and Memory have to be the same!"); +}; +template +void dctToMagnitude(TPrecision *in, Image &mag) { + static_assert(std::is_same::value, + "Type of Image and Memory have to be the same!"); +}; +template +void fftScaleMagnitude(TPrecision *in, Image &mag) { + static_assert(std::is_same::value, + "Type of Image and Memory have to be the same!"); +}; +template +void fftResetMask(Image &mag, int radius, bool low, int window = 0) { + static_assert(std::is_same::value, + "Type of Image and Memory have to be the same!"); +}; +template +void fftApplyPassFilter(Image &mag, int radius, bool low, int window = 0) { + static_assert(std::is_same::value, + "Type of Image and Memory have to be the same!"); +}; +template void fftShift(Image &mag) { + static_assert(std::is_same::value, + "Type of Image and Memory have to be the same!"); +}; +template void ifftShift(Image &mag) { + static_assert(std::is_same::value, + "Type of Image and Memory have to be the same!"); +}; + +template TPrecision *dct(Image &in) { + static_assert(std::is_same::value, + "Type of Image and Memory have to be the same!"); + return nullptr; +}; +template +void idct(TPrecision *in, Image &out) { + static_assert(std::is_same::value, + "Type of Image and Memory have to be the same!"); +}; + +} + +#endif // __FFT_HPP__ \ No newline at end of file diff --git a/dsl/hipacc.hpp b/dsl/hipacc.hpp index 02d5413f..ebbcc9d5 100644 --- a/dsl/hipacc.hpp +++ b/dsl/hipacc.hpp @@ -39,6 +39,7 @@ #include "kernel.hpp" #include "mask.hpp" #include "pyramid.hpp" +#include "fft.hpp" namespace hipacc { class HipaccEoP{}; diff --git a/dsl/kernel.hpp b/dsl/kernel.hpp index 58449dd0..cda317eb 100644 --- a/dsl/kernel.hpp +++ b/dsl/kernel.hpp @@ -226,8 +226,11 @@ class Kernel { auto reduce(Domain &domain, Reduce mode, const Function &fun) -> decltype(fun()); template void iterate(Domain &domain, const Function &fun); + void convolveFFT(); }; +template +void Kernel::convolveFFT() {} template template auto Kernel::convolve(Mask &mask, Reduce mode, const Function& fun) -> decltype(fun()) { diff --git a/include/hipacc/Config/CompilerOptions.h b/include/hipacc/Config/CompilerOptions.h index aa9d8e70..c4364546 100644 --- a/include/hipacc/Config/CompilerOptions.h +++ b/include/hipacc/Config/CompilerOptions.h @@ -50,11 +50,12 @@ namespace hipacc { // compiler option possibilities enum CompilerOption { - AUTO = 0x1, - ON = 0x2, - OFF = 0x4, - USER_ON = 0x8, - USER_OFF = 0x10 + AUTO = 0x1, + ON = 0x2, + OFF = 0x4, + USER_ON = 0x8, + USER_OFF = 0x10, + FAST = 0x20 }; // target language specification @@ -84,11 +85,14 @@ class CompilerOptions { CompilerOption local_memory; CompilerOption multiple_pixels; CompilerOption vectorize_kernels; + CompilerOption use_fft; + CompilerOption async_kernels; // user defined values for target code features int kernel_config_x, kernel_config_y; int reduce_config_num_warps, reduce_config_num_hists; int align_bytes; int pixels_per_thread; + int num_streams; Texture texture_type; std::string rs_package_name, rs_directory; @@ -104,6 +108,9 @@ class CompilerOptions { case AUTO: llvm::errs() << "AUTO - determined by the framework"; break; + case FAST: + llvm::errs() << "ENABLED - FAST"; + break; case ON: llvm::errs() << "ENABLED"; break; @@ -126,12 +133,15 @@ class CompilerOptions { local_memory(AUTO), multiple_pixels(AUTO), vectorize_kernels(OFF), + use_fft(OFF), + async_kernels(OFF), kernel_config_x(128), kernel_config_y(1), reduce_config_num_warps(16), reduce_config_num_hists(16), align_bytes(0), pixels_per_thread(1), + num_streams(1), texture_type(Texture::None), rs_package_name("org.hipacc.rs"), rs_directory("/data/local/tmp") @@ -155,6 +165,7 @@ class CompilerOptions { static const auto option_ou = static_cast( ON|USER_ON); static const auto option_aou = static_cast(AUTO|ON|USER_ON); + static const auto option_of = static_cast(ON | FAST); bool exploreConfig(CompilerOption option=option_ou) { return explore_config & option; @@ -190,6 +201,9 @@ class CompilerOptions { bool vectorizeKernels(CompilerOption option=option_ou) { return vectorize_kernels & option; } + bool asyncKernelLaunch(CompilerOption option=option_ou) { + return async_kernels & option; + } bool multiplePixelsPerThread(CompilerOption option=option_ou) { return multiple_pixels & option; } @@ -208,6 +222,17 @@ class CompilerOptions { void setLocalMemory(CompilerOption o) { local_memory = o; } void setVectorizeKernels(CompilerOption o) { vectorize_kernels = o; } + void setUseFFT(CompilerOption o) { use_fft = o; } + bool getUseFFT(CompilerOption option = option_of) { return use_fft & option; } + + void setStreamAsyncKernelLaunch(int stream) { + assert((stream > 0) && "Number of streams must be larger than 0"); + num_streams = stream; + async_kernels = USER_ON; + } + + int getNumStreams() { return num_streams; } + void setTextureMemory(Texture type) { texture_type = type; if (type == Texture::None) texture_memory = USER_OFF; @@ -304,6 +329,8 @@ class CompilerOptions { getOptionAsString(multiple_pixels, pixels_per_thread); llvm::errs() << "\n Vectorization of kernels: "; getOptionAsString(vectorize_kernels); + llvm::errs() << "\n Usage of FFT: "; + getOptionAsString(use_fft); llvm::errs() << "\n\n"; } }; diff --git a/include/hipacc/DSL/ClassRepresentation.h b/include/hipacc/DSL/ClassRepresentation.h index c4952147..40321678 100644 --- a/include/hipacc/DSL/ClassRepresentation.h +++ b/include/hipacc/DSL/ClassRepresentation.h @@ -44,6 +44,7 @@ #include #include #include +#include namespace clang { namespace hipacc { @@ -85,6 +86,12 @@ enum class Interpolate : uint8_t { L3 }; +// basic pyramid operations +enum class PyramidOperation : uint8_t { + REDUCE = 0, + FILTER, + EXPAND +}; // common base class for images, masks and pyramids class HipaccSize { @@ -143,24 +150,149 @@ class HipaccMemory : public HipaccSize { class HipaccImage : public HipaccMemory { private: ASTContext &Ctx; + std::string streamStr; public: HipaccImage(ASTContext &Ctx, VarDecl *VD, QualType QT) : HipaccMemory(VD, VD->getNameAsString(), QT), - Ctx(Ctx) + Ctx(Ctx), streamStr("stream") {} unsigned getPixelSize() { return Ctx.getTypeSize(type)/8; } std::string getTextureType(); std::string getImageReadFunction(); + void setStream(std::string s) { streamStr = s; } + std::string getStream() { return streamStr; } }; class HipaccPyramid : public HipaccImage { + private: + unsigned depth; public: HipaccPyramid(ASTContext &Ctx, VarDecl *VD, QualType QT) : - HipaccImage(Ctx, VD, QT) + HipaccImage(Ctx, VD, QT), depth(0) {} + void setDepth(unsigned d) { depth = d; } + unsigned getDepth() { return depth; } +}; + + +// TODO, generalize this to an abstract class interface +class HipaccPyramidPipeline : public HipaccDevice { + private: + CompilerOptions &options; + + bool singleStream; + bool multiStream; + bool pipelineKernelLaunch; + unsigned depth; + + SmallVector baseImgs; + std::string global_level_str; + llvm::DenseMap KernelDeclMap; + llvm::DenseMap KernelMap; + + // resource usage in the pipeline + struct PyrResource { + unsigned reg = 0; + unsigned smem = 0; + unsigned thr = 0; + unsigned blk = 0; + }; + PyrResource UsedResourcePyr; + + public: + HipaccPyramidPipeline(CompilerOptions &options) : + HipaccDevice(options), + options(options), + pipelineKernelLaunch(false), + depth(0) + { + if (options.getNumStreams() == 1) { + singleStream = true; + multiStream = false; + } else if (options.getNumStreams() > 1) { + singleStream = false; + multiStream = true; + } + resetResourceUsage(); + } + + void resetResourceUsage() { + UsedResourcePyr.reg = 0; + UsedResourcePyr.smem = 0; + UsedResourcePyr.thr = 0; + UsedResourcePyr.blk = 0; + } + + bool hasResourceAvailable(PyrResource &r) { + bool reg_available = (max_total_registers - UsedResourcePyr.reg) > r.reg; + bool smem_available = (max_total_shared_memory - UsedResourcePyr.smem) > r.smem; + bool thr_available = (max_threads_per_multiprocessor - UsedResourcePyr.thr) > r.thr; + bool blk_available = (max_blocks_per_multiprocessor - UsedResourcePyr.blk) > r.blk; + return (reg_available && smem_available && thr_available && blk_available); + } + + bool useResource(PyrResource &r) { + if (hasResourceAvailable(r)) { + UsedResourcePyr.reg += r.reg; + UsedResourcePyr.smem += r.smem; + UsedResourcePyr.thr += r.thr; + UsedResourcePyr.blk += r.blk; + return true; + } else { + return false; + } + } + + std::string getPyramidOperationStr(PyramidOperation opr) { + switch(opr) + { + case PyramidOperation::REDUCE : return "REDUCE"; + case PyramidOperation::FILTER : return "FILTER"; + case PyramidOperation::EXPAND : return "EXPAND"; + default: return "PyramidOperation{" + std::to_string(int(opr)) + '}'; + } + } + + unsigned getNumWaves(HipaccKernel *K, unsigned w, unsigned h, bool print=0); + + void updateDepth(unsigned d) { + depth = (d > depth) ? d : depth; + } + unsigned getDepth() { return depth; } + void recordBaseImg(HipaccImage *img) { + baseImgs.push_back(img); + } + void setGlobalLevelStr(std::string level) { + if (global_level_str.empty()) { + global_level_str = level; + } + } + std::string getGlobalLevelStr() { + return global_level_str; + } + bool isSingleStream() { return singleStream; } + bool isMultiStream() { return multiStream; } + void setSingleStream(bool s) { singleStream = s; } + void setMultiStream(bool s) { multiStream = s; } + void recordKernel(ValueDecl *VD, HipaccKernel *K, PyramidOperation pyrOp) { + KernelDeclMap[VD] = pyrOp; + KernelMap[K] = pyrOp; + } + bool isRecordedKernel(ValueDecl *VD) { return KernelDeclMap.count(VD); } + PyramidOperation getPyramidOperation(ValueDecl *VD) { + assert(isRecordedKernel(VD) && "Kernel is not recorded in the pyramid pipeline."); + return KernelDeclMap[VD]; + } + void enablePipelineKernelLaunch() { + pipelineKernelLaunch = true; + } + bool isPipelineKernelLaunch() { + return pipelineKernelLaunch; + } + void printStreamPipelineInfo(); }; @@ -385,6 +517,7 @@ class HipaccKernelClass { std::string name; CXXMethodDecl *kernelFunction, *reduceFunction, *binningFunction; QualType pixelType, binType; + PyramidOperation pyrOp; KernelStatistics *kernelStatistics; // kernel member information SmallVector members; @@ -426,6 +559,9 @@ class HipaccKernelClass { QualType getPixelType() { return pixelType; } QualType getBinType() { return binType; } + void setPyramidOperation(PyramidOperation op) { pyrOp = op; } + PyramidOperation getPyramidOperation() { return pyrOp; } + KernelStatistics &getKernelStatistics(void) { return *kernelStatistics; } @@ -574,7 +710,9 @@ class HipaccKernel : public HipaccKernelFeatures { std::string name; std::string kernelName, reduceName, binningName; std::string fileName; - std::string reduceStr, binningStr, infoStr, numBinsStr; + std::string reduceStr, binningStr, infoStr, numBinsStr, streamStr; + bool waitEvent, recordEvent; + std::string waitEventStr, recordEventStr; unsigned infoStrCnt, binningStrCnt; HipaccIterationSpace *iterationSpace; std::map imgMap; @@ -614,7 +752,8 @@ class HipaccKernel : public HipaccKernelFeatures { reduceName(options.getTargetPrefix() + KC->getName() + name + "Reduce"), binningName(options.getTargetPrefix() + KC->getName() + name + "Binning"), fileName(options.getTargetPrefix() + KC->getName() + VD->getNameAsString()), - reduceStr(), binningStr(), infoStr(), numBinsStr(), + reduceStr(), binningStr(), infoStr(), numBinsStr(), streamStr("stream"), + waitEvent(false), recordEvent(false), waitEventStr("event"), recordEventStr("event"), infoStrCnt(0), binningStrCnt(0), iterationSpace(nullptr), imgMap(), @@ -751,6 +890,8 @@ class HipaccKernel : public HipaccKernelFeatures { // recreate parameter information createArgInfo(); } + unsigned getResourceUsageReg() { return num_reg; } + unsigned getResourceUsageSmem() { return num_smem; } void setDefaultConfig(); @@ -803,6 +944,16 @@ class HipaccKernel : public HipaccKernelFeatures { unsigned getPixelsPerThreadReduce() { return pixels_per_thread[GlobalOperator]; } + void setStream(std::string s) { streamStr = s; } + std::string getStream() { return streamStr; } + void setWaitEvent(bool e) { waitEvent = e; } + bool hasWaitEvent() { return waitEvent; } + void setRecordEvent(bool e) { recordEvent = e; } + bool hasRecordEvent() { return recordEvent; } + void setWaitEventStr(std::string s) { waitEventStr = s; } + std::string getWaitEventStr() { return waitEventStr; } + void setRecordEventStr(std::string s) { recordEventStr = s; } + std::string getRecordEventStr() { return recordEventStr; } }; } // namespace hipacc } // namespace clang diff --git a/include/hipacc/Device/TargetDescription.h b/include/hipacc/Device/TargetDescription.h index abfb701b..560fc3b7 100644 --- a/include/hipacc/Device/TargetDescription.h +++ b/include/hipacc/Device/TargetDescription.h @@ -78,6 +78,7 @@ class HipaccDeviceOptions { case Device::Maxwell_50: case Device::Maxwell_52: case Device::Maxwell_53: + case Device::Turing_75: alignment = 256; if (options.emitCUDA()) local_memory_threshold = 6; else local_memory_threshold = 11; @@ -206,6 +207,7 @@ class HipaccDevice : public HipaccDeviceOptions { // NVIDIA only device properties unsigned num_alus; unsigned num_sfus; + unsigned num_sms; public: explicit HipaccDevice(CompilerOptions &options) : @@ -214,7 +216,8 @@ class HipaccDevice : public HipaccDeviceOptions { max_threads_per_warp(32), max_blocks_per_multiprocessor(8), num_alus(0), - num_sfus(0) + num_sfus(0), + num_sms(1) { switch (target_device) { case Device::CPU: @@ -261,10 +264,16 @@ class HipaccDevice : public HipaccDeviceOptions { if (target_device==Device::Kepler_30) { max_register_per_thread = 63; } + if (target_device==Device::Kepler_30) { + num_sms = 8; + } else if (target_device==Device::Kepler_35) { + num_sms = 13; + } break; case Device::Maxwell_50: case Device::Maxwell_52: case Device::Maxwell_53: + case Device::Turing_75: max_blocks_per_multiprocessor = 32; max_threads_per_block = 1024; max_warps_per_multiprocessor = 64; @@ -274,6 +283,7 @@ class HipaccDevice : public HipaccDeviceOptions { max_register_per_thread = 255; num_alus = 128; num_sfus = 32; + num_sms = 46; if (target_device==Device::Maxwell_52) { max_total_shared_memory = 98304; } @@ -343,7 +353,7 @@ class HipaccDevice : public HipaccDeviceOptions { bool isNVIDIAGPU() { return target_device >= Device::Fermi_20 && - target_device <= Device::Maxwell_53; + target_device <= Device::Turing_75; } std::string getTargetDeviceName() { @@ -358,6 +368,7 @@ class HipaccDevice : public HipaccDeviceOptions { case Device::Maxwell_50: return "NVIDIA Maxwell (50)"; case Device::Maxwell_52: return "NVIDIA Maxwell (52)"; case Device::Maxwell_53: return "NVIDIA Maxwell (53)"; + case Device::Turing_75: return "NVIDIA Turing (75)"; case Device::Evergreen: return "AMD Evergreen"; case Device::NorthernIsland: return "AMD Northern Island"; //case Device::SouthernIsland: return "AMD Southern Island"; diff --git a/include/hipacc/Device/TargetDevices.h b/include/hipacc/Device/TargetDevices.h index 1a990ee6..04034720 100644 --- a/include/hipacc/Device/TargetDevices.h +++ b/include/hipacc/Device/TargetDevices.h @@ -49,6 +49,7 @@ enum class Device { Maxwell_50 = 50, Maxwell_52 = 52, Maxwell_53 = 53, + Turing_75 = 75, Evergreen = 58, NorthernIsland = 69, //SouthernIsland = 79 diff --git a/include/hipacc/Rewrite/CreateHostStrings.h b/include/hipacc/Rewrite/CreateHostStrings.h index 170fd575..05505e9c 100644 --- a/include/hipacc/Rewrite/CreateHostStrings.h +++ b/include/hipacc/Rewrite/CreateHostStrings.h @@ -88,7 +88,9 @@ class CreateHostStrings { MemoryTransferDirection direction, std::string &resultStr); void writeMemoryTransferDomainFromMask(HipaccMask *Domain, HipaccMask *Mask, std::string &resultStr); - void writeKernelCall(HipaccKernel *K, std::string &resultStr); + void writeFFTConvolutionCall(HipaccKernel *K, std::string &resultStr, + ASTContext &Context, bool fast = true); + void writeKernelCall(HipaccKernel *K, std::string &resultStr, HipaccPyramidPipeline *pyrPipe=nullptr); void writeReduceCall(HipaccKernel *K, std::string &resultStr); void writeBinningCall(HipaccKernel *K, std::string &resultStr); std::string getInterpolationDefinition(HipaccKernel *K, HipaccAccessor *Acc, diff --git a/lib/DSL/ClassRepresentation.cpp b/lib/DSL/ClassRepresentation.cpp index 3442b654..fe16092a 100644 --- a/lib/DSL/ClassRepresentation.cpp +++ b/lib/DSL/ClassRepresentation.cpp @@ -651,5 +651,170 @@ void HipaccKernel::createHostArgInfo(ArrayRef hostArgs, std::string } } +unsigned HipaccPyramidPipeline::getNumWaves(HipaccKernel *K, unsigned w, unsigned h, bool print) { + unsigned sx = K->getNumThreadsX(); + unsigned sy = K->getNumThreadsY(); + unsigned r_reg = std::max(K->getResourceUsageReg(), (unsigned)1); + unsigned r_smem = std::max(K->getResourceUsageSmem(), (unsigned)1); + unsigned k_blk = std::ceil((float)w / sx) * std::ceil((float)h / sy); + + unsigned n_blk_reg = std::floor((float)max_total_registers / (r_reg * sx * sy)); + unsigned n_blk_smem = std::floor((float)max_total_shared_memory / r_smem); + unsigned n_blk_thr = std::floor((float)max_threads_per_multiprocessor / (sx*sy)); + unsigned n_blk = std::min({n_blk_reg, n_blk_smem, n_blk_thr, max_blocks_per_multiprocessor}); + + if (print) { + llvm::errs() << " [wave-info] r_reg:" << r_reg << " r_smem:" << r_smem << " k_blk:" << k_blk << "\n"; + llvm::errs() << " n_blk_reg:" << n_blk_reg << " n_blk_smem:" << n_blk_smem << " n_blk_thr:" + << n_blk_thr << " n_blk:" << n_blk << "\n"; + } + + unsigned n_wave = std::ceil((float)k_blk / (n_blk * num_sms)); + return n_wave; +} + +void HipaccPyramidPipeline::printStreamPipelineInfo() { + llvm::errs() << "\nInformation for Pyramid cuda stream scheduling\n"; + std::string deviceInfo_str, singleStream_str, multiStream_str; + deviceInfo_str += " GPU arch: " + getTargetDeviceName() + "\n"; + deviceInfo_str += " max_register_per_sm: " + std::to_string(max_total_registers) + "\n"; + deviceInfo_str += " max_shared_memory_per_sm: " + std::to_string(max_total_shared_memory) + "\n"; + deviceInfo_str += " max_threads_per_sm: " + std::to_string(max_threads_per_multiprocessor) + "\n"; + deviceInfo_str += " max_blocks_per_sm: " + std::to_string(max_blocks_per_multiprocessor) + "\n"; + deviceInfo_str += " num_sm: " + std::to_string(num_sms) + "\n"; + llvm::errs() << deviceInfo_str; + + HipaccKernel *K_r, *K_f, *K_e; // Unpack kernels + for (auto kp : KernelMap) { + if (kp.second == PyramidOperation::REDUCE) { K_r = kp.first; } + else if (kp.second == PyramidOperation::FILTER) { K_f = kp.first; } + else if (kp.second == PyramidOperation::EXPAND) { K_e = kp.first; } + } + assert(K_r && "no reduce kernel for multi stream pyramid execution"); + assert(K_f && "no filter kernel for multi stream pyramid execution"); + assert(K_e && "no expand kernel for multi stream pyramid execution"); + + unsigned imgSzX = baseImgs.front()->getSizeX(); // assume same-sized base images + unsigned imgSzY = baseImgs.front()->getSizeY(); + + // single stream analysis + singleStream_str += " -single-stream wave estimation\n"; + std::vector numWavesDS; + std::vector numWavesLP; + std::vector numWavesUS; + + for (size_t d=0; d 0) { + // Downsampling + numWavesDS.push_back(getNumWaves(K_r, imgSzX, imgSzY)); + } + if (d < depth - 1) { + // level processing + numWavesLP.push_back(getNumWaves(K_f, imgSzX, imgSzY)); + // Upsampling + numWavesUS.push_back(getNumWaves(K_e, imgSzX, imgSzY)); + } + imgSzX /= 2; imgSzY /= 2; + } + + assert((numWavesDS.size() == depth-1) && "number of waves in reduce does not match depth\n"); + assert((numWavesLP.size() == depth-1) && "number of waves in filter does not match depth\n"); + assert((numWavesUS.size() == depth-1) && "number of waves in expand does not match depth\n"); + + unsigned numWavesDSSum = std::accumulate(numWavesDS.begin(), numWavesDS.end(), 0); + unsigned numWavesLPSum = std::accumulate(numWavesLP.begin(), numWavesLP.end(), 0); + unsigned numWavesUSSum = std::accumulate(numWavesUS.begin(), numWavesUS.end(), 0); + + singleStream_str += " Reduce kernel " + K_r->getName() + " takes " + std::to_string(numWavesDSSum) + " waves\n"; + singleStream_str += " Filter kernel " + K_f->getName() + " takes " + std::to_string(numWavesLPSum) + " waves\n"; + singleStream_str += " Expand kernel " + K_e->getName() + " takes " + std::to_string(numWavesUSSum) + " waves\n"; + llvm::errs() << singleStream_str; + + // multi stream info + imgSzX = baseImgs.front()->getSizeX(); // assume same-sized base images + imgSzY = baseImgs.front()->getSizeY(); + multiStream_str += " -multi-stream wave estimation\n"; + + unsigned n_wave_seq_r = 0; + unsigned n_wave_seq_f = 0; + unsigned n_wave_expand = 0; + unsigned n_p = 0; + std::vector S_p; + + // downsampling + for (size_t d=0; dgetNumThreadsX(); + unsigned sy = K_f->getNumThreadsY(); + unsigned r_reg = std::max(K_f->getResourceUsageReg(), (unsigned)1); + unsigned r_smem = std::max(K_f->getResourceUsageSmem(), (unsigned)1); + unsigned k_blk = std::ceil((float)imgSzX / sx) * std::ceil((float)imgSzY / sy); + unsigned n_blk_reg = std::floor((float)max_total_registers / (r_reg * sx * sy)); + unsigned n_blk_smem = std::floor((float)max_total_shared_memory / r_smem); + unsigned n_blk_thr = std::floor((float)max_threads_per_multiprocessor / (sx*sy)); + unsigned n_blk = std::min({n_blk_reg, n_blk_smem, n_blk_thr, max_blocks_per_multiprocessor}); + + unsigned n_wave = std::ceil((float)k_blk / (n_blk * num_sms)); + unsigned n_blk_tail = k_blk % (n_blk * num_sms); + unsigned n_blk_tail_per_sm = std::ceil((float)n_blk_tail / num_sms); + + if (n_wave > 1) { // more than one wave, fine-grained level execution + resetResourceUsage(); + n_wave_seq_f += n_wave; + n_wave_seq_r += getNumWaves(K_r, imgSzX / 2, imgSzY / 2); + } else if (n_blk_tail == 0) { // exactly one wave, fine-grained level execution + resetResourceUsage(); + n_wave_seq_f += n_wave; + n_wave_seq_r += getNumWaves(K_r, imgSzX / 2, imgSzY / 2); + } else if (n_blk_tail > 0) { // less than one wave, coarse-grained level execution + PyrResource res; + res.reg = n_blk_tail_per_sm * (r_reg * sx * sy); + res.smem = n_blk_tail_per_sm * r_smem; + res.thr = n_blk_tail_per_sm * (sx * sy); + res.blk = n_blk_tail_per_sm; + + if (hasResourceAvailable(res)) { // resource is sufficient, same wave + n_p++; + } else { // resource not sufficient, new wave + S_p.push_back(n_p); + n_p = 0; + resetResourceUsage(); + } + useResource(res); + } + imgSzX /= 2; + imgSzY /= 2; + } + + // upsampling + imgSzX = baseImgs.front()->getSizeX(); // assume same-sized base images + imgSzY = baseImgs.front()->getSizeY(); + for (size_t d=0; dgetName() + " takes " + std::to_string(n_wave_seq_r + 1) + " waves\n"; + multiStream_str += " Filter kernel " + K_f->getName() + " takes " + std::to_string(n_wave_seq_f) + " waves\n"; + multiStream_str += " Coarse-grained level execution:\n"; + if (S_p.size()) { + multiStream_str += + " takes " + std::to_string(S_p.size() + 1) + " waves and "; + for (auto np : S_p) { + multiStream_str += std::to_string(np) + " "; + } + multiStream_str += std::to_string(n_p) + " parallel streams\n"; + } else { + multiStream_str += + " takes 1 wave and " + std::to_string(n_p) + " parallel streams\n"; + } + multiStream_str += " Expand execution: takes " + std::to_string(n_wave_expand) + " waves\n"; + llvm::errs() << multiStream_str; +} + + + // vim: set ts=2 sw=2 sts=2 et ai: diff --git a/lib/Rewrite/CreateHostStrings.cpp b/lib/Rewrite/CreateHostStrings.cpp index 9a618d06..7d2e777c 100644 --- a/lib/Rewrite/CreateHostStrings.cpp +++ b/lib/Rewrite/CreateHostStrings.cpp @@ -41,9 +41,15 @@ using namespace hipacc; void CreateHostStrings::writeHeaders(std::string &resultStr) { switch (options.getTargetLang()) { case Language::C99: - resultStr += "#include \"hipacc_cpu_standalone.hpp\"\n\n"; break; + resultStr += "#include \"hipacc_cpu_standalone.hpp\"\n\n"; + if (options.getUseFFT()) + resultStr += "#include \"hipacc_cpu_fftw.hpp\"\n\n"; + break; case Language::CUDA: - resultStr += "#include \"hipacc_cu_standalone.hpp\"\n\n"; break; + resultStr += "#include \"hipacc_cu_standalone.hpp\"\n\n"; + if (options.getUseFFT()) + resultStr += "#include \"hipacc_cu_cufft.hpp\"\n\n"; + break; case Language::OpenCLACC: case Language::OpenCLCPU: case Language::OpenCLGPU: @@ -260,11 +266,19 @@ void CreateHostStrings::writeMemoryTransfer(HipaccImage *Img, std::string mem, case HOST_TO_DEVICE: resultStr += "hipaccWriteMemory("; resultStr += Img->getName(); - resultStr += ", " + mem + ");"; + resultStr += ", " + mem; + if (options.asyncKernelLaunch()) { + resultStr += ", " + Img->getStream(); + } + resultStr += ");"; break; case DEVICE_TO_HOST: resultStr += "hipaccReadMemory<" + Img->getTypeStr() + ">("; - resultStr += Img->getName() + ");"; + resultStr += Img->getName(); + if (options.asyncKernelLaunch()) { + resultStr += ", " + Img->getStream(); + } + resultStr += ");"; break; case DEVICE_TO_DEVICE: resultStr += "hipaccCopyMemory("; @@ -284,11 +298,19 @@ void CreateHostStrings::writeMemoryTransfer(HipaccPyramid *Pyr, std::string idx, case HOST_TO_DEVICE: resultStr += "hipaccWriteMemory("; resultStr += Pyr->getName() + "(" + idx + ")"; - resultStr += ", " + mem + ");"; + resultStr += ", " + mem; + if (options.asyncKernelLaunch()) { + resultStr += ", " + Pyr->getStream(); + } + resultStr += ");"; break; case DEVICE_TO_HOST: resultStr += "hipaccReadMemory<" + Pyr->getTypeStr() + ">("; - resultStr += Pyr->getName() + "(" + idx + "));"; + resultStr += Pyr->getName() + "(" + idx + ")"; + if (options.asyncKernelLaunch()) { + resultStr += ", " + Pyr->getStream(); + } + resultStr += ");"; break; case DEVICE_TO_DEVICE: resultStr += "hipaccCopyMemory("; @@ -386,8 +408,91 @@ void CreateHostStrings::writeMemoryTransferDomainFromMask( } } +void CreateHostStrings::writeFFTConvolutionCall(HipaccKernel *K, + std::string &resultStr, + ASTContext &Context, bool fast) { + if (options.getUseFFT() && (options.getTargetLang() == Language::C99 || + options.getTargetLang() == Language::CUDA)) { + std::string precision = "double"; + if (fast) { + precision = "float"; + } + switch (options.getTargetLang()) { + case Language::C99: + resultStr += "fftwConvolution<" + precision + ">("; + break; + case Language::CUDA: + resultStr += "cufftConvolution_device<" + precision + ">("; + break; + default: + assert(false && "Convolution with FFT is only supported for C99 and CUDA"); + return; + } + // get Iterationspace + std::string iterName = K->getIterationSpace()->getName(); + // get Accessor + HipaccAccessor *Acc; + std::string accName; + for (auto img : K->getKernelClass()->getImgFields()) { + Acc = K->getImgFromMapping(img); + accName = Acc->getName(); + if (accName != iterName) + break; + } + // get Mask + HipaccMask *Mask; + std::string mskName; + for (auto msk : K->getKernelClass()->getMaskFields()) { + Mask = K->getMaskFromMapping(msk); + mskName = Mask->getName(); + if (Mask->getHostMemName().empty()) { + assert(false && "FFT Convolution without mask not supported"); + } + } + // put function arguments + resultStr += "(" + Acc->getImage()->getTypeStr() + " *)(" + Acc->getName() + + ".img->mem)"; + // resultStr += ", " + K->getIterationSpace()->getImage()->getSizeXStr() + ", + // " + K->getIterationSpace()->getImage()->getSizeYStr(); + resultStr += ", " + K->getIterationSpace()->getName() + ".width, " + + K->getIterationSpace()->getName() + ".height"; + resultStr += ", " + Mask->getHostMemName() + ", " + Mask->getSizeXStr() + + ", " + Mask->getSizeYStr(); + resultStr += ", (" + K->getIterationSpace()->getImage()->getTypeStr() + + " *)(" + K->getIterationSpace()->getName() + ".img->mem)"; + resultStr += ", " + std::to_string(device.alignment); + std::string boundary_linear; + if (Acc->getBoundaryMode() == Boundary::REPEAT) { + boundary_linear = "false"; // circular convolution + } else if (Acc->getBoundaryMode() == Boundary::CONSTANT) { + boundary_linear = "true"; // linear convolution + } else { + assert(false && "Only REPEAT and CONSTANT boundary modes supported"); + } + resultStr += ", " + boundary_linear; + if (Acc->getBoundaryMode() == Boundary::CONSTANT) { + std::string constVal = "0"; + clang::Expr::EvalResult Result; + if (Acc->getConstExpr()->EvaluateAsRValue(Result, Context)) { + constVal = Result.Val.getAsString(Context, K->getKernelClass()->getPixelType()); + } + resultStr += ", " + constVal; // boundary constant + } + + resultStr += ");\n"; + } else { + if (options.getUseFFT()) { + assert(false && "Convolution with FFT is only supported for C99 and CUDA"); + return; + } else { + assert(false && "Convolution with FFT can not be used without -use-fft switch"); + return; + } + } +} -void CreateHostStrings::writeKernelCall(HipaccKernel *K, std::string &resultStr) { +void CreateHostStrings::writeKernelCall(HipaccKernel *K, std::string &resultStr, + HipaccPyramidPipeline *pyrPipe) { auto argTypeNames = K->getArgTypeNames(); auto deviceArgNames = K->getDeviceArgNames(); auto hostArgNames = K->getHostArgNames(); @@ -728,6 +833,13 @@ void CreateHostStrings::writeKernelCall(HipaccKernel *K, std::string &resultStr) } resultStr += "\n" + indent; + // insert cuda stream and wait event + if (options.emitCUDA() && options.asyncKernelLaunch() && K->hasWaitEvent()) { + resultStr += "cudaStreamWaitEvent(" + K->getStream() + ", "; + resultStr += K->getWaitEventStr() + ", 0);"; + resultStr += "\n" + indent; + } + // launch kernel if (options.exploreConfig() || options.timeKernels()) { switch (options.getTargetLang()) { @@ -805,6 +917,34 @@ void CreateHostStrings::writeKernelCall(HipaccKernel *K, std::string &resultStr) resultStr += ", " + gridStr; resultStr += ", " + blockStr; resultStr += ", _args" + kernel_name + ".data()"; + + if (options.asyncKernelLaunch()) { + if (pyrPipe) { // pyramid app + if (pyrPipe->isSingleStream()) { + resultStr += ", " + K->getStream(); + } else if (pyrPipe->isMultiStream()) { + switch (pyrPipe->getPyramidOperation(K->getDecl())) { + case PyramidOperation::REDUCE: + resultStr += ", stream[" + pyrPipe->getGlobalLevelStr() + " - 1]"; + resultStr += ", events[" + pyrPipe->getGlobalLevelStr() + " - 1]"; // wait event + resultStr += ", events[" + pyrPipe->getGlobalLevelStr() + "]"; // record event + break; + case PyramidOperation::FILTER: + resultStr += ", stream[" + pyrPipe->getGlobalLevelStr() + " - 1]"; + break; + case PyramidOperation::EXPAND: + resultStr += ", stream[" + pyrPipe->getGlobalLevelStr() + "]"; + resultStr += ", events[" + std::to_string(pyrPipe->getDepth()) + " * 2";// wait event + resultStr += " - " + pyrPipe->getGlobalLevelStr() + " - 2]"; + resultStr += ", events[" + std::to_string(pyrPipe->getDepth()) + " * 2";// record event + resultStr += " - " + pyrPipe->getGlobalLevelStr() + " - 1]"; + break; + } + } + } else { // non-pyramid app + resultStr += ", " + K->getStream(); + } + } resultStr += ");"; break; case Language::Renderscript: @@ -825,8 +965,14 @@ void CreateHostStrings::writeKernelCall(HipaccKernel *K, std::string &resultStr) break; } } -} + // insert cuda stream and record event + if (options.emitCUDA() && options.asyncKernelLaunch() && K->hasRecordEvent()) { + resultStr += "cudaEventRecord(" + K->getRecordEventStr() + ", "; + resultStr += K->getStream() + ");"; + resultStr += "\n" + indent; + } +} void CreateHostStrings::writeReduceCall(HipaccKernel *K, std::string &resultStr) { std::string typeStr(K->getIterationSpace()->getImage()->getTypeStr()); diff --git a/lib/Rewrite/Rewrite.cpp b/lib/Rewrite/Rewrite.cpp index 0f295033..266f10ec 100644 --- a/lib/Rewrite/Rewrite.cpp +++ b/lib/Rewrite/Rewrite.cpp @@ -100,6 +100,9 @@ class Rewrite : public ASTConsumer, public RecursiveASTVisitor { // store interpolation methods required for CUDA SmallVector InterpolationDefinitionsGlobal; + // pipeline global config for pyramid + HipaccPyramidPipeline *pyramidPipe; + // pointer to main function FunctionDecl *mainFD; FileID mainFileID; @@ -120,6 +123,7 @@ class Rewrite : public ASTConsumer, public RecursiveASTVisitor { builtins(CI.getASTContext()), stringCreator(CreateHostStrings(options, targetDevice)), compilerClasses(CompilerKnownClasses()), + pyramidPipe(nullptr), mainFD(nullptr), literalCount(0), skipTransfer(false) @@ -376,11 +380,52 @@ void Rewrite::HandleTranslationUnit(ASTContext &) { CompoundStmt *CS = dyn_cast(mainFD->getBody()); assert(CS->size() && "CompoundStmt has no statements."); - std::string initStr; + std::string initStr, cleanupStr; // get initialization string for run-time stringCreator.writeInitialization(initStr); + // print pyramid pipeline info for cuda + if (compilerOptions.emitCUDA() && compilerOptions.asyncKernelLaunch() && + pyramidPipe->isPipelineKernelLaunch()) { + pyramidPipe->printStreamPipelineInfo(); + } + + // write CUDA streams + if (compilerOptions.asyncKernelLaunch()) { + if (pyramidPipe->isSingleStream()) { + initStr += "cudaStream_t stream;"; + initStr += "\n" + stringCreator.getIndent(); + + initStr += "cudaStreamCreate(&stream);"; + initStr += "\n" + stringCreator.getIndent(); + } else if (pyramidPipe->isMultiStream()) { + initStr += "cudaStream_t stream["; + initStr += std::to_string(pyramidPipe->getDepth()) + " - 1];"; + initStr += "\n" + stringCreator.getIndent(); + + initStr += "for (int i = 0; i < "; + initStr += std::to_string(pyramidPipe->getDepth()) + " - 1; i++) {"; + initStr += "\n" + stringCreator.getIndent(); + initStr += " cudaStreamCreate(&stream[i]);"; + initStr += "\n" + stringCreator.getIndent() + "}"; + initStr += "\n" + stringCreator.getIndent(); + + initStr += "cudaEvent_t events["; + initStr += std::to_string(pyramidPipe->getDepth()) + " * 2];"; + initStr += "\n" + stringCreator.getIndent(); + + initStr += "for (int i = 0; i < "; + initStr += std::to_string(pyramidPipe->getDepth()) + " * 2; i++) {"; + initStr += "\n" + stringCreator.getIndent(); + initStr += " cudaEventCreate(&events[i]);"; + initStr += "\n" + stringCreator.getIndent() + "}"; + initStr += "\n" + stringCreator.getIndent(); + } else { + // + } + } + // load OpenCL kernel files and compile the OpenCL kernels if (!compilerOptions.exploreConfig()) { for (auto map : KernelDeclMap) @@ -411,6 +456,31 @@ void Rewrite::HandleTranslationUnit(ASTContext &) { // insert initialization before first statement TextRewriter.InsertTextBefore(CS->body_front()->getBeginLoc(), initStr); + // insert cleanup before last statement + if (compilerOptions.asyncKernelLaunch()) { + if (pyramidPipe->isSingleStream()) { + cleanupStr = "cudaStreamDestroy(stream);"; + cleanupStr += "\n" + stringCreator.getIndent(); + } else if (pyramidPipe->isMultiStream()) { + cleanupStr += "for (int i = 0; i < "; + cleanupStr += std::to_string(pyramidPipe->getDepth()) + " - 1; i++) {"; + cleanupStr += "\n" + stringCreator.getIndent(); + cleanupStr += " cudaStreamDestroy(stream[i]);"; + cleanupStr += "\n" + stringCreator.getIndent() + "}"; + cleanupStr += "\n" + stringCreator.getIndent(); + + cleanupStr += "for (int i = 0; i < "; + cleanupStr += std::to_string(pyramidPipe->getDepth()) + " * 2; i++) {"; + cleanupStr += "\n" + stringCreator.getIndent(); + cleanupStr += " cudaEventDestroy(events[i]);"; + cleanupStr += "\n" + stringCreator.getIndent() + "}"; + cleanupStr += "\n" + stringCreator.getIndent(); + } else { + // + } + TextRewriter.InsertTextBefore(CS->body_back()->getBeginLoc(), cleanupStr); + } + // get buffer of main file id. If we haven't changed it, then we are done. if (auto RewriteBuf = TextRewriter.getRewriteBufferFor(mainFileID)) { *Out << std::string(RewriteBuf->begin(), RewriteBuf->end()); @@ -627,6 +697,20 @@ bool Rewrite::VisitCXXRecordDecl(CXXRecordDecl *D) { KC->setBinningFunction(method); continue; } + + // pyramid functions + if (method->getNameAsString() == "_operatePyramidReduce") { + KC->setPyramidOperation(PyramidOperation::REDUCE); + continue; + } + if (method->getNameAsString() == "_operatePyramidFilter") { + KC->setPyramidOperation(PyramidOperation::FILTER); + continue; + } + if (method->getNameAsString() == "_operatePyramidExpand") { + KC->setPyramidOperation(PyramidOperation::EXPAND); + continue; + } } } @@ -683,7 +767,25 @@ bool Rewrite::VisitDeclStmt(DeclStmt *D) { std::string width_str = convertToString(CCE->getArg(0)); std::string height_str = convertToString(CCE->getArg(1)); - if (compilerOptions.emitC99()) { + if (compilerOptions.asyncKernelLaunch() && compilerOptions.emitCUDA()) { + // extract depth level from pyramid + unsigned IDConstant = Diags.getCustomDiagID(DiagnosticsEngine::Error, + "Constant expression for %0 argument of Image %1 required (Pyramid stream CUDA only)."); + if (!CCE->getArg(0)->isEvaluatable(Context)) { + Diags.Report(CCE->getArg(0)->getExprLoc(), IDConstant) << "width" + << Img->getName(); + } + if (!CCE->getArg(1)->isEvaluatable(Context)) { + Diags.Report(CCE->getArg(1)->getExprLoc(), IDConstant) << "height" + << Img->getName(); + } + + int64_t img_stride = CCE->getArg(0)->EvaluateKnownConstInt(Context).getSExtValue(); + int64_t img_height = CCE->getArg(1)->EvaluateKnownConstInt(Context).getSExtValue(); + + Img->setSizeX(img_stride); + Img->setSizeY(img_height); + } else if (compilerOptions.emitC99()) { // check if the parameter can be resolved to a constant unsigned IDConstant = Diags.getCustomDiagID(DiagnosticsEngine::Error, "Constant expression for %0 argument of Image %1 required (C/C++ only)."); @@ -746,6 +848,36 @@ bool Rewrite::VisitDeclStmt(DeclStmt *D) { std::string image_str = convertToString(CCE->getArg(0)); std::string depth_str = convertToString(CCE->getArg(1)); + if (compilerOptions.asyncKernelLaunch() && compilerOptions.emitCUDA()) { + // extract depth level from pyramid + unsigned IDConstant = Diags.getCustomDiagID(DiagnosticsEngine::Error, + "Constant expression for %0 argument of Pyramid %1 required (CUDA only)."); + if (!CCE->getArg(1)->isEvaluatable(Context)) { + Diags.Report(CCE->getArg(1)->getExprLoc(), IDConstant) << "depth" + << Pyr->getName(); + } + int64_t pyr_depth = CCE->getArg(1)->EvaluateKnownConstInt(Context).getSExtValue(); + Pyr->setDepth(pyr_depth); + + pyramidPipe->updateDepth(pyr_depth); + std::string pyr_name = VD->getName(); + pyramidPipe->setGlobalLevelStr(pyr_name + ".level()"); + + // TODO, generalize this to async h2d + auto arg = CCE->getArg(0)->IgnoreParenCasts(); + HipaccImage *Img = nullptr; + if (auto DRE = dyn_cast(arg)) { + // check if the argument specifies the image + if (ImgDeclMap.count(DRE->getDecl())) { + Img = ImgDeclMap[DRE->getDecl()]; + if (pyramidPipe->isMultiStream()) { + Img->setStream("stream[0]"); + } + pyramidPipe->recordBaseImg(Img); + } + } + } + // create memory allocation string std::string newStr; stringCreator.writePyramidAllocation(VD->getName(), @@ -1244,6 +1376,20 @@ bool Rewrite::VisitDeclStmt(DeclStmt *D) { HipaccKernel *K = new HipaccKernel(Context, VD, KC, compilerOptions); KernelDeclMap[VD] = K; + if (compilerOptions.asyncKernelLaunch() && compilerOptions.emitCUDA()) { + switch (KC->getPyramidOperation()) { + case PyramidOperation::REDUCE: + pyramidPipe->recordKernel(VD, K, PyramidOperation::REDUCE); + break; + case PyramidOperation::FILTER: + pyramidPipe->recordKernel(VD, K, PyramidOperation::FILTER); + break; + case PyramidOperation::EXPAND: + pyramidPipe->recordKernel(VD, K, PyramidOperation::EXPAND); + break; + } + } + // remove kernel declaration TextRewriter.RemoveText(D->getSourceRange()); @@ -1342,6 +1488,10 @@ bool Rewrite::VisitFunctionDecl(FunctionDecl *D) { assert(D->getBody() && "main function has no body."); assert(isa(D->getBody()) && "CompoundStmt for main body expected."); mainFD = D; + // TODO, triggle pyramid config + if (compilerOptions.emitCUDA() && compilerOptions.asyncKernelLaunch()) { + pyramidPipe = new HipaccPyramidPipeline(compilerOptions); + } } return true; @@ -1624,7 +1774,35 @@ bool Rewrite::VisitCXXMemberCallExpr(CXXMemberCallExpr *E) { // TODO: handle the case when only reduce function is specified // // create kernel call string - stringCreator.writeKernelCall(K, newStr); + if (compilerOptions.emitCUDA() && compilerOptions.asyncKernelLaunch() && + pyramidPipe->isPipelineKernelLaunch()) { + stringCreator.writeKernelCall(K, newStr, pyramidPipe); + } else { + stringCreator.writeKernelCall(K, newStr); + } + + // rewrite kernel invocation + replaceText(E->getBeginLoc(), E->getBeginLoc(), ';', newStr); + } + } else if (!KernelDeclMap.empty() && + E->getDirectCallee()->getNameAsString() == "convolveFFT") { + // get the user Kernel class + if (KernelDeclMap.count(DRE->getDecl())) { + HipaccKernel *K = KernelDeclMap[DRE->getDecl()]; + VarDecl *VD = K->getDecl(); + std::string newStr; + + // this was checked before, when the user class was parsed + CXXConstructExpr *CCE = dyn_cast(VD->getInit()); + assert(CCE->getNumArgs() == K->getKernelClass()->getMembers().size() && + "number of arguments doesn't match!"); + + // set host argument names and retrieve literals stored to temporaries + K->setHostArgNames(llvm::makeArrayRef(CCE->getArgs(), + CCE->getNumArgs()), newStr, literalCount); + + bool fast = compilerOptions.getUseFFT(FAST); + stringCreator.writeFFTConvolutionCall(K, newStr, Context, fast); // rewrite kernel invocation replaceText(E->getBeginLoc(), E->getBeginLoc(), ';', newStr); @@ -1741,6 +1919,10 @@ bool Rewrite::VisitCallExpr (CallExpr *E) { SourceRange range(E->getBeginLoc(), E->getBeginLoc().getLocWithOffset(std::string("traverse").length()-1)); TextRewriter.ReplaceText(range, "hipaccTraverse"); + + if (compilerOptions.emitCUDA() && compilerOptions.asyncKernelLaunch()) { + pyramidPipe->enablePipelineKernelLaunch(); + } } } } diff --git a/runtime/cuda_complex.hpp b/runtime/cuda_complex.hpp new file mode 100644 index 00000000..633c7736 --- /dev/null +++ b/runtime/cuda_complex.hpp @@ -0,0 +1,1335 @@ +// An implementation of C++ std::complex for use on CUDA devices. +// Written by John C. Travers (2012) +// +// Missing: +// - long double support (not supported on CUDA) +// - some integral pow functions (due to lack of C++11 support on CUDA) +// +// Heavily derived from the LLVM libcpp project (svn revision 147853). +// Based on libcxx/include/complex. +// The git history contains the complete change history from the original. +// The modifications are licensed as per the original LLVM license below. +// +// -*- C++ -*- +//===--------------------------- complex ----------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef CUDA_COMPLEX_HPP +#define CUDA_COMPLEX_HPP + +#ifdef __CUDACC__ +#define CUDA_CALLABLE_MEMBER __host__ __device__ +#else +#define CUDA_CALLABLE_MEMBER +#endif + +/* + complex synopsis + +template +class complex +{ +public: + typedef T value_type; + + complex(const T& re = T(), const T& im = T()); + complex(const complex&); + template complex(const complex&); + + T real() const; + T imag() const; + + void real(T); + void imag(T); + + complex& operator= (const T&); + complex& operator+=(const T&); + complex& operator-=(const T&); + complex& operator*=(const T&); + complex& operator/=(const T&); + + complex& operator=(const complex&); + template complex& operator= (const complex&); + template complex& operator+=(const complex&); + template complex& operator-=(const complex&); + template complex& operator*=(const complex&); + template complex& operator/=(const complex&); +}; + +template<> +class complex +{ +public: + typedef float value_type; + + constexpr complex(float re = 0.0f, float im = 0.0f); + explicit constexpr complex(const complex&); + + constexpr float real() const; + void real(float); + constexpr float imag() const; + void imag(float); + + complex& operator= (float); + complex& operator+=(float); + complex& operator-=(float); + complex& operator*=(float); + complex& operator/=(float); + + complex& operator=(const complex&); + template complex& operator= (const complex&); + template complex& operator+=(const complex&); + template complex& operator-=(const complex&); + template complex& operator*=(const complex&); + template complex& operator/=(const complex&); +}; + +template<> +class complex +{ +public: + typedef double value_type; + + constexpr complex(double re = 0.0, double im = 0.0); + constexpr complex(const complex&); + + constexpr double real() const; + void real(double); + constexpr double imag() const; + void imag(double); + + complex& operator= (double); + complex& operator+=(double); + complex& operator-=(double); + complex& operator*=(double); + complex& operator/=(double); + complex& operator=(const complex&); + + template complex& operator= (const complex&); + template complex& operator+=(const complex&); + template complex& operator-=(const complex&); + template complex& operator*=(const complex&); + template complex& operator/=(const complex&); +}; + +// 26.3.6 operators: +template complex operator+(const complex&, const complex&); +template complex operator+(const complex&, const T&); +template complex operator+(const T&, const complex&); +template complex operator-(const complex&, const complex&); +template complex operator-(const complex&, const T&); +template complex operator-(const T&, const complex&); +template complex operator*(const complex&, const complex&); +template complex operator*(const complex&, const T&); +template complex operator*(const T&, const complex&); +template complex operator/(const complex&, const complex&); +template complex operator/(const complex&, const T&); +template complex operator/(const T&, const complex&); +template complex operator+(const complex&); +template complex operator-(const complex&); +template bool operator==(const complex&, const complex&); +template bool operator==(const complex&, const T&); +template bool operator==(const T&, const complex&); +template bool operator!=(const complex&, const complex&); +template bool operator!=(const complex&, const T&); +template bool operator!=(const T&, const complex&); + +template + basic_istream& + operator>>(basic_istream&, complex&); +template + basic_ostream& + operator<<(basic_ostream&, const complex&); + +// 26.3.7 values: + +template T real(const complex&); + double real(double); +template double real(T); + float real(float); + +template T imag(const complex&); + double imag(double); +template double imag(T); + float imag(float); + +template T abs(const complex&); + +template T arg(const complex&); + double arg(double); +template double arg(T); + float arg(float); + +template T norm(const complex&); + double norm(double); +template double norm(T); + float norm(float); + +template complex conj(const complex&); + complex conj(double); +template complex conj(T); + complex conj(float); + +template complex proj(const complex&); + complex proj(double); +template complex proj(T); + complex proj(float); + +template complex polar(const T&, const T& = 0); + +// 26.3.8 transcendentals: +template complex acos(const complex&); +template complex asin(const complex&); +template complex atan(const complex&); +template complex acosh(const complex&); +template complex asinh(const complex&); +template complex atanh(const complex&); +template complex cos (const complex&); +template complex cosh (const complex&); +template complex exp (const complex&); +template complex log (const complex&); +template complex log10(const complex&); + +template complex pow(const complex&, const T&); +template complex pow(const complex&, const complex&); +template complex pow(const T&, const complex&); + +template complex sin (const complex&); +template complex sinh (const complex&); +template complex sqrt (const complex&); +template complex tan (const complex&); +template complex tanh (const complex&); + +template + basic_istream& + operator>>(basic_istream& is, complex& x); + +template + basic_ostream& + operator<<(basic_ostream& o, const complex& x); + +*/ + +#include +#include + +template class complex; + +template complex<_Tp> operator*(const complex<_Tp>& __z, const complex<_Tp>& __w); +template complex<_Tp> operator/(const complex<_Tp>& __x, const complex<_Tp>& __y); + +template +class complex +{ +public: + typedef _Tp value_type; +private: + value_type __re_; + value_type __im_; +public: + CUDA_CALLABLE_MEMBER + complex(const value_type& __re = value_type(), const value_type& __im = value_type()) + : __re_(__re), __im_(__im) {} + template CUDA_CALLABLE_MEMBER + complex(const complex<_Xp>& __c) + : __re_(__c.real()), __im_(__c.imag()) {} + + CUDA_CALLABLE_MEMBER value_type real() const {return __re_;} + CUDA_CALLABLE_MEMBER value_type imag() const {return __im_;} + + CUDA_CALLABLE_MEMBER void real(value_type __re) {__re_ = __re;} + CUDA_CALLABLE_MEMBER void imag(value_type __im) {__im_ = __im;} + + CUDA_CALLABLE_MEMBER complex& operator= (const value_type& __re) + {__re_ = __re; __im_ = value_type(); return *this;} + CUDA_CALLABLE_MEMBER complex& operator+=(const value_type& __re) {__re_ += __re; return *this;} + CUDA_CALLABLE_MEMBER complex& operator-=(const value_type& __re) {__re_ -= __re; return *this;} + CUDA_CALLABLE_MEMBER complex& operator*=(const value_type& __re) {__re_ *= __re; __im_ *= __re; return *this;} + CUDA_CALLABLE_MEMBER complex& operator/=(const value_type& __re) {__re_ /= __re; __im_ /= __re; return *this;} + + template CUDA_CALLABLE_MEMBER complex& operator= (const complex<_Xp>& __c) + { + __re_ = __c.real(); + __im_ = __c.imag(); + return *this; + } + template CUDA_CALLABLE_MEMBER complex& operator+=(const complex<_Xp>& __c) + { + __re_ += __c.real(); + __im_ += __c.imag(); + return *this; + } + template CUDA_CALLABLE_MEMBER complex& operator-=(const complex<_Xp>& __c) + { + __re_ -= __c.real(); + __im_ -= __c.imag(); + return *this; + } + template CUDA_CALLABLE_MEMBER complex& operator*=(const complex<_Xp>& __c) + { + *this = *this * __c; + return *this; + } + template CUDA_CALLABLE_MEMBER complex& operator/=(const complex<_Xp>& __c) + { + *this = *this / __c; + return *this; + } +}; + +template<> class complex; + +template<> +class complex +{ + float __re_; + float __im_; +public: + typedef float value_type; + + /*constexpr*/ CUDA_CALLABLE_MEMBER complex(float __re = 0.0f, float __im = 0.0f) + : __re_(__re), __im_(__im) {} + explicit /*constexpr*/ complex(const complex& __c); + + /*constexpr*/ CUDA_CALLABLE_MEMBER float real() const {return __re_;} + /*constexpr*/ CUDA_CALLABLE_MEMBER float imag() const {return __im_;} + + CUDA_CALLABLE_MEMBER void real(value_type __re) {__re_ = __re;} + CUDA_CALLABLE_MEMBER void imag(value_type __im) {__im_ = __im;} + + CUDA_CALLABLE_MEMBER complex& operator= (float __re) + {__re_ = __re; __im_ = value_type(); return *this;} + CUDA_CALLABLE_MEMBER complex& operator+=(float __re) {__re_ += __re; return *this;} + CUDA_CALLABLE_MEMBER complex& operator-=(float __re) {__re_ -= __re; return *this;} + CUDA_CALLABLE_MEMBER complex& operator*=(float __re) {__re_ *= __re; __im_ *= __re; return *this;} + CUDA_CALLABLE_MEMBER complex& operator/=(float __re) {__re_ /= __re; __im_ /= __re; return *this;} + + template CUDA_CALLABLE_MEMBER complex& operator= (const complex<_Xp>& __c) + { + __re_ = __c.real(); + __im_ = __c.imag(); + return *this; + } + template CUDA_CALLABLE_MEMBER complex& operator+=(const complex<_Xp>& __c) + { + __re_ += __c.real(); + __im_ += __c.imag(); + return *this; + } + template CUDA_CALLABLE_MEMBER complex& operator-=(const complex<_Xp>& __c) + { + __re_ -= __c.real(); + __im_ -= __c.imag(); + return *this; + } + template CUDA_CALLABLE_MEMBER complex& operator*=(const complex<_Xp>& __c) + { + *this = *this * __c; + return *this; + } + template CUDA_CALLABLE_MEMBER complex& operator/=(const complex<_Xp>& __c) + { + *this = *this / __c; + return *this; + } +}; + +template<> +class complex +{ + double __re_; + double __im_; +public: + typedef double value_type; + + /*constexpr*/ CUDA_CALLABLE_MEMBER complex(double __re = 0.0, double __im = 0.0) + : __re_(__re), __im_(__im) {} + /*constexpr*/ complex(const complex& __c); + + /*constexpr*/ CUDA_CALLABLE_MEMBER double real() const {return __re_;} + /*constexpr*/ CUDA_CALLABLE_MEMBER double imag() const {return __im_;} + + CUDA_CALLABLE_MEMBER void real(value_type __re) {__re_ = __re;} + CUDA_CALLABLE_MEMBER void imag(value_type __im) {__im_ = __im;} + + CUDA_CALLABLE_MEMBER complex& operator= (double __re) + {__re_ = __re; __im_ = value_type(); return *this;} + CUDA_CALLABLE_MEMBER complex& operator+=(double __re) {__re_ += __re; return *this;} + CUDA_CALLABLE_MEMBER complex& operator-=(double __re) {__re_ -= __re; return *this;} + CUDA_CALLABLE_MEMBER complex& operator*=(double __re) {__re_ *= __re; __im_ *= __re; return *this;} + CUDA_CALLABLE_MEMBER complex& operator/=(double __re) {__re_ /= __re; __im_ /= __re; return *this;} + + template CUDA_CALLABLE_MEMBER complex& operator= (const complex<_Xp>& __c) + { + __re_ = __c.real(); + __im_ = __c.imag(); + return *this; + } + template CUDA_CALLABLE_MEMBER complex& operator+=(const complex<_Xp>& __c) + { + __re_ += __c.real(); + __im_ += __c.imag(); + return *this; + } + template CUDA_CALLABLE_MEMBER complex& operator-=(const complex<_Xp>& __c) + { + __re_ -= __c.real(); + __im_ -= __c.imag(); + return *this; + } + template CUDA_CALLABLE_MEMBER complex& operator*=(const complex<_Xp>& __c) + { + *this = *this * __c; + return *this; + } + template CUDA_CALLABLE_MEMBER complex& operator/=(const complex<_Xp>& __c) + { + *this = *this / __c; + return *this; + } +}; + +//constexpr +inline CUDA_CALLABLE_MEMBER +complex::complex(const complex& __c) + : __re_(__c.real()), __im_(__c.imag()) {} + +//constexpr +inline CUDA_CALLABLE_MEMBER +complex::complex(const complex& __c) + : __re_(__c.real()), __im_(__c.imag()) {} + +// 26.3.6 operators: + +template +inline CUDA_CALLABLE_MEMBER +complex<_Tp> +operator+(const complex<_Tp>& __x, const complex<_Tp>& __y) +{ + complex<_Tp> __t(__x); + __t += __y; + return __t; +} + +template +inline CUDA_CALLABLE_MEMBER +complex<_Tp> +operator+(const complex<_Tp>& __x, const _Tp& __y) +{ + complex<_Tp> __t(__x); + __t += __y; + return __t; +} + +template +inline CUDA_CALLABLE_MEMBER +complex<_Tp> +operator+(const _Tp& __x, const complex<_Tp>& __y) +{ + complex<_Tp> __t(__y); + __t += __x; + return __t; +} + +template +inline CUDA_CALLABLE_MEMBER +complex<_Tp> +operator-(const complex<_Tp>& __x, const complex<_Tp>& __y) +{ + complex<_Tp> __t(__x); + __t -= __y; + return __t; +} + +template +inline CUDA_CALLABLE_MEMBER +complex<_Tp> +operator-(const complex<_Tp>& __x, const _Tp& __y) +{ + complex<_Tp> __t(__x); + __t -= __y; + return __t; +} + +template +inline CUDA_CALLABLE_MEMBER +complex<_Tp> +operator-(const _Tp& __x, const complex<_Tp>& __y) +{ + complex<_Tp> __t(-__y); + __t += __x; + return __t; +} + +template +CUDA_CALLABLE_MEMBER +complex<_Tp> +operator*(const complex<_Tp>& __z, const complex<_Tp>& __w) +{ + _Tp __a = __z.real(); + _Tp __b = __z.imag(); + _Tp __c = __w.real(); + _Tp __d = __w.imag(); + _Tp __ac = __a * __c; + _Tp __bd = __b * __d; + _Tp __ad = __a * __d; + _Tp __bc = __b * __c; + _Tp __x = __ac - __bd; + _Tp __y = __ad + __bc; + if (isnan(__x) && isnan(__y)) + { + bool __recalc = false; + if (isinf(__a) || isinf(__b)) + { + __a = copysign(isinf(__a) ? _Tp(1) : _Tp(0), __a); + __b = copysign(isinf(__b) ? _Tp(1) : _Tp(0), __b); + if (isnan(__c)) + __c = copysign(_Tp(0), __c); + if (isnan(__d)) + __d = copysign(_Tp(0), __d); + __recalc = true; + } + if (isinf(__c) || isinf(__d)) + { + __c = copysign(isinf(__c) ? _Tp(1) : _Tp(0), __c); + __d = copysign(isinf(__d) ? _Tp(1) : _Tp(0), __d); + if (isnan(__a)) + __a = copysign(_Tp(0), __a); + if (isnan(__b)) + __b = copysign(_Tp(0), __b); + __recalc = true; + } + if (!__recalc && (isinf(__ac) || isinf(__bd) || + isinf(__ad) || isinf(__bc))) + { + if (isnan(__a)) + __a = copysign(_Tp(0), __a); + if (isnan(__b)) + __b = copysign(_Tp(0), __b); + if (isnan(__c)) + __c = copysign(_Tp(0), __c); + if (isnan(__d)) + __d = copysign(_Tp(0), __d); + __recalc = true; + } + if (__recalc) + { + __x = _Tp(INFINITY) * (__a * __c - __b * __d); + __y = _Tp(INFINITY) * (__a * __d + __b * __c); + } + } + return complex<_Tp>(__x, __y); +} + +template +inline CUDA_CALLABLE_MEMBER +complex<_Tp> +operator*(const complex<_Tp>& __x, const _Tp& __y) +{ + complex<_Tp> __t(__x); + __t *= __y; + return __t; +} + +template +inline CUDA_CALLABLE_MEMBER +complex<_Tp> +operator*(const _Tp& __x, const complex<_Tp>& __y) +{ + complex<_Tp> __t(__y); + __t *= __x; + return __t; +} + +template +CUDA_CALLABLE_MEMBER +complex<_Tp> +operator/(const complex<_Tp>& __z, const complex<_Tp>& __w) +{ + int __ilogbw = 0; + _Tp __a = __z.real(); + _Tp __b = __z.imag(); + _Tp __c = __w.real(); + _Tp __d = __w.imag(); + _Tp __logbw = logb(fmax(fabs(__c), fabs(__d))); + if (isfinite(__logbw)) + { + __ilogbw = static_cast(__logbw); + __c = scalbn(__c, -__ilogbw); + __d = scalbn(__d, -__ilogbw); + } + _Tp __denom = __c * __c + __d * __d; + _Tp __x = scalbn((__a * __c + __b * __d) / __denom, -__ilogbw); + _Tp __y = scalbn((__b * __c - __a * __d) / __denom, -__ilogbw); + if (isnan(__x) && isnan(__y)) + { + if ((__denom == _Tp(0)) && (!isnan(__a) || !isnan(__b))) + { + __x = copysign(_Tp(INFINITY), __c) * __a; + __y = copysign(_Tp(INFINITY), __c) * __b; + } + else if ((isinf(__a) || isinf(__b)) && isfinite(__c) && isfinite(__d)) + { + __a = copysign(isinf(__a) ? _Tp(1) : _Tp(0), __a); + __b = copysign(isinf(__b) ? _Tp(1) : _Tp(0), __b); + __x = _Tp(INFINITY) * (__a * __c + __b * __d); + __y = _Tp(INFINITY) * (__b * __c - __a * __d); + } + else if (isinf(__logbw) && __logbw > _Tp(0) && isfinite(__a) && isfinite(__b)) + { + __c = copysign(isinf(__c) ? _Tp(1) : _Tp(0), __c); + __d = copysign(isinf(__d) ? _Tp(1) : _Tp(0), __d); + __x = _Tp(0) * (__a * __c + __b * __d); + __y = _Tp(0) * (__b * __c - __a * __d); + } + } + return complex<_Tp>(__x, __y); +} + +template<> +CUDA_CALLABLE_MEMBER +complex +operator/(const complex& __z, const complex& __w) +{ + int __ilogbw = 0; + float __a = __z.real(); + float __b = __z.imag(); + float __c = __w.real(); + float __d = __w.imag(); + float __logbw = logbf(fmaxf(fabsf(__c), fabsf(__d))); + if (isfinite(__logbw)) + { + __ilogbw = static_cast(__logbw); + __c = scalbnf(__c, -__ilogbw); + __d = scalbnf(__d, -__ilogbw); + } + float __denom = __c * __c + __d * __d; + float __x = scalbnf((__a * __c + __b * __d) / __denom, -__ilogbw); + float __y = scalbnf((__b * __c - __a * __d) / __denom, -__ilogbw); + if (isnan(__x) && isnan(__y)) + { + if ((__denom == float(0)) && (!isnan(__a) || !isnan(__b))) + { +#pragma warning(suppress: 4756) // Ignore INFINITY related warning + __x = copysignf(INFINITY, __c) * __a; +#pragma warning(suppress: 4756) // Ignore INFINITY related warning + __y = copysignf(INFINITY, __c) * __b; + } + else if ((isinf(__a) || isinf(__b)) && isfinite(__c) && isfinite(__d)) + { + __a = copysignf(isinf(__a) ? float(1) : float(0), __a); + __b = copysignf(isinf(__b) ? float(1) : float(0), __b); +#pragma warning(suppress: 4756) // Ignore INFINITY related warning + __x = INFINITY * (__a * __c + __b * __d); +#pragma warning(suppress: 4756) // Ignore INFINITY related warning + __y = INFINITY * (__b * __c - __a * __d); + } + else if (isinf(__logbw) && __logbw > float(0) && isfinite(__a) && isfinite(__b)) + { + __c = copysignf(isinf(__c) ? float(1) : float(0), __c); + __d = copysignf(isinf(__d) ? float(1) : float(0), __d); + __x = float(0) * (__a * __c + __b * __d); + __y = float(0) * (__b * __c - __a * __d); + } + } + return complex(__x, __y); +} + +template +inline CUDA_CALLABLE_MEMBER +complex<_Tp> +operator/(const complex<_Tp>& __x, const _Tp& __y) +{ + return complex<_Tp>(__x.real() / __y, __x.imag() / __y); +} + +template +inline CUDA_CALLABLE_MEMBER +complex<_Tp> +operator/(const _Tp& __x, const complex<_Tp>& __y) +{ + complex<_Tp> __t(__x); + __t /= __y; + return __t; +} + +template +inline CUDA_CALLABLE_MEMBER +complex<_Tp> +operator+(const complex<_Tp>& __x) +{ + return __x; +} + +template +inline CUDA_CALLABLE_MEMBER +complex<_Tp> +operator-(const complex<_Tp>& __x) +{ + return complex<_Tp>(-__x.real(), -__x.imag()); +} + +template +inline CUDA_CALLABLE_MEMBER +bool +operator==(const complex<_Tp>& __x, const complex<_Tp>& __y) +{ + return __x.real() == __y.real() && __x.imag() == __y.imag(); +} + +template +inline CUDA_CALLABLE_MEMBER +bool +operator==(const complex<_Tp>& __x, const _Tp& __y) +{ + return __x.real() == __y && __x.imag() == 0; +} + +template +inline CUDA_CALLABLE_MEMBER +bool +operator==(const _Tp& __x, const complex<_Tp>& __y) +{ + return __x == __y.real() && 0 == __y.imag(); +} + +template +inline CUDA_CALLABLE_MEMBER +bool +operator!=(const complex<_Tp>& __x, const complex<_Tp>& __y) +{ + return !(__x == __y); +} + +template +inline CUDA_CALLABLE_MEMBER +bool +operator!=(const complex<_Tp>& __x, const _Tp& __y) +{ + return !(__x == __y); +} + +template +inline CUDA_CALLABLE_MEMBER +bool +operator!=(const _Tp& __x, const complex<_Tp>& __y) +{ + return !(__x == __y); +} + +// 26.3.7 values: + +// real + +template +inline CUDA_CALLABLE_MEMBER +_Tp +real(const complex<_Tp>& __c) +{ + return __c.real(); +} + +inline CUDA_CALLABLE_MEMBER +double +real(double __re) +{ + return __re; +} + +inline CUDA_CALLABLE_MEMBER +float +real(float __re) +{ + return __re; +} + +// imag + +template +inline CUDA_CALLABLE_MEMBER +_Tp +imag(const complex<_Tp>& __c) +{ + return __c.imag(); +} + +inline CUDA_CALLABLE_MEMBER +double +imag(double __re) +{ + return 0; +} + +inline CUDA_CALLABLE_MEMBER +float +imag(float __re) +{ + return 0; +} + +// abs + +template +inline CUDA_CALLABLE_MEMBER +_Tp +abs(const complex<_Tp>& __c) +{ + return hypot(__c.real(), __c.imag()); +} + +// arg + +template +inline CUDA_CALLABLE_MEMBER +_Tp +arg(const complex<_Tp>& __c) +{ + return atan2(__c.imag(), __c.real()); +} + +inline CUDA_CALLABLE_MEMBER +double +arg(double __re) +{ + return atan2(0., __re); +} + +inline CUDA_CALLABLE_MEMBER +float +arg(float __re) +{ + return atan2f(0.F, __re); +} + +// norm + +template +inline CUDA_CALLABLE_MEMBER +_Tp +norm(const complex<_Tp>& __c) +{ + if (isinf(__c.real())) + return fabs(__c.real()); + if (isinf(__c.imag())) + return fabs(__c.imag()); + return __c.real() * __c.real() + __c.imag() * __c.imag(); +} + +inline CUDA_CALLABLE_MEMBER +double +norm(double __re) +{ + return __re * __re; +} + +inline CUDA_CALLABLE_MEMBER +float +norm(float __re) +{ + return __re * __re; +} + +// conj + +template +inline CUDA_CALLABLE_MEMBER +complex<_Tp> +conj(const complex<_Tp>& __c) +{ + return complex<_Tp>(__c.real(), -__c.imag()); +} + +inline CUDA_CALLABLE_MEMBER +complex +conj(double __re) +{ + return complex(__re); +} + +inline CUDA_CALLABLE_MEMBER +complex +conj(float __re) +{ + return complex(__re); +} + +// proj + +template +inline CUDA_CALLABLE_MEMBER +complex<_Tp> +proj(const complex<_Tp>& __c) +{ + complex<_Tp> __r = __c; + if (isinf(__c.real()) || isinf(__c.imag())) + __r = complex<_Tp>(INFINITY, copysign(_Tp(0), __c.imag())); + return __r; +} + +inline CUDA_CALLABLE_MEMBER +complex +proj(double __re) +{ + if (isinf(__re)) + __re = fabs(__re); + return complex(__re); +} + +inline CUDA_CALLABLE_MEMBER +complex +proj(float __re) +{ + if (isinf(__re)) + __re = fabs(__re); + return complex(__re); +} + +// polar + +template +CUDA_CALLABLE_MEMBER +complex<_Tp> +polar(const _Tp& __rho, const _Tp& __theta = _Tp(0)) +{ + if (isnan(__rho) || signbit(__rho)) + return complex<_Tp>(_Tp(NAN), _Tp(NAN)); + if (isnan(__theta)) + { + if (isinf(__rho)) + return complex<_Tp>(__rho, __theta); + return complex<_Tp>(__theta, __theta); + } + if (isinf(__theta)) + { + if (isinf(__rho)) + return complex<_Tp>(__rho, _Tp(NAN)); + return complex<_Tp>(_Tp(NAN), _Tp(NAN)); + } + _Tp __x = __rho * cos(__theta); + if (isnan(__x)) + __x = 0; + _Tp __y = __rho * sin(__theta); + if (isnan(__y)) + __y = 0; + return complex<_Tp>(__x, __y); +} + +// log + +template +inline CUDA_CALLABLE_MEMBER +complex<_Tp> +log(const complex<_Tp>& __x) +{ + return complex<_Tp>(log(abs(__x)), arg(__x)); +} + +// log10 + +template +inline CUDA_CALLABLE_MEMBER +complex<_Tp> +log10(const complex<_Tp>& __x) +{ + return log(__x) / log(_Tp(10)); +} + +// sqrt + +template +CUDA_CALLABLE_MEMBER +complex<_Tp> +sqrt(const complex<_Tp>& __x) +{ + if (isinf(__x.imag())) + return complex<_Tp>(_Tp(INFINITY), __x.imag()); + if (isinf(__x.real())) + { + if (__x.real() > _Tp(0)) + return complex<_Tp>(__x.real(), isnan(__x.imag()) ? __x.imag() : copysign(_Tp(0), __x.imag())); + return complex<_Tp>(isnan(__x.imag()) ? __x.imag() : _Tp(0), copysign(__x.real(), __x.imag())); + } + return polar(sqrt(abs(__x)), arg(__x) / _Tp(2)); +} + +// exp + +template +CUDA_CALLABLE_MEMBER +complex<_Tp> +exp(const complex<_Tp>& __x) +{ + _Tp __i = __x.imag(); + if (isinf(__x.real())) + { + if (__x.real() < _Tp(0)) + { + if (!isfinite(__i)) + __i = _Tp(1); + } + else if (__i == 0 || !isfinite(__i)) + { + if (isinf(__i)) + __i = _Tp(NAN); + return complex<_Tp>(__x.real(), __i); + } + } + else if (isnan(__x.real()) && __x.imag() == 0) + return __x; + _Tp __e = exp(__x.real()); + return complex<_Tp>(__e * cos(__i), __e * sin(__i)); +} + +// pow + +template +inline CUDA_CALLABLE_MEMBER +complex<_Tp> +pow(const complex<_Tp>& __x, const complex<_Tp>& __y) +{ + return exp(__y * log(__x)); +} + +template +inline CUDA_CALLABLE_MEMBER +complex<_Tp> +pow(const complex<_Tp>& __x, const _Tp& __y) +{ + return pow(__x, complex<_Tp>(__y)); +} + +template +inline CUDA_CALLABLE_MEMBER +complex<_Tp> +pow(const _Tp& __x, const complex<_Tp>& __y) +{ + return pow(complex<_Tp>(__x), __y); +} + +// asinh + +template +CUDA_CALLABLE_MEMBER +complex<_Tp> +asinh(const complex<_Tp>& __x) +{ + const _Tp __pi(atan2(+0., -0.)); + if (isinf(__x.real())) + { + if (isnan(__x.imag())) + return __x; + if (isinf(__x.imag())) + return complex<_Tp>(__x.real(), copysign(__pi * _Tp(0.25), __x.imag())); + return complex<_Tp>(__x.real(), copysign(_Tp(0), __x.imag())); + } + if (isnan(__x.real())) + { + if (isinf(__x.imag())) + return complex<_Tp>(__x.imag(), __x.real()); + if (__x.imag() == 0) + return __x; + return complex<_Tp>(__x.real(), __x.real()); + } + if (isinf(__x.imag())) + return complex<_Tp>(copysign(__x.imag(), __x.real()), copysign(__pi/_Tp(2), __x.imag())); + complex<_Tp> __z = log(__x + sqrt(pow(__x, _Tp(2)) + _Tp(1))); + return complex<_Tp>(copysign(__z.real(), __x.real()), copysign(__z.imag(), __x.imag())); +} + +// acosh + +template +CUDA_CALLABLE_MEMBER +complex<_Tp> +acosh(const complex<_Tp>& __x) +{ + const _Tp __pi(atan2(+0., -0.)); + if (isinf(__x.real())) + { + if (isnan(__x.imag())) + return complex<_Tp>(fabs(__x.real()), __x.imag()); + if (isinf(__x.imag())) + if (__x.real() > 0) + return complex<_Tp>(__x.real(), copysign(__pi * _Tp(0.25), __x.imag())); + else + return complex<_Tp>(-__x.real(), copysign(__pi * _Tp(0.75), __x.imag())); + if (__x.real() < 0) + return complex<_Tp>(-__x.real(), copysign(__pi, __x.imag())); + return complex<_Tp>(__x.real(), copysign(_Tp(0), __x.imag())); + } + if (isnan(__x.real())) + { + if (isinf(__x.imag())) + return complex<_Tp>(fabs(__x.imag()), __x.real()); + return complex<_Tp>(__x.real(), __x.real()); + } + if (isinf(__x.imag())) + return complex<_Tp>(fabs(__x.imag()), copysign(__pi/_Tp(2), __x.imag())); + complex<_Tp> __z = log(__x + sqrt(pow(__x, _Tp(2)) - _Tp(1))); + return complex<_Tp>(copysign(__z.real(), _Tp(0)), copysign(__z.imag(), __x.imag())); +} + +// atanh + +template +CUDA_CALLABLE_MEMBER +complex<_Tp> +atanh(const complex<_Tp>& __x) +{ + const _Tp __pi(atan2(+0., -0.)); + if (isinf(__x.imag())) + { + return complex<_Tp>(copysign(_Tp(0), __x.real()), copysign(__pi/_Tp(2), __x.imag())); + } + if (isnan(__x.imag())) + { + if (isinf(__x.real()) || __x.real() == 0) + return complex<_Tp>(copysign(_Tp(0), __x.real()), __x.imag()); + return complex<_Tp>(__x.imag(), __x.imag()); + } + if (isnan(__x.real())) + { + return complex<_Tp>(__x.real(), __x.real()); + } + if (isinf(__x.real())) + { + return complex<_Tp>(copysign(_Tp(0), __x.real()), copysign(__pi/_Tp(2), __x.imag())); + } + if (fabs(__x.real()) == _Tp(1) && __x.imag() == _Tp(0)) + { + return complex<_Tp>(copysign(_Tp(INFINITY), __x.real()), copysign(_Tp(0), __x.imag())); + } + complex<_Tp> __z = log((_Tp(1) + __x) / (_Tp(1) - __x)) / _Tp(2); + return complex<_Tp>(copysign(__z.real(), __x.real()), copysign(__z.imag(), __x.imag())); +} + +// sinh + +template +CUDA_CALLABLE_MEMBER +complex<_Tp> +sinh(const complex<_Tp>& __x) +{ + if (isinf(__x.real()) && !isfinite(__x.imag())) + return complex<_Tp>(__x.real(), _Tp(NAN)); + if (__x.real() == 0 && !isfinite(__x.imag())) + return complex<_Tp>(__x.real(), _Tp(NAN)); + if (__x.imag() == 0 && !isfinite(__x.real())) + return __x; + return complex<_Tp>(sinh(__x.real()) * cos(__x.imag()), cosh(__x.real()) * sin(__x.imag())); +} + +// cosh + +template +CUDA_CALLABLE_MEMBER +complex<_Tp> +cosh(const complex<_Tp>& __x) +{ + if (isinf(__x.real()) && !isfinite(__x.imag())) + return complex<_Tp>(fabs(__x.real()), _Tp(NAN)); + if (__x.real() == 0 && !isfinite(__x.imag())) + return complex<_Tp>(_Tp(NAN), __x.real()); + if (__x.real() == 0 && __x.imag() == 0) + return complex<_Tp>(_Tp(1), __x.imag()); + if (__x.imag() == 0 && !isfinite(__x.real())) + return complex<_Tp>(fabs(__x.real()), __x.imag()); + return complex<_Tp>(cosh(__x.real()) * cos(__x.imag()), sinh(__x.real()) * sin(__x.imag())); +} + +// tanh + +template +CUDA_CALLABLE_MEMBER +complex<_Tp> +tanh(const complex<_Tp>& __x) +{ + if (isinf(__x.real())) + { + if (!isfinite(__x.imag())) + return complex<_Tp>(_Tp(1), _Tp(0)); + return complex<_Tp>(_Tp(1), copysign(_Tp(0), sin(_Tp(2) * __x.imag()))); + } + if (isnan(__x.real()) && __x.imag() == 0) + return __x; + _Tp __2r(_Tp(2) * __x.real()); + _Tp __2i(_Tp(2) * __x.imag()); + _Tp __d(cosh(__2r) + cos(__2i)); + return complex<_Tp>(sinh(__2r)/__d, sin(__2i)/__d); +} + +// asin + +template +CUDA_CALLABLE_MEMBER +complex<_Tp> +asin(const complex<_Tp>& __x) +{ + complex<_Tp> __z = asinh(complex<_Tp>(-__x.imag(), __x.real())); + return complex<_Tp>(__z.imag(), -__z.real()); +} + +// acos + +template +CUDA_CALLABLE_MEMBER +complex<_Tp> +acos(const complex<_Tp>& __x) +{ + const _Tp __pi(atan2(+0., -0.)); + if (isinf(__x.real())) + { + if (isnan(__x.imag())) + return complex<_Tp>(__x.imag(), __x.real()); + if (isinf(__x.imag())) + { + if (__x.real() < _Tp(0)) + return complex<_Tp>(_Tp(0.75) * __pi, -__x.imag()); + return complex<_Tp>(_Tp(0.25) * __pi, -__x.imag()); + } + if (__x.real() < _Tp(0)) + return complex<_Tp>(__pi, signbit(__x.imag()) ? -__x.real() : __x.real()); + return complex<_Tp>(_Tp(0), signbit(__x.imag()) ? __x.real() : -__x.real()); + } + if (isnan(__x.real())) + { + if (isinf(__x.imag())) + return complex<_Tp>(__x.real(), -__x.imag()); + return complex<_Tp>(__x.real(), __x.real()); + } + if (isinf(__x.imag())) + return complex<_Tp>(__pi/_Tp(2), -__x.imag()); + if (__x.real() == 0) + return complex<_Tp>(__pi/_Tp(2), -__x.imag()); + complex<_Tp> __z = log(__x + sqrt(pow(__x, _Tp(2)) - _Tp(1))); + if (signbit(__x.imag())) + return complex<_Tp>(fabs(__z.imag()), fabs(__z.real())); + return complex<_Tp>(fabs(__z.imag()), -fabs(__z.real())); +} + +// atan + +template +CUDA_CALLABLE_MEMBER +complex<_Tp> +atan(const complex<_Tp>& __x) +{ + complex<_Tp> __z = atanh(complex<_Tp>(-__x.imag(), __x.real())); + return complex<_Tp>(__z.imag(), -__z.real()); +} + +// sin + +template +CUDA_CALLABLE_MEMBER +complex<_Tp> +sin(const complex<_Tp>& __x) +{ + complex<_Tp> __z = sinh(complex<_Tp>(-__x.imag(), __x.real())); + return complex<_Tp>(__z.imag(), -__z.real()); +} + +// cos + +template +inline CUDA_CALLABLE_MEMBER +complex<_Tp> +cos(const complex<_Tp>& __x) +{ + return cosh(complex<_Tp>(-__x.imag(), __x.real())); +} + +// tan + +template +CUDA_CALLABLE_MEMBER +complex<_Tp> +tan(const complex<_Tp>& __x) +{ + complex<_Tp> __z = tanh(complex<_Tp>(-__x.imag(), __x.real())); + return complex<_Tp>(__z.imag(), -__z.real()); +} + +template +std::basic_istream<_CharT, _Traits>& +operator>>(std::basic_istream<_CharT, _Traits>& __is, complex<_Tp>& __x) +{ + if (__is.good()) + { + ws(__is); + if (__is.peek() == _CharT('(')) + { + __is.get(); + _Tp __r; + __is >> __r; + if (!__is.fail()) + { + ws(__is); + _CharT __c = __is.peek(); + if (__c == _CharT(',')) + { + __is.get(); + _Tp __i; + __is >> __i; + if (!__is.fail()) + { + ws(__is); + __c = __is.peek(); + if (__c == _CharT(')')) + { + __is.get(); + __x = complex<_Tp>(__r, __i); + } + else + __is.setstate(std::ios_base::failbit); + } + else + __is.setstate(std::ios_base::failbit); + } + else if (__c == _CharT(')')) + { + __is.get(); + __x = complex<_Tp>(__r, _Tp(0)); + } + else + __is.setstate(std::ios_base::failbit); + } + else + __is.setstate(std::ios_base::failbit); + } + else + { + _Tp __r; + __is >> __r; + if (!__is.fail()) + __x = complex<_Tp>(__r, _Tp(0)); + else + __is.setstate(std::ios_base::failbit); + } + } + else + __is.setstate(std::ios_base::failbit); + return __is; +} + +template +std::basic_ostream<_CharT, _Traits>& +operator<<(std::basic_ostream<_CharT, _Traits>& __os, const complex<_Tp>& __x) +{ + std::basic_ostringstream<_CharT, _Traits> __s; + __s.flags(__os.flags()); + __s.imbue(__os.getloc()); + __s.precision(__os.precision()); + __s << '(' << __x.real() << ',' << __x.imag() << ')'; + return __os << __s.str(); +} + +//} // close namespace cuda_complex + +#endif // CUDA_COMPLEX_HPP diff --git a/runtime/hipacc_cpu_fftw.hpp b/runtime/hipacc_cpu_fftw.hpp new file mode 100644 index 00000000..b048ffe0 --- /dev/null +++ b/runtime/hipacc_cpu_fftw.hpp @@ -0,0 +1,604 @@ +#include +#include + +#include "hipacc_fft_helper.hpp" + +#include +#include +#include + +#include +#include +#include + +template struct FFT {}; +// fftw wrapper functinos for single precision +template <> struct FFT { + template static void *fft_malloc(TArgs... args) { + return fftwf_malloc(args...); + } + template + static fftwf_plan fft_plan_dft_r2c_2d(TArgs... args) { + return fftwf_plan_dft_r2c_2d(args...); + } + template + static fftwf_plan fft_plan_dft_c2r_2d(TArgs... args) { + return fftwf_plan_dft_c2r_2d(args...); + } + template static fftwf_plan fft_plan_r2r_2d(TArgs... args) { + return fftwf_plan_r2r_2d(args...); + } + template static void fft_execute(TArgs... args) { + fftwf_execute(args...); + } + template static void fft_destroy_plan(TArgs... args) { + fftwf_destroy_plan(args...); + } + template static void fft_free(TArgs... args) { + fftwf_free(args...); + } + template static void fft_flops(TArgs... args) { + fftwf_flops(args...); + } + template static void fft_init_threads(TArgs... args) { + fftwf_init_threads(args...); + } + template static void fft_set_timelimit(TArgs... args) { + fftwf_set_timelimit(args...); + } + template static void fft_plan_with_nthreads(TArgs... args) { + fftwf_plan_with_nthreads(args...); + } + template static void fft_cleanup_threads(TArgs... args) { + fftwf_cleanup_threads(args...); + } +}; +// fftw wrapper functinos for double precision +template <> struct FFT { + template static void *fft_malloc(TArgs... args) { + return fftw_malloc(args...); + } + template + static fftw_plan fft_plan_dft_r2c_2d(TArgs... args) { + return fftw_plan_dft_r2c_2d(args...); + } + template + static fftw_plan fft_plan_dft_c2r_2d(TArgs... args) { + return fftw_plan_dft_c2r_2d(args...); + } + template static fftw_plan fft_plan_r2r_2d(TArgs... args) { + return fftw_plan_r2r_2d(args...); + } + template static void fft_execute(TArgs... args) { + fftw_execute(args...); + } + template static void fft_destroy_plan(TArgs... args) { + fftw_destroy_plan(args...); + } + template static void fft_free(TArgs... args) { + fftw_free(args...); + } + template static void fft_flops(TArgs... args) { + fftw_flops(args...); + } + template static void fft_init_threads(TArgs... args) { + fftw_init_threads(args...); + } + template static void fft_set_timelimit(TArgs... args) { + fftw_set_timelimit(args...); + } + template static void fft_plan_with_nthreads(TArgs... args) { + fftw_plan_with_nthreads(args...); + } + template static void fft_cleanup_threads(TArgs... args) { + fftw_cleanup_threads(args...); + } +}; + +template +void fftConvolve(TPrecision *in, int width, int height, V *k, int k_w, int k_h, + TPrecision *out) { + typedef + typename std::conditional::value, + fftwf_complex, fftw_complex>::type fft_complex; + typedef typename std::conditional::value, + fftwf_plan, fftw_plan>::type fft_plan; + bool floatPrecision = false; + if (std::is_same::value) { + floatPrecision = true; + } + + FFT::fft_init_threads(); + FFT::fft_plan_with_nthreads(2); + const unsigned int PLANNER_FLAG = FFTW_MEASURE; // FFTW_ESTIMATE or FFTW_MEASURE + FFT::fft_set_timelimit(0.001); + + int image_width = width; + int image_height = height; + int matsize = image_width * image_height; + int intermediateSize = image_height * (image_width / 2 + 1); + + // setup buffers + TPrecision *kernel = + (TPrecision *)FFT::fft_malloc(sizeof(TPrecision) * matsize); + fft_complex *image_fft = (fft_complex *)FFT::fft_malloc( + sizeof(fft_complex) * intermediateSize); + fft_complex *kernel_fft = (fft_complex *)FFT::fft_malloc( + sizeof(fft_complex) * intermediateSize); + + memset(kernel, 0, sizeof(TPrecision) * matsize); + + // prepare kernel + putKernel(k, kernel, k_w, k_h, image_width, image_height); + + // create plans + fft_plan plan_image = FFT::fft_plan_dft_r2c_2d( + image_height, image_width, in, image_fft, PLANNER_FLAG); + fft_plan plan_kernel = FFT::fft_plan_dft_r2c_2d( + image_height, image_width, kernel, kernel_fft, PLANNER_FLAG); + fft_plan plan_inverse = FFT::fft_plan_dft_c2r_2d( + image_height, image_width, image_fft, out, PLANNER_FLAG); + + // start + auto start_time = std::chrono::system_clock::now(); + + //#pragma omp parallel sections + { + //#pragma omp section + { FFT::fft_execute(plan_image); } + //#pragma omp section + { FFT::fft_execute(plan_kernel); } + } + +// Pointwise Multiplication in Frequency Domain +#pragma omp parallel for + for (int ind = 0; ind < intermediateSize; ++ind) { + reinterpret_cast *>(image_fft)[ind] *= + reinterpret_cast *>(kernel_fft)[ind]; + } + + FFT::fft_execute(plan_inverse); + +// scale +#pragma omp parallel for + for (int ind = 0; ind < matsize; ++ind) { + out[ind] /= (image_height * image_width); + } + + // stop + auto end_time = std::chrono::system_clock::now(); + auto elapsed_time = std::chrono::duration_cast( + end_time - start_time); + std::cout << "h: " << height << " w: " << width << std::endl; + std::cout << (float)elapsed_time.count() / 1000.0 << " ms" << std::endl; + + // cleanup + FFT::fft_destroy_plan(plan_image); + FFT::fft_destroy_plan(plan_kernel); + FFT::fft_destroy_plan(plan_inverse); + FFT::fft_free(kernel); + FFT::fft_free(image_fft); + FFT::fft_free(kernel_fft); + FFT::fft_cleanup_threads(); +} + +bool estimateConvolutionExecutionTime(int width, int height, int padWidth, + int padHeight, int k_w, int k_h, + bool linear) { + // assumed 4 GFLOPS per core + const double FLOPMS = 4000000; // FLOP per MS + const double THREADS = 4; + + double N1 = padWidth; + double N2 = padHeight; + + double singleFFTcost = 7.0 * N1 * N2 * log2(N1 * N2); + + // double FFTConvolutionCost1 = (singleFFTcost1 * 3 + (N1*N2*(3+1))) / THREADS; + // double FFTConvolutionCost2 = (singleFFTcost2 * 3 + (N1*N2*(3+1))) / THREADS; + + double FFTConvolutionCost = (singleFFTcost * 3 + (N1 * N2 * (3 + 1))) / THREADS; + std::cout << "FFTConvolutionCost: " << FFTConvolutionCost / FLOPMS << std::endl; + + double flopPerPixelInHipaccKernel; // depends on border condition + if (linear) { + flopPerPixelInHipaccKernel = 4; + } else { + flopPerPixelInHipaccKernel = 12; + } + double simpleHipaccCost = + (height * width) * (k_w * k_h) * (2 + flopPerPixelInHipaccKernel); + double parallelHipaccCost = simpleHipaccCost / THREADS; + std::cout << "simpleHipaccCost: " << simpleHipaccCost / FLOPMS << std::endl; + + if (FFTConvolutionCost < simpleHipaccCost) { + std::cout << "FFT Convolution will be faster" << std::endl; + return true; + } else { + std::cout << "Hipacc Kernel will be faster" << std::endl; + return false; + } +} + +template +void fftwConvolution(T *in, int width, int height, const V (&kernel)[rows][cols], + int k_w, int k_h, U *out, int alignment, bool linear, + B boundaryConstant = 0) { + assert(rows == k_h && cols == k_w); + + int width_in = alignedWidth(width, alignment); + int width_out = alignedWidth(width, alignment); + + int padWidth = width; + int padHeight = height; + if (linear) { + // required padding for linear convolution + padWidth = width + k_w / 2; + padHeight = height + k_h / 2; + // additional padding for performance + padWidth = upperPowerOfTwo(padWidth); // width has more influence + padHeight = nextDiv(padHeight, 8); + } + int padSize = padWidth * padHeight; + + /* + estimateConvolutionExecutionTime(width, height, padWidth, padHeight, k_w, k_h, + linear); + */ + + // prepare input buffer + TPrecision *input = new TPrecision[padSize]; + if (linear) { + std::fill_n(input, padSize, (TPrecision)boundaryConstant); + } + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + input[y * padWidth + x] = (TPrecision)in[y * width_in + x]; + } + } + // prepare output buffer + TPrecision *output = new TPrecision[padSize]; + fftConvolve(input, padWidth, padHeight, (V *)(&kernel[0][0]), k_w, k_h, output); + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + out[y * width_out + x] = (U)output[y * padWidth + x]; + } + } + + free(input); + free(output); +} + +template +void fftTransform(TPrecision *in, int width, int height, TPrecision *out, + bool forward = true, bool scale = false) { + typedef + typename std::conditional::value, + fftwf_complex, fftw_complex>::type fft_complex; + typedef typename std::conditional::value, + fftwf_plan, fftw_plan>::type fft_plan; + bool floatPrecision = false; + if (std::is_same::value) { + floatPrecision = true; + } + + FFT::fft_init_threads(); + FFT::fft_plan_with_nthreads(2); + const unsigned int PLANNER_FLAG = FFTW_MEASURE; // FFTW_ESTIMATE or FFTW_MEASURE + FFT::fft_set_timelimit(0.001); + + // create plans + fft_plan plan_image; + if (forward) { + fft_complex *fft = reinterpret_cast(out); + plan_image = FFT::fft_plan_dft_r2c_2d(height, width, in, fft, + PLANNER_FLAG); + } else { + fft_complex *fft = reinterpret_cast(in); + plan_image = FFT::fft_plan_dft_c2r_2d(height, width, fft, out, + PLANNER_FLAG); + } + + FFT::fft_execute(plan_image); + + if (!forward && scale) { + // scale +#pragma omp parallel for + for (int ind = 0; ind < width * height; ++ind) { + out[ind] /= (width * height); + } + } + + // cleanup + FFT::fft_destroy_plan(plan_image); + FFT::fft_cleanup_threads(); +} + +template +void fftTransformDevice(TPrecision *in, int width, int height, TPrecision *out, + bool forward = true, bool scale = false) { + // static_assert(false, "fftTransformDevice not available for cpu!"); + std::cerr << "fftTransformDevice is not available for cpu! Using fftTransform " + "instead." + << std::endl; + fftTransform(in, width, height, out, forward, scale); +} + +template +void dctTransformDevice(TPrecision *in, int width, int height, TPrecision *out, + bool forward = true) { + // static_assert(false, "dctTransformDevice not available for cpu!"); + std::cerr << "dctTransformDevice is not available for cpu! Using dctTransform " + "instead." + << std::endl; + dctTransform(in, width, height, out, forward); +} + +template +void dctTransform(TPrecision *in, int width, int height, TPrecision *out, + bool forward = true) { + typedef + typename std::conditional::value, + fftwf_complex, fftw_complex>::type fft_complex; + typedef typename std::conditional::value, + fftwf_plan, fftw_plan>::type fft_plan; + bool floatPrecision = false; + if (std::is_same::value) { + floatPrecision = true; + } + + FFT::fft_init_threads(); + FFT::fft_plan_with_nthreads(2); + const unsigned int PLANNER_FLAG = FFTW_MEASURE; // FFTW_ESTIMATE or FFTW_MEASURE + FFT::fft_set_timelimit(0.001); + + // setup buffers + // create plans + fft_plan plan_image; + if (forward) { + TPrecision *fft = reinterpret_cast(out); + plan_image = FFT::fft_plan_r2r_2d( + height, width, in, fft, FFTW_REDFT10, FFTW_REDFT10, PLANNER_FLAG); + } else { + TPrecision *fft = reinterpret_cast(in); + plan_image = FFT::fft_plan_r2r_2d( + height, width, fft, out, FFTW_REDFT01, FFTW_REDFT01, PLANNER_FLAG); + } + + FFT::fft_execute(plan_image); + + if (!forward) { + // scale +#pragma omp parallel for + for (int ind = 0; ind < width * height; ++ind) { + out[ind] /= (2 * width * 2 * height); + } + } + + // cleanup + FFT::fft_destroy_plan(plan_image); + FFT::fft_cleanup_threads(); +} + +// forward FFT +template TPrecision *fft(HipaccImage &in) { + int width = in->width; + int height = in->height; + int width_in = alignedWidth(width, in->alignment); + + // prepare input buffer + TPrecision *input = new TPrecision[width * height]; + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + input[y * width + x] = (TPrecision)((T *)(in->mem))[y * width_in + x]; + } + } + // prepare output buffer + TPrecision *output = new TPrecision[2 * (width / 2 + 1) * height]; + + fftTransform(input, width, height, output); + + free(input); + + return output; +} + +// inverse FFT +template void ifft(TPrecision *in, HipaccImage &out, bool scaled = true) { + int width = out->width; + int height = out->height; + int width_out = alignedWidth(width, out->alignment); + + // prepare output buffer + TPrecision *output = new TPrecision[width * height]; + + fftTransform(in, width, height, output, false, scaled); + + // truncate values outside of range 0-255 + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + T val = output[y * width + x]; + if (output[y * width + x] < 0) { + val = 0; + } else if (output[y * width + x] > 255) { + val = 255; + } + // convert output + ((T *)(out->mem))[y * width_out + x] = (T)(val); + } + } + + free(output); +} + +// create magnitude from fft +template +void fftToMagnitude(TPrecision *in, HipaccImage &mag) { + int width = mag->width; + int height = mag->height; + int width_out = alignedWidth(width, mag->alignment); + + T *tmp = new T[width * height]; + calcMagnitude(reinterpret_cast *>(in), tmp, width, + height); + + shiftFFT(tmp, width, height); + + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + ((T *)(mag->mem))[y * width_out + x] = (T)tmp[y * width + x]; + } + } + + free(tmp); +} + +// create magnitude from dct +template +void dctToMagnitude(TPrecision *in, HipaccImage &mag) { + int width = mag->width; + int height = mag->height; + int width_out = alignedWidth(width, mag->alignment); + + T *tmp = new T[width * height]; + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + tmp[y * width + x] = std::abs(in[y * width + x]); + } + } + + float max = 0.0; + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + if (tmp[y * width + x] > max) + max = tmp[y * width + x]; + } + } + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + tmp[y * width + x] = + (T)(255.0 * pow(tmp[y * width + x] * (1.0 / max), 1.0 / 4.0)); + } + } + + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + ((T *)(mag->mem))[y * width_out + x] = (T)tmp[y * width + x]; + } + } + + free(tmp); +} + +// apply mask mag to result of FFT in +// and ignore values for frequencies lower than r +template +void fftScaleMagnitude(TPrecision *in, HipaccImage &mag) { + int width = mag->width; + int height = mag->height; + int width_in = alignedWidth(width, mag->alignment); + + T *scale = new T[width_in * height]; + memcpy(scale, mag->mem, sizeof(T) * width_in * height); + + iShiftFFT(scale, width, height, mag->alignment); + + for (int y = 0; y < height; y++) { + for (int x = 0; x < (width / 2 + 1); x++) { + in[(y * (width / 2 + 1) + x) * 2] *= + (TPrecision)scale[y * width_in + x] / 255; + in[(y * (width / 2 + 1) + x) * 2 + 1] *= + (TPrecision)scale[y * width_in + x] / 255; + } + } + + free(scale); +} + +// forward DCT +template TPrecision *dct(HipaccImage &in) { + int width = in->width; + int height = in->height; + int width_in = alignedWidth(width, in->alignment); + + // prepare input buffer + TPrecision *input = new TPrecision[width * height]; + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + input[y * width + x] = (TPrecision)((T *)(in->mem))[y * width_in + x]; + } + } + // prepare output buffer + TPrecision *output = new TPrecision[width * height]; + + dctTransform(input, width, height, output); + + free(input); + + return output; +} + +// inverse DCT +template void idct(TPrecision *in, HipaccImage &out) { + int width = out->width; + int height = out->height; + int width_out = alignedWidth(width, out->alignment); + + // prepare output buffer + TPrecision *output = new TPrecision[width * height]; + + dctTransform(in, width, height, output, false); + + // truncate values outside of range 0-255 + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + T val = output[y * width + x]; + if (output[y * width + x] < 0) { + val = 0; + } else if (output[y * width + x] > 255) { + val = 255; + } + // convert output + ((T *)(out->mem))[y * width_out + x] = (T)(val); + } + } + + free(output); +} + +// function wrappers for images + +template void fftShift(HipaccImage &mag) { + int width = mag->width; + int height = mag->height; + int width_in = alignedWidth(width, mag->alignment); + + shiftFFT((T *)(mag->mem), width, height, width_in); +} + +template void ifftShift(HipaccImage &mag) { + int width = mag->width; + int height = mag->height; + int width_in = alignedWidth(width, mag->alignment); + + iShiftFFT((T *)(mag->mem), width, height, width_in); +} + +template +void fftResetMask(HipaccImage &mag, int radius, bool low, int window = 0) { + int width = mag->width; + int height = mag->height; + int width_in = alignedWidth(width, mag->alignment); + + magResetFreq((T *)(mag->mem), width, height, width_in, radius, window, low); +} + +template +void fftApplyPassFilter(HipaccImage &mag, int radius, bool low, int window = 0) { + int width = mag->width; + int height = mag->height; + int width_in = alignedWidth(width, mag->alignment); + + magPassFilter((T *)(mag->mem), width, height, width_in, radius, window, low); +} diff --git a/runtime/hipacc_cu.hpp.cmake b/runtime/hipacc_cu.hpp.cmake index 4d0f579b..f083ca1a 100644 --- a/runtime/hipacc_cu.hpp.cmake +++ b/runtime/hipacc_cu.hpp.cmake @@ -181,6 +181,7 @@ void hipaccInitCUDA(); void hipaccCopyMemory(const HipaccImage &src, HipaccImage &dst); void hipaccCopyMemoryRegion(const HipaccAccessor &src, const HipaccAccessor &dst); void hipaccLaunchKernel(const void *kernel, std::string kernel_name, dim3 grid, dim3 block, void **args, bool print_timing=true); +void hipaccLaunchKernel(const void *kernel, std::string kernel_name, dim3 grid, dim3 block, void **args, cudaStream_t stream, cudaEvent_t waitEvent=nullptr, cudaEvent_t recordEvent=nullptr); void hipaccLaunchKernelBenchmark(const void *kernel, std::string kernel_name, dim3 grid, dim3 block, std::vector args, bool print_timing=true); void hipaccLaunchKernelExploration(std::string filename, std::string kernel, std::vector args, std::vector smems, std::vector consts, std::vector texs, @@ -207,8 +208,12 @@ HipaccImage hipaccCreatePyramidImage(const HipaccImage &base, size_t width, size template void hipaccWriteMemory(HipaccImage &img, T *host_mem); template +void hipaccWriteMemory(HipaccImage &img, T *host_mem, cudaStream_t stream); +template T *hipaccReadMemory(const HipaccImage &img); template +T *hipaccReadMemory(const HipaccImage &img, cudaStream_t stream); +template void hipaccBindTexture(hipaccMemoryType mem_type, const textureReference *tex, const HipaccImage &img); template void hipaccBindSurface(hipaccMemoryType mem_type, const surfaceReference *surf, const HipaccImage &img); diff --git a/runtime/hipacc_cu.tpp b/runtime/hipacc_cu.tpp index 36ce05e9..88d43fbc 100644 --- a/runtime/hipacc_cu.tpp +++ b/runtime/hipacc_cu.tpp @@ -122,6 +122,33 @@ void hipaccWriteMemory(HipaccImage &img, T *host_mem) { } +// Write to memory async +template +void hipaccWriteMemory(HipaccImage &img, T *host_mem, cudaStream_t stream) { + if (host_mem == NULL) return; + + size_t width = img->width; + size_t height = img->height; + size_t stride = img->stride; + + if ((char *)host_mem != img->host) + std::copy(host_mem, host_mem + width*height, (T*)img->host); + + if (img->mem_type >= Array2D) { + cudaError_t err = cudaMemcpy2DToArrayAsync((cudaArray *)img->mem, 0, 0, host_mem, stride*sizeof(T), width*sizeof(T), height, cudaMemcpyHostToDevice, stream); + checkErr(err, "cudaMemcpy2DToArray()"); + } else { + if (stride > width) { + cudaError_t err = cudaMemcpy2DAsync(img->mem, stride*sizeof(T), host_mem, width*sizeof(T), width*sizeof(T), height, cudaMemcpyHostToDevice, stream); + checkErr(err, "cudaMemcpy2D()"); + } else { + cudaError_t err = cudaMemcpyAsync(img->mem, host_mem, sizeof(T)*width*height, cudaMemcpyHostToDevice, stream); + checkErr(err, "cudaMemcpy()"); + } + } +} + + // Read from memory template T *hipaccReadMemory(const HipaccImage &img) { @@ -146,6 +173,30 @@ T *hipaccReadMemory(const HipaccImage &img) { } +// Read from memory async +template +T *hipaccReadMemory(const HipaccImage &img, cudaStream_t stream) { + size_t width = img->width; + size_t height = img->height; + size_t stride = img->stride; + + if (img->mem_type >= Array2D) { + cudaError_t err = cudaMemcpy2DFromArrayAsync((T*)img->host, stride*sizeof(T), (cudaArray *)img->mem, 0, 0, width*sizeof(T), height, cudaMemcpyDeviceToHost, stream); + checkErr(err, "cudaMemcpy2DFromArray()"); + } else { + if (stride > width) { + cudaError_t err = cudaMemcpy2DAsync((T*)img->host, width*sizeof(T), img->mem, stride*sizeof(T), width*sizeof(T), height, cudaMemcpyDeviceToHost, stream); + checkErr(err, "cudaMemcpy2D()"); + } else { + cudaError_t err = cudaMemcpyAsync((T*)img->host, img->mem, sizeof(T)*width*height, cudaMemcpyDeviceToHost, stream); + checkErr(err, "cudaMemcpy()"); + } + } + + return (T*)img->host; +} + + // Bind memory to texture template void hipaccBindTexture(hipaccMemoryType mem_type, const textureReference *tex, const HipaccImage &img) { diff --git a/runtime/hipacc_cu_cufft.hpp b/runtime/hipacc_cu_cufft.hpp new file mode 100644 index 00000000..8b0ddf91 --- /dev/null +++ b/runtime/hipacc_cu_cufft.hpp @@ -0,0 +1,1501 @@ +#include +#include +#include + +#include "cuda_complex.hpp" +#include "hipacc_base_standalone.hpp" +#include "hipacc_fft_helper.hpp" + +#include +#include + +#define gpuErrchk(ans) \ + { gpuAssert((ans), __FILE__, __LINE__); } +inline void gpuAssert(cudaError_t code, const char *file, int line, + bool abort = true) { + if (code != cudaSuccess) { + fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, + line); + if (abort) + exit(code); + } +} + +void Handle_Error(cudaError_t err, const char *file, int line) { + if (err != cudaSuccess) { + std::string cu = std::string(cudaGetErrorString(err)) + " in " + + std::string(file) + " at line: " + std::to_string(line) + + "\n"; + std::cout << cu; + } +} +#define HANDLE_ERROR(err) (Handle_Error(err, __FILE__, __LINE__)) + +// Complex data type +struct d2 { + double x; + double y; +}; +typedef d2 DoubleComplex; +// typedef float2 Complex; +struct f2 { + float x; + float y; +}; +typedef f2 Complex; + +// Mul +__device__ cufftComplex C_Mul(cufftComplex a, cufftComplex b) { + cufftComplex temp; + temp.x = a.x * b.x - a.y * b.y; + temp.y = a.y * b.x + b.y * a.x; + return temp; +} +// Mul double +__device__ cufftDoubleComplex C_Mul(cufftDoubleComplex a, cufftDoubleComplex b) { + cufftDoubleComplex temp; + temp.x = a.x * b.x - a.y * b.y; + temp.y = a.y * b.x + b.y * a.x; + return temp; +} +// Scale +template __device__ C C_Scale(C a, float s) { + C c; + c.x = s * a.x; + c.y = s * a.y; + return c; +} +// Mul and scale +__global__ void MulScalePw(cufftDoubleComplex *image, cufftDoubleComplex *filter, + int Nx, int Ny, double scale) { + int tix = threadIdx.x + blockIdx.x * blockDim.x; + int tiy = threadIdx.y + blockIdx.y * blockDim.y; + int tid = tix + Nx * tiy; + + if (tid >= Nx * Ny) + return; + + if ((tix < Nx) && (tiy < Ny)) { + image[tid] = C_Scale(C_Mul(image[tid], filter[tid]), scale); + } +} +// Mul and scale float +__global__ void MulScalePw(cufftComplex *image, cufftComplex *filter, int Nx, + int Ny, float scale) { + int tix = threadIdx.x + blockIdx.x * blockDim.x; + int tiy = threadIdx.y + blockIdx.y * blockDim.y; + int tid = tix + Nx * tiy; + + if (tid >= Nx * Ny) + return; + + if ((tix < Nx) && (tiy < Ny)) { + image[tid] = C_Scale(C_Mul(image[tid], filter[tid]), scale); + } +} + +// scale +template +__global__ void ScalePw(C *image, int Nx, int Ny, double scale) { + int tix = threadIdx.x + blockIdx.x * blockDim.x; + int tiy = threadIdx.y + blockIdx.y * blockDim.y; + int tid = tix + Nx * tiy; + + if (tid >= Nx * Ny) + return; + + if ((tix < Nx) && (tiy < Ny)) { + image[tid] = C_Scale(image[tid], scale); + } +} + +// scale element-wise +template +__global__ void ScaleE(T *image, int Nx, int Ny, TScale scale) { + int tix = threadIdx.x + blockIdx.x * blockDim.x; + int tiy = threadIdx.y + blockIdx.y * blockDim.y; + int tid = tix + Nx * tiy; + + if (tid >= Nx * Ny) + return; + + if ((tix < Nx) && (tiy < Ny)) { + image[tid] = image[tid] * scale; + } +} + +// Makhoul reorder +template +__global__ void Reorder(T *image, U *out, int width, int height, int width_in, + int blockX, int blockY) { + int x = threadIdx.x + blockIdx.x * blockDim.x; + int y = threadIdx.y + blockIdx.y * blockDim.y; + int tid = x + width_in * y; + + if (tid >= width_in * height) + return; + + if ((x < width) && (y < height)) { + int dstX; + if (x % 2 == 0) { + dstX = x / 2; + } else { + dstX = (width - (x / 2) - 1); + } + int dstY; + if (y % 2 == 0) { + dstY = y / 2; + } else { + dstY = (width - (y / 2) - 1); + } + out[dstY * width + dstX] = (U)image[x + width_in * y]; + } +} + +// Makhoul inverse reorder +template +__global__ void iReorder(T *in, U *image, int width, int height, int width_out, + int blockX, int blockY) { + int x = threadIdx.x + blockIdx.x * blockDim.x; + int y = threadIdx.y + blockIdx.y * blockDim.y; + int tid = x + width_out * y; + + if (tid >= width_out * height) + return; + + if ((x < width) && (y < height)) { + int dstX; + if (x % 2 == 0) { + dstX = x / 2; + } else { + dstX = (width - (x / 2) - 1); + } + int dstY; + if (y % 2 == 0) { + dstY = y / 2; + } else { + dstY = (width - (y / 2) - 1); + } + image[x + width_out * y] = (T)in[dstY * width + dstX]; + } +} + +// Makhoul factor multiplication +template +__global__ void Factoring(complex *in, T *out, int width, int height, + int blockX, int blockY, int out_width) { + int x = threadIdx.x + blockIdx.x * blockDim.x; + int y = threadIdx.y + blockIdx.y * blockDim.y; + int tid = x + width * y; + + if (tid >= width * height) + return; + + const complex i(0.0, 1.0); + complex factor = exp(-i * complex(M_PI * y / (2 * height))); + complex factor2 = exp(-i * complex(M_PI * x / (2 * width))); + complex factor3 = exp(-i * complex(M_PI * (-x) / (2 * width))); + complex factor4 = exp(-i * complex(M_PI * (-y) / (2 * height))); + + if ((x < width / 2 + 1) && (y < height)) { + if (x == 0 && y == 0) { + out[y * width + x] = + 4.0 * (factor * (factor2 * in[y * out_width + x])).real(); + } else if (x == 0) { + out[y * width + x] = + 2.0 * (factor2 * (factor * in[y * out_width + x] + + factor4 * in[(height - y) * out_width + x])) + .real(); + } else if (y == 0) { + out[y * width + x] = + 2.0 * (factor * (factor2 * in[y * out_width + x] + + factor3 * conj(in[y * out_width + x]))) + .real(); + } else { + out[y * width + x] = + 2.0 * (factor * (factor2 * in[y * out_width + x] + + factor3 * conj(in[(height - y) * out_width + x]))) + .real(); + } + } else if ((x < width) && (y < height)) { + if (x == 0 && y == 0) { + out[y * width + x] = + 4.0 * (factor * (factor2 * conj(in[y * out_width + width - x]))).real(); + } else if (x == 0) { + out[y * width + x] = + 2.0 * + (factor2 * (factor * conj(in[(height - y) * out_width + width - x]) + + factor4 * conj(in[y * out_width + width - x]))) + .real(); + } else if (y == 0) { + out[y * width + x] = + 2.0 * (factor * (factor2 * conj(in[y * out_width + width - x]) + + factor3 * in[y * out_width + width - x])) + .real(); + } else { + out[y * width + x] = + 2.0 * + (factor * (factor2 * conj(in[(height - y) * out_width + width - x]) + + factor3 * in[y * out_width + width - x])) + .real(); + } + } +} + +// Makhoul inverse factor multiplication +template +__global__ void iFactoring(T *in, complex *out, int width, int height, + int blockX, int blockY) { + int x = threadIdx.x + blockIdx.x * blockDim.x; + int y = threadIdx.y + blockIdx.y * blockDim.y; + int tid = x + width * y; + + if (tid >= width * height) + return; + + int in_width = width / 2 + 1; + + const complex i(0.0, 1.0); + + if ((x < in_width) && (y < height)) { + complex factor3 = exp(-i * complex(M_PI * (-x) / (2 * width))); + complex factor4 = exp(-i * complex(M_PI * (-y) / (2 * height))); + if (x == 0 && y == 0) { + out[y * in_width + x] = complex(0.25, 0.0) * factor4 * factor3 * + complex(in[y * width + x]); + } else if (y == 0) { + out[y * in_width + x] = complex(0.25, 0.0) * factor4 * factor3 * + (complex(in[y * width + x]) - + i * complex(in[y * width + width - x])); + } else if (x == 0) { + out[y * in_width + x] = complex(0.25, 0.0) * factor4 * factor3 * + (complex(in[y * width + x]) - + i * complex(in[(height - y) * width + x])); + } else { + out[y * in_width + x] = + complex(0.25, 0.0) * factor4 * factor3 * + ((complex(in[y * width + x]) - + complex(in[(height - y) * width + width - x])) - + i * (complex(in[(height - y) * width + x]) + + complex(in[y * width + width - x]))); + } + } +} + +template +__global__ void CopyAligned(T *in, U *out, int width, int height, int width_in, + int width_out) { + int x = threadIdx.x + blockIdx.x * blockDim.x; + int y = threadIdx.y + blockIdx.y * blockDim.y; + int tid = x + width * y; + + if (tid >= width_in * height) + return; + + if ((x < width) && (y < height)) { + out[y * width_out + x] = (U)in[x + width_in * y]; + } +} + +template +__global__ void FillMem(T *mem, U val, int width, int height) { + int x = threadIdx.x + blockIdx.x * blockDim.x; + int y = threadIdx.y + blockIdx.y * blockDim.y; + int tid = x + width * y; + + if (tid >= width * height) + return; + + if ((x < width) && (y < height)) { + mem[y * width + x] = (T)val; + } +} + +template +__global__ void PutKernel(T *kernel, U *dest, int kw, int kh, int nx, int ny) { + int x = threadIdx.x + blockIdx.x * blockDim.x; + int y = threadIdx.y + blockIdx.y * blockDim.y; + int tid = x + kw * y; + + if (tid >= kw * kh) + return; + + if ((x < kw) && (y < kh)) { + int kx = x - kw / 2; + int ky = y - kh / 2; + + dest[nx * ((ny + ky) % ny) + ((nx + kx) % nx)] = + (U)kernel[kw * (kh / 2 + ky) + (kw / 2 + kx)]; + } +} + +template +void fftConvolve(TPrecision *in, int width, int height, V *k, int k_w, int k_h, + TPrecision *out) { + typedef typename std::conditional::value, + cufftComplex, cufftDoubleComplex>::type + cufft_Complex; + typedef typename std::conditional::value, + cufftReal, cufftDoubleReal>::type cufft_Real; + bool floatPrecision = false; + if (std::is_same::value) { + floatPrecision = true; + } + + int image_width = width; + int image_height = height; + + int matsize = image_width * image_height; + int intermediateWidth = image_width / 2 + 1; + int intermediateSize = image_height * intermediateWidth; + + // setup buffers + TPrecision *kernel = (TPrecision *)malloc(sizeof(TPrecision) * matsize); + + memset(kernel, 0, sizeof(TPrecision) * matsize); + + // prepare kernel + putKernel(k, kernel, k_w, k_h, image_width, image_height); + + // Allocate device memory + TPrecision *d_in, *d_kernel, *d_out; + cufft_Complex *d_image_fft, *d_kernel_fft; + cudaMalloc(reinterpret_cast(&d_in), sizeof(TPrecision) * matsize); + cudaMalloc(reinterpret_cast(&d_kernel), sizeof(TPrecision) * matsize); + cudaMalloc(reinterpret_cast(&d_image_fft), + sizeof(cufft_Complex) * intermediateSize); + cudaMalloc(reinterpret_cast(&d_kernel_fft), + sizeof(cufft_Complex) * intermediateSize); + cudaMalloc(reinterpret_cast(&d_out), sizeof(TPrecision) * matsize); + // Copy host memory to device + cudaMemcpy(d_in, in, sizeof(TPrecision) * matsize, cudaMemcpyHostToDevice); + cudaMemcpy(d_kernel, kernel, sizeof(TPrecision) * matsize, + cudaMemcpyHostToDevice); + + cudaDeviceSynchronize(); // only for time measure + // start + auto start_time = std::chrono::system_clock::now(); + + // create plans + cufftHandle plan_forward_many, plan_reverse_many; + int n[2] = {image_height, image_width}; + if (floatPrecision) { + cufftPlanMany(&plan_forward_many, 2, n, nullptr, 1, 1, nullptr, 1, 1, + CUFFT_R2C, 1); + cufftPlanMany(&plan_reverse_many, 2, n, nullptr, 1, 1, nullptr, 1, 1, + CUFFT_C2R, 1); + } else { + cufftPlanMany(&plan_forward_many, 2, n, nullptr, 1, 1, nullptr, 1, 1, + CUFFT_D2Z, 1); + cufftPlanMany(&plan_reverse_many, 2, n, nullptr, 1, 1, nullptr, 1, 1, + CUFFT_Z2D, 1); + } + + // Transform signal and kernel + if (floatPrecision) { + cufftExecR2C(plan_forward_many, reinterpret_cast(d_in), + reinterpret_cast(d_image_fft)); + cufftExecR2C(plan_forward_many, reinterpret_cast(d_kernel), + reinterpret_cast(d_kernel_fft)); + } else { + cufftExecD2Z(plan_forward_many, reinterpret_cast(d_in), + reinterpret_cast(d_image_fft)); + cufftExecD2Z(plan_forward_many, reinterpret_cast(d_kernel), + reinterpret_cast(d_kernel_fft)); + } + + // define dimensions + dim3 threadsPerBlock(32, 32); // 1024 threads + dim3 numBlocks(image_width / threadsPerBlock.x + 1, + image_height / threadsPerBlock.y + 1); + + // Pointwise Multiplication in Frequency Domain + MulScalePw<<>>( + reinterpret_cast(d_image_fft), + reinterpret_cast(d_kernel_fft), intermediateWidth, + image_height, 1.0 / (image_height * image_width)); + HANDLE_ERROR(cudaGetLastError()); + + if (floatPrecision) { + cufftExecC2R(plan_reverse_many, reinterpret_cast(d_image_fft), + reinterpret_cast(d_out)); + } else { + cufftExecZ2D(plan_reverse_many, + reinterpret_cast(d_image_fft), + reinterpret_cast(d_out)); + } + + cudaDeviceSynchronize(); // only for time measure + // stop + auto end_time = std::chrono::system_clock::now(); + auto elapsed_time = std::chrono::duration_cast( + end_time - start_time); + std::cout << (float)elapsed_time.count() / 1000.0 << " ms" << std::endl; + + // Copy device memory to host + cudaMemcpy(out, d_out, sizeof(TPrecision) * matsize, cudaMemcpyDeviceToHost); + + // cleanup + cufftDestroy(plan_forward_many); + cufftDestroy(plan_reverse_many); + free(kernel); + cudaFree(d_in); + cudaFree(d_kernel); + cudaFree(d_image_fft); + cudaFree(d_kernel_fft); + cudaFree(d_out); +} + +template +void fftConvolve_device(TPrecision *in, int width, int height, TPrecision *k, + TPrecision *out) { + typedef typename std::conditional::value, + cufftComplex, cufftDoubleComplex>::type + cufft_Complex; + typedef typename std::conditional::value, + cufftReal, cufftDoubleReal>::type cufft_Real; + bool floatPrecision = false; + if (std::is_same::value) { + floatPrecision = true; + } + + int image_width = width; + int image_height = height; + + std::cout << width << " x " << height << std::endl; + + int intermediateWidth = image_width / 2 + 1; + int intermediateSize = image_height * intermediateWidth; + + // Allocate device memory + cufft_Complex *d_image_fft, *d_kernel_fft; + cudaMalloc(reinterpret_cast(&d_image_fft), + sizeof(cufft_Complex) * intermediateSize); + cudaMalloc(reinterpret_cast(&d_kernel_fft), + sizeof(cufft_Complex) * intermediateSize); + + // create plans + cufftHandle plan_forward_many, plan_reverse_many; + int n[2] = {image_height, image_width}; + if (floatPrecision) { + cufftPlanMany(&plan_forward_many, 2, n, nullptr, 1, 1, nullptr, 1, 1, + CUFFT_R2C, 1); + cufftPlanMany(&plan_reverse_many, 2, n, nullptr, 1, 1, nullptr, 1, 1, + CUFFT_C2R, 1); + } else { + cufftPlanMany(&plan_forward_many, 2, n, nullptr, 1, 1, nullptr, 1, 1, + CUFFT_D2Z, 1); + cufftPlanMany(&plan_reverse_many, 2, n, nullptr, 1, 1, nullptr, 1, 1, + CUFFT_Z2D, 1); + } + + cudaDeviceSynchronize(); // only for time measure + // start + auto start_time = std::chrono::system_clock::now(); + + // Transform signal and kernel + if (floatPrecision) { + cufftExecR2C(plan_forward_many, reinterpret_cast(in), + reinterpret_cast(d_image_fft)); + cufftExecR2C(plan_forward_many, reinterpret_cast(k), + reinterpret_cast(d_kernel_fft)); + } else { + cufftExecD2Z(plan_forward_many, reinterpret_cast(in), + reinterpret_cast(d_image_fft)); + cufftExecD2Z(plan_forward_many, reinterpret_cast(k), + reinterpret_cast(d_kernel_fft)); + } + + // define dimensions + dim3 threadsPerBlock(32, 32); // 1024 threads + dim3 numBlocks(image_width / threadsPerBlock.x + 1, + image_height / threadsPerBlock.y + 1); + + // Pointwise Multiplication in Frequency Domain + MulScalePw<<>>( + reinterpret_cast(d_image_fft), + reinterpret_cast(d_kernel_fft), intermediateWidth, + image_height, 1.0 / (image_height * image_width)); + HANDLE_ERROR(cudaGetLastError()); + + if (floatPrecision) { + cufftExecC2R(plan_reverse_many, reinterpret_cast(d_image_fft), + reinterpret_cast(out)); + } else { + cufftExecZ2D(plan_reverse_many, + reinterpret_cast(d_image_fft), + reinterpret_cast(out)); + } + + cudaDeviceSynchronize(); // only for time measure + // stop + auto end_time = std::chrono::system_clock::now(); + auto elapsed_time = std::chrono::duration_cast( + end_time - start_time); + std::cout << (float)elapsed_time.count() / 1000.0 << " ms" << std::endl; + + // cleanup + cufftDestroy(plan_forward_many); + cufftDestroy(plan_reverse_many); + cudaFree(d_image_fft); + cudaFree(d_kernel_fft); +} + +bool estimateConvolutionExecutionTime(int width, int height, int padWidth, + int padHeight, int k_w, int k_h, + bool linear) { + // assumed: + // 2496 CUDA cores + // 3.52 TFLOPS single, 1.17 TFLOPS double + const double FLOPMS = 3500000000; // FLOP per MS + + double N1 = padWidth; + double N2 = padHeight; + + double singleFFTcost = 8.0 * N1 * N2 * log2(N1 * N2); + + double FFTConvolutionCost = (singleFFTcost * 3 + (N1 * N2 * (3 + 1))) * 80; + std::cout << "FFTConvolutionCost: " << FFTConvolutionCost / FLOPMS << std::endl; + + double simpleHipaccCost = (height * width) * (k_w * k_h) * 2 * 22; + std::cout << "simpleHipaccCost: " << simpleHipaccCost / FLOPMS << std::endl; + + if (FFTConvolutionCost < simpleHipaccCost) { + std::cout << "FFT Convolution will be faster" << std::endl; + return true; + } else { + std::cout << "Hipacc Kernel will be faster" << std::endl; + return false; + } +} + +template +void cufftConvolution(T *in, int width, int height, const V (&kernel)[rows][cols], + int k_w, int k_h, U *out, int alignment, bool linear, + B boundaryConstant = 0) { + assert(rows == k_h && cols == k_w); + + int width_in = alignedWidth(width, alignment); + int width_out = alignedWidth(width, alignment); + + int padWidth = width; + int padHeight = height; + if (linear) { + padWidth = width + k_w / 2; + padHeight = height + k_h / 2; + } + int padSize = padWidth * padHeight; + + /* + estimateConvolutionExecutionTime(width, height, padWidth, padHeight, k_w, k_h, + linear); + */ + + // prepare input buffer + TPrecision *input = new TPrecision[padSize]; + if (linear) { + std::fill_n(input, padSize, (TPrecision)boundaryConstant); + } + T *input_d = new T[width_in * height]; + // convert input + gpuErrchk(cudaMemcpy(input_d, in, sizeof(T) * width_in * height, + cudaMemcpyDeviceToHost)); + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + input[y * padWidth + x] = (TPrecision)input_d[y * width_in + x]; + } + } + // prepare output buffer + TPrecision *output = new TPrecision[padSize]; + fftConvolve(input, padWidth, padHeight, (V *)(&kernel[0][0]), k_w, k_h, output); + // convert output + U *out_d = new U[width_out * height]; + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + out_d[y * width_out + x] = (U)output[y * padWidth + x]; + } + } + cudaMemcpy(out, out_d, sizeof(U) * width_out * height, cudaMemcpyHostToDevice); + + free(input); + free(input_d); + free(output); + free(out_d); +} + +template +void cufftConvolution_device(T *in, int width, int height, + const V (&kernel)[rows][cols], int k_w, int k_h, + U *out, int alignment, bool linear, + B boundaryConstant = 0) { + assert(rows == k_h && cols == k_w); + + int width_in = alignedWidth(width, alignment); + int width_out = alignedWidth(width, alignment); + + int padWidth = width; + int padHeight = height; + if (linear) { + padWidth = width + k_w / 2; + padHeight = height + k_h / 2; + } + int padSize = padWidth * padHeight; + + /* + estimateConvolutionExecutionTime(width, height, padWidth, padHeight, k_w, k_h, + linear); + */ + + V *kernel_in; + cudaMalloc(reinterpret_cast(&kernel_in), sizeof(V) * k_w * k_h); + gpuErrchk(cudaMemcpy(kernel_in, (V *)(&kernel[0][0]), sizeof(V) * k_w * k_h, + cudaMemcpyHostToDevice)); + + TPrecision *d_in, *d_kernel, *d_out; + cudaMalloc(reinterpret_cast(&d_in), sizeof(TPrecision) * padSize); + cudaMalloc(reinterpret_cast(&d_kernel), sizeof(TPrecision) * padSize); + cudaMalloc(reinterpret_cast(&d_out), sizeof(TPrecision) * padSize); + + dim3 threadsPerBlock(32, 32); // 1024 threads + dim3 numBlocks(width / threadsPerBlock.x + 1, height / threadsPerBlock.y + 1); + dim3 threadsPerBlock2(32, 32); // 1024 threads + dim3 numBlocks2(padWidth / threadsPerBlock.x + 1, + padHeight / threadsPerBlock.y + 1); + + // prepare image buffer + if (linear) { + FillMem<<>>(d_in, (TPrecision)boundaryConstant, + padWidth, padHeight); + HANDLE_ERROR(cudaGetLastError()); + } + CopyAligned<<>>(in, d_in, width, height, width_in, + padWidth); + HANDLE_ERROR(cudaGetLastError()); + + // prepare kernel buffer + cudaMemset(d_kernel, 0, sizeof(TPrecision) * padSize); + PutKernel<<>>(kernel_in, d_kernel, k_w, k_h, + padWidth, padHeight); + + // prepare output buffer + // TPrecision *output = new TPrecision[padSize]; + fftConvolve_device(d_in, padWidth, padHeight, d_kernel, d_out); + + // convert output + + CopyAligned<<>>(d_out, out, width, height, padWidth, + width_out); + HANDLE_ERROR(cudaGetLastError()); + + // clean up + cudaFree(d_in); + cudaFree(d_kernel); + cudaFree(d_out); +} + +template +void fftTransform(TPrecision *in, int width, int height, TPrecision *out, + bool forward = true, bool scale = false) { + typedef typename std::conditional::value, + cufftComplex, cufftDoubleComplex>::type + cufft_Complex; + typedef typename std::conditional::value, + cufftReal, cufftDoubleReal>::type cufft_Real; + bool floatPrecision = false; + if (std::is_same::value) { + floatPrecision = true; + } + + int image_width = width; + int image_height = height; + + int matsize = image_width * image_height; + int intermediateWidth = image_width / 2 + 1; + int intermediateSize = image_height * intermediateWidth; + + // Allocate device memory + cufft_Real *d_in; + cufft_Complex *d_image_fft; + gpuErrchk( + cudaMalloc(reinterpret_cast(&d_in), sizeof(cufft_Real) * matsize)); + gpuErrchk(cudaMalloc(reinterpret_cast(&d_image_fft), + sizeof(cufft_Complex) * intermediateSize)); + // Copy host memory to device + if (forward) { + gpuErrchk(cudaMemcpy(d_in, in, sizeof(cufft_Real) * matsize, + cudaMemcpyHostToDevice)); + } else { + gpuErrchk(cudaMemcpy(d_image_fft, in, + sizeof(cufft_Complex) * intermediateSize, + cudaMemcpyHostToDevice)); + } + // create plans + cufftHandle plan_forward_many; + int n[2] = {image_height, image_width}; + if (floatPrecision) { + if (forward) { + cufftPlanMany(&plan_forward_many, 2, n, nullptr, 1, 1, nullptr, 1, 1, + CUFFT_R2C, 1); + } else { + cufftPlanMany(&plan_forward_many, 2, n, nullptr, 1, 1, nullptr, 1, 1, + CUFFT_C2R, 1); + } + } else { + if (forward) { + cufftPlanMany(&plan_forward_many, 2, n, nullptr, 1, 1, nullptr, 1, 1, + CUFFT_D2Z, 1); + } else { + cufftPlanMany(&plan_forward_many, 2, n, nullptr, 1, 1, nullptr, 1, 1, + CUFFT_Z2D, 1); + } + } + + // define dimensions + dim3 threadsPerBlock(32, 32); // 1024 threads + dim3 numBlocks(image_width / threadsPerBlock.x + 1, + image_height / threadsPerBlock.y + 1); + + // Transform signal + if (forward) { + if (floatPrecision) { + cufftExecR2C(plan_forward_many, reinterpret_cast(d_in), + reinterpret_cast(d_image_fft)); + } else { + cufftExecD2Z(plan_forward_many, reinterpret_cast(d_in), + reinterpret_cast(d_image_fft)); + } + } else { + if (floatPrecision) { + cufftExecC2R(plan_forward_many, + reinterpret_cast(d_image_fft), + reinterpret_cast(d_in)); + } else { + cufftExecZ2D(plan_forward_many, + reinterpret_cast(d_image_fft), + reinterpret_cast(d_in)); + } + } + + if (!forward && scale) { + // Pointwise Scale + ScaleE<<>>( + reinterpret_cast(d_in), image_width, + image_height, 1.0 / (image_height * image_width)); + HANDLE_ERROR(cudaGetLastError()); + } + + // Copy device memory to host + if (forward) { + gpuErrchk(cudaMemcpy(out, d_image_fft, + sizeof(cufft_Complex) * intermediateSize, + cudaMemcpyDeviceToHost)); + } else { + gpuErrchk(cudaMemcpy(out, d_in, sizeof(cufft_Real) * matsize, + cudaMemcpyDeviceToHost)); + } + + // cleanup + cufftDestroy(plan_forward_many); + cudaFree(d_in); + cudaFree(d_image_fft); +} + +template +void fftTransformDevice(TPrecision *in, int width, int height, TPrecision *out, + bool forward = true, bool scale = false, + int alignment = 0) { + if (alignment == 0) + alignment = width; + typedef typename std::conditional::value, + cufftComplex, cufftDoubleComplex>::type + cufft_Complex; + typedef typename std::conditional::value, + cufftReal, cufftDoubleReal>::type cufft_Real; + bool floatPrecision = false; + if (std::is_same::value) { + floatPrecision = true; + } + + int image_width = width; + int image_height = height; + + // int matsize = image_width * image_height; + + // create plans + cufftHandle plan_forward_many; + int n[2] = {image_height, image_width}; + int inembed[2] = {height, alignment}; + int onembed[2] = {height, width / 2 + 1}; + int idist = alignment * height; + int odist = (width / 2 + 1) * height; + + if (floatPrecision) { + if (forward) { + cufftPlanMany(&plan_forward_many, 2, n, inembed, 1, idist, onembed, 1, + odist, CUFFT_R2C, 1); + } else { + cufftPlanMany(&plan_forward_many, 2, n, onembed, 1, odist, inembed, 1, + idist, CUFFT_C2R, 1); + } + } else { + if (forward) { + cufftPlanMany(&plan_forward_many, 2, n, inembed, 1, idist, onembed, 1, + odist, CUFFT_D2Z, 1); + } else { + cufftPlanMany(&plan_forward_many, 2, n, onembed, 1, odist, inembed, 1, + idist, CUFFT_Z2D, 1); + } + } + + // define dimensions + dim3 threadsPerBlock(32, 32); // 1024 threads + dim3 numBlocks(alignment / threadsPerBlock.x + 1, + image_height / threadsPerBlock.y + 1); + + // Transform signal + if (forward) { + if (floatPrecision) { + cufftExecR2C(plan_forward_many, reinterpret_cast(in), + reinterpret_cast(out)); + } else { + cufftExecD2Z(plan_forward_many, reinterpret_cast(in), + reinterpret_cast(out)); + } + } else { + if (floatPrecision) { + cufftExecC2R(plan_forward_many, + reinterpret_cast(in), + reinterpret_cast(out)); + } else { + cufftExecZ2D(plan_forward_many, + reinterpret_cast(in), + reinterpret_cast(out)); + } + } + + if (!forward && scale) { + // Pointwise Scale + ScaleE<<>>(reinterpret_cast(out), + alignment, image_height, + 1.0 / (image_height * image_width)); + HANDLE_ERROR(cudaGetLastError()); + } + + // cleanup + cufftDestroy(plan_forward_many); +} + +// old fft part without device functions +/* +// forward FFT +template TPrecision *fft(HipaccImage &in) { + int width = in->width; + int height = in->height; + int width_in = alignedWidth(width, in->alignment); + + // prepare input buffer + TPrecision *input = new TPrecision[width * height]; + T *input_d = new T[width_in * height]; + // convert input + gpuErrchk(cudaMemcpy(input_d, in->mem, sizeof(T) * width_in * height, + cudaMemcpyDeviceToHost)); + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + input[y * width + x] = (TPrecision)input_d[y * width_in + x]; + } + } + + // prepare output buffer + TPrecision *output = new TPrecision[2 * (width / 2 + 1) * height]; + fftTransform(input, width, height, output, true); + + free(input); + free(input_d); + + return output; +} + +// inverse FFT +template void ifft(TPrecision *in, HipaccImage &out) { + int width = out->width; + int height = out->height; + int width_out = alignedWidth(width, out->alignment); + + // prepare output buffer + TPrecision *output = new TPrecision[width * height]; + + fftTransform(in, width, height, output, false, true); + + // truncate values outside of range 0-255 + T *out_d = new T[width_out * height]; + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + T val = output[y * width + x]; + if (output[y * width + x] < 0) { + val = 0; + } else if (output[y * width + x] > 255) { + val = 255; + } + // convert output + out_d[y * width_out + x] = (T)(val); + } + } + + gpuErrchk(cudaMemcpy(out->mem, out_d, sizeof(T) * width_out * height, + cudaMemcpyHostToDevice)); + + free(output); + free(out_d); +} +*/ + +// new fft part + +// forward FFT +template TPrecision *fft(HipaccImage &in) { + int width = in->width; + int height = in->height; + int width_in = alignedWidth(width, in->alignment); + + // prepare output buffer + TPrecision *output; + cudaMalloc((void **)&output, sizeof(TPrecision) * 2 * (width / 2 + 1) * height); + TPrecision *input; + if (std::is_same::value) { + input = (TPrecision *)(in->mem); + } else { + gpuErrchk( + cudaMalloc((void **)&input, sizeof(TPrecision) * width_in * height)); + // define dimensions + dim3 threadsPerBlock(32, 32); // 1024 threads + dim3 numBlocks(width / threadsPerBlock.x + 1, height / threadsPerBlock.y + 1); + + // Pointwise Multiplication in Frequency Domain + CopyAligned<<>>((T *)(in->mem), input, width, + height, width_in, width_in); + HANDLE_ERROR(cudaGetLastError()); + } + fftTransformDevice(input, width, height, output, true, false, width_in); + HANDLE_ERROR(cudaGetLastError()); + return output; +} + +// inverse FFT +template void ifft(TPrecision *in, HipaccImage &out) { + int width = out->width; + int height = out->height; + int width_out = alignedWidth(width, out->alignment); + + // prepare output buffer + TPrecision *output; + if (std::is_same::value) { + output = (TPrecision *)(out->mem); + } else { + gpuErrchk( + cudaMalloc((void **)&output, sizeof(TPrecision) * width_out * height)); + } + fftTransformDevice(in, width, height, output, false, true, width_out); + + if (!std::is_same::value) { + // define dimensions + dim3 threadsPerBlock(32, 32); // 1024 threads + dim3 numBlocks(width / threadsPerBlock.x + 1, height / threadsPerBlock.y + 1); + + // Pointwise Multiplication in Frequency Domain + CopyAligned<<>>(output, (T *)(out->mem), width, + height, width_out, width_out); + HANDLE_ERROR(cudaGetLastError()); + } +} + +// create magnitude from fft +template +void fftToMagnitude(TPrecision *in, HipaccImage &mag) { + int width = mag->width; + int height = mag->height; + int width_out = alignedWidth(width, mag->alignment); + + T *tmp = new T[width * height]; + std::complex *input = + new std::complex[(width / 2 + 1) * height]; + gpuErrchk(cudaMemcpy( + input, in, sizeof(std::complex) * (width / 2 + 1) * height, + cudaMemcpyDeviceToHost)); + calcMagnitude(reinterpret_cast *>(input), tmp, width, + height); + + shiftFFT(tmp, width, height); + + T *out = new T[width_out * height]; + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + out[y * width_out + x] = (T)tmp[y * width + x]; + } + } + + gpuErrchk(cudaMemcpy(mag->mem, out, sizeof(T) * width_out * height, + cudaMemcpyHostToDevice)); + free(tmp); + free(out); +} + +// create magnitude from dct +template +void dctToMagnitude(TPrecision *in, HipaccImage &mag) { + int width = mag->width; + int height = mag->height; + int width_out = alignedWidth(width, mag->alignment); + + T *tmp = new T[width * height]; + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + tmp[y * width + x] = std::abs(in[y * width + x]); + } + } + + float max = 0.0; + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + if (tmp[y * width + x] > max) + max = tmp[y * width + x]; + } + } + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + tmp[y * width + x] = + (T)(255.0 * pow(tmp[y * width + x] * (1.0 / max), 1.0 / 4.0)); + } + } + + T *out = new T[width_out * height]; + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + out[y * width_out + x] = (T)tmp[y * width + x]; + } + } + + gpuErrchk(cudaMemcpy(mag->mem, out, sizeof(T) * width_out * height, + cudaMemcpyHostToDevice)); + free(tmp); + free(out); +} + +// apply mask mag to result of FFT in +template +void fftScaleMagnitude(TPrecision *in, HipaccImage &mag) { + int width = mag->width; + int height = mag->height; + int width_in = alignedWidth(width, mag->alignment); + T *scale = new T[width_in * height]; + std::complex *input = + new std::complex[(width / 2 + 1) * height]; + gpuErrchk(cudaMemcpy( + input, in, sizeof(std::complex) * (width / 2 + 1) * height, + cudaMemcpyDeviceToHost)); + gpuErrchk(cudaMemcpy(scale, mag->mem, sizeof(T) * width_in * height, + cudaMemcpyDeviceToHost)); + + iShiftFFT(scale, width, height, mag->alignment); + + for (int y = 0; y < height; y++) { + for (int x = 0; x < (width / 2 + 1); x++) { + input[(y * (width / 2 + 1) + x)] *= + (TPrecision)scale[y * width_in + x] / 255; + } + } + + gpuErrchk(cudaMemcpy( + in, input, sizeof(std::complex) * (width / 2 + 1) * height, + cudaMemcpyHostToDevice)); + + free(scale); +} + +template +void dct_forward_device(TPrecision *in, int width, int height, TPrecision *out, + int width_in = 0) { + if (width_in == 0) { + width_in = width; + } + // define dimensions + dim3 threadsPerBlock(32, 32); // 1024 threads + dim3 numBlocks(width / threadsPerBlock.x + 1, height / threadsPerBlock.y + 1); + + TPrecision *reorder; + gpuErrchk(cudaMalloc((void **)&reorder, sizeof(TPrecision) * width * height)); + + Reorder<<>>(in, reorder, width, height, width_in, + width, height); + HANDLE_ERROR(cudaGetLastError()); + + // prepare output buffer + int out_width = width / 2 + 1; + std::complex *output; + gpuErrchk(cudaMalloc((void **)&output, + sizeof(std::complex) * out_width * height)); + + fftTransformDevice(reorder, width, height, + reinterpret_cast(output), true, false); + + Factoring<<>>( + reinterpret_cast *>(output), out, width, height, width, + height, out_width); + cudaDeviceSynchronize(); + HANDLE_ERROR(cudaGetLastError()); + + cudaFree(reorder); + cudaFree(output); +} + +template +void dct_forward(T *in, int width, int height, TPrecision *out, + int width_in = 0) { + if (width_in == 0) { + width_in = width; + } + + TPrecision *input = new TPrecision[width * height]; + + for (int y = 0; y < height / 2; y++) { + for (int x = 0; x < width / 2; x++) { + input[y * width + x] = (TPrecision)in[(2 * y) * width_in + (2 * x)]; + input[y * width + (x + width / 2)] = + (TPrecision)in[(2 * y) * width_in + (2 * (width / 2 - x - 1) + 1)]; + input[(y + height / 2) * width + x] = + (TPrecision)in[(2 * (height / 2 - y - 1) + 1) * width_in + (2 * x)]; + input[(y + height / 2) * width + (x + width / 2)] = + (TPrecision)in[(2 * (height / 2 - y - 1) + 1) * width_in + + (2 * (width / 2 - x - 1) + 1)]; + } + } + + // prepare output buffer + int out_width = width / 2 + 1; + std::complex *output = + new std::complex[out_width * height]; + + fftTransform(input, width, height, reinterpret_cast(output), + true); + + const std::complex i(0.0, 1.0); + for (int y = 0; y < height; y++) { + for (int x = 0; x < width / 2 + 1; x++) { + std::complex factor = + std::exp(-i * std::complex(M_PI * y / (2 * height))); + std::complex factor2 = + std::exp(-i * std::complex(M_PI * x / (2 * width))); + std::complex factor3 = + std::exp(-i * std::complex(M_PI * (-x) / (2 * width))); + std::complex factor4 = + std::exp(-i * std::complex(M_PI * (-y) / (2 * height))); + if (x == 0 && y == 0) { + out[y * width + x] = + 4.0 * (factor * (factor2 * output[y * out_width + x])).real(); + } else if (x == 0) { + out[y * width + x] = + 2.0 * (factor2 * (factor * output[y * out_width + x] + + factor4 * output[(height - y) * out_width + x])) + .real(); + } else if (y == 0) { + out[y * width + x] = + 2.0 * (factor * (factor2 * output[y * out_width + x] + + factor3 * std::conj(output[y * out_width + x]))) + .real(); + } else { + out[y * width + x] = + 2.0 * + (factor * (factor2 * output[y * out_width + x] + + factor3 * std::conj(output[(height - y) * out_width + x]))) + .real(); + } + } + } + + for (int y = 0; y < height; y++) { + for (int x = width / 2 + 1; x < width; x++) { + std::complex factor = + std::exp(-i * std::complex(M_PI * y / (2 * height))); + std::complex factor2 = + std::exp(-i * std::complex(M_PI * x / (2 * width))); + std::complex factor3 = + std::exp(-i * std::complex(M_PI * (-x) / (2 * width))); + std::complex factor4 = + std::exp(-i * std::complex(M_PI * (-y) / (2 * height))); + if (x == 0 && y == 0) { + out[y * width + x] = + 4.0 * + (factor * (factor2 * std::conj(output[y * out_width + width - x]))) + .real(); + } else if (x == 0) { + out[y * width + x] = + 2.0 * + (factor2 * + (factor * std::conj(output[(height - y) * out_width + width - x]) + + factor4 * std::conj(output[y * out_width + width - x]))) + .real(); + } else if (y == 0) { + out[y * width + x] = + 2.0 * + (factor * (factor2 * std::conj(output[y * out_width + width - x]) + + factor3 * output[y * out_width + width - x])) + .real(); + } else { + out[y * width + x] = + 2.0 * + (factor * + (factor2 * std::conj(output[(height - y) * out_width + width - x]) + + factor3 * output[y * out_width + width - x])) + .real(); + } + } + } + + free(input); + free(output); +} + +template +void dct_inverse_device(TPrecision *in, int width, int height, T *out, + int width_out = 0) { + if (width_out == 0) { + width_out = width; + } + + int in_width = width / 2 + 1; + + // define dimensions + dim3 threadsPerBlock(32, 32); // 1024 threads + dim3 numBlocks(width_out / threadsPerBlock.x + 1, + height / threadsPerBlock.y + 1); + + // prepare buffers + std::complex *input; + gpuErrchk(cudaMalloc((void **)&input, + sizeof(std::complex) * in_width * height)); + TPrecision *output; + gpuErrchk(cudaMalloc((void **)&output, sizeof(TPrecision) * width * height)); + + iFactoring<<>>( + in, reinterpret_cast *>(input), width, height, width, + height); + cudaDeviceSynchronize(); + HANDLE_ERROR(cudaGetLastError()); + + fftTransformDevice(reinterpret_cast(input), width, height, output, + false, true); + + iReorder<<>>(output, out, width, height, width_out, + width, height); + cudaDeviceSynchronize(); + HANDLE_ERROR(cudaGetLastError()); +} + +template +void dct_inverse(TPrecision *in, int width, int height, T *out, + int width_out = 0) { + if (width_out == 0) { + width_out = width; + } + + int in_width = width / 2 + 1; + + std::complex *input = + new std::complex[in_width * height]; + + const std::complex i(0.0, 1.0); + for (int y = 0; y < height; y++) { + for (int x = 0; x < in_width; x++) { + std::complex factor3 = + std::exp(-i * std::complex(M_PI * (-x) / (2 * width))); + std::complex factor4 = + std::exp(-i * std::complex(M_PI * (-y) / (2 * height))); + if (x == 0 && y == 0) { + input[y * in_width + x] = std::complex(0.25, 0.0) * factor4 * + factor3 * + std::complex(in[y * width + x]); + } else if (y == 0) { + input[y * in_width + x] = + std::complex(0.25, 0.0) * factor4 * factor3 * + (std::complex(in[y * width + x]) - + i * std::complex(in[y * width + width - x])); + } else if (x == 0) { + input[y * in_width + x] = + std::complex(0.25, 0.0) * factor4 * factor3 * + (std::complex(in[y * width + x]) - + i * std::complex(in[(height - y) * width + x])); + } else { + input[y * in_width + x] = + std::complex(0.25, 0.0) * factor4 * factor3 * + ((std::complex(in[y * width + x]) - + std::complex(in[(height - y) * width + width - x])) - + i * (std::complex(in[(height - y) * width + x]) + + std::complex(in[y * width + width - x]))); + } + } + } + + // prepare output buffer + TPrecision *output = new TPrecision[width * height]; + + fftTransform(reinterpret_cast(input), width, height, output, + false, true); + + for (int y = 0; y < height / 2; y++) { + for (int x = 0; x < width / 2; x++) { + out[(2 * y) * width_out + (2 * x)] = (T)output[y * width + x]; + out[(2 * y) * width_out + (2 * (width / 2 - x - 1) + 1)] = + (T)output[y * width + (x + width / 2)]; + out[(2 * (height / 2 - y - 1) + 1) * width_out + (2 * x)] = + (T)output[(y + height / 2) * width + x]; + out[(2 * (height / 2 - y - 1) + 1) * width_out + + (2 * (width / 2 - x - 1) + 1)] = + (T)output[(y + height / 2) * width + (x + width / 2)]; + } + } + + free(output); +} + +template +void dctTransform(TPrecision *in, int width, int height, TPrecision *out, + bool forward = true) { + if (forward) { + dct_forward(in, width, height, out); + } else { + dct_inverse(in, width, height, out); + } +} + +template +void dctTransformDevice(TPrecision *in, int width, int height, TPrecision *out, + bool forward = true) { + if (forward) { + dct_forward_device(in, width, height, out); + } else { + dct_inverse_device(in, width, height, out); + } +} + +// forward FCT +template TPrecision *dct(HipaccImage &in) { + int width = in->width; + int height = in->height; + int width_in = alignedWidth(width, in->alignment); + + // prepare input buffer + T *input_d = new T[width_in * height]; + // convert input + gpuErrchk(cudaMemcpy(input_d, in->mem, sizeof(T) * width_in * height, + cudaMemcpyDeviceToHost)); + + TPrecision *out = new TPrecision[width * height]; + dct_forward(input_d, width, height, out, width_in); + + free(input_d); + + return out; +} + +// inverse FCT +template void idct(TPrecision *in, HipaccImage &out) { + int width = out->width; + int height = out->height; + int width_out = alignedWidth(width, out->alignment); + + T *out_d = new T[width_out * height]; + + dct_inverse(in, width, height, out_d, width_out); + + gpuErrchk(cudaMemcpy(out->mem, out_d, sizeof(T) * width_out * height, + cudaMemcpyHostToDevice)); + + free(out_d); +} + +/* +template TPrecision *dct(HipaccImage &in) { + int width = in->width; + int height = in->height; + int width_in = alignedWidth(width, in->alignment); + + // prepare output buffer + TPrecision *output; + gpuErrchk(cudaMalloc((void **)&output, sizeof(TPrecision) * width * height)); + dct_forward_device((TPrecision *)(in->mem), width, height, output, width_in); + + return output; +} + +// inverse FCT +template void idct(TPrecision *in, HipaccImage &out) { + int width = out->width; + int height = out->height; + int width_out = alignedWidth(width, out->alignment); + + //dct_inverse_device(in, width, height, (TPrecision *)(out->mem), width_out); + dct_inverse_device(in, 1000, 100, (TPrecision *)(out->mem)); + +} +*/ + +// function wrappers for images + +template void fftShift(HipaccImage &mag) { + int width = mag->width; + int height = mag->height; + int width_in = alignedWidth(width, mag->alignment); + T *input = new T[width_in * height]; + gpuErrchk(cudaMemcpy(input, mag->mem, sizeof(T) * width_in * height, + cudaMemcpyDeviceToHost)); + + shiftFFT(input, width, height, width_in); + + gpuErrchk(cudaMemcpy(mag->mem, input, sizeof(T) * width_in * height, + cudaMemcpyHostToDevice)); +} + +template void ifftShift(HipaccImage &mag) { + int width = mag->width; + int height = mag->height; + int width_in = alignedWidth(width, mag->alignment); + T *input = new T[width_in * height]; + gpuErrchk(cudaMemcpy(input, mag->mem, sizeof(T) * width_in * height, + cudaMemcpyDeviceToHost)); + + iShiftFFT(input, width, height, width_in); + + gpuErrchk(cudaMemcpy(mag->mem, input, sizeof(T) * width_in * height, + cudaMemcpyHostToDevice)); +} + +template +void fftResetMask(HipaccImage &mag, int radius, bool low, int window = 0) { + int width = mag->width; + int height = mag->height; + int width_in = alignedWidth(width, mag->alignment); + T *input = new T[width_in * height]; + gpuErrchk(cudaMemcpy(input, mag->mem, sizeof(T) * width_in * height, + cudaMemcpyDeviceToHost)); + + magResetFreq(input, width, height, width_in, radius, window, low); + + gpuErrchk(cudaMemcpy(mag->mem, input, sizeof(T) * width_in * height, + cudaMemcpyHostToDevice)); +} + +template +void fftApplyPassFilter(HipaccImage &mag, int radius, bool low, int window = 0) { + int width = mag->width; + int height = mag->height; + int width_in = alignedWidth(width, mag->alignment); + T *input = new T[width_in * height]; + gpuErrchk(cudaMemcpy(input, mag->mem, sizeof(T) * width_in * height, + cudaMemcpyDeviceToHost)); + + magPassFilter(input, width, height, width_in, radius, window, low); + + gpuErrchk(cudaMemcpy(mag->mem, input, sizeof(T) * width_in * height, + cudaMemcpyHostToDevice)); +} diff --git a/runtime/hipacc_cu_standalone.hpp.cmake b/runtime/hipacc_cu_standalone.hpp.cmake index d2c54166..68ef83b7 100644 --- a/runtime/hipacc_cu_standalone.hpp.cmake +++ b/runtime/hipacc_cu_standalone.hpp.cmake @@ -234,6 +234,19 @@ void hipaccLaunchKernel(const void *kernel, std::string kernel_name, dim3 grid, } +// Launch kernel async +void hipaccLaunchKernel(const void *kernel, std::string kernel_name, dim3 grid, dim3 block, void **args, cudaStream_t stream, cudaEvent_t waitEvent, cudaEvent_t recordEvent) { + if (waitEvent) { + cudaStreamWaitEvent(stream, waitEvent, 0); + } + cudaError_t err = cudaLaunchKernel(kernel, grid, block, args, 0, stream); + checkErr(err, "cudaLaunchKernel(" + kernel_name + ")"); + if (recordEvent) { + cudaEventRecord(recordEvent, stream); + } +} + + // Benchmark timing for a kernel call void hipaccLaunchKernelBenchmark(const void *kernel, std::string kernel_name, dim3 grid, dim3 block, std::vector args, bool print_timing) { std::vector times; diff --git a/runtime/hipacc_fft_helper.hpp b/runtime/hipacc_fft_helper.hpp new file mode 100644 index 00000000..b1271114 --- /dev/null +++ b/runtime/hipacc_fft_helper.hpp @@ -0,0 +1,263 @@ +// This is a helper file for both hipacc_cpu_fftw.hpp and hipacc_cu_cufft.hpp + +#include +#include +#include + +// https://stackoverflow.com/questions/8204645/implementing-gaussian-blur-how-to-calculate-convolution-matrix-kernel#8204880 +void calcGauss(float *kernel, int W) { + double sigma = 1; + double mean = W / 2; + double sum = 0.0; // For accumulating the kernel values + for (int x = 0; x < W; ++x) { + for (int y = 0; y < W; ++y) { + kernel[x * W + y] = exp(-0.5 * (pow((x - mean) / sigma, 2.0) + + pow((y - mean) / sigma, 2.0))) / + (2 * M_PI * sigma * sigma); + + // Accumulate the kernel values + sum += kernel[x * W + y]; + } + } + // Normalize the kernel + for (int x = 0; x < W; ++x) + for (int y = 0; y < W; ++y) + kernel[x * W + y] /= sum; +} +// boxblur +void calcBox(float *kernel, int W) { + double count = W * W; + for (int x = 0; x < count; ++x) { + kernel[x] = 1.0 / count; + } +} + +template +void putKernel(T *kernel, U *dest, int kx, int ky, int nx, int ny) { + // centered + /*for (int y = -ky/2; y <= ky/2; y++) { + for (int x = -kx/2; x <= kx/2; x++) { + dest[nx * (ny/2 + y) + (nx/2 + x)] = kernel[kx * (ky/2+y) + (kx/2+x)]; + } + }*/ + + // edges wrap-around + for (int y = -ky / 2; y <= ky / 2; y++) { + for (int x = -kx / 2; x <= kx / 2; x++) { + dest[nx * ((ny + y) % ny) + ((nx + x) % nx)] = + kernel[kx * (ky / 2 + y) + (kx / 2 + x)]; + } + } + // dest[0] = 1.0; +} + +template +void putKernelComplex(T *kernel, std::complex *dest, int kx, int ky, int nx, + int ny) { + // edges wrap-around + for (int y = -ky / 2; y <= ky / 2; y++) { + for (int x = -kx / 2; x <= kx / 2; x++) { + dest[nx * ((ny + y) % ny) + ((nx + x) % nx)].real( + kernel[kx * (ky / 2 + y) + (kx / 2 + x)]); + dest[nx * ((ny + y) % ny) + ((nx + x) % nx)].imag(0); + } + } +} + +// only unshifted +template +void calcMagnitude(std::complex *in, U *out, int width, int height) { + // in is result of fft (r2c: only left half (width/2+1)*height) + int width_in = width / 2 + 1; + int N = width * height; + float *magnitude = (float *)malloc(sizeof(float) * width * height); + for (int y = 0; y < height; y++) { + for (int x = 0; x < width_in; x++) { + magnitude[y * width + x] = std::abs(in[y * width_in + x]); + if (x < width / 2) + magnitude[width * height - (y * width + x) - 1] = + std::abs(in[y * width_in + x + 1]); // point mirror + } + } + float max = 0.0; + for (int y = 0; y < height; y++) { + for (int x = 0; x < width_in; x++) { + if (magnitude[y * width + x] > max) + max = magnitude[y * width + x]; + } + } + for (int i = 0; i < N; ++i) { + out[i] = (U)(255.0 * pow(magnitude[i] * (1.0 / max), 1.0 / 4.0)); + } + free(magnitude); +} + +// 2d access to 1d array +inline int linearize(int w, int x, int y) { return y * w + x; } + +// get next width that is aligned to a size of alignment of type T +template int alignedWidth(int width, int alignment) { + alignment /= sizeof(T); + int res = width; + int rest = res % alignment; + if (rest != 0) { + res += alignment - (rest); + } + return res; +} + +// get int greater or equal val that is divisible by div +int nextDiv(int val, int div) { + int newVal = val; + int rest = newVal % div; + if (rest != 0) { + newVal += div - (rest); + } + return newVal; +} + +// https://stackoverflow.com/questions/466204/rounding-up-to-next-power-of-2 +unsigned long upperPowerOfTwo(unsigned long v) { + v--; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + v++; + return v; +} + +// https://stackoverflow.com/questions/4424374/determining-if-a-number-is-prime +bool isPrime(int number) { + if (number < 2) + return false; + if (number == 2) + return true; + if (number % 2 == 0) + return false; + for (int i = 3; (i * i) <= number; i += 2) { + if (number % i == 0) + return false; + } + return true; +} + +// shift the zero-frequency component to the center of the image +template +void shiftFFT(T *image, int width, int height, int alignment = 0) { + int width_align = width; + if (alignment) { + width_align = alignedWidth(width, alignment); + } + T *tmp = new T[width_align * height]; + + int y_inc = height / 2; + int x_inc = width / 2; + int cpy_len_l = sizeof(T) * (width / 2); + int cpy_len_r = sizeof(T) * (width / 2); + if (width % 2) { + cpy_len_l += 1; + } + int src_x_l = 0; + int src_x_r = width / 2; + if (width % 2) { + src_x_r += 1; + } + + for (int y = 0; y < height; y++) { + memcpy(&tmp[linearize(width_align, (src_x_l + x_inc) % width, + (y + y_inc) % height)], + &image[linearize(width_align, src_x_l, y)], cpy_len_l); + memcpy(&tmp[linearize(width_align, (src_x_r + x_inc) % width, + (y + y_inc) % height)], + &image[linearize(width_align, src_x_r, y)], cpy_len_r); + } + + memcpy(image, tmp, sizeof(T) * width_align * height); + free(tmp); +} + +// inverse shift the zero-frequency +template +void iShiftFFT(T *image, int width, int height, int alignment = 0) { + int width_align = width; + if (alignment) { + width_align = alignedWidth(width, alignment); + } + T *tmp = new T[width_align * height]; + + int y_inc = height / 2; + int x_inc = width / 2; + if (width % 2) { + y_inc += 1; + x_inc += 1; + } + int cpy_len_l = sizeof(T) * (width / 2); + int cpy_len_r = sizeof(T) * (width / 2); + if (width % 2) { + cpy_len_r += 1; + } + int src_x_l = 0; + int src_x_r = width / 2; + + for (int y = 0; y < height; y++) { + memcpy(&tmp[linearize(width_align, (src_x_l + x_inc) % width, + (y + y_inc) % height)], + &image[linearize(width_align, src_x_l, y)], cpy_len_l); + memcpy(&tmp[linearize(width_align, (src_x_r + x_inc) % width, + (y + y_inc) % height)], + &image[linearize(width_align, src_x_r, y)], cpy_len_r); + } + + memcpy(image, tmp, sizeof(T) * width_align * height); + free(tmp); +} + +// hann window of size N for cos weighted gradient +float hannWindow(float N, float x) { + if (x > N) + return 1; + N *= 2; + return 0.5 * (1.0 - cos(2.0 * M_PI * x / N)); +} + +// reset mask values for low frequencies below r with gradient blur of size w +template +void magResetFreq(T *image, int width, int height, int alignment, float r, + float w, bool low) { + int width_align = alignedWidth(width, alignment); + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + int xd = x - width / 2; + int yd = y - height / 2; + float d = sqrt(xd * xd + yd * yd); + if (low && d < r) { + image[y * width_align + x] = + 255 - (255 - image[y * width_align + x]) * (1 - hannWindow(w, r - d)); + } else if (!low && d > r) { + image[y * width_align + x] = + 255 - (255 - image[y * width_align + x]) * (1 - hannWindow(w, d - r)); + } + } + } +} + +// low pass filter for low frequencies below r with gradient blur of size w +template +void magPassFilter(T *image, int width, int height, int alignment, int r, + int w, bool low) { + int width_align = alignedWidth(width, alignment); + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + int xd = x - width / 2; + int yd = y - height / 2; + float d = sqrt(xd * xd + yd * yd); + if (low && d > r) { + image[y * width_align + x] *= (1 - hannWindow(w, d - r)); + } else if (!low && d