diff --git a/CMakeLists.txt b/CMakeLists.txt index 84a7191a..6ad0f6b9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -184,3 +184,8 @@ endif() if(EXISTS ${CMAKE_SOURCE_DIR}/samples/CMakeLists.txt) add_subdirectory(samples) endif() + +# add apps if available +if(EXISTS ${CMAKE_SOURCE_DIR}/apps/CMakeLists.txt) + add_subdirectory(apps) +endif() diff --git a/LICENSE b/LICENSE index 3f7f789a..3cc17a66 100644 --- a/LICENSE +++ b/LICENSE @@ -1,3 +1,4 @@ +Copyright (c) 2020, University of Erlangen-Nuremberg Copyright (c) 2014, Saarland University Copyright (c) 2012, University of Erlangen-Nuremberg Copyright (c) 2012, Siemens AG diff --git a/README.md b/README.md index 9dc7d367..d4e4db88 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,8 @@ +This is a fork of the [Hipacc](http://hipacc-lang.org) project, with primarily experiential features: +* Asynchronous runtime APIs +* Multi-stream implementation for multiresolution filters in CUDA + + # Hipacc A domain-specific language and compiler for image processing diff --git a/_clang-format b/_clang-format new file mode 100644 index 00000000..a81aaae7 --- /dev/null +++ b/_clang-format @@ -0,0 +1,121 @@ +--- +Language: Cpp +# BasedOnStyle: LLVM +AccessModifierOffset: -2 +AlignAfterOpenBracket: Align +AlignConsecutiveAssignments: false +AlignConsecutiveDeclarations: false +AlignEscapedNewlines: Right +AlignOperands: true +AlignTrailingComments: true +AllowAllParametersOfDeclarationOnNextLine: true +AllowShortBlocksOnASingleLine: false +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: All +AllowShortIfStatementsOnASingleLine: false +AllowShortLoopsOnASingleLine: false +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: false +AlwaysBreakTemplateDeclarations: MultiLine +BinPackArguments: true +BinPackParameters: true +BraceWrapping: + AfterClass: false + AfterControlStatement: false + AfterEnum: false + AfterFunction: false + AfterNamespace: false + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + AfterExternBlock: false + BeforeCatch: false + BeforeElse: false + IndentBraces: false + SplitEmptyFunction: true + SplitEmptyRecord: true + SplitEmptyNamespace: true +BreakBeforeBinaryOperators: None +BreakBeforeBraces: Attach +BreakBeforeInheritanceComma: false +BreakInheritanceList: BeforeColon +BreakBeforeTernaryOperators: true +BreakConstructorInitializersBeforeComma: false +BreakConstructorInitializers: BeforeColon +BreakAfterJavaFieldAnnotations: false +BreakStringLiterals: true +ColumnLimit: 82 +CommentPragmas: '^ IWYU pragma:' +CompactNamespaces: false +ConstructorInitializerAllOnOneLineOrOnePerLine: false +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: true +DerivePointerAlignment: false +DisableFormat: false +ExperimentalAutoDetectBinPacking: false +FixNamespaceComments: true +ForEachMacros: + - foreach + - Q_FOREACH + - BOOST_FOREACH +IncludeBlocks: Preserve +IncludeCategories: + - Regex: '^"(llvm|llvm-c|clang|clang-c)/' + Priority: 2 + - Regex: '^(<|"(gtest|gmock|isl|json)/)' + Priority: 3 + - Regex: '.*' + Priority: 1 +IncludeIsMainRegex: '(Test)?$' +IndentCaseLabels: false +IndentPPDirectives: None +IndentWidth: 2 +IndentWrappedFunctionNames: false +JavaScriptQuotes: Leave +JavaScriptWrapImports: true +KeepEmptyLinesAtTheStartOfBlocks: true +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +ObjCBinPackProtocolList: Auto +ObjCBlockIndentWidth: 2 +ObjCSpaceAfterProperty: false +ObjCSpaceBeforeProtocolList: true +PenaltyBreakAssignment: 2 +PenaltyBreakBeforeFirstCallParameter: 19 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakString: 1000 +PenaltyBreakTemplateDeclaration: 10 +PenaltyExcessCharacter: 1000000 +PenaltyReturnTypeOnItsOwnLine: 60 +PointerAlignment: Right +ReflowComments: true +SortIncludes: true +SortUsingDeclarations: true +SpaceAfterCStyleCast: false +SpaceAfterTemplateKeyword: true +SpaceBeforeAssignmentOperators: true +SpaceBeforeCpp11BracedList: false +SpaceBeforeCtorInitializerColon: true +SpaceBeforeInheritanceColon: true +SpaceBeforeParens: ControlStatements +SpaceBeforeRangeBasedForLoopColon: true +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 1 +SpacesInAngles: false +SpacesInContainerLiterals: true +SpacesInCStyleCastParentheses: false +SpacesInParentheses: false +SpacesInSquareBrackets: false +Standard: Cpp11 +StatementMacros: + - Q_UNUSED + - QT_REQUIRE_VERSION +TabWidth: 8 +UseTab: Never +... + diff --git a/apps/6_Multiresolution_Filters/Laplacian_Pyramid_Encoding/src/main.cpp b/apps/6_Multiresolution_Filters/Laplacian_Pyramid_Encoding/src/main.cpp new file mode 100644 index 00000000..b69dcaad --- /dev/null +++ b/apps/6_Multiresolution_Filters/Laplacian_Pyramid_Encoding/src/main.cpp @@ -0,0 +1,212 @@ +// +// Copyright (c) 2020, University of Erlangen-Nuremberg +// Copyright (c) 2012, University of Erlangen-Nuremberg +// Copyright (c) 2012, Siemens AG +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, this +// list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + +#include "hipacc.hpp" +#include +#include + +#define SIZE_X 3 +#define SIZE_Y 3 +#define WIDTH 1024 +#define HEIGHT 1024 + +using namespace hipacc; +using namespace hipacc::math; + + +class Gaussian : public Kernel { + private: + Accessor &input; + Mask &mask; + + public: + Gaussian(IterationSpace &iter, Accessor &input, + Mask &mask) + : Kernel(iter), input(input), mask(mask) { + add_accessor(&input); + } + + void kernel() { + output() = convolve(mask, Reduce::SUM, [&] () { + return input(mask) * mask(); + }); + } + + void _operatePyramidReduce() {} +}; + +class DifferenceOfGaussian : public Kernel { + private: + Accessor &input1; + Accessor &input2; + + public: + DifferenceOfGaussian(IterationSpace &iter, Accessor &input1, + Accessor &input2) + : Kernel(iter), input1(input1), input2(input2) { + add_accessor(&input1); + add_accessor(&input2); + } + + void kernel() { + output() = input1() - input2(); + } + + void _operatePyramidFilter() {} +}; + +class Blend : public Kernel { + private: + Accessor &input1; + Accessor &input2; + + public: + Blend(IterationSpace &iter, Accessor &input1, + Accessor &input2) + : Kernel(iter), input1(input1), input2(input2) { + add_accessor(&input1); + add_accessor(&input2); + } + + void kernel() { + output() = (short)input1() + (short)input2() / 2; + } + + void _operatePyramidExpand() {} +}; + + +/************************************************************************* + * Main function * + *************************************************************************/ +int main(int argc, const char **argv) { + const int width = WIDTH; + const int height = HEIGHT; + const int size_x = SIZE_X; + const int size_y = SIZE_Y; + float timing = 0; + + // only filter kernel sizes 3x3, 5x5, and 7x7 implemented + if (size_x != size_y || !(size_x == 3 || size_x == 5 || size_x == 7)) { + std::cout << "Wrong filter kernel size. " + << "Currently supported values: 3x3, 5x5, and 7x7!" + << std::endl; + exit(EXIT_FAILURE); + } + + // convolution filter mask + const float coef[SIZE_Y][SIZE_X] = { +#if SIZE_X == 3 + { 0.057118f, 0.124758f, 0.057118f }, + { 0.124758f, 0.272496f, 0.124758f }, + { 0.057118f, 0.124758f, 0.057118f } +#endif +#if SIZE_X == 5 + { 0.005008f, 0.017300f, 0.026151f, 0.017300f, 0.005008f }, + { 0.017300f, 0.059761f, 0.090339f, 0.059761f, 0.017300f }, + { 0.026151f, 0.090339f, 0.136565f, 0.090339f, 0.026151f }, + { 0.017300f, 0.059761f, 0.090339f, 0.059761f, 0.017300f }, + { 0.005008f, 0.017300f, 0.026151f, 0.017300f, 0.005008f } +#endif +#if SIZE_X == 7 + { 0.000841f, 0.003010f, 0.006471f, 0.008351f, 0.006471f, 0.003010f, 0.000841f }, + { 0.003010f, 0.010778f, 0.023169f, 0.029902f, 0.023169f, 0.010778f, 0.003010f }, + { 0.006471f, 0.023169f, 0.049806f, 0.064280f, 0.049806f, 0.023169f, 0.006471f }, + { 0.008351f, 0.029902f, 0.064280f, 0.082959f, 0.064280f, 0.029902f, 0.008351f }, + { 0.006471f, 0.023169f, 0.049806f, 0.064280f, 0.049806f, 0.023169f, 0.006471f }, + { 0.003010f, 0.010778f, 0.023169f, 0.029902f, 0.023169f, 0.010778f, 0.003010f }, + { 0.000841f, 0.003010f, 0.006471f, 0.008351f, 0.006471f, 0.003010f, 0.000841f } +#endif + }; + + // host memory for random generated image of width x height pixels + char *input = new char[width * height]; + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; ++x) { + input[y * width + x] = (char)((y * width + x) % 19); + } + } + + std::cout << "Calculating Hipacc Gaussian Laplacian pyramid ..." << std::endl; + + //************************************************************************// + + // input and output image of width x height pixels + Image gaus(width, height); + Image lap(width, height); + Mask mask(coef); + + const int depth = 10; + Pyramid pgaus(gaus, depth); + Pyramid plap(lap, depth); + + gaus = input; + traverse(pgaus, plap, [&] () { + if (!pgaus.is_top_level()) { + // construct Gaussian pyramid + BoundaryCondition bound(pgaus(-1), mask, Boundary::CLAMP); + Accessor acc1(bound); + IterationSpace iter1(pgaus(0)); + Gaussian blur(iter1, acc1, mask); + blur.execute(); + + // construct Laplacian pyramid + Accessor acc3(pgaus(-1)); + Accessor acc4(pgaus(0), Interpolate::LF); + IterationSpace iter3(plap(-1)); + DifferenceOfGaussian DoG(iter3, acc3, acc4); + DoG.execute(); + } + + traverse(); + + // collapse pyramids + if (!pgaus.is_bottom_level()) { + // blend final output image from Laplacian pyramid + Accessor acc3(plap(1), Interpolate::LF); + Accessor acc4(plap(0)); + IterationSpace iter2(plap(0)); + Blend blend(iter2, acc3, acc4); + blend.execute(); + } + }); + + // get pointer to result data + char *output = lap.data(); + + //************************************************************************// + + // convert to uchar for visualization + for (int p = 0; p < width*height; ++p) { + output[p] = (char)(output[p] + 127); + } + + // free memory + delete[] input; + + return EXIT_SUCCESS; +} diff --git a/apps/CMakeLists.txt b/apps/CMakeLists.txt new file mode 100644 index 00000000..6b2762a3 --- /dev/null +++ b/apps/CMakeLists.txt @@ -0,0 +1,19 @@ +file(GLOB SAMPLE_DIRS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/[0-9]*) +foreach(SAMPLE_DIR IN LISTS SAMPLE_DIRS) + # install samples + install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/${SAMPLE_DIR} DESTINATION samples COMPONENT samples) + + # deploy build tool for samples + file(GLOB SAMPLES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}/${SAMPLE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/${SAMPLE_DIR}/*) + foreach(SAMPLE IN LISTS SAMPLES) + if(IS_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/${SAMPLE_DIR}/${SAMPLE}) + if(UNIX) + install(FILES ${CMAKE_BINARY_DIR}/samples/buildtools/unix/Makefile DESTINATION samples/${SAMPLE_DIR}/${SAMPLE} COMPONENT samples) + elseif(WIN32) + file(GLOB NMAKE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/buildtools/nmake/*) + file(GLOB VS2015_FILES ${CMAKE_CURRENT_SOURCE_DIR}/buildtools/vs2015/*) + install(FILES ${NMAKE_FILES} ${VS2015_FILES} DESTINATION samples/${SAMPLE_DIR}/${SAMPLE} COMPONENT samples) + endif() + endif() + endforeach() +endforeach() diff --git a/apps/readme.txt b/apps/readme.txt new file mode 100644 index 00000000..5cf2dccd --- /dev/null +++ b/apps/readme.txt @@ -0,0 +1,7 @@ +Application folder for new samples + +How to enable stream-based code generation for apps: +1 follow the standard getting started guide at (https://hipacc-lang.org/install.html) +2 the apps should be copied automatically to the same directory as samples +3 enable the Hipacc flag `-use-stream n` in cuda.conf +4 `-use-stream 1` for single-stream and `-use-stream 2` for multi-stream diff --git a/compiler/hipacc.cpp b/compiler/hipacc.cpp index d278f1f4..aea76368 100644 --- a/compiler/hipacc.cpp +++ b/compiler/hipacc.cpp @@ -101,7 +101,8 @@ void printUsage() { << " -rs-package Specify Renderscript package name. (default: \"org.hipacc.rs\")\n" << " -o Write output to \n" << " --help Display available options\n" - << " --version Display version information\n"; + << " --version Display version information\n" + << " -use-stream Async kernel launch with streams\n"; } @@ -247,6 +248,20 @@ int main(int argc, char *argv[]) { compilerOptions.setTimeKernels(USER_ON); continue; } + if (StringRef(argv[i]) == "-use-stream") { + assert(i<(argc-1) && "Mandatory integer parameter for -use-stream switch missing."); + std::istringstream buffer(argv[i+1]); + int val; + buffer >> val; + if (buffer.fail()) { + llvm::errs() << "ERROR: Expected integer parameter for -use-stream switch.\n\n"; + printUsage(); + return EXIT_FAILURE; + } + compilerOptions.setStreamAsyncKernelLaunch(val); + ++i; + continue; + } if (StringRef(argv[i]) == "-use-textures") { assert(i<(argc-1) && "Mandatory texture memory specification for -use-textures switch missing."); if (StringRef(argv[i+1]) == "off") { @@ -421,6 +436,13 @@ int main(int argc, char *argv[]) { // kernels are timed internally by the runtime in case of exploration compilerOptions.setTimeKernels(OFF); } + // Enable CUDA streams + if (!compilerOptions.emitCUDA() && compilerOptions.asyncKernelLaunch()) { + llvm::errs() << "ERROR: async (stream) support is only available for CUDA!\n\n"; + printUsage(); + return EXIT_FAILURE; + } + // print summary of compiler options compilerOptions.printSummary(targetDevice.getTargetDeviceName()); diff --git a/include/hipacc/Config/CompilerOptions.h b/include/hipacc/Config/CompilerOptions.h index aa9d8e70..27bd6c47 100644 --- a/include/hipacc/Config/CompilerOptions.h +++ b/include/hipacc/Config/CompilerOptions.h @@ -84,11 +84,13 @@ class CompilerOptions { CompilerOption local_memory; CompilerOption multiple_pixels; CompilerOption vectorize_kernels; + CompilerOption async_kernels; // user defined values for target code features int kernel_config_x, kernel_config_y; int reduce_config_num_warps, reduce_config_num_hists; int align_bytes; int pixels_per_thread; + int num_streams; Texture texture_type; std::string rs_package_name, rs_directory; @@ -126,12 +128,14 @@ class CompilerOptions { local_memory(AUTO), multiple_pixels(AUTO), vectorize_kernels(OFF), + async_kernels(OFF), kernel_config_x(128), kernel_config_y(1), reduce_config_num_warps(16), reduce_config_num_hists(16), align_bytes(0), pixels_per_thread(1), + num_streams(1), texture_type(Texture::None), rs_package_name("org.hipacc.rs"), rs_directory("/data/local/tmp") @@ -190,6 +194,9 @@ class CompilerOptions { bool vectorizeKernels(CompilerOption option=option_ou) { return vectorize_kernels & option; } + bool asyncKernelLaunch(CompilerOption option=option_ou) { + return async_kernels & option; + } bool multiplePixelsPerThread(CompilerOption option=option_ou) { return multiple_pixels & option; } @@ -207,6 +214,13 @@ class CompilerOptions { void setTimeKernels(CompilerOption o) { time_kernels = o; } void setLocalMemory(CompilerOption o) { local_memory = o; } void setVectorizeKernels(CompilerOption o) { vectorize_kernels = o; } + void setStreamAsyncKernelLaunch(int stream) { + assert((stream > 0) && "Number of streams must be larger than 0"); + num_streams = stream; + async_kernels = USER_ON; + } + + int getNumStreams() { return num_streams; } void setTextureMemory(Texture type) { texture_type = type; diff --git a/include/hipacc/DSL/ClassRepresentation.h b/include/hipacc/DSL/ClassRepresentation.h index c4952147..40321678 100644 --- a/include/hipacc/DSL/ClassRepresentation.h +++ b/include/hipacc/DSL/ClassRepresentation.h @@ -44,6 +44,7 @@ #include #include #include +#include namespace clang { namespace hipacc { @@ -85,6 +86,12 @@ enum class Interpolate : uint8_t { L3 }; +// basic pyramid operations +enum class PyramidOperation : uint8_t { + REDUCE = 0, + FILTER, + EXPAND +}; // common base class for images, masks and pyramids class HipaccSize { @@ -143,24 +150,149 @@ class HipaccMemory : public HipaccSize { class HipaccImage : public HipaccMemory { private: ASTContext &Ctx; + std::string streamStr; public: HipaccImage(ASTContext &Ctx, VarDecl *VD, QualType QT) : HipaccMemory(VD, VD->getNameAsString(), QT), - Ctx(Ctx) + Ctx(Ctx), streamStr("stream") {} unsigned getPixelSize() { return Ctx.getTypeSize(type)/8; } std::string getTextureType(); std::string getImageReadFunction(); + void setStream(std::string s) { streamStr = s; } + std::string getStream() { return streamStr; } }; class HipaccPyramid : public HipaccImage { + private: + unsigned depth; public: HipaccPyramid(ASTContext &Ctx, VarDecl *VD, QualType QT) : - HipaccImage(Ctx, VD, QT) + HipaccImage(Ctx, VD, QT), depth(0) {} + void setDepth(unsigned d) { depth = d; } + unsigned getDepth() { return depth; } +}; + + +// TODO, generalize this to an abstract class interface +class HipaccPyramidPipeline : public HipaccDevice { + private: + CompilerOptions &options; + + bool singleStream; + bool multiStream; + bool pipelineKernelLaunch; + unsigned depth; + + SmallVector baseImgs; + std::string global_level_str; + llvm::DenseMap KernelDeclMap; + llvm::DenseMap KernelMap; + + // resource usage in the pipeline + struct PyrResource { + unsigned reg = 0; + unsigned smem = 0; + unsigned thr = 0; + unsigned blk = 0; + }; + PyrResource UsedResourcePyr; + + public: + HipaccPyramidPipeline(CompilerOptions &options) : + HipaccDevice(options), + options(options), + pipelineKernelLaunch(false), + depth(0) + { + if (options.getNumStreams() == 1) { + singleStream = true; + multiStream = false; + } else if (options.getNumStreams() > 1) { + singleStream = false; + multiStream = true; + } + resetResourceUsage(); + } + + void resetResourceUsage() { + UsedResourcePyr.reg = 0; + UsedResourcePyr.smem = 0; + UsedResourcePyr.thr = 0; + UsedResourcePyr.blk = 0; + } + + bool hasResourceAvailable(PyrResource &r) { + bool reg_available = (max_total_registers - UsedResourcePyr.reg) > r.reg; + bool smem_available = (max_total_shared_memory - UsedResourcePyr.smem) > r.smem; + bool thr_available = (max_threads_per_multiprocessor - UsedResourcePyr.thr) > r.thr; + bool blk_available = (max_blocks_per_multiprocessor - UsedResourcePyr.blk) > r.blk; + return (reg_available && smem_available && thr_available && blk_available); + } + + bool useResource(PyrResource &r) { + if (hasResourceAvailable(r)) { + UsedResourcePyr.reg += r.reg; + UsedResourcePyr.smem += r.smem; + UsedResourcePyr.thr += r.thr; + UsedResourcePyr.blk += r.blk; + return true; + } else { + return false; + } + } + + std::string getPyramidOperationStr(PyramidOperation opr) { + switch(opr) + { + case PyramidOperation::REDUCE : return "REDUCE"; + case PyramidOperation::FILTER : return "FILTER"; + case PyramidOperation::EXPAND : return "EXPAND"; + default: return "PyramidOperation{" + std::to_string(int(opr)) + '}'; + } + } + + unsigned getNumWaves(HipaccKernel *K, unsigned w, unsigned h, bool print=0); + + void updateDepth(unsigned d) { + depth = (d > depth) ? d : depth; + } + unsigned getDepth() { return depth; } + void recordBaseImg(HipaccImage *img) { + baseImgs.push_back(img); + } + void setGlobalLevelStr(std::string level) { + if (global_level_str.empty()) { + global_level_str = level; + } + } + std::string getGlobalLevelStr() { + return global_level_str; + } + bool isSingleStream() { return singleStream; } + bool isMultiStream() { return multiStream; } + void setSingleStream(bool s) { singleStream = s; } + void setMultiStream(bool s) { multiStream = s; } + void recordKernel(ValueDecl *VD, HipaccKernel *K, PyramidOperation pyrOp) { + KernelDeclMap[VD] = pyrOp; + KernelMap[K] = pyrOp; + } + bool isRecordedKernel(ValueDecl *VD) { return KernelDeclMap.count(VD); } + PyramidOperation getPyramidOperation(ValueDecl *VD) { + assert(isRecordedKernel(VD) && "Kernel is not recorded in the pyramid pipeline."); + return KernelDeclMap[VD]; + } + void enablePipelineKernelLaunch() { + pipelineKernelLaunch = true; + } + bool isPipelineKernelLaunch() { + return pipelineKernelLaunch; + } + void printStreamPipelineInfo(); }; @@ -385,6 +517,7 @@ class HipaccKernelClass { std::string name; CXXMethodDecl *kernelFunction, *reduceFunction, *binningFunction; QualType pixelType, binType; + PyramidOperation pyrOp; KernelStatistics *kernelStatistics; // kernel member information SmallVector members; @@ -426,6 +559,9 @@ class HipaccKernelClass { QualType getPixelType() { return pixelType; } QualType getBinType() { return binType; } + void setPyramidOperation(PyramidOperation op) { pyrOp = op; } + PyramidOperation getPyramidOperation() { return pyrOp; } + KernelStatistics &getKernelStatistics(void) { return *kernelStatistics; } @@ -574,7 +710,9 @@ class HipaccKernel : public HipaccKernelFeatures { std::string name; std::string kernelName, reduceName, binningName; std::string fileName; - std::string reduceStr, binningStr, infoStr, numBinsStr; + std::string reduceStr, binningStr, infoStr, numBinsStr, streamStr; + bool waitEvent, recordEvent; + std::string waitEventStr, recordEventStr; unsigned infoStrCnt, binningStrCnt; HipaccIterationSpace *iterationSpace; std::map imgMap; @@ -614,7 +752,8 @@ class HipaccKernel : public HipaccKernelFeatures { reduceName(options.getTargetPrefix() + KC->getName() + name + "Reduce"), binningName(options.getTargetPrefix() + KC->getName() + name + "Binning"), fileName(options.getTargetPrefix() + KC->getName() + VD->getNameAsString()), - reduceStr(), binningStr(), infoStr(), numBinsStr(), + reduceStr(), binningStr(), infoStr(), numBinsStr(), streamStr("stream"), + waitEvent(false), recordEvent(false), waitEventStr("event"), recordEventStr("event"), infoStrCnt(0), binningStrCnt(0), iterationSpace(nullptr), imgMap(), @@ -751,6 +890,8 @@ class HipaccKernel : public HipaccKernelFeatures { // recreate parameter information createArgInfo(); } + unsigned getResourceUsageReg() { return num_reg; } + unsigned getResourceUsageSmem() { return num_smem; } void setDefaultConfig(); @@ -803,6 +944,16 @@ class HipaccKernel : public HipaccKernelFeatures { unsigned getPixelsPerThreadReduce() { return pixels_per_thread[GlobalOperator]; } + void setStream(std::string s) { streamStr = s; } + std::string getStream() { return streamStr; } + void setWaitEvent(bool e) { waitEvent = e; } + bool hasWaitEvent() { return waitEvent; } + void setRecordEvent(bool e) { recordEvent = e; } + bool hasRecordEvent() { return recordEvent; } + void setWaitEventStr(std::string s) { waitEventStr = s; } + std::string getWaitEventStr() { return waitEventStr; } + void setRecordEventStr(std::string s) { recordEventStr = s; } + std::string getRecordEventStr() { return recordEventStr; } }; } // namespace hipacc } // namespace clang diff --git a/include/hipacc/Device/TargetDescription.h b/include/hipacc/Device/TargetDescription.h index abfb701b..560fc3b7 100644 --- a/include/hipacc/Device/TargetDescription.h +++ b/include/hipacc/Device/TargetDescription.h @@ -78,6 +78,7 @@ class HipaccDeviceOptions { case Device::Maxwell_50: case Device::Maxwell_52: case Device::Maxwell_53: + case Device::Turing_75: alignment = 256; if (options.emitCUDA()) local_memory_threshold = 6; else local_memory_threshold = 11; @@ -206,6 +207,7 @@ class HipaccDevice : public HipaccDeviceOptions { // NVIDIA only device properties unsigned num_alus; unsigned num_sfus; + unsigned num_sms; public: explicit HipaccDevice(CompilerOptions &options) : @@ -214,7 +216,8 @@ class HipaccDevice : public HipaccDeviceOptions { max_threads_per_warp(32), max_blocks_per_multiprocessor(8), num_alus(0), - num_sfus(0) + num_sfus(0), + num_sms(1) { switch (target_device) { case Device::CPU: @@ -261,10 +264,16 @@ class HipaccDevice : public HipaccDeviceOptions { if (target_device==Device::Kepler_30) { max_register_per_thread = 63; } + if (target_device==Device::Kepler_30) { + num_sms = 8; + } else if (target_device==Device::Kepler_35) { + num_sms = 13; + } break; case Device::Maxwell_50: case Device::Maxwell_52: case Device::Maxwell_53: + case Device::Turing_75: max_blocks_per_multiprocessor = 32; max_threads_per_block = 1024; max_warps_per_multiprocessor = 64; @@ -274,6 +283,7 @@ class HipaccDevice : public HipaccDeviceOptions { max_register_per_thread = 255; num_alus = 128; num_sfus = 32; + num_sms = 46; if (target_device==Device::Maxwell_52) { max_total_shared_memory = 98304; } @@ -343,7 +353,7 @@ class HipaccDevice : public HipaccDeviceOptions { bool isNVIDIAGPU() { return target_device >= Device::Fermi_20 && - target_device <= Device::Maxwell_53; + target_device <= Device::Turing_75; } std::string getTargetDeviceName() { @@ -358,6 +368,7 @@ class HipaccDevice : public HipaccDeviceOptions { case Device::Maxwell_50: return "NVIDIA Maxwell (50)"; case Device::Maxwell_52: return "NVIDIA Maxwell (52)"; case Device::Maxwell_53: return "NVIDIA Maxwell (53)"; + case Device::Turing_75: return "NVIDIA Turing (75)"; case Device::Evergreen: return "AMD Evergreen"; case Device::NorthernIsland: return "AMD Northern Island"; //case Device::SouthernIsland: return "AMD Southern Island"; diff --git a/include/hipacc/Device/TargetDevices.h b/include/hipacc/Device/TargetDevices.h index 1a990ee6..04034720 100644 --- a/include/hipacc/Device/TargetDevices.h +++ b/include/hipacc/Device/TargetDevices.h @@ -49,6 +49,7 @@ enum class Device { Maxwell_50 = 50, Maxwell_52 = 52, Maxwell_53 = 53, + Turing_75 = 75, Evergreen = 58, NorthernIsland = 69, //SouthernIsland = 79 diff --git a/include/hipacc/Rewrite/CreateHostStrings.h b/include/hipacc/Rewrite/CreateHostStrings.h index 170fd575..2d18c3e0 100644 --- a/include/hipacc/Rewrite/CreateHostStrings.h +++ b/include/hipacc/Rewrite/CreateHostStrings.h @@ -88,7 +88,7 @@ class CreateHostStrings { MemoryTransferDirection direction, std::string &resultStr); void writeMemoryTransferDomainFromMask(HipaccMask *Domain, HipaccMask *Mask, std::string &resultStr); - void writeKernelCall(HipaccKernel *K, std::string &resultStr); + void writeKernelCall(HipaccKernel *K, std::string &resultStr, HipaccPyramidPipeline *pyrPipe=nullptr); void writeReduceCall(HipaccKernel *K, std::string &resultStr); void writeBinningCall(HipaccKernel *K, std::string &resultStr); std::string getInterpolationDefinition(HipaccKernel *K, HipaccAccessor *Acc, diff --git a/lib/DSL/ClassRepresentation.cpp b/lib/DSL/ClassRepresentation.cpp index 3442b654..fe16092a 100644 --- a/lib/DSL/ClassRepresentation.cpp +++ b/lib/DSL/ClassRepresentation.cpp @@ -651,5 +651,170 @@ void HipaccKernel::createHostArgInfo(ArrayRef hostArgs, std::string } } +unsigned HipaccPyramidPipeline::getNumWaves(HipaccKernel *K, unsigned w, unsigned h, bool print) { + unsigned sx = K->getNumThreadsX(); + unsigned sy = K->getNumThreadsY(); + unsigned r_reg = std::max(K->getResourceUsageReg(), (unsigned)1); + unsigned r_smem = std::max(K->getResourceUsageSmem(), (unsigned)1); + unsigned k_blk = std::ceil((float)w / sx) * std::ceil((float)h / sy); + + unsigned n_blk_reg = std::floor((float)max_total_registers / (r_reg * sx * sy)); + unsigned n_blk_smem = std::floor((float)max_total_shared_memory / r_smem); + unsigned n_blk_thr = std::floor((float)max_threads_per_multiprocessor / (sx*sy)); + unsigned n_blk = std::min({n_blk_reg, n_blk_smem, n_blk_thr, max_blocks_per_multiprocessor}); + + if (print) { + llvm::errs() << " [wave-info] r_reg:" << r_reg << " r_smem:" << r_smem << " k_blk:" << k_blk << "\n"; + llvm::errs() << " n_blk_reg:" << n_blk_reg << " n_blk_smem:" << n_blk_smem << " n_blk_thr:" + << n_blk_thr << " n_blk:" << n_blk << "\n"; + } + + unsigned n_wave = std::ceil((float)k_blk / (n_blk * num_sms)); + return n_wave; +} + +void HipaccPyramidPipeline::printStreamPipelineInfo() { + llvm::errs() << "\nInformation for Pyramid cuda stream scheduling\n"; + std::string deviceInfo_str, singleStream_str, multiStream_str; + deviceInfo_str += " GPU arch: " + getTargetDeviceName() + "\n"; + deviceInfo_str += " max_register_per_sm: " + std::to_string(max_total_registers) + "\n"; + deviceInfo_str += " max_shared_memory_per_sm: " + std::to_string(max_total_shared_memory) + "\n"; + deviceInfo_str += " max_threads_per_sm: " + std::to_string(max_threads_per_multiprocessor) + "\n"; + deviceInfo_str += " max_blocks_per_sm: " + std::to_string(max_blocks_per_multiprocessor) + "\n"; + deviceInfo_str += " num_sm: " + std::to_string(num_sms) + "\n"; + llvm::errs() << deviceInfo_str; + + HipaccKernel *K_r, *K_f, *K_e; // Unpack kernels + for (auto kp : KernelMap) { + if (kp.second == PyramidOperation::REDUCE) { K_r = kp.first; } + else if (kp.second == PyramidOperation::FILTER) { K_f = kp.first; } + else if (kp.second == PyramidOperation::EXPAND) { K_e = kp.first; } + } + assert(K_r && "no reduce kernel for multi stream pyramid execution"); + assert(K_f && "no filter kernel for multi stream pyramid execution"); + assert(K_e && "no expand kernel for multi stream pyramid execution"); + + unsigned imgSzX = baseImgs.front()->getSizeX(); // assume same-sized base images + unsigned imgSzY = baseImgs.front()->getSizeY(); + + // single stream analysis + singleStream_str += " -single-stream wave estimation\n"; + std::vector numWavesDS; + std::vector numWavesLP; + std::vector numWavesUS; + + for (size_t d=0; d 0) { + // Downsampling + numWavesDS.push_back(getNumWaves(K_r, imgSzX, imgSzY)); + } + if (d < depth - 1) { + // level processing + numWavesLP.push_back(getNumWaves(K_f, imgSzX, imgSzY)); + // Upsampling + numWavesUS.push_back(getNumWaves(K_e, imgSzX, imgSzY)); + } + imgSzX /= 2; imgSzY /= 2; + } + + assert((numWavesDS.size() == depth-1) && "number of waves in reduce does not match depth\n"); + assert((numWavesLP.size() == depth-1) && "number of waves in filter does not match depth\n"); + assert((numWavesUS.size() == depth-1) && "number of waves in expand does not match depth\n"); + + unsigned numWavesDSSum = std::accumulate(numWavesDS.begin(), numWavesDS.end(), 0); + unsigned numWavesLPSum = std::accumulate(numWavesLP.begin(), numWavesLP.end(), 0); + unsigned numWavesUSSum = std::accumulate(numWavesUS.begin(), numWavesUS.end(), 0); + + singleStream_str += " Reduce kernel " + K_r->getName() + " takes " + std::to_string(numWavesDSSum) + " waves\n"; + singleStream_str += " Filter kernel " + K_f->getName() + " takes " + std::to_string(numWavesLPSum) + " waves\n"; + singleStream_str += " Expand kernel " + K_e->getName() + " takes " + std::to_string(numWavesUSSum) + " waves\n"; + llvm::errs() << singleStream_str; + + // multi stream info + imgSzX = baseImgs.front()->getSizeX(); // assume same-sized base images + imgSzY = baseImgs.front()->getSizeY(); + multiStream_str += " -multi-stream wave estimation\n"; + + unsigned n_wave_seq_r = 0; + unsigned n_wave_seq_f = 0; + unsigned n_wave_expand = 0; + unsigned n_p = 0; + std::vector S_p; + + // downsampling + for (size_t d=0; dgetNumThreadsX(); + unsigned sy = K_f->getNumThreadsY(); + unsigned r_reg = std::max(K_f->getResourceUsageReg(), (unsigned)1); + unsigned r_smem = std::max(K_f->getResourceUsageSmem(), (unsigned)1); + unsigned k_blk = std::ceil((float)imgSzX / sx) * std::ceil((float)imgSzY / sy); + unsigned n_blk_reg = std::floor((float)max_total_registers / (r_reg * sx * sy)); + unsigned n_blk_smem = std::floor((float)max_total_shared_memory / r_smem); + unsigned n_blk_thr = std::floor((float)max_threads_per_multiprocessor / (sx*sy)); + unsigned n_blk = std::min({n_blk_reg, n_blk_smem, n_blk_thr, max_blocks_per_multiprocessor}); + + unsigned n_wave = std::ceil((float)k_blk / (n_blk * num_sms)); + unsigned n_blk_tail = k_blk % (n_blk * num_sms); + unsigned n_blk_tail_per_sm = std::ceil((float)n_blk_tail / num_sms); + + if (n_wave > 1) { // more than one wave, fine-grained level execution + resetResourceUsage(); + n_wave_seq_f += n_wave; + n_wave_seq_r += getNumWaves(K_r, imgSzX / 2, imgSzY / 2); + } else if (n_blk_tail == 0) { // exactly one wave, fine-grained level execution + resetResourceUsage(); + n_wave_seq_f += n_wave; + n_wave_seq_r += getNumWaves(K_r, imgSzX / 2, imgSzY / 2); + } else if (n_blk_tail > 0) { // less than one wave, coarse-grained level execution + PyrResource res; + res.reg = n_blk_tail_per_sm * (r_reg * sx * sy); + res.smem = n_blk_tail_per_sm * r_smem; + res.thr = n_blk_tail_per_sm * (sx * sy); + res.blk = n_blk_tail_per_sm; + + if (hasResourceAvailable(res)) { // resource is sufficient, same wave + n_p++; + } else { // resource not sufficient, new wave + S_p.push_back(n_p); + n_p = 0; + resetResourceUsage(); + } + useResource(res); + } + imgSzX /= 2; + imgSzY /= 2; + } + + // upsampling + imgSzX = baseImgs.front()->getSizeX(); // assume same-sized base images + imgSzY = baseImgs.front()->getSizeY(); + for (size_t d=0; dgetName() + " takes " + std::to_string(n_wave_seq_r + 1) + " waves\n"; + multiStream_str += " Filter kernel " + K_f->getName() + " takes " + std::to_string(n_wave_seq_f) + " waves\n"; + multiStream_str += " Coarse-grained level execution:\n"; + if (S_p.size()) { + multiStream_str += + " takes " + std::to_string(S_p.size() + 1) + " waves and "; + for (auto np : S_p) { + multiStream_str += std::to_string(np) + " "; + } + multiStream_str += std::to_string(n_p) + " parallel streams\n"; + } else { + multiStream_str += + " takes 1 wave and " + std::to_string(n_p) + " parallel streams\n"; + } + multiStream_str += " Expand execution: takes " + std::to_string(n_wave_expand) + " waves\n"; + llvm::errs() << multiStream_str; +} + + + // vim: set ts=2 sw=2 sts=2 et ai: diff --git a/lib/Rewrite/CreateHostStrings.cpp b/lib/Rewrite/CreateHostStrings.cpp index 9a618d06..d7f90cbd 100644 --- a/lib/Rewrite/CreateHostStrings.cpp +++ b/lib/Rewrite/CreateHostStrings.cpp @@ -260,11 +260,19 @@ void CreateHostStrings::writeMemoryTransfer(HipaccImage *Img, std::string mem, case HOST_TO_DEVICE: resultStr += "hipaccWriteMemory("; resultStr += Img->getName(); - resultStr += ", " + mem + ");"; + resultStr += ", " + mem; + if (options.asyncKernelLaunch()) { + resultStr += ", " + Img->getStream(); + } + resultStr += ");"; break; case DEVICE_TO_HOST: resultStr += "hipaccReadMemory<" + Img->getTypeStr() + ">("; - resultStr += Img->getName() + ");"; + resultStr += Img->getName(); + if (options.asyncKernelLaunch()) { + resultStr += ", " + Img->getStream(); + } + resultStr += ");"; break; case DEVICE_TO_DEVICE: resultStr += "hipaccCopyMemory("; @@ -284,11 +292,19 @@ void CreateHostStrings::writeMemoryTransfer(HipaccPyramid *Pyr, std::string idx, case HOST_TO_DEVICE: resultStr += "hipaccWriteMemory("; resultStr += Pyr->getName() + "(" + idx + ")"; - resultStr += ", " + mem + ");"; + resultStr += ", " + mem; + if (options.asyncKernelLaunch()) { + resultStr += ", " + Pyr->getStream(); + } + resultStr += ");"; break; case DEVICE_TO_HOST: resultStr += "hipaccReadMemory<" + Pyr->getTypeStr() + ">("; - resultStr += Pyr->getName() + "(" + idx + "));"; + resultStr += Pyr->getName() + "(" + idx + ")"; + if (options.asyncKernelLaunch()) { + resultStr += ", " + Pyr->getStream(); + } + resultStr += ");"; break; case DEVICE_TO_DEVICE: resultStr += "hipaccCopyMemory("; @@ -387,7 +403,7 @@ void CreateHostStrings::writeMemoryTransferDomainFromMask( } -void CreateHostStrings::writeKernelCall(HipaccKernel *K, std::string &resultStr) { +void CreateHostStrings::writeKernelCall(HipaccKernel *K, std::string &resultStr, HipaccPyramidPipeline *pyrPipe) { auto argTypeNames = K->getArgTypeNames(); auto deviceArgNames = K->getDeviceArgNames(); auto hostArgNames = K->getHostArgNames(); @@ -728,6 +744,13 @@ void CreateHostStrings::writeKernelCall(HipaccKernel *K, std::string &resultStr) } resultStr += "\n" + indent; + // insert cuda stream and wait event + if (options.emitCUDA() && options.asyncKernelLaunch() && K->hasWaitEvent()) { + resultStr += "cudaStreamWaitEvent(" + K->getStream() + ", "; + resultStr += K->getWaitEventStr() + ", 0);"; + resultStr += "\n" + indent; + } + // launch kernel if (options.exploreConfig() || options.timeKernels()) { switch (options.getTargetLang()) { @@ -805,6 +828,34 @@ void CreateHostStrings::writeKernelCall(HipaccKernel *K, std::string &resultStr) resultStr += ", " + gridStr; resultStr += ", " + blockStr; resultStr += ", _args" + kernel_name + ".data()"; + + if (options.asyncKernelLaunch()) { + if (pyrPipe) { // pyramid app + if (pyrPipe->isSingleStream()) { + resultStr += ", " + K->getStream(); + } else if (pyrPipe->isMultiStream()) { + switch (pyrPipe->getPyramidOperation(K->getDecl())) { + case PyramidOperation::REDUCE: + resultStr += ", stream[" + pyrPipe->getGlobalLevelStr() + " - 1]"; + resultStr += ", events[" + pyrPipe->getGlobalLevelStr() + " - 1]"; // wait event + resultStr += ", events[" + pyrPipe->getGlobalLevelStr() + "]"; // record event + break; + case PyramidOperation::FILTER: + resultStr += ", stream[" + pyrPipe->getGlobalLevelStr() + " - 1]"; + break; + case PyramidOperation::EXPAND: + resultStr += ", stream[" + pyrPipe->getGlobalLevelStr() + "]"; + resultStr += ", events[" + std::to_string(pyrPipe->getDepth()) + " * 2";// wait event + resultStr += " - " + pyrPipe->getGlobalLevelStr() + " - 2]"; + resultStr += ", events[" + std::to_string(pyrPipe->getDepth()) + " * 2";// record event + resultStr += " - " + pyrPipe->getGlobalLevelStr() + " - 1]"; + break; + } + } + } else { // non-pyramid app + resultStr += ", " + K->getStream(); + } + } resultStr += ");"; break; case Language::Renderscript: @@ -825,6 +876,13 @@ void CreateHostStrings::writeKernelCall(HipaccKernel *K, std::string &resultStr) break; } } + + // insert cuda stream and record event + if (options.emitCUDA() && options.asyncKernelLaunch() && K->hasRecordEvent()) { + resultStr += "cudaEventRecord(" + K->getRecordEventStr() + ", "; + resultStr += K->getStream() + ");"; + resultStr += "\n" + indent; + } } diff --git a/lib/Rewrite/Rewrite.cpp b/lib/Rewrite/Rewrite.cpp index 0f295033..dd219600 100644 --- a/lib/Rewrite/Rewrite.cpp +++ b/lib/Rewrite/Rewrite.cpp @@ -100,6 +100,9 @@ class Rewrite : public ASTConsumer, public RecursiveASTVisitor { // store interpolation methods required for CUDA SmallVector InterpolationDefinitionsGlobal; + // pipeline global config for pyramid + HipaccPyramidPipeline *pyramidPipe; + // pointer to main function FunctionDecl *mainFD; FileID mainFileID; @@ -120,6 +123,7 @@ class Rewrite : public ASTConsumer, public RecursiveASTVisitor { builtins(CI.getASTContext()), stringCreator(CreateHostStrings(options, targetDevice)), compilerClasses(CompilerKnownClasses()), + pyramidPipe(nullptr), mainFD(nullptr), literalCount(0), skipTransfer(false) @@ -376,11 +380,52 @@ void Rewrite::HandleTranslationUnit(ASTContext &) { CompoundStmt *CS = dyn_cast(mainFD->getBody()); assert(CS->size() && "CompoundStmt has no statements."); - std::string initStr; + std::string initStr, cleanupStr; // get initialization string for run-time stringCreator.writeInitialization(initStr); + // print pyramid pipeline info for cuda + if (compilerOptions.emitCUDA() && compilerOptions.asyncKernelLaunch() && + pyramidPipe->isPipelineKernelLaunch()) { + pyramidPipe->printStreamPipelineInfo(); + } + + // write CUDA streams + if (compilerOptions.asyncKernelLaunch()) { + if (pyramidPipe->isSingleStream()) { + initStr += "cudaStream_t stream;"; + initStr += "\n" + stringCreator.getIndent(); + + initStr += "cudaStreamCreate(&stream);"; + initStr += "\n" + stringCreator.getIndent(); + } else if (pyramidPipe->isMultiStream()) { + initStr += "cudaStream_t stream["; + initStr += std::to_string(pyramidPipe->getDepth()) + " - 1];"; + initStr += "\n" + stringCreator.getIndent(); + + initStr += "for (int i = 0; i < "; + initStr += std::to_string(pyramidPipe->getDepth()) + " - 1; i++) {"; + initStr += "\n" + stringCreator.getIndent(); + initStr += " cudaStreamCreate(&stream[i]);"; + initStr += "\n" + stringCreator.getIndent() + "}"; + initStr += "\n" + stringCreator.getIndent(); + + initStr += "cudaEvent_t events["; + initStr += std::to_string(pyramidPipe->getDepth()) + " * 2];"; + initStr += "\n" + stringCreator.getIndent(); + + initStr += "for (int i = 0; i < "; + initStr += std::to_string(pyramidPipe->getDepth()) + " * 2; i++) {"; + initStr += "\n" + stringCreator.getIndent(); + initStr += " cudaEventCreate(&events[i]);"; + initStr += "\n" + stringCreator.getIndent() + "}"; + initStr += "\n" + stringCreator.getIndent(); + } else { + // + } + } + // load OpenCL kernel files and compile the OpenCL kernels if (!compilerOptions.exploreConfig()) { for (auto map : KernelDeclMap) @@ -411,6 +456,31 @@ void Rewrite::HandleTranslationUnit(ASTContext &) { // insert initialization before first statement TextRewriter.InsertTextBefore(CS->body_front()->getBeginLoc(), initStr); + // insert cleanup before last statement + if (compilerOptions.asyncKernelLaunch()) { + if (pyramidPipe->isSingleStream()) { + cleanupStr = "cudaStreamDestroy(stream);"; + cleanupStr += "\n" + stringCreator.getIndent(); + } else if (pyramidPipe->isMultiStream()) { + cleanupStr += "for (int i = 0; i < "; + cleanupStr += std::to_string(pyramidPipe->getDepth()) + " - 1; i++) {"; + cleanupStr += "\n" + stringCreator.getIndent(); + cleanupStr += " cudaStreamDestroy(stream[i]);"; + cleanupStr += "\n" + stringCreator.getIndent() + "}"; + cleanupStr += "\n" + stringCreator.getIndent(); + + cleanupStr += "for (int i = 0; i < "; + cleanupStr += std::to_string(pyramidPipe->getDepth()) + " * 2; i++) {"; + cleanupStr += "\n" + stringCreator.getIndent(); + cleanupStr += " cudaEventDestroy(events[i]);"; + cleanupStr += "\n" + stringCreator.getIndent() + "}"; + cleanupStr += "\n" + stringCreator.getIndent(); + } else { + // + } + TextRewriter.InsertTextBefore(CS->body_back()->getBeginLoc(), cleanupStr); + } + // get buffer of main file id. If we haven't changed it, then we are done. if (auto RewriteBuf = TextRewriter.getRewriteBufferFor(mainFileID)) { *Out << std::string(RewriteBuf->begin(), RewriteBuf->end()); @@ -627,6 +697,20 @@ bool Rewrite::VisitCXXRecordDecl(CXXRecordDecl *D) { KC->setBinningFunction(method); continue; } + + // pyramid functions + if (method->getNameAsString() == "_operatePyramidReduce") { + KC->setPyramidOperation(PyramidOperation::REDUCE); + continue; + } + if (method->getNameAsString() == "_operatePyramidFilter") { + KC->setPyramidOperation(PyramidOperation::FILTER); + continue; + } + if (method->getNameAsString() == "_operatePyramidExpand") { + KC->setPyramidOperation(PyramidOperation::EXPAND); + continue; + } } } @@ -683,7 +767,25 @@ bool Rewrite::VisitDeclStmt(DeclStmt *D) { std::string width_str = convertToString(CCE->getArg(0)); std::string height_str = convertToString(CCE->getArg(1)); - if (compilerOptions.emitC99()) { + if (compilerOptions.asyncKernelLaunch() && compilerOptions.emitCUDA()) { + // extract depth level from pyramid + unsigned IDConstant = Diags.getCustomDiagID(DiagnosticsEngine::Error, + "Constant expression for %0 argument of Image %1 required (Pyramid stream CUDA only)."); + if (!CCE->getArg(0)->isEvaluatable(Context)) { + Diags.Report(CCE->getArg(0)->getExprLoc(), IDConstant) << "width" + << Img->getName(); + } + if (!CCE->getArg(1)->isEvaluatable(Context)) { + Diags.Report(CCE->getArg(1)->getExprLoc(), IDConstant) << "height" + << Img->getName(); + } + + int64_t img_stride = CCE->getArg(0)->EvaluateKnownConstInt(Context).getSExtValue(); + int64_t img_height = CCE->getArg(1)->EvaluateKnownConstInt(Context).getSExtValue(); + + Img->setSizeX(img_stride); + Img->setSizeY(img_height); + } else if (compilerOptions.emitC99()) { // check if the parameter can be resolved to a constant unsigned IDConstant = Diags.getCustomDiagID(DiagnosticsEngine::Error, "Constant expression for %0 argument of Image %1 required (C/C++ only)."); @@ -746,6 +848,36 @@ bool Rewrite::VisitDeclStmt(DeclStmt *D) { std::string image_str = convertToString(CCE->getArg(0)); std::string depth_str = convertToString(CCE->getArg(1)); + if (compilerOptions.asyncKernelLaunch() && compilerOptions.emitCUDA()) { + // extract depth level from pyramid + unsigned IDConstant = Diags.getCustomDiagID(DiagnosticsEngine::Error, + "Constant expression for %0 argument of Pyramid %1 required (CUDA only)."); + if (!CCE->getArg(1)->isEvaluatable(Context)) { + Diags.Report(CCE->getArg(1)->getExprLoc(), IDConstant) << "depth" + << Pyr->getName(); + } + int64_t pyr_depth = CCE->getArg(1)->EvaluateKnownConstInt(Context).getSExtValue(); + Pyr->setDepth(pyr_depth); + + pyramidPipe->updateDepth(pyr_depth); + std::string pyr_name = VD->getName(); + pyramidPipe->setGlobalLevelStr(pyr_name + ".level()"); + + // TODO, generalize this to async h2d + auto arg = CCE->getArg(0)->IgnoreParenCasts(); + HipaccImage *Img = nullptr; + if (auto DRE = dyn_cast(arg)) { + // check if the argument specifies the image + if (ImgDeclMap.count(DRE->getDecl())) { + Img = ImgDeclMap[DRE->getDecl()]; + if (pyramidPipe->isMultiStream()) { + Img->setStream("stream[0]"); + } + pyramidPipe->recordBaseImg(Img); + } + } + } + // create memory allocation string std::string newStr; stringCreator.writePyramidAllocation(VD->getName(), @@ -1244,6 +1376,20 @@ bool Rewrite::VisitDeclStmt(DeclStmt *D) { HipaccKernel *K = new HipaccKernel(Context, VD, KC, compilerOptions); KernelDeclMap[VD] = K; + if (compilerOptions.asyncKernelLaunch() && compilerOptions.emitCUDA()) { + switch (KC->getPyramidOperation()) { + case PyramidOperation::REDUCE: + pyramidPipe->recordKernel(VD, K, PyramidOperation::REDUCE); + break; + case PyramidOperation::FILTER: + pyramidPipe->recordKernel(VD, K, PyramidOperation::FILTER); + break; + case PyramidOperation::EXPAND: + pyramidPipe->recordKernel(VD, K, PyramidOperation::EXPAND); + break; + } + } + // remove kernel declaration TextRewriter.RemoveText(D->getSourceRange()); @@ -1342,6 +1488,10 @@ bool Rewrite::VisitFunctionDecl(FunctionDecl *D) { assert(D->getBody() && "main function has no body."); assert(isa(D->getBody()) && "CompoundStmt for main body expected."); mainFD = D; + // TODO, triggle pyramid config + if (compilerOptions.emitCUDA() && compilerOptions.asyncKernelLaunch()) { + pyramidPipe = new HipaccPyramidPipeline(compilerOptions); + } } return true; @@ -1624,7 +1774,12 @@ bool Rewrite::VisitCXXMemberCallExpr(CXXMemberCallExpr *E) { // TODO: handle the case when only reduce function is specified // // create kernel call string - stringCreator.writeKernelCall(K, newStr); + if (compilerOptions.emitCUDA() && compilerOptions.asyncKernelLaunch() && + pyramidPipe->isPipelineKernelLaunch()) { + stringCreator.writeKernelCall(K, newStr, pyramidPipe); + } else { + stringCreator.writeKernelCall(K, newStr); + } // rewrite kernel invocation replaceText(E->getBeginLoc(), E->getBeginLoc(), ';', newStr); @@ -1741,6 +1896,10 @@ bool Rewrite::VisitCallExpr (CallExpr *E) { SourceRange range(E->getBeginLoc(), E->getBeginLoc().getLocWithOffset(std::string("traverse").length()-1)); TextRewriter.ReplaceText(range, "hipaccTraverse"); + + if (compilerOptions.emitCUDA() && compilerOptions.asyncKernelLaunch()) { + pyramidPipe->enablePipelineKernelLaunch(); + } } } } diff --git a/runtime/hipacc_cu.hpp.cmake b/runtime/hipacc_cu.hpp.cmake index 4d0f579b..f083ca1a 100644 --- a/runtime/hipacc_cu.hpp.cmake +++ b/runtime/hipacc_cu.hpp.cmake @@ -181,6 +181,7 @@ void hipaccInitCUDA(); void hipaccCopyMemory(const HipaccImage &src, HipaccImage &dst); void hipaccCopyMemoryRegion(const HipaccAccessor &src, const HipaccAccessor &dst); void hipaccLaunchKernel(const void *kernel, std::string kernel_name, dim3 grid, dim3 block, void **args, bool print_timing=true); +void hipaccLaunchKernel(const void *kernel, std::string kernel_name, dim3 grid, dim3 block, void **args, cudaStream_t stream, cudaEvent_t waitEvent=nullptr, cudaEvent_t recordEvent=nullptr); void hipaccLaunchKernelBenchmark(const void *kernel, std::string kernel_name, dim3 grid, dim3 block, std::vector args, bool print_timing=true); void hipaccLaunchKernelExploration(std::string filename, std::string kernel, std::vector args, std::vector smems, std::vector consts, std::vector texs, @@ -207,8 +208,12 @@ HipaccImage hipaccCreatePyramidImage(const HipaccImage &base, size_t width, size template void hipaccWriteMemory(HipaccImage &img, T *host_mem); template +void hipaccWriteMemory(HipaccImage &img, T *host_mem, cudaStream_t stream); +template T *hipaccReadMemory(const HipaccImage &img); template +T *hipaccReadMemory(const HipaccImage &img, cudaStream_t stream); +template void hipaccBindTexture(hipaccMemoryType mem_type, const textureReference *tex, const HipaccImage &img); template void hipaccBindSurface(hipaccMemoryType mem_type, const surfaceReference *surf, const HipaccImage &img); diff --git a/runtime/hipacc_cu.tpp b/runtime/hipacc_cu.tpp index 36ce05e9..88d43fbc 100644 --- a/runtime/hipacc_cu.tpp +++ b/runtime/hipacc_cu.tpp @@ -122,6 +122,33 @@ void hipaccWriteMemory(HipaccImage &img, T *host_mem) { } +// Write to memory async +template +void hipaccWriteMemory(HipaccImage &img, T *host_mem, cudaStream_t stream) { + if (host_mem == NULL) return; + + size_t width = img->width; + size_t height = img->height; + size_t stride = img->stride; + + if ((char *)host_mem != img->host) + std::copy(host_mem, host_mem + width*height, (T*)img->host); + + if (img->mem_type >= Array2D) { + cudaError_t err = cudaMemcpy2DToArrayAsync((cudaArray *)img->mem, 0, 0, host_mem, stride*sizeof(T), width*sizeof(T), height, cudaMemcpyHostToDevice, stream); + checkErr(err, "cudaMemcpy2DToArray()"); + } else { + if (stride > width) { + cudaError_t err = cudaMemcpy2DAsync(img->mem, stride*sizeof(T), host_mem, width*sizeof(T), width*sizeof(T), height, cudaMemcpyHostToDevice, stream); + checkErr(err, "cudaMemcpy2D()"); + } else { + cudaError_t err = cudaMemcpyAsync(img->mem, host_mem, sizeof(T)*width*height, cudaMemcpyHostToDevice, stream); + checkErr(err, "cudaMemcpy()"); + } + } +} + + // Read from memory template T *hipaccReadMemory(const HipaccImage &img) { @@ -146,6 +173,30 @@ T *hipaccReadMemory(const HipaccImage &img) { } +// Read from memory async +template +T *hipaccReadMemory(const HipaccImage &img, cudaStream_t stream) { + size_t width = img->width; + size_t height = img->height; + size_t stride = img->stride; + + if (img->mem_type >= Array2D) { + cudaError_t err = cudaMemcpy2DFromArrayAsync((T*)img->host, stride*sizeof(T), (cudaArray *)img->mem, 0, 0, width*sizeof(T), height, cudaMemcpyDeviceToHost, stream); + checkErr(err, "cudaMemcpy2DFromArray()"); + } else { + if (stride > width) { + cudaError_t err = cudaMemcpy2DAsync((T*)img->host, width*sizeof(T), img->mem, stride*sizeof(T), width*sizeof(T), height, cudaMemcpyDeviceToHost, stream); + checkErr(err, "cudaMemcpy2D()"); + } else { + cudaError_t err = cudaMemcpyAsync((T*)img->host, img->mem, sizeof(T)*width*height, cudaMemcpyDeviceToHost, stream); + checkErr(err, "cudaMemcpy()"); + } + } + + return (T*)img->host; +} + + // Bind memory to texture template void hipaccBindTexture(hipaccMemoryType mem_type, const textureReference *tex, const HipaccImage &img) { diff --git a/runtime/hipacc_cu_standalone.hpp.cmake b/runtime/hipacc_cu_standalone.hpp.cmake index d2c54166..68ef83b7 100644 --- a/runtime/hipacc_cu_standalone.hpp.cmake +++ b/runtime/hipacc_cu_standalone.hpp.cmake @@ -234,6 +234,19 @@ void hipaccLaunchKernel(const void *kernel, std::string kernel_name, dim3 grid, } +// Launch kernel async +void hipaccLaunchKernel(const void *kernel, std::string kernel_name, dim3 grid, dim3 block, void **args, cudaStream_t stream, cudaEvent_t waitEvent, cudaEvent_t recordEvent) { + if (waitEvent) { + cudaStreamWaitEvent(stream, waitEvent, 0); + } + cudaError_t err = cudaLaunchKernel(kernel, grid, block, args, 0, stream); + checkErr(err, "cudaLaunchKernel(" + kernel_name + ")"); + if (recordEvent) { + cudaEventRecord(recordEvent, stream); + } +} + + // Benchmark timing for a kernel call void hipaccLaunchKernelBenchmark(const void *kernel, std::string kernel_name, dim3 grid, dim3 block, std::vector args, bool print_timing) { std::vector times;