diff --git a/include/hipacc/AST/ASTFuse.h b/include/hipacc/AST/ASTFuse.h index a5ce8aa3..49aea6ed 100644 --- a/include/hipacc/AST/ASTFuse.h +++ b/include/hipacc/AST/ASTFuse.h @@ -114,10 +114,21 @@ class ASTFuse { SubListPosition Local2PointLoc = Undefined; SubListPosition Point2LocalLoc = Undefined; SubListPosition Local2LocalLoc = Undefined; + FusiblePartitionBlock::PatternType patternType; + + FusionTypeTags(FusiblePartitionBlock::PatternType patternType) : patternType(patternType) {} + }; + + struct KernelListLocation { + // The location of the block in a set of partitionBlockNames + unsigned blockLocation; + + // The location of the respective kernel list in a partitionBlockNames + unsigned listLocation; }; + std::map FusibleKernelSubListPosMap; - std::map> FusibleKernelBlockLocation; - std::set>> fusibleSetNames; + std::map FusibleKernelBlockLocation; std::vector *> fusibleKernelSet; // member functions @@ -128,12 +139,32 @@ class ASTFuse { FunctionDecl *createFusedKernelDecl(std::list *l); void insertPrologFusedKernel(); void insertEpilogFusedKernel(); - void createReg4FusionVarDecl(QualType QT); - void createIdx4FusionVarDecl(); - void createGidVarDecl(); + void createReg4FusionVarDecl(QualType QT, unsigned int ppt); + void createIdx4FusionVarDecl(unsigned int ppt); + void createGidVarDecl(unsigned int ppt); void markKernelPositionSublist(std::list *l); void recomputeMemorySizeLocalFusion(std::list *l); + const FusiblePartitionBlock& getPartitionBlockFor(std::list *l) { + hipacc_require((!l->empty()), "There is no fusion type for empty lists."); + + auto fusibleBlocks = dataDeps->getFusiblePartitionBlocks(); + auto block = fusibleBlocks.end(); + + for (auto k : *l) { + // get iterator + auto innerBlock = FusiblePartitionBlock::findForKernel(k, fusibleBlocks); + if (block != fusibleBlocks.end()) { + hipacc_require((block == innerBlock), "The given kernel list contains kernels of distinct partition blocks."); + } else { + block = innerBlock; + } + } + + hipacc_require(block != fusibleBlocks.end(), "The given kernel list did not correspond to a partition block."); + return *block; + } + public: ASTFuse(ASTContext& Ctx, DiagnosticsEngine &Diags, hipacc::Builtin::Context &builtins, CompilerOptions &options, PrintingPolicy Policy, HostDataDeps *dataDeps) : @@ -150,24 +181,30 @@ class ASTFuse { fusionRegVarCount(0), fusionIdxVarCount(0) { - fusibleSetNames = dataDeps->getFusibleSetNames(); + unsigned nFusibleKernelBlockLocations = 0; + for (const auto& fusibleBlock : dataDeps->getFusiblePartitionBlocks()) { // block level + if (!fusibleBlock.isPatternFusible()) { + continue; + } - // unpack fusible kernel info, one kernel per PB - // TODO, merge parallel kernels - unsigned PBlockID; - PBlockID = 0; - for (auto PBN : fusibleSetNames) { // block level unsigned KernelVecID = 0; - for (auto sL : PBN) { // vector level - auto pos = std::make_tuple(PBlockID, KernelVecID); - auto nam = sL.front(); + for (const auto& part : fusibleBlock.getParts()) { // vector level + KernelListLocation pos = { + nFusibleKernelBlockLocations, + KernelVecID + }; + + auto nam = part.front().getName(); + bool locExists = FusibleKernelBlockLocation.find(nam) != FusibleKernelBlockLocation.end(); + hipacc_require(!locExists, "Kernel lists cannot be added twice"); + FusibleKernelBlockLocation[nam] = pos; KernelVecID++; } // create a list for each partion block std::list *list = new std::list; fusibleKernelSet.push_back(list); - PBlockID++; + nFusibleKernelBlockLocations++; } } diff --git a/include/hipacc/AST/ASTTranslate.h b/include/hipacc/AST/ASTTranslate.h index d89dcbdd..c1a5cf8d 100644 --- a/include/hipacc/AST/ASTTranslate.h +++ b/include/hipacc/AST/ASTTranslate.h @@ -197,20 +197,24 @@ class ASTTranslate : public StmtVisitor { class KernelFusionVars { public: bool bSkipGidDecl; - Expr *exprOutput; + VarDecl *exprOutput; bool bReplaceExprOutput; - Expr *exprInput; + bool multipleInputs; + std::map exprInputs; + VarDecl *exprInput; + VarDecl *exprInputAccess; + bool bInputAccessProduce; bool bReplaceExprInput; bool bP2LReplaceExprInputIdx; - Expr *exprP2LInputIdx; + VarDecl *exprP2LInputIdx; bool bP2LReplaceInputExprs; Stmt *stmtP2LProducerBody; Expr *exprSharedImgReg; std::string exprSharedImgName; bool bL2LInsertKernelBody; bool bL2LInsertBeforeSmem; - Expr *exprL2LIdXShift; - Expr *exprL2LIdYShift; + VarDecl *exprL2LIdXShift; + VarDecl *exprL2LIdYShift; int curL2LIdXShift; int curL2LIdYShift; bool bL2LRecordBorder; @@ -230,7 +234,11 @@ class ASTTranslate : public StmtVisitor { bSkipGidDecl(true), exprOutput(nullptr), bReplaceExprOutput(false), + multipleInputs(false), + exprInputs(), exprInput(nullptr), + exprInputAccess(nullptr), + bInputAccessProduce(false), bReplaceExprInput(false), bP2LReplaceExprInputIdx(false), exprP2LInputIdx(nullptr), @@ -255,6 +263,8 @@ class ASTTranslate : public StmtVisitor { }; KernelFusionVars fusionVars; + size_t currentPptIndex; + template T *Clone(T *S) { if (S==nullptr) @@ -280,6 +290,8 @@ class ASTTranslate : public StmtVisitor { } } + Expr* createPptVarRefExpr(VarDecl *VD) const; + VarDecl *CloneVarDecl(VarDecl *VD); VarDecl *CloneParmVarDecl(ParmVarDecl *PVD); VarDecl *CloneDeclTex(ParmVarDecl *D, std::string prefix); @@ -458,7 +470,8 @@ class ASTTranslate : public StmtVisitor { tileVars(), lidYRef(nullptr), gidYRef(nullptr), - fusionVars(kernel) { + fusionVars(kernel), + currentPptIndex(0) { // get 'hipacc' namespace context for lookups auto hipacc_ident = &Ctx.Idents.get("hipacc"); for (auto *decl : Ctx.getTranslationUnitDecl()->lookup(hipacc_ident)) @@ -519,7 +532,9 @@ class ASTTranslate : public StmtVisitor { // Kernel Fusion getters and setters void setFusionSkipGidDecl(bool b) { fusionVars.bSkipGidDecl = b; } void setFusionP2PSrcOperator(VarDecl *VD); + void setFusionNP2PSrcOperator(VarDecl *inVD, VarDecl *outVD, bool produce); void setFusionP2PDestOperator(VarDecl *VD); + void setFusionNP2PDestOperator(const std::map& imgVarDeclMap); void setFusionP2PIntermOperator(VarDecl *VDIn, VarDecl *VDOut); void setFusionL2PDestOperator(VarDecl *VD, VarDecl *VDSharedImg, std::string nam); void setFusionL2PIntermOperator(VarDecl *VDIn, VarDecl *VDOut, VarDecl *VDSharedImg, std::string nam); diff --git a/include/hipacc/Analysis/HostDataDeps.h b/include/hipacc/Analysis/HostDataDeps.h index 7b3a2b29..e2d0ee7f 100644 --- a/include/hipacc/Analysis/HostDataDeps.h +++ b/include/hipacc/Analysis/HostDataDeps.h @@ -58,6 +58,7 @@ #include #include #include +#include //#define PRINT_DEBUG @@ -118,10 +119,11 @@ class DependencyTracker : public StmtVisitor { } }; - +class FusiblePartitionBlock; class HostDataDeps : public ManagedAnalysis { friend class DependencyTracker; + friend class FusiblePartitionBlock; private: static const bool DEBUG; @@ -166,7 +168,7 @@ class HostDataDeps : public ManagedAnalysis { using partitionBlock = std::vector *>; partitionBlock applicationGraph; using partitionBlockNames = std::vector>; - std::set fusibleSetNames; + std::set fusiblePartitionBlocks; using edgeWeight = std::map, unsigned>; edgeWeight edgeWeightMap_; @@ -485,10 +487,44 @@ class HostDataDeps : public ManagedAnalysis { std::string getMemcpyNodeName(std::string imgDst, std::string imgSrc, std::string direction); std::string getKernelNodeName(std::string kernelName); + // helper to convert a partitionBlock to a block of the respective kernel names + static partitionBlockNames convertToNames(const partitionBlock* pB) { + partitionBlockNames PBNam; + llvm::errs() << " [ "; + for (auto pL : *pB) { + llvm::errs() << "{"; + std::list lNam; + for (auto p : *pL) { + std::string kname = p->getKernel()->getName(); + llvm::errs() << " --> " << kname; + lNam.push_back(kname); + } + llvm::errs() << "} "; + PBNam.push_back(lNam); + } + llvm::errs() << "] \n"; + + return PBNam; + } + + static bool partitionBlockNamesContains( + const std::set& haystack, + const std::string& needle + ) { + for (const auto& PBN : haystack) { + if (std::any_of(PBN.begin(), PBN.end(), [&](std::list lNam){ + return (std::find(lNam.begin(), lNam.end(), needle) != lNam.end()) && + (lNam.size() > 1);})) { + return true; + } + } + return false; + } + // kernel fusion analysis void computeGraphWeight(); void fusibilityAnalysis(); - void fusibilityAnalysisLinear(); + void fusibilityAnalysisLinearAndParallel(); void minCutGlobal(partitionBlock PB, partitionBlock &PBRet0, partitionBlock &PBRet1); unsigned minCutPhase(partitionBlock &PB, edgeWeight &curEdgeWeightMap, std::pair &ST); @@ -499,7 +535,7 @@ class HostDataDeps : public ManagedAnalysis { std::string getSharedISName(HipaccKernel *K); bool isSrc(Process *P); bool isDest(Process *P); - std::set getFusibleSetNames() const; + const std::set& getFusiblePartitionBlocks() const; std::string getGraphMemcpyNodeName(std::string dst, std::string src, std::string dir); std::string getGraphKernelNodeName(std::string kernelName); std::set getGraphMemcpyNodeDepOn(std::string dst, std::string src, std::string dir); @@ -507,7 +543,6 @@ class HostDataDeps : public ManagedAnalysis { std::map> getGraphNodeDepMap() const; std::vector getOutputImageNames(); - static HostDataDeps *parse(ASTContext &Context, PrintingPolicy &Policy, AnalysisDeclContext &analysisContext, @@ -522,7 +557,7 @@ class HostDataDeps : public ManagedAnalysis { DependencyTracker DT(Context, Policy, analysisContext, compilerClasses, dataDeps); dataDeps.generateSchedule(); if (dataDeps.compilerOptions->fuseKernels()) { - dataDeps.fusibilityAnalysisLinear(); + dataDeps.fusibilityAnalysisLinearAndParallel(); } if (dataDeps.compilerOptions->useGraph()) { dataDeps.buildGraphDependency(); @@ -532,6 +567,89 @@ class HostDataDeps : public ManagedAnalysis { } }; +class FusiblePartitionBlock { + public: + class KernelInfo; + using Part = std::vector; + + enum class PatternType { + Linear, + Parallel + }; + + enum class Pattern { + // Linear patterns + Linear, + + // Parallel patterns + + // Parallel points to point + NP2P, + // Parallel locals to point + NL2P, + // Parallel mixed locals/points to point + Mixed2P, + // Parallel points to local + NP2L, + // Parallel locals to local + NL2L, + // Parallel mixed locals/points to local + Mixed2L + }; + + struct KernelInfo { + std::string name; + + const std::string& getName() const; + + bool operator < ( const KernelInfo& rhs ) const; + }; + + private: + Pattern pattern; + std::vector parts; + std::unordered_set kernelNames; + + public: + FusiblePartitionBlock(PatternType patternType, HostDataDeps::partitionBlock& inBlock); + + static std::set::iterator findForKernel( + const HipaccKernel* kernel, + const std::set& fusibleBlocks + ) { + return std::find_if( + fusibleBlocks.begin(), + fusibleBlocks.end(), + [&](const FusiblePartitionBlock& block) { + return block.hasKernel(kernel); + } + ); + } + + /** + * Check whether the pattern of this block is fusible. + */ + bool isPatternFusible() const { + // Return true if the pattern is fusible by the current ASTFuse tool, false otherwise. + + switch (pattern) { + case FusiblePartitionBlock::Pattern::Linear: + case FusiblePartitionBlock::Pattern::NP2P: + return true; + default: + return false; + } + } + + PatternType getPatternType() const; + Pattern getPattern() const; + const std::vector& getParts() const; + bool hasKernelName(const std::string& name) const; + bool hasKernel(const HipaccKernel* kernel) const; + + bool operator < ( const FusiblePartitionBlock& rhs ) const; +}; + } } diff --git a/lib/AST/ASTFuse.cpp b/lib/AST/ASTFuse.cpp index 8b87c0dc..2c2551c5 100644 --- a/lib/AST/ASTFuse.cpp +++ b/lib/AST/ASTFuse.cpp @@ -56,87 +56,102 @@ void ASTFuse::insertEpilogFusedKernel() { void ASTFuse::markKernelPositionSublist(std::list *l) { + const auto& partitionBlock = getPartitionBlockFor(l); + FusiblePartitionBlock::PatternType patternType = partitionBlock.getPatternType(); + // initialize kernel location tags for (auto K : *l) { - FusionTypeTags *tags = new FusionTypeTags; + FusionTypeTags *tags = new FusionTypeTags(patternType); FusibleKernelSubListPosMap[K] = tags; } - // sub-list indexing - auto itSrc = l->begin(); - auto itLastLocal = l->begin(); - for (auto it = l->begin(); it != l->end(); ++it) { - HipaccKernelClass *KC = (*it)->getKernelClass(); - // local to local fusion, e.g., l -> l - if ((KC->getKernelType() == LocalOperator) && (it == itSrc) && (l->size() == 2) && - (l->back()->getKernelClass()->getKernelType() == LocalOperator)) { - FusionTypeTags *PKTag = FusibleKernelSubListPosMap[l->front()]; - PKTag->Local2LocalLoc = Source; - FusionTypeTags *CKTag = FusibleKernelSubListPosMap[l->back()]; - CKTag->Local2LocalLoc = Destination; - break; - } - - if ((KC->getKernelType() == LocalOperator) && (it != itSrc)) { - // first search all the local kernels in the list - // point to local fusion, e.g., p -> p -> ... -> l - for (auto itSub = itSrc; itSub != it; ++itSub) { - HipaccKernel *K = *itSub; - FusionTypeTags *KTag = FusibleKernelSubListPosMap[K]; - if (itSub == itSrc) { - KTag->Point2LocalLoc = Source; - } else { - KTag->Point2LocalLoc = Intermediate; - } + if (patternType == FusiblePartitionBlock::PatternType::Linear) { + // sub-list indexing + auto itSrc = l->begin(); + auto itLastLocal = l->begin(); + for (auto it = l->begin(); it != l->end(); ++it) { + HipaccKernelClass *KC = (*it)->getKernelClass(); + // local to local fusion, e.g., l -> l + if ((KC->getKernelType() == LocalOperator) && (it == itSrc) && (l->size() == 2) && + (l->back()->getKernelClass()->getKernelType() == LocalOperator)) { + FusionTypeTags *PKTag = FusibleKernelSubListPosMap[l->front()]; + PKTag->Local2LocalLoc = Source; + FusionTypeTags *CKTag = FusibleKernelSubListPosMap[l->back()]; + CKTag->Local2LocalLoc = Destination; + break; } - HipaccKernel *KL = *it; - FusionTypeTags *KTagL = FusibleKernelSubListPosMap[KL]; - KTagL->Point2LocalLoc = Destination; - itSrc = std::next(it); - itLastLocal = it; - } else if (std::next(it) == l->end()) { - // after found all the local kernels in the list - // trace back to perform point-based kernels - HipaccKernelClass *KC = (*itLastLocal)->getKernelClass(); - if (KC->getKernelType() == LocalOperator) { - // local to point fusion, e.g., l -> p -> ... -> p - for (auto itSub = itLastLocal; itSub != l->end(); ++itSub) { + + if ((KC->getKernelType() == LocalOperator) && (it != itSrc)) { + // first search all the local kernels in the list + // point to local fusion, e.g., p -> p -> ... -> l + for (auto itSub = itSrc; itSub != it; ++itSub) { HipaccKernel *K = *itSub; FusionTypeTags *KTag = FusibleKernelSubListPosMap[K]; - if (itSub == itLastLocal) { - KTag->Local2PointLoc = Source; - } else if (std::next(itSub) == l->end()) { - KTag->Local2PointLoc = Destination; + if (itSub == itSrc) { + KTag->Point2LocalLoc = Source; } else { - KTag->Local2PointLoc = Intermediate; + KTag->Point2LocalLoc = Intermediate; } } - } else { - // point to point fusion, e.g., p -> p -> ... -> p - for (auto itSub = itLastLocal; itSub != l->end(); ++itSub) { - HipaccKernel *K = *itSub; - FusionTypeTags *KTag = FusibleKernelSubListPosMap[K]; - if (itSub == itLastLocal) { - KTag->Point2PointLoc = Source; - } else if (std::next(itSub) == l->end()) { - KTag->Point2PointLoc = Destination; - } else { - KTag->Point2PointLoc = Intermediate; + HipaccKernel *KL = *it; + FusionTypeTags *KTagL = FusibleKernelSubListPosMap[KL]; + KTagL->Point2LocalLoc = Destination; + itSrc = std::next(it); + itLastLocal = it; + } else if (std::next(it) == l->end()) { + // after found all the local kernels in the list + // trace back to perform point-based kernels + HipaccKernelClass *KC = (*itLastLocal)->getKernelClass(); + if (KC->getKernelType() == LocalOperator) { + // local to point fusion, e.g., l -> p -> ... -> p + for (auto itSub = itLastLocal; itSub != l->end(); ++itSub) { + HipaccKernel *K = *itSub; + FusionTypeTags *KTag = FusibleKernelSubListPosMap[K]; + if (itSub == itLastLocal) { + KTag->Local2PointLoc = Source; + } else if (std::next(itSub) == l->end()) { + KTag->Local2PointLoc = Destination; + } else { + KTag->Local2PointLoc = Intermediate; + } + } + } else { + // point to point fusion, e.g., p -> p -> ... -> p + for (auto itSub = itLastLocal; itSub != l->end(); ++itSub) { + HipaccKernel *K = *itSub; + FusionTypeTags *KTag = FusibleKernelSubListPosMap[K]; + if (itSub == itLastLocal) { + KTag->Point2PointLoc = Source; + } else if (std::next(itSub) == l->end()) { + KTag->Point2PointLoc = Destination; + } else { + KTag->Point2PointLoc = Intermediate; + } } } } } - } - if (DEBUG) { - std::cout << "[Kernel Fusion INFO] fusible sublist position:\n"; - for (auto K : *l) { - FusionTypeTags *tags = FusibleKernelSubListPosMap[K]; - std::cout << " " << K->getKernelClass()->getName() + K->getName() << ":"; - std::cout << " Point2PointLoc(" << tags->Point2PointLoc << "),"; - std::cout << " Local2PointLoc(" << tags->Local2PointLoc << "),"; - std::cout << " Point2LocalLoc(" << tags->Point2LocalLoc << "),"; - std::cout << " Local2LocalLoc(" << tags->Local2LocalLoc << ")\n"; + if (DEBUG) { + std::cout << "[Kernel Fusion INFO] fusible sublist position:\n"; + for (auto K : *l) { + FusionTypeTags *tags = FusibleKernelSubListPosMap[K]; + std::cout << " " << K->getKernelClass()->getName() + K->getName() << ":"; + std::cout << " Point2PointLoc(" << tags->Point2PointLoc << "),"; + std::cout << " Local2PointLoc(" << tags->Local2PointLoc << "),"; + std::cout << " Point2LocalLoc(" << tags->Point2LocalLoc << "),"; + std::cout << " Local2LocalLoc(" << tags->Local2LocalLoc << ")\n"; + } + } + } else if (patternType == FusiblePartitionBlock::PatternType::Parallel) { + HipaccKernel* consumer = l->back(); + for (auto kernel : *l) { + FusionTypeTags* tags = FusibleKernelSubListPosMap[kernel]; + if (kernel == consumer) { + tags->Point2PointLoc = Destination; + } else { + tags->Point2PointLoc = Source; + } } } } @@ -211,9 +226,24 @@ void ASTFuse::initKernelFusion() { void ASTFuse::HipaccFusion(std::list *l) { hipacc_require((l->size() >=2), "at least two kernels shoud be recorded for fusion"); + + unsigned int ppt = 0; + for (const auto& kernel : *l) { + if (ppt != 0) { + hipacc_require((kernel->getPixelsPerThread() == ppt), "kernels in one fusion list must all have the same PPT"); + } else { + ppt = kernel->getPixelsPerThread(); + } + } + + hipacc_require((ppt >= 1), "The PPT of the given fusion list could not be determined. This indicates a bug in the compiler implementation."); + + const auto& partitionBlock = getPartitionBlockFor(l); + FusiblePartitionBlock::PatternType patternType = partitionBlock.getPatternType(); + initKernelFusion(); curFusedKernelDecl = createFusedKernelDecl(l); - createGidVarDecl(); + createGidVarDecl(ppt); markKernelPositionSublist(l); @@ -231,12 +261,17 @@ void ASTFuse::HipaccFusion(std::list *l) { if (DEBUG) { std::cout << "[Kernel Fusion INFO] domain-specific fusion:\n"; } + + std::map parallelOutImageMap; + VarDecl* parallelInput = nullptr; + // fused kernel body generation for (auto it = (l->begin()); it != l->end(); ++it) { Stmt *curFusionBody = nullptr; HipaccKernel *K = *it; HipaccKernelClass *KC = K->getKernelClass(); KernelType KernelType = KC->getKernelType(); + HipaccIterationSpace* iterSpace = K->getIterationSpace(); if (DEBUG) { std::cout << " Kernel " << KC->getName() + K->getName() << " executes "; } FusionTypeTags *KTag = FusibleKernelSubListPosMap[K]; FunctionDecl *kernelDecl = createFunctionDecl(Ctx, @@ -245,140 +280,190 @@ void ASTFuse::HipaccFusion(std::list *l) { ASTTranslate *Hipacc = new ASTTranslate(Ctx, kernelDecl, K, KC, builtins, compilerOptions); - // Point-to-Point Transformation - switch(KTag->Point2PointLoc) { - default: - break; - case Source: - if (DEBUG) { std::cout << "P2P source generate"; } - hipacc_require(KernelType == PointOperator, "Mismatch kernel type for fusion"); - createReg4FusionVarDecl(KC->getOutField()->getType()); - Hipacc->setFusionP2PSrcOperator(fusionRegVarDecls.back()); - curFusionBody = Hipacc->Hipacc(KC->getKernelFunction()->getBody()); - break; - case Destination: - if (DEBUG) { std::cout << "P2P Destination generate"; } - hipacc_require(KernelType == PointOperator, "Mismatch kernel type for fusion"); - Hipacc->setFusionP2PDestOperator(fusionRegVarDecls.back()); - curFusionBody = Hipacc->Hipacc(KC->getKernelFunction()->getBody()); - break; - case Intermediate: - if (DEBUG) { std::cout << "P2P Intermediate generate"; } - hipacc_require(KernelType == PointOperator, "Mismatch kernel type for fusion"); - VarDecl *VDIn = fusionRegVarDecls.back(); - createReg4FusionVarDecl(KC->getOutField()->getType()); - VarDecl *VDOut = fusionRegVarDecls.back(); - Hipacc->setFusionP2PIntermOperator(VDIn, VDOut); - curFusionBody = Hipacc->Hipacc(KC->getKernelFunction()->getBody()); - break; - } - - // Local-to-Point Transformation - switch(KTag->Local2PointLoc) { - default: - break; - case Source: - hipacc_require(KernelType == LocalOperator, "Mismatch kernel type for fusion"); - if (DEBUG) { std::cout << "L2P source generate"; } - if (dataDeps->hasSharedIS(K)) { - createReg4FusionVarDecl(KC->getOutField()->getType()); - regVDSImg = fusionRegVarDecls.back(); - } - createReg4FusionVarDecl(KC->getOutField()->getType()); - Hipacc->setFusionP2PSrcOperator(fusionRegVarDecls.back()); - curFusionBody = Hipacc->Hipacc(KC->getKernelFunction()->getBody()); - if (dataDeps->hasSharedIS(K)) { - fusionRegSharedStmts.push_back(Hipacc->getFusionSharedInputStmt(regVDSImg)); - } - break; - case Destination: - hipacc_require(KernelType == PointOperator, "Mismatch kernel type for fusion"); - if (DEBUG) { std::cout << "L2P Destination generate"; } - if (dataDeps->hasSharedIS(K)) { - Hipacc->setFusionL2PDestOperator(fusionRegVarDecls.back(), regVDSImg, - dataDeps->getSharedISName(K)); - } else { + if (patternType == FusiblePartitionBlock::PatternType::Linear) { + // Point-to-Point Transformation + switch(KTag->Point2PointLoc) { + default: + break; + case Source: + if (DEBUG) { std::cout << "P2P source generate"; } + hipacc_require(KernelType == PointOperator, "Mismatch kernel type for fusion"); + createReg4FusionVarDecl(KC->getOutField()->getType(), ppt); + Hipacc->setFusionP2PSrcOperator(fusionRegVarDecls.back()); + curFusionBody = Hipacc->Hipacc(KC->getKernelFunction()->getBody()); + break; + case Destination: + if (DEBUG) { std::cout << "P2P Destination generate"; } + hipacc_require(KernelType == PointOperator, "Mismatch kernel type for fusion"); Hipacc->setFusionP2PDestOperator(fusionRegVarDecls.back()); - } - curFusionBody = Hipacc->Hipacc(KC->getKernelFunction()->getBody()); - break; - case Intermediate: - hipacc_require(KernelType == PointOperator, "Mismatch kernel type for fusion"); - if (DEBUG) { std::cout << "L2P Intermediate generate"; } - VarDecl *VDIn = fusionRegVarDecls.back(); - createReg4FusionVarDecl(KC->getOutField()->getType()); - VarDecl *VDOut = fusionRegVarDecls.back(); - if (dataDeps->hasSharedIS(K)) { - Hipacc->setFusionL2PIntermOperator(VDIn, VDOut, regVDSImg, dataDeps->getSharedISName(K)); - } else { + curFusionBody = Hipacc->Hipacc(KC->getKernelFunction()->getBody()); + break; + case Intermediate: + if (DEBUG) { std::cout << "P2P Intermediate generate"; } + hipacc_require(KernelType == PointOperator, "Mismatch kernel type for fusion"); + VarDecl *VDIn = fusionRegVarDecls.back(); + createReg4FusionVarDecl(KC->getOutField()->getType(), ppt); + VarDecl *VDOut = fusionRegVarDecls.back(); Hipacc->setFusionP2PIntermOperator(VDIn, VDOut); - } - curFusionBody = Hipacc->Hipacc(KC->getKernelFunction()->getBody()); - break; - } + curFusionBody = Hipacc->Hipacc(KC->getKernelFunction()->getBody()); + break; + } - // Point-to-Local Transformation - switch(KTag->Point2LocalLoc) { - default: - break; - case Source: - hipacc_require(KernelType == PointOperator, "Mismatch kernel type for fusion"); - if (DEBUG) { std::cout << "P2L source generate"; } - createIdx4FusionVarDecl(); - createReg4FusionVarDecl(KC->getOutField()->getType()); - Hipacc->setFusionP2LSrcOperator(fusionRegVarDecls.back(), fusionIdxVarDecls.back()); - vecProducerP2LBody.clear(); - vecProducerP2LBody.push_back(Hipacc->Hipacc(KC->getKernelFunction()->getBody())); - break; - case Destination: - hipacc_require(KernelType == LocalOperator, "Mismatch kernel type for fusion"); - if (DEBUG) { std::cout << "P2L Destination generate"; } - Hipacc->setFusionP2LDestOperator(fusionRegVarDecls.back(), fusionIdxVarDecls.back(), - createCompoundStmt(Ctx, vecProducerP2LBody)); - curFusionBody = Hipacc->Hipacc(KC->getKernelFunction()->getBody()); - break; - case Intermediate: - hipacc_require(KernelType == PointOperator, "Mismatch kernel type for fusion"); - if (DEBUG) { std::cout << "P2L Intermediate generate"; } - VarDecl *VDIn = fusionRegVarDecls.back(); - createReg4FusionVarDecl(KC->getOutField()->getType()); - VarDecl *VDOut = fusionRegVarDecls.back(); - Hipacc->setFusionP2PIntermOperator(VDIn, VDOut); - vecProducerP2LBody.push_back(Hipacc->Hipacc(KC->getKernelFunction()->getBody())); - break; - } + // Local-to-Point Transformation + switch(KTag->Local2PointLoc) { + default: + break; + case Source: + hipacc_require(KernelType == LocalOperator, "Mismatch kernel type for fusion"); + if (DEBUG) { std::cout << "L2P source generate"; } + if (dataDeps->hasSharedIS(K)) { + createReg4FusionVarDecl(KC->getOutField()->getType(), ppt); + regVDSImg = fusionRegVarDecls.back(); + } + createReg4FusionVarDecl(KC->getOutField()->getType(), ppt); + Hipacc->setFusionP2PSrcOperator(fusionRegVarDecls.back()); + curFusionBody = Hipacc->Hipacc(KC->getKernelFunction()->getBody()); + if (dataDeps->hasSharedIS(K)) { + fusionRegSharedStmts.push_back(Hipacc->getFusionSharedInputStmt(regVDSImg)); + } + break; + case Destination: + hipacc_require(KernelType == PointOperator, "Mismatch kernel type for fusion"); + if (DEBUG) { std::cout << "L2P Destination generate"; } + if (dataDeps->hasSharedIS(K)) { + Hipacc->setFusionL2PDestOperator(fusionRegVarDecls.back(), regVDSImg, + dataDeps->getSharedISName(K)); + } else { + Hipacc->setFusionP2PDestOperator(fusionRegVarDecls.back()); + } + curFusionBody = Hipacc->Hipacc(KC->getKernelFunction()->getBody()); + break; + case Intermediate: + hipacc_require(KernelType == PointOperator, "Mismatch kernel type for fusion"); + if (DEBUG) { std::cout << "L2P Intermediate generate"; } + VarDecl *VDIn = fusionRegVarDecls.back(); + createReg4FusionVarDecl(KC->getOutField()->getType(), ppt); + VarDecl *VDOut = fusionRegVarDecls.back(); + if (dataDeps->hasSharedIS(K)) { + Hipacc->setFusionL2PIntermOperator(VDIn, VDOut, regVDSImg, dataDeps->getSharedISName(K)); + } else { + Hipacc->setFusionP2PIntermOperator(VDIn, VDOut); + } + curFusionBody = Hipacc->Hipacc(KC->getKernelFunction()->getBody()); + break; + } - // Local-to-Local Transformation - switch(KTag->Local2LocalLoc) { - default: - break; - case Source: - hipacc_require(KernelType == LocalOperator, "Mismatch kernel type for fusion"); - if (DEBUG) { std::cout << "L2L source generate"; } - createReg4FusionVarDecl(KC->getOutField()->getType()); - createIdx4FusionVarDecl(); - idxXFused = fusionIdxVarDecls.back(); - createIdx4FusionVarDecl(); - idxYFused = fusionIdxVarDecls.back(); - Hipacc->setFusionL2LSrcOperator(fusionRegVarDecls.back(), idxXFused, idxYFused, + // Point-to-Local Transformation + switch(KTag->Point2LocalLoc) { + default: + break; + case Source: + hipacc_require(KernelType == PointOperator, "Mismatch kernel type for fusion"); + if (DEBUG) { std::cout << "P2L source generate"; } + createIdx4FusionVarDecl(ppt); + createReg4FusionVarDecl(KC->getOutField()->getType(), ppt); + Hipacc->setFusionP2LSrcOperator(fusionRegVarDecls.back(), fusionIdxVarDecls.back()); + vecProducerP2LBody.clear(); + vecProducerP2LBody.push_back(Hipacc->Hipacc(KC->getKernelFunction()->getBody())); + break; + case Destination: + hipacc_require(KernelType == LocalOperator, "Mismatch kernel type for fusion"); + if (DEBUG) { std::cout << "P2L Destination generate"; } + Hipacc->setFusionP2LDestOperator(fusionRegVarDecls.back(), fusionIdxVarDecls.back(), + createCompoundStmt(Ctx, vecProducerP2LBody)); + curFusionBody = Hipacc->Hipacc(KC->getKernelFunction()->getBody()); + break; + case Intermediate: + hipacc_require(KernelType == PointOperator, "Mismatch kernel type for fusion"); + if (DEBUG) { std::cout << "P2L Intermediate generate"; } + VarDecl *VDIn = fusionRegVarDecls.back(); + createReg4FusionVarDecl(KC->getOutField()->getType(), ppt); + VarDecl *VDOut = fusionRegVarDecls.back(); + Hipacc->setFusionP2PIntermOperator(VDIn, VDOut); + vecProducerP2LBody.push_back(Hipacc->Hipacc(KC->getKernelFunction()->getBody())); + break; + } + + // Local-to-Local Transformation + switch(KTag->Local2LocalLoc) { + default: + break; + case Source: + hipacc_require(KernelType == LocalOperator, "Mismatch kernel type for fusion"); + if (DEBUG) { std::cout << "L2L source generate"; } + createReg4FusionVarDecl(KC->getOutField()->getType(), ppt); + createIdx4FusionVarDecl(ppt); + idxXFused = fusionIdxVarDecls.back(); + createIdx4FusionVarDecl(ppt); + idxYFused = fusionIdxVarDecls.back(); + Hipacc->setFusionL2LSrcOperator(fusionRegVarDecls.back(), idxXFused, idxYFused, + std::get<1>(localKernelMemorySizeMap[K])); + Hipacc->Hipacc(KC->getKernelFunction()->getBody()); + stmtsL2LProducerKernel = Hipacc->getFusionLocalKernelBody(); + KLocalSrc = K; + break; + case Destination: + hipacc_require(KernelType == LocalOperator, "Mismatch kernel type for fusion"); + if (DEBUG) { std::cout << "L2L Destination generate"; } + Hipacc->setFusionL2LDestOperator(stmtsL2LProducerKernel, + fusionRegVarDecls.back(), idxXFused, idxYFused, std::get<1>(localKernelMemorySizeMap[K])); - Hipacc->Hipacc(KC->getKernelFunction()->getBody()); - stmtsL2LProducerKernel = Hipacc->getFusionLocalKernelBody(); - KLocalSrc = K; - break; - case Destination: - hipacc_require(KernelType == LocalOperator, "Mismatch kernel type for fusion"); - if (DEBUG) { std::cout << "L2L Destination generate"; } - Hipacc->setFusionL2LDestOperator(stmtsL2LProducerKernel, - fusionRegVarDecls.back(), idxXFused, idxYFused, - std::get<1>(localKernelMemorySizeMap[K])); - Hipacc->Hipacc(KC->getKernelFunction()->getBody()); - stmtsL2LConsumerKernel = Hipacc->getFusionLocalKernelBody(); - Local2LocalEndInsertion = true; - break; - case Intermediate: - hipacc_require(0, "Only two local kernels can be fused"); - break; + Hipacc->Hipacc(KC->getKernelFunction()->getBody()); + stmtsL2LConsumerKernel = Hipacc->getFusionLocalKernelBody(); + Local2LocalEndInsertion = true; + break; + case Intermediate: + hipacc_require(0, "Only two local kernels can be fused"); + break; + } + } else if (patternType == FusiblePartitionBlock::PatternType::Parallel) { + auto imgFields = KC->getImgFields(); + + FieldDecl* srcAccessorDecl = nullptr; + FieldDecl* srcIterDecl = nullptr; + HipaccAccessor* srcAccessor = nullptr; + bool srcProduce = (parallelInput == nullptr); + + switch(KTag->Point2PointLoc) { + default: + break; + case Source: + if (DEBUG) { std::cout << "P2P source generate"; } + hipacc_require(KernelType == PointOperator, "Mismatch kernel type for fusion"); + + srcIterDecl = KC->getOutField(); + for (FieldDecl* fieldDecl : imgFields) { + if (fieldDecl != srcIterDecl) { + hipacc_require(srcAccessorDecl == nullptr, + "source kernel in nP2P fusion may not have more than one accessor"); + srcAccessorDecl = fieldDecl; + } + } + hipacc_require(srcAccessorDecl != nullptr, + "source kernel in nP2P fusion must have exactly one accessor"); + + srcAccessor = K->getImgFromMapping(srcAccessorDecl); + + if (parallelInput == nullptr) { + createReg4FusionVarDecl(srcAccessor->getImage()->getType(), ppt); + parallelInput = fusionRegVarDecls.back(); + } + + createReg4FusionVarDecl(KC->getOutField()->getType(), ppt); + parallelOutImageMap[iterSpace->getImage()] = fusionRegVarDecls.back(); + Hipacc->setFusionNP2PSrcOperator(parallelInput, fusionRegVarDecls.back(), srcProduce); + curFusionBody = Hipacc->Hipacc(KC->getKernelFunction()->getBody()); + break; + case Destination: + if (DEBUG) { std::cout << "P2P Destination generate"; } + hipacc_require(KernelType == PointOperator, "Mismatch kernel type for fusion"); + Hipacc->setFusionNP2PDestOperator(parallelOutImageMap); + curFusionBody = Hipacc->Hipacc(KC->getKernelFunction()->getBody()); + break; + case Intermediate: + hipacc_require(false, "Invalid kernel position for parallel fusion"); + break; + } } if (curFusionBody) { @@ -424,24 +509,28 @@ bool ASTFuse::parseFusibleKernel(HipaccKernel *K) { if (!dataDeps->isFusible(K)) { return false; } // prepare fusible kernel list - unsigned PBlockID, KernelVecID; std::string kernelName = K->getKernelClass()->getName() + K->getName(); hipacc_require(FusibleKernelBlockLocation.count(kernelName), "Kernel name has no record"); - std::tie(PBlockID, KernelVecID) = FusibleKernelBlockLocation[kernelName]; - auto PBl = fusibleKernelSet[PBlockID]; + + auto loc = FusibleKernelBlockLocation[kernelName]; + + auto PBl = fusibleKernelSet[loc.blockLocation]; PBl->push_back(K); // fusion starts whenever a fusible block is ready - auto PBNam = *std::next(fusibleSetNames.begin(), PBlockID); - if (PBl->size() == PBNam.size()) { + + auto fusibleParts = std::next(dataDeps->getFusiblePartitionBlocks().begin(), loc.blockLocation)->getParts(); + if (PBl->size() == fusibleParts.size()) { // sort fusible list based on data dependence PBl->sort([&] (HipaccKernel *ka, HipaccKernel *kb) -> bool { std::string kaNam = ka->getKernelClass()->getName() + ka->getName(); - auto itKa = std::find_if(PBNam.begin(), PBNam.end(), - [&](std::list ls) { return ls.front() == kaNam; }); - hipacc_require(itKa != PBNam.end(), "Kernel cannot be sorted"); + auto itKa = std::find_if(fusibleParts.begin(), fusibleParts.end(), + [&](const FusiblePartitionBlock::Part& part) { return part.front().getName() == kaNam; }); + hipacc_require(itKa != fusibleParts.end(), "Kernel cannot be sorted"); std::string kbNam = kb->getKernelClass()->getName() + kb->getName(); - return (std::find(itKa->begin(), itKa->end(), kbNam) != itKa->end()); + return (std::find_if(itKa->begin(), itKa->end(), [&](const FusiblePartitionBlock::KernelInfo& info) { + return info.getName() == kbNam; + }) != itKa->end()); }); if (DEBUG) { @@ -460,27 +549,30 @@ bool ASTFuse::parseFusibleKernel(HipaccKernel *K) { // getters bool ASTFuse::isSrcKernel(HipaccKernel *K) { - unsigned PBlockID, KernelVecID; std::string kernelName = K->getKernelClass()->getName() + K->getName(); hipacc_require(FusibleKernelBlockLocation.count(kernelName), "Kernel name has no record"); - std::tie(PBlockID, KernelVecID) = FusibleKernelBlockLocation[kernelName]; - return fusibleKernelSet[PBlockID]->front() == K; + + auto loc = FusibleKernelBlockLocation[kernelName]; + + return fusibleKernelSet[loc.blockLocation]->front() == K; } bool ASTFuse::isDestKernel(HipaccKernel *K) { - unsigned PBlockID, KernelVecID; std::string kernelName = K->getKernelClass()->getName() + K->getName(); hipacc_require(FusibleKernelBlockLocation.count(kernelName), "Kernel name has no record"); - std::tie(PBlockID, KernelVecID) = FusibleKernelBlockLocation[kernelName]; - return fusibleKernelSet[PBlockID]->back() == K; + + auto loc = FusibleKernelBlockLocation[kernelName]; + + return fusibleKernelSet[loc.blockLocation]->back() == K; } HipaccKernel *ASTFuse::getProducerKernel(HipaccKernel *K) { - unsigned PBlockID, KernelVecID; std::string kernelName = K->getKernelClass()->getName() + K->getName(); hipacc_require(FusibleKernelBlockLocation.count(kernelName), "Kernel name has no record"); - std::tie(PBlockID, KernelVecID) = FusibleKernelBlockLocation[kernelName]; - auto PBl = fusibleKernelSet[PBlockID]; + + auto loc = FusibleKernelBlockLocation[kernelName]; + + auto PBl = fusibleKernelSet[loc.blockLocation]; auto it = std::find(PBl->begin(), PBl->end(), K); return (it == PBl->begin()) ? nullptr : *std::prev(it); } @@ -492,19 +584,25 @@ SmallVector ASTFuse::getFusedFileNamesAll() const { std::string ASTFuse::getFusedKernelName(HipaccKernel *K) { return fusedKernelNameMap[K]; } unsigned ASTFuse::getNewYSizeLocalKernel(HipaccKernel *K) { return std::get<1>(fusedLocalKernelMemorySizeMap[K]); } -void ASTFuse::createReg4FusionVarDecl(QualType QT) { - std::string Name = "_reg_fusion" + std::to_string(fusionRegVarCount++); - VarDecl *VD = createVarDecl(Ctx, Ctx.getTranslationUnitDecl(), Name, QT); - fusionRegVarDecls.push_back(VD); +void ASTFuse::createReg4FusionVarDecl(QualType QT, unsigned int ppt) { + std::string regCount = std::to_string(fusionRegVarCount++); + for (unsigned int i = 0; i < ppt; ++i) { + std::string Name = "_reg_fusion" + regCount + "_" + std::to_string(i); + VarDecl *VD = createVarDecl(Ctx, Ctx.getTranslationUnitDecl(), Name, QT); + fusionRegVarDecls.push_back(VD); + } } -void ASTFuse::createIdx4FusionVarDecl() { - std::string Name = "_idx_fusion" + std::to_string(fusionIdxVarCount++); - VarDecl *VD = createVarDecl(Ctx, Ctx.getTranslationUnitDecl(), Name, Ctx.IntTy); - fusionIdxVarDecls.push_back(VD); +void ASTFuse::createIdx4FusionVarDecl(unsigned int ppt) { + std::string idxCount = std::to_string(fusionIdxVarCount++); + for (unsigned int i = 0; i < ppt; ++i) { + std::string Name = "_idx_fusion" + idxCount + "_" + std::to_string(i); + VarDecl *VD = createVarDecl(Ctx, Ctx.getTranslationUnitDecl(), Name, Ctx.IntTy); + fusionIdxVarDecls.push_back(VD); + } } -void ASTFuse::createGidVarDecl() { +void ASTFuse::createGidVarDecl(unsigned int ppt) { SmallVector uintDeclTypes; SmallVector uintDeclNames; uintDeclTypes.push_back(Ctx.UnsignedIntTy); @@ -539,6 +637,13 @@ void ASTFuse::createGidVarDecl() { block_id_x, BO_Mul, Ctx.IntTy), local_id_x, BO_Add, Ctx.IntTy)); Expr *YE = createBinaryOperator(Ctx, local_size_y, block_id_y, BO_Mul, Ctx.IntTy); + + // Adapt YE according to PPT + if (ppt > 1) { + YE = createBinaryOperator(Ctx, YE, createIntegerLiteral(Ctx, + static_cast(ppt)), BO_Mul, Ctx.IntTy); + } + VarDecl *gid_y = createVarDecl(Ctx, curFusedKernelDecl, "gid_y", Ctx.getConstType(Ctx.IntTy), createBinaryOperator(Ctx, YE, local_id_y, BO_Add, Ctx.IntTy)); fusionRegVarDecls.push_back(gid_x); diff --git a/lib/AST/ASTTranslate.cpp b/lib/AST/ASTTranslate.cpp index 54a93d13..d0f5153d 100644 --- a/lib/AST/ASTTranslate.cpp +++ b/lib/AST/ASTTranslate.cpp @@ -31,6 +31,8 @@ // //===----------------------------------------------------------------------===// +#include + #include "hipacc/AST/ASTTranslate.h" using namespace clang; @@ -1086,6 +1088,8 @@ Stmt *ASTTranslate::Hipacc(Stmt *S) { } for (size_t p=0; pgetPixelsPerThread(); ++p) { + currentPptIndex = p; + // clear all stored decls before cloning, otherwise existing // VarDecls will be reused and we will miss declarations KernelDeclMap.clear(); @@ -1169,6 +1173,8 @@ Stmt *ASTTranslate::Hipacc(Stmt *S) { } } + currentPptIndex = 0; + // add label statement if needed (boundary handling), else add body if (border_handling) { LabelStmt *LS = createLabelStmt(Ctx, LDS[ld_count++], @@ -1197,6 +1203,14 @@ Stmt *ASTTranslate::Hipacc(Stmt *S) { } +Expr *ASTTranslate::createPptVarRefExpr(VarDecl *VD) const { + std::string originalIdent = VD->getNameAsString(); + std::string newIdent = originalIdent.substr(0, originalIdent.rfind("_")) + "_" + std::to_string(currentPptIndex); + VarDecl* toRef = createVarDecl(Ctx, Ctx.getTranslationUnitDecl(), newIdent, VD->getType()); + return createDeclRefExpr(Ctx, toRef); +} + + VarDecl *ASTTranslate::CloneVarDecl(VarDecl *VD) { VarDecl *result = KernelDeclMap[VD]; @@ -1351,10 +1365,10 @@ Stmt *ASTTranslate::VisitCompoundStmtTranslate(CompoundStmt *S) { if (Kernel->isFusible() && fusionVars.bL2LInsertKernelBody && fusionVars.bL2LInsertBeforeSmem) { Expr *offset_x, *offset_y; - offset_x = createBinaryOperator(Ctx, fusionVars.exprL2LIdXShift, + offset_x = createBinaryOperator(Ctx, createPptVarRefExpr(fusionVars.exprL2LIdXShift), createIntegerLiteral(Ctx, fusionVars.curL2LIdXShift), BO_Assign, Ctx.IntTy); - offset_y = createBinaryOperator(Ctx, fusionVars.exprL2LIdYShift, + offset_y = createBinaryOperator(Ctx, createPptVarRefExpr(fusionVars.exprL2LIdYShift), createIntegerLiteral(Ctx, fusionVars.curL2LIdYShift), BO_Assign, Ctx.IntTy); body.push_back(offset_x); @@ -1633,9 +1647,16 @@ Expr *ASTTranslate::VisitMemberExprTranslate(MemberExpr *E) { setExprProps(E, result); if (Kernel->isFusible() && fusionVars.bReplaceExprInput && - KernelClass->getKernelType() == PointOperator && - Kernel->getImgFromMapping(dyn_cast(VD))) { - return fusionVars.exprInput; + KernelClass->getKernelType() == PointOperator) { + HipaccAccessor* accessor = Kernel->getImgFromMapping(dyn_cast(VD)); + if (accessor != nullptr) { + if (fusionVars.multipleInputs) { + HipaccImage* img = accessor->getImage(); + return createPptVarRefExpr(fusionVars.exprInputs[img]); + } else { + return createPptVarRefExpr(fusionVars.exprInput); + } + } } return result; @@ -1957,13 +1978,13 @@ Expr *ASTTranslate::VisitCXXOperatorCallExprTranslate(CXXOperatorCallExpr *E) { } else { TXOld = createIntegerLiteral(Ctx, 0); } - TX = createBinaryOperator(Ctx, fusionVars.exprL2LIdXShift, TXOld, BO_Add, Ctx.IntTy); + TX = createBinaryOperator(Ctx, createPptVarRefExpr(fusionVars.exprL2LIdXShift), TXOld, BO_Add, Ctx.IntTy); if (acc->getSizeY() > 1) { SYOld = createIntegerLiteral(Ctx, static_cast(fusionVars.curL2LVarAccSizeY/2)); } else { SYOld = createIntegerLiteral(Ctx, 0); } - SY = createBinaryOperator(Ctx, fusionVars.exprL2LIdYShift, SYOld, BO_Add, Ctx.IntTy); + SY = createBinaryOperator(Ctx, createPptVarRefExpr(fusionVars.exprL2LIdYShift), SYOld, BO_Add, Ctx.IntTy); } else { if (acc->getSizeX() > 1) { if (Kernel->isFusible() && compilerOptions.allowMisAlignedAccess()) { @@ -1992,7 +2013,16 @@ Expr *ASTTranslate::VisitCXXOperatorCallExprTranslate(CXXOperatorCallExpr *E) { if (use_shared) { result = accessMemShared(DRE, TX, SY); } else { - result = accessMem(LHS, acc, mem_acc); + if (fusionVars.exprInputAccess != nullptr) { + if (!fusionVars.bInputAccessProduce) { + result = createPptVarRefExpr(fusionVars.exprInputAccess); + } else { + Expr* genMemAccess = accessMem(LHS, acc, mem_acc); + result = createCompoundAssignOperator(Ctx, createPptVarRefExpr(fusionVars.exprInputAccess), genMemAccess, BinaryOperatorKind::BO_Assign, genMemAccess->getType()); + } + } else { + result = accessMem(LHS, acc, mem_acc); + } } if (Kernel->isFusible() && KernelClass->getKernelType() == LocalOperator) { @@ -2079,7 +2109,7 @@ Expr *ASTTranslate::VisitCXXOperatorCallExprTranslate(CXXOperatorCallExpr *E) { if (Kernel->isFusible() && KernelClass->getKernelType() == LocalOperator) { fusionVars.bL2LInsertBeforeSmem = true; - if (fusionVars.bReplaceExprInput) return fusionVars.exprInput; + if (fusionVars.bReplaceExprInput) return createPptVarRefExpr(fusionVars.exprInput); } } @@ -2087,7 +2117,7 @@ Expr *ASTTranslate::VisitCXXOperatorCallExprTranslate(CXXOperatorCallExpr *E) { if (Kernel->isFusible() && fusionVars.bP2LReplaceExprInputIdx) { ArraySubscriptExpr *tempASE = dyn_cast(result); - tempASE->setRHS(fusionVars.exprP2LInputIdx); + tempASE->setRHS(createPptVarRefExpr(fusionVars.exprP2LInputIdx)); } return result; @@ -2179,7 +2209,7 @@ Expr *ASTTranslate::VisitCXXMemberCallExprTranslate(CXXMemberCallExpr *E) { setExprProps(E, result); if (Kernel->isFusible() && fusionVars.bReplaceExprOutput) { - return fusionVars.exprOutput; + return createPptVarRefExpr(fusionVars.exprOutput); } return result; } @@ -2421,48 +2451,66 @@ Expr *ASTTranslate::BinningTranslator::translateCXXMemberCallExpr(CXXMemberCallE void ASTTranslate::setFusionP2PSrcOperator(VarDecl *VD) { fusionVars.bReplaceExprOutput = true; - fusionVars.exprOutput = createDeclRefExpr(Ctx, VD); + fusionVars.exprOutput = CloneVarDecl(VD); +} + +void ASTTranslate::setFusionNP2PSrcOperator(VarDecl *inVD, VarDecl *outVD, bool produce) { + fusionVars.bReplaceExprOutput = true; + fusionVars.exprOutput = CloneVarDecl(outVD); + fusionVars.exprInputAccess = CloneVarDecl(inVD); + fusionVars.bInputAccessProduce = produce; } void ASTTranslate::setFusionP2PDestOperator(VarDecl *VD) { fusionVars.bReplaceExprInput = true; - fusionVars.exprInput = createDeclRefExpr(Ctx, VD); + fusionVars.exprInput = CloneVarDecl(VD); +} + +void ASTTranslate::setFusionNP2PDestOperator(const std::map& imgVarDeclMap) { + fusionVars.bReplaceExprInput = true; + fusionVars.multipleInputs = true; + + for (auto it = imgVarDeclMap.begin(); it != imgVarDeclMap.end(); ++it) { + HipaccImage* img = it->first; + VarDecl* VD = it->second; + fusionVars.exprInputs[img] = CloneVarDecl(VD); + } } void ASTTranslate::setFusionP2PIntermOperator(VarDecl *VDIn, VarDecl *VDOut) { fusionVars.bReplaceExprInput = true; - fusionVars.exprInput = createDeclRefExpr(Ctx, VDIn); + fusionVars.exprInput = CloneVarDecl(VDIn); fusionVars.bReplaceExprOutput = true; - fusionVars.exprOutput = createDeclRefExpr(Ctx, VDOut); + fusionVars.exprOutput = CloneVarDecl(VDOut); } void ASTTranslate::setFusionL2PDestOperator(VarDecl *VD, VarDecl *VDSharedImg, std::string nam) { fusionVars.bReplaceExprInput = true; - fusionVars.exprInput = createDeclRefExpr(Ctx, VD); + fusionVars.exprInput = CloneVarDecl(VD); fusionVars.exprSharedImgReg = createDeclRefExpr(Ctx, VDSharedImg); fusionVars.exprSharedImgName = nam; } void ASTTranslate::setFusionL2PIntermOperator(VarDecl *VDIn, VarDecl *VDOut, VarDecl *VDSharedImg, std::string nam) { fusionVars.bReplaceExprInput = true; - fusionVars.exprInput = createDeclRefExpr(Ctx, VDIn); + fusionVars.exprInput = CloneVarDecl(VDIn); fusionVars.bReplaceExprOutput = true; - fusionVars.exprOutput = createDeclRefExpr(Ctx, VDOut); + fusionVars.exprOutput = CloneVarDecl(VDOut); fusionVars.exprSharedImgReg = createDeclRefExpr(Ctx, VDSharedImg); fusionVars.exprSharedImgName = nam; } void ASTTranslate::setFusionP2LSrcOperator(VarDecl *VDReg, VarDecl *VDIdx) { fusionVars.bP2LReplaceExprInputIdx = true; - fusionVars.exprP2LInputIdx = createDeclRefExpr(Ctx, VDIdx); + fusionVars.exprP2LInputIdx = CloneVarDecl(VDIdx); fusionVars.bReplaceExprOutput = true; - fusionVars.exprOutput = createDeclRefExpr(Ctx, VDReg); + fusionVars.exprOutput = CloneVarDecl(VDReg); } void ASTTranslate::setFusionP2LDestOperator(VarDecl *VDReg, VarDecl *VDIdx, Stmt *S) { fusionVars.bP2LReplaceInputExprs = true; - fusionVars.exprP2LInputIdx = createDeclRefExpr(Ctx, VDIdx); - fusionVars.exprOutput = createDeclRefExpr(Ctx, VDReg); + fusionVars.exprP2LInputIdx = CloneVarDecl(VDIdx); + fusionVars.exprOutput = CloneVarDecl(VDReg); fusionVars.stmtP2LProducerBody = S; } @@ -2473,9 +2521,9 @@ void ASTTranslate::setFusionL2LSrcOperator(VarDecl *VDRegOut, VarDecl *VDIdX, fusionVars.curL2LVarAccSizeX = sz; fusionVars.curL2LVarAccSizeY = sz; fusionVars.bReplaceExprOutput = true; - fusionVars.exprOutput = createDeclRefExpr(Ctx, VDRegOut); - fusionVars.exprL2LIdXShift = createDeclRefExpr(Ctx, VDIdX); - fusionVars.exprL2LIdYShift = createDeclRefExpr(Ctx, VDIdY); + fusionVars.exprOutput = CloneVarDecl(VDRegOut); + fusionVars.exprL2LIdXShift = CloneVarDecl(VDIdX); + fusionVars.exprL2LIdYShift = CloneVarDecl(VDIdY); } void ASTTranslate::setFusionL2LEndSrcOperator(std::queue stmtsLocal, @@ -2484,8 +2532,8 @@ void ASTTranslate::setFusionL2LEndSrcOperator(std::queue stmtsLocal, fusionVars.curL2LVarAccSizeX = sz; fusionVars.curL2LVarAccSizeY = sz; fusionVars.stmtsL2LProducerKernel = stmtsLocal; - fusionVars.exprL2LIdXShift = createDeclRefExpr(Ctx, VDIdX); - fusionVars.exprL2LIdYShift = createDeclRefExpr(Ctx, VDIdY); + fusionVars.exprL2LIdXShift = CloneVarDecl(VDIdX); + fusionVars.exprL2LIdYShift = CloneVarDecl(VDIdY); fusionVars.bL2LReplaceBody = true; } @@ -2499,15 +2547,16 @@ void ASTTranslate::setFusionL2LDestOperator(std::queue stmtsLocal, fusionVars.bL2LRecordBody = true; fusionVars.bL2LRecordBorder = true; fusionVars.bReplaceExprInput = true; - fusionVars.exprInput = createDeclRefExpr(Ctx, VDRegIn); - fusionVars.exprL2LIdXShift = createDeclRefExpr(Ctx, VDIdX); - fusionVars.exprL2LIdYShift = createDeclRefExpr(Ctx, VDIdY); + fusionVars.exprInput = CloneVarDecl(VDRegIn); + fusionVars.exprL2LIdXShift = CloneVarDecl(VDIdX); + fusionVars.exprL2LIdYShift = CloneVarDecl(VDIdY); } std::queue ASTTranslate::getFusionLocalKernelBody() { return fusionVars.stmtsL2LKernel; } +// TODO This likely needs to be adapted in order to support PPT + fusion Stmt *ASTTranslate::getFusionSharedInputStmt(VarDecl *VDIn) { BinaryOperator *bOShared; for (auto param : kernelDecl->parameters()) { diff --git a/lib/AST/BorderHandling.cpp b/lib/AST/BorderHandling.cpp index 0d8e4a65..f0f44109 100644 --- a/lib/AST/BorderHandling.cpp +++ b/lib/AST/BorderHandling.cpp @@ -178,8 +178,8 @@ Expr *ASTTranslate::addBorderHandling(DeclRefExpr *LHS, Expr *local_offset_x, idx_x = addLocalOffset(idx_x, local_offset_x); idx_y = addLocalOffset(idx_y, local_offset_y); if (bRecordStmtsForKernelFusion) { - idx_x_fusion = addLocalOffset(idx_x_fusion, fusionVars.exprL2LIdXShift); - idx_y_fusion = addLocalOffset(idx_y_fusion, fusionVars.exprL2LIdYShift); + idx_x_fusion = addLocalOffset(idx_x_fusion, createPptVarRefExpr(fusionVars.exprL2LIdXShift)); + idx_y_fusion = addLocalOffset(idx_y_fusion, createPptVarRefExpr(fusionVars.exprL2LIdYShift)); } // step 1: remove is_offset and add interpolation & boundary handling @@ -391,10 +391,10 @@ Expr *ASTTranslate::addBorderHandling(DeclRefExpr *LHS, Expr *local_offset_x, if (bRecordStmtsForKernelFusion) { Expr *end_offset_x, *end_offset_y; - end_offset_x = createBinaryOperator(Ctx, fusionVars.exprL2LIdXShift, + end_offset_x = createBinaryOperator(Ctx, createPptVarRefExpr(fusionVars.exprL2LIdXShift), createBinaryOperator(Ctx, idx_x, tileVars.global_id_x, BO_Sub, Ctx.IntTy), BO_Assign, Ctx.IntTy); - end_offset_y = createBinaryOperator(Ctx, fusionVars.exprL2LIdYShift, + end_offset_y = createBinaryOperator(Ctx, createPptVarRefExpr(fusionVars.exprL2LIdYShift), createBinaryOperator(Ctx, idx_y, gidYRef, BO_Sub, Ctx.IntTy), BO_Assign, Ctx.IntTy); (fusionVars.stmtsL2LBorder).push_back(end_offset_x); diff --git a/lib/AST/MemoryAccess.cpp b/lib/AST/MemoryAccess.cpp index 7bb6a20e..dc55a4ba 100644 --- a/lib/AST/MemoryAccess.cpp +++ b/lib/AST/MemoryAccess.cpp @@ -703,12 +703,12 @@ void ASTTranslate::stageLineToSharedMemory(ParmVarDecl *PVD, if (Kernel->isFusible() && fusionVars.bP2LReplaceInputExprs) { // extract and set global id ArraySubscriptExpr *tempASE = dyn_cast(RHS); - stageBody.push_back(createBinaryOperator(Ctx, fusionVars.exprP2LInputIdx, + stageBody.push_back(createBinaryOperator(Ctx, createPptVarRefExpr(fusionVars.exprP2LInputIdx), tempASE->getIdx(), BO_Assign, Acc->getImage()->getType())); // insert the producer body stageBody.push_back(fusionVars.stmtP2LProducerBody); // replace the input - stageBody.push_back(createBinaryOperator(Ctx, LHS, fusionVars.exprOutput, + stageBody.push_back(createBinaryOperator(Ctx, LHS, createPptVarRefExpr(fusionVars.exprOutput), BO_Assign, Acc->getImage()->getType())); } else { stageBody.push_back(createBinaryOperator(Ctx, LHS, RHS, BO_Assign, diff --git a/lib/Analysis/HostDataDeps.cpp b/lib/Analysis/HostDataDeps.cpp index 3627aaac..a47e0800 100644 --- a/lib/Analysis/HostDataDeps.cpp +++ b/lib/Analysis/HostDataDeps.cpp @@ -618,6 +618,171 @@ void HostDataDeps::addKernel( kernelMap_[KVD] = kernel; } +FusiblePartitionBlock::FusiblePartitionBlock(PatternType patternType, HostDataDeps::partitionBlock& inBlock) { + for (auto* inPart : inBlock) { + Part part; + for (HostDataDeps::Process* inProcess : *inPart) { + auto kernelName = inProcess->getKernel()->getName(); + part.push_back({ + kernelName + }); + kernelNames.insert(kernelName); + } + parts.push_back(part); + } + + Pattern pat; + + switch (patternType) { + default: + hipacc_require(false, "Invalid pattern type."); + exit(1); // This will never be reached + case PatternType::Linear: + pat = Pattern::Linear; + break; + case PatternType::Parallel: + // TODO + std::list producers; + HostDataDeps::Process* consumer = nullptr; + for (auto* inPart : inBlock) { + hipacc_require(inPart->size() == 1 || inPart->size() == 2, "Invalid block part length."); + + if (inPart->size() == 2) { + HostDataDeps::Process* innerConsumer = *std::next(inPart->begin()); + if (consumer == nullptr) { + consumer = innerConsumer; + } else { + hipacc_require(innerConsumer == consumer, "In parallel patterns, all parts must have the same consumer."); + } + } + + producers.push_back(inPart->front()); + } + + hipacc_require(consumer != nullptr, "Patterns with no consumers are not allowed."); + hipacc_require(producers.size() > 1, "In parallel patterns, more than one producer must exist."); + + auto consumerKT = consumer->getKernel()->getKernelClass()->getKernelType(); + hipacc_require(consumerKT == PointOperator || consumerKT == LocalOperator, + "In parallel patterns, only local or point operators are supported"); + + auto firstProducerKT = producers.front()->getKernel()->getKernelClass()->getKernelType(); + hipacc_require(firstProducerKT == PointOperator || firstProducerKT == LocalOperator, + "In parallel patterns, only local or point operators are supported"); + + if (consumerKT == PointOperator) { + pat = Pattern::NP2P; + } else { + pat = Pattern::NP2L; + } + + for (auto* producer : producers) { + auto producerKT = producer->getKernel()->getKernelClass()->getKernelType(); + hipacc_require(producerKT == PointOperator || producerKT == LocalOperator, + "In parallel patterns, only local or point operators are supported"); + + if (producerKT != firstProducerKT) { // Mixed producer kernel types + if (consumerKT == PointOperator) { + pat = Pattern::Mixed2P; + } else { + pat = Pattern::Mixed2L; + } + break; + } else if (producerKT == PointOperator) { // Only point producers + if (consumerKT == PointOperator) { + pat = Pattern::NP2P; + } else { + pat = Pattern::NP2L; + } + } else { // Only local producers + if (consumerKT == PointOperator) { + pat = Pattern::NL2P; + } else { + pat = Pattern::NL2L; + } + } + } + + break; + } + + pattern = pat; + + if (!isPatternFusible()) { + std::string patternStr = "unknown"; + switch (pattern) { + case Pattern::Linear: + patternStr = "linear"; + break; + case Pattern::NP2P: + patternStr = "parallel points-to-point"; + break; + case Pattern::NL2P: + patternStr = "parallel locals-to-point"; + break; + case Pattern::Mixed2P: + patternStr = "parallel mixed-locals/point-to-point"; + break; + case Pattern::NP2L: + patternStr = "parallel points-to-local"; + break; + case Pattern::NL2L: + patternStr = "parallel locals-to-local"; + break; + case Pattern::Mixed2L: + patternStr = "parallel mixed-locals/point-to-local"; + break; + } + + llvm::errs() << "[Kernel Fusion INFO] hint: Detected " + patternStr + " pattern, which is not yet supported. Skipped fusion for this pattern.\n"; + } +} + +const std::string& FusiblePartitionBlock::KernelInfo::getName() const { + return name; +} + +bool FusiblePartitionBlock::KernelInfo::operator < ( const FusiblePartitionBlock::KernelInfo& rhs ) const { + return name < rhs.name; +} + +FusiblePartitionBlock::PatternType FusiblePartitionBlock::getPatternType() const { + switch (pattern) { + case Pattern::Linear: + return PatternType::Linear; + case Pattern::NP2P: + case Pattern::NL2P: + case Pattern::Mixed2P: + case Pattern::NP2L: + case Pattern::NL2L: + case Pattern::Mixed2L: + return PatternType::Parallel; + } + + hipacc_require(false, "FusiblePartitionBlock has invalid pattern."); + exit(1); // This will never be reached +} + +FusiblePartitionBlock::Pattern FusiblePartitionBlock::getPattern() const { + return pattern; +} + +const std::vector& FusiblePartitionBlock::getParts() const { + return parts; +} + +bool FusiblePartitionBlock::hasKernelName(const std::string& name) const { + return kernelNames.find(name) != kernelNames.end(); +} + +bool FusiblePartitionBlock::hasKernel(const HipaccKernel* kernel) const { + std::string kernelName = kernel->getKernelClass()->getName() + kernel->getName(); + return hasKernelName(kernelName); +} + +bool FusiblePartitionBlock::operator < ( const FusiblePartitionBlock& rhs ) const { + return parts < rhs.parts; +} void HostDataDeps::addAccessor( ValueDecl *AVD, HipaccAccessor *acc, ValueDecl* IVD) { @@ -900,9 +1065,10 @@ std::map> HostDataDeps::getGraphNodeDepMap() } // detect simple linear producer-consumer data dependence -void HostDataDeps::fusibilityAnalysisLinear() { +void HostDataDeps::fusibilityAnalysisLinearAndParallel() { partitionBlock workingBlock; partitionBlock readyBlock; + for (auto pL : applicationGraph) { Process* producerP = pL->front(); KernelType KT = producerP->getKernel()->getKernelClass()->getKernelType(); @@ -911,6 +1077,7 @@ void HostDataDeps::fusibilityAnalysisLinear() { workingBlock.push_back(pL); } } + for (auto pL : workingBlock) { Process* consumerP = pL->back(); KernelType KT = consumerP->getKernel()->getKernelClass()->getKernelType(); @@ -954,6 +1121,73 @@ void HostDataDeps::fusibilityAnalysisLinear() { } } + + std::map readyMapParallel; + + for (auto pL : workingBlock) { + Process* consumerP = pL->back(); + + if (readyMapParallel.find(consumerP) != readyMapParallel.end()) { + // if respective chunk is already in map, ignore it + continue; + } + + partitionBlock* parallelBlock = new partitionBlock; + Space* lastParallelInSpace = nullptr; + + for (auto poL : applicationGraph) { + if (pL == poL) { + // Only consider distinct lists + continue; + } + + Process* innerProducerP = poL->front(); + Process* innerConsumerP = poL->back(); + + if ( + poL->size() == 2 && + innerConsumerP == consumerP + ) { + std::vector inSpaces = innerProducerP->getInSpaces(); + if (inSpaces.size() == 1) { + Space* inSpace = inSpaces.front(); + if (lastParallelInSpace == nullptr) { + lastParallelInSpace = inSpace; + } + if (lastParallelInSpace == inSpace) { + parallelBlock->push_back(poL); + } + } + } + } + + if (!parallelBlock->empty()) { + parallelBlock->push_back(pL); + bool isParallelyFusible = true; + for (Space* inSpace : consumerP->getInSpaces()) { + Process* srcP = inSpace->getSrcProcess(); + bool found = std::find_if( + parallelBlock->begin(), + parallelBlock->end(), + [srcP](std::list* e) { + return e->front() == srcP; + } + ) != parallelBlock->end(); + + // external input to consumer + if (!found) { + isParallelyFusible = false; + break; + } + } + if (isParallelyFusible) { + readyMapParallel[consumerP] = parallelBlock; + } + } + } + + // At this point, readyMapParallel contains all parallely fusible blocks (as values) + // group all fusible pairs into partition blocks std::set readySet; std::map LocalOpPBMap; @@ -1031,21 +1265,23 @@ void HostDataDeps::fusibilityAnalysisLinear() { if (!readySet.empty()) { llvm::errs() << "[Kernel Fusion INFO] fusible kernels from linear analysis:\n"; for (auto pB : readySet) { - partitionBlockNames PBNam; - llvm::errs() << " [ "; - for (auto pL : *pB) { - llvm::errs() << "{"; - std::list lNam; - for (auto p : *pL) { - std::string kname = p->getKernel()->getName(); - llvm::errs() << " --> " << kname; - lNam.push_back(kname); - } - llvm::errs() << "} "; - PBNam.push_back(lNam); - } - llvm::errs() << "] \n"; - fusibleSetNames.insert(PBNam); + fusiblePartitionBlocks.emplace(FusiblePartitionBlock::PatternType::Linear, *pB); + } + } + + // convert readyMapParallel to fusibleSetNamesParallel + if (!readyMapParallel.empty()) { + llvm::errs() << "[Kernel Fusion INFO] fusible kernels from parallel analysis:\n"; + for (auto it = readyMapParallel.begin(); it != readyMapParallel.end(); ++it) { + partitionBlock* pB = it->second; + + // Add consumer as destination block for completeness + Process* consumerP = it->first; + auto destList = new std::list; + destList->push_back(consumerP); + pB->push_back(destList); + + fusiblePartitionBlocks.emplace(FusiblePartitionBlock::PatternType::Parallel, *pB); } } } @@ -1120,7 +1356,7 @@ void HostDataDeps::fusibilityAnalysis() { PBNam.push_back(lNam); } llvm::errs() << "--------------------------------\n"; - fusibleSetNames.insert(PBNam); + fusiblePartitionBlocks.emplace(FusiblePartitionBlock::PatternType::Linear, PB); } } @@ -1432,28 +1668,18 @@ bool HostDataDeps::isDest(Process *P) { return s->getDstProcesses().empty(); } -std::set HostDataDeps::getFusibleSetNames() const { - return fusibleSetNames; +const std::set& HostDataDeps::getFusiblePartitionBlocks() const { + return fusiblePartitionBlocks; } bool HostDataDeps::isFusible(HipaccKernel *K) { - bool isFusible = false; - std::string fullName = K->getKernelClass()->getName() + K->getName(); + auto fusibleBlock = FusiblePartitionBlock::findForKernel(K, fusiblePartitionBlocks); - // Kernel name has no corresponding process or no execute() is called - if (!processMap_.count(fullName)) { - return isFusible; - } - // get Kernel Partition Block - for (auto PBN : fusibleSetNames) { - if (std::any_of(PBN.begin(), PBN.end(), [&](std::list lNam){ - return (std::find(lNam.begin(), lNam.end(), fullName) != lNam.end()) && - (lNam.size() > 1);})) { - isFusible = true; - break; - } + if (fusibleBlock == fusiblePartitionBlocks.end()) { + return false; } - return isFusible; + + return fusibleBlock->isPatternFusible(); } bool HostDataDeps::hasSharedIS(HipaccKernel *K) { diff --git a/samples-public/6_Test/Color_Curves/CMakeLists.txt b/samples-public/6_Test/Color_Curves/CMakeLists.txt new file mode 100644 index 00000000..00c0284d --- /dev/null +++ b/samples-public/6_Test/Color_Curves/CMakeLists.txt @@ -0,0 +1,10 @@ +cmake_minimum_required(VERSION 3.14) + +get_filename_component(SAMPLE_NAME "${CMAKE_CURRENT_LIST_DIR}" NAME) + +project(${SAMPLE_NAME}) + +#add_hipacc_sample_dsl() +#add_hipacc_sample_cpu() +add_hipacc_sample_cuda(FUSION LOCAL PPT GRAPH) # kernel fusion is only supported for CUDA +#add_hipacc_sample_opencl(CPU GPU) diff --git a/samples-public/6_Test/Color_Curves/src/main.cpp b/samples-public/6_Test/Color_Curves/src/main.cpp new file mode 100644 index 00000000..09dd4c2d --- /dev/null +++ b/samples-public/6_Test/Color_Curves/src/main.cpp @@ -0,0 +1,433 @@ +// +// Copyright (c) 2020, University of Erlangen-Nuremberg +// Copyright (c) 2012, University of Erlangen-Nuremberg +// Copyright (c) 2012, Siemens AG +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, this +// list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + +#include "hipacc.hpp" + +#include +#include +#include + +#define WIDTH 512 +#define HEIGHT 512 +#define N_CONTROL_POINTS 4 +#define MIN_VALUE static_cast(0) +#define MAX_VALUE static_cast(255) + +#define CHANNEL_R 0 +#define CHANNEL_G 1 +#define CHANNEL_B 2 + +using namespace hipacc; +using namespace hipacc::math; + +// Kernel description in Hipacc +class ColorCurves : public Kernel { + private: + Accessor ∈ + int channel; + uchar cp_y1, cp_y2, cp_y3, cp_y4; + + public: + ColorCurves(IterationSpace &iter, Accessor &acc, int channel, uchar cp_y1, uchar cp_y2, uchar cp_y3, uchar cp_y4) + : Kernel(iter), in(acc), channel(channel), cp_y1(cp_y1), cp_y2(cp_y2), cp_y3(cp_y3), cp_y4(cp_y4) { + add_accessor(&in); + } + + void kernel() { + uchar4 interm_pixel = in(); + float x = static_cast(interm_pixel.w); + if (channel == CHANNEL_R) { + x = static_cast(interm_pixel.x); + } else if (channel == CHANNEL_G) { + x = static_cast(interm_pixel.y); + } else if (channel == CHANNEL_B) { + x = static_cast(interm_pixel.z); + } + + const float h = (static_cast(MAX_VALUE) - static_cast(MIN_VALUE)) / (static_cast(N_CONTROL_POINTS) - 1.0); + + const float y1 = static_cast(cp_y1); + const float y2 = static_cast(cp_y2); + const float y3 = static_cast(cp_y3); + const float y4 = static_cast(cp_y4); + + const float a = (6.0/(h*h)) * (-2.0 * y2 + y3); + const float b = (6.0/(h*h)) * (y2 - 2.0 * y3); + + const float m1 = 0.0; + const float m2 = (4.0 * a - b) / 15.0; + const float m3 = (-a + 4.0 * b) / 15.0; + const float m4 = 0.0; + + const float a1 = (m2-m1) / (6.0*h); + const float a2 = (m3-m2) / (6.0*h); + const float a3 = (m4-m3) / (6.0*h); + const float a4 = 0.0; + + const float b1 = m1 / 2.0; + const float b2 = m2 / 2.0; + const float b3 = m3 / 2.0; + const float b4 = m4 / 2.0; + + const float c1 = (y2 - y1) / h - ((m2 + 2*m1) / 6.0) * h; + const float c2 = (y3 - y2) / h - ((m3 + 2*m2) / 6.0) * h; + const float c3 = (y4 - y3) / h - ((m4 + 2*m3) / 6.0) * h; + const float c4 = 0.0; + + const float d1 = y1; + const float d2 = y2; + const float d3 = y3; + const float d4 = y4; + + float ai = a1; + float bi = b1; + float ci = c1; + float di = d1; + + if (x < h) { + // nothing to do here + } else if (x < 2 * h) { + ai = a2; + bi = b2; + ci = c2; + di = d2; + x -= h; + } else if (x < 3 * h) { + ai = a3; + bi = b3; + ci = c3; + di = d3; + x -= 2 * h; + } else { + ai = a4; + bi = b4; + ci = c4; + di = d4; + x -= 3 * h; + } + + const float x3 = x*x*x; + const float x2 = x*x; + + output() = static_cast(ai*x3 + bi*x2 + ci*x + di); + } +}; + +// Kernel description in Hipacc +class CombineChannels : public Kernel { + private: + Accessor& in_r; + Accessor& in_g; + Accessor& in_b; + + public: + CombineChannels(IterationSpace& iter, Accessor& acc_r, Accessor& acc_g, Accessor& acc_b) + : Kernel(iter), in_r(acc_r), in_g(acc_g), in_b(acc_b) { + add_accessor(&in_r); + add_accessor(&in_g); + add_accessor(&in_b); + } + + void kernel() { + uchar pixel_r = in_r(); + uchar pixel_g = in_g(); + uchar pixel_b = in_b(); + + uchar4 out; + out.x = pixel_r; + out.y = pixel_g; + out.z = pixel_b; + out.w = MAX_VALUE; + + output() = out; + } +}; + +// forward declaration of reference implementation +void kernel_fusion(uchar4 *in, uchar4 *out, int width, int height); + +void compare_color(uchar4* output, uchar4* ref_out, int width, int height) { + uchar *cmp_output_r = new uchar[width*height]; + uchar *cmp_output_g = new uchar[width*height]; + uchar *cmp_output_b = new uchar[width*height]; + uchar *cmp_output_a = new uchar[width*height]; + uchar *cmp_ref_out_r = new uchar[width*height]; + uchar *cmp_ref_out_g = new uchar[width*height]; + uchar *cmp_ref_out_b = new uchar[width*height]; + uchar *cmp_ref_out_a = new uchar[width*height]; + + for (int i = 0; i < width*height; ++i) { + cmp_output_r[i] = output[i].x; + cmp_output_g[i] = output[i].y; + cmp_output_b[i] = output[i].z; + cmp_output_a[i] = output[i].w; + cmp_ref_out_r[i] = ref_out[i].x; + cmp_ref_out_g[i] = ref_out[i].y; + cmp_ref_out_b[i] = ref_out[i].z; + cmp_ref_out_a[i] = ref_out[i].w; + } + + compare_results(cmp_output_r, cmp_ref_out_r, width, height); + compare_results(cmp_output_g, cmp_ref_out_g, width, height); + compare_results(cmp_output_b, cmp_ref_out_b, width, height); + compare_results(cmp_output_a, cmp_ref_out_a, width, height); + + delete[] cmp_output_r; + delete[] cmp_output_g; + delete[] cmp_output_b; + delete[] cmp_output_a; + delete[] cmp_ref_out_r; + delete[] cmp_ref_out_g; + delete[] cmp_ref_out_b; + delete[] cmp_ref_out_a; +} + +void diff_color(uchar4* img1, uchar4* img2, int width, int height) { + for (int i = 0; i < width*height; ++i) { + uchar4 c1 = img1[i]; + uchar4 c2 = img2[i]; + + const bool eq_r = c1.x == c2.x; + const bool eq_g = c1.y == c2.y; + const bool eq_b = c1.z == c2.z; + const bool eq_a = c1.w == c2.w; + + const bool eq_all = eq_r && eq_g && eq_b && eq_a; + + if (eq_all) { + std::cout << "[" << i << "] "; + std::cout << "(" << c1.x << "," << c1.y << "," << c1.z << "," << c1.w << ")"; + std::cout << " <=> "; + std::cout << "(" << c2.x << "," << c2.y << "," << c2.z << "," << c2.w << ")"; + std::cout << std::endl; + } + } +} + +/************************************************************************* + * Main function * + *************************************************************************/ +HIPACC_CODEGEN int main(int argc, const char **argv) { + int width_arg = WIDTH; + int height_arg = HEIGHT; + + if(argc >= 2) { + width_arg = std::stoi(argv[1]); + height_arg = width_arg; + } + + if(argc >= 3) { + height_arg = std::stoi(argv[2]); + } + + const int width = width_arg; + const int height = height_arg; + + // host memory for image of width x height pixels, random + uchar4 *input = (uchar4*)load_data(width, height, 4); + uchar4 *ref_out = new uchar4[width*height]; + + std::cout << "Testing Hipacc kernel fusion ..." << std::endl; + + //************************************************************************// + + // input and output image of width x height pixels + Image in(width, height, input); + Image out(width, height); + + Accessor acc_in(in); + + // **** + // * Red channel + // **** + Image channel_r(width, height); + IterationSpace iter_channel_r(channel_r); + Accessor acc_channel_r(channel_r); + + ColorCurves curves_channel_r(iter_channel_r, acc_in, CHANNEL_R, 10, 50, 230, 20); + + // **** + // * Green channel + // **** + Image channel_g(width, height); + IterationSpace iter_channel_g(channel_g); + Accessor acc_channel_g(channel_g); + + ColorCurves curves_channel_g(iter_channel_g, acc_in, CHANNEL_G, 10, 50, 230, 20); + + // **** + // * Blue channel + // **** + Image channel_b(width, height); + IterationSpace iter_channel_b(channel_b); + Accessor acc_channel_b(channel_b); + + ColorCurves curves_channel_b(iter_channel_b, acc_in, CHANNEL_B, 10, 50, 230, 20); + + // **** + // * Combining operator + // **** + IterationSpace iter_out(out); + + CombineChannels combine(iter_out, acc_channel_r, acc_channel_g, acc_channel_b); + + // execution after all decls + curves_channel_r.execute(); + curves_channel_g.execute(); + curves_channel_b.execute(); + combine.execute(); + + // get pointer to result data + uchar4 *output = out.data(); + + //************************************************************************// + std::cout << "Calculating reference ..." << std::endl; + kernel_fusion(input, ref_out, width, height); + + compare_color(output, ref_out, width, height); + //diff_color(output, ref_out, width, height); + + // free memory + delete[] input; + delete[] ref_out; + + return EXIT_SUCCESS; +} + +// kernel fusion reference +void color_curves_kernel(uchar4 *in, uchar *out, int width, int height, int channel, uchar cp_y1, uchar cp_y2, uchar cp_y3, uchar cp_y4) { + for (int p = 0; p < width*height; ++p) { + uchar4 interm_pixel = in[p]; + float x = static_cast(interm_pixel.w); + if (channel == CHANNEL_R) { + x = static_cast(interm_pixel.x); + } else if (channel == CHANNEL_G) { + x = static_cast(interm_pixel.y); + } else if (channel == CHANNEL_B) { + x = static_cast(interm_pixel.z); + } + + const float h = (static_cast(MAX_VALUE) - static_cast(MIN_VALUE)) / (static_cast(N_CONTROL_POINTS) - 1.0); + + const float y1 = static_cast(cp_y1); + const float y2 = static_cast(cp_y2); + const float y3 = static_cast(cp_y3); + const float y4 = static_cast(cp_y4); + + const float a = (6.0/(h*h)) * (-2.0 * y2 + y3); + const float b = (6.0/(h*h)) * (y2 - 2.0 * y3); + + const float m1 = 0.0; + const float m2 = (4.0 * a - b) / 15.0; + const float m3 = (-a + 4.0 * b) / 15.0; + const float m4 = 0.0; + + const float a1 = (m2-m1) / (6.0*h); + const float a2 = (m3-m2) / (6.0*h); + const float a3 = (m4-m3) / (6.0*h); + const float a4 = 0.0; + + const float b1 = m1 / 2.0; + const float b2 = m2 / 2.0; + const float b3 = m3 / 2.0; + const float b4 = m4 / 2.0; + + const float c1 = (y2 - y1) / h - ((m2 + 2*m1) / 6.0) * h; + const float c2 = (y3 - y2) / h - ((m3 + 2*m2) / 6.0) * h; + const float c3 = (y4 - y3) / h - ((m4 + 2*m3) / 6.0) * h; + const float c4 = 0.0; + + const float d1 = y1; + const float d2 = y2; + const float d3 = y3; + const float d4 = y4; + + float ai = a1; + float bi = b1; + float ci = c1; + float di = d1; + + if (x < h) { + // nothing to do here + } else if (x < 2 * h) { + ai = a2; + bi = b2; + ci = c2; + di = d2; + x -= h; + } else if (x < 3 * h) { + ai = a3; + bi = b3; + ci = c3; + di = d3; + x -= 2 * h; + } else { + ai = a4; + bi = b4; + ci = c4; + di = d4; + x -= 3 * h; + } + + const float x3 = x*x*x; + const float x2 = x*x; + + out[p] = static_cast(ai*x3 + bi*x2 + ci*x + di); + } +} + +void combine_channels_kernel(uchar* in_r, uchar* in_g, uchar* in_b, uchar4* out, int width, int height) { + for (int p = 0; p < width*height; ++p) { + uchar pixel_r = in_r[p]; + uchar pixel_g = in_g[p]; + uchar pixel_b = in_b[p]; + + uchar4 pixel_out; + pixel_out.x = pixel_r; + pixel_out.y = pixel_g; + pixel_out.z = pixel_b; + pixel_out.w = MAX_VALUE; + + out[p] = pixel_out; + } +} + +void kernel_fusion(uchar4 *in, uchar4 *out, int width, int height) { + uchar *ref_buf_r = new uchar[width*height]; + uchar *ref_buf_g = new uchar[width*height]; + uchar *ref_buf_b = new uchar[width*height]; + + color_curves_kernel(in, ref_buf_r, width, height, CHANNEL_R, 10, 50, 230, 20); + color_curves_kernel(in, ref_buf_g, width, height, CHANNEL_G, 10, 50, 230, 20); + color_curves_kernel(in, ref_buf_b, width, height, CHANNEL_B, 10, 50, 230, 20); + combine_channels_kernel(ref_buf_r, ref_buf_g, ref_buf_b, out, width, height); + + delete[] ref_buf_r; + delete[] ref_buf_g; + delete[] ref_buf_b; +} + diff --git a/samples-public/6_Test/Color_Curves_Int/CMakeLists.txt b/samples-public/6_Test/Color_Curves_Int/CMakeLists.txt new file mode 100644 index 00000000..00c0284d --- /dev/null +++ b/samples-public/6_Test/Color_Curves_Int/CMakeLists.txt @@ -0,0 +1,10 @@ +cmake_minimum_required(VERSION 3.14) + +get_filename_component(SAMPLE_NAME "${CMAKE_CURRENT_LIST_DIR}" NAME) + +project(${SAMPLE_NAME}) + +#add_hipacc_sample_dsl() +#add_hipacc_sample_cpu() +add_hipacc_sample_cuda(FUSION LOCAL PPT GRAPH) # kernel fusion is only supported for CUDA +#add_hipacc_sample_opencl(CPU GPU) diff --git a/samples-public/6_Test/Color_Curves_Int/src/main.cpp b/samples-public/6_Test/Color_Curves_Int/src/main.cpp new file mode 100644 index 00000000..5a4e63dd --- /dev/null +++ b/samples-public/6_Test/Color_Curves_Int/src/main.cpp @@ -0,0 +1,433 @@ +// +// Copyright (c) 2020, University of Erlangen-Nuremberg +// Copyright (c) 2012, University of Erlangen-Nuremberg +// Copyright (c) 2012, Siemens AG +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, this +// list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + +#include "hipacc.hpp" + +#include +#include +#include + +#define WIDTH 512 +#define HEIGHT 512 +#define N_CONTROL_POINTS 4 +#define MIN_VALUE static_cast(0) +#define MAX_VALUE static_cast(255) + +#define CHANNEL_R 0 +#define CHANNEL_G 1 +#define CHANNEL_B 2 + +using namespace hipacc; +using namespace hipacc::math; + +// Kernel description in Hipacc +class ColorCurves : public Kernel { + private: + Accessor ∈ + int channel; + uchar cp_y1, cp_y2, cp_y3, cp_y4; + + public: + ColorCurves(IterationSpace &iter, Accessor &acc, int channel, uchar cp_y1, uchar cp_y2, uchar cp_y3, uchar cp_y4) + : Kernel(iter), in(acc), channel(channel), cp_y1(cp_y1), cp_y2(cp_y2), cp_y3(cp_y3), cp_y4(cp_y4) { + add_accessor(&in); + } + + void kernel() { + uchar4 interm_pixel = in(); + int x = static_cast(interm_pixel.w); + if (channel == CHANNEL_R) { + x = static_cast(interm_pixel.x); + } else if (channel == CHANNEL_G) { + x = static_cast(interm_pixel.y); + } else if (channel == CHANNEL_B) { + x = static_cast(interm_pixel.z); + } + + const int h = (static_cast(MAX_VALUE) - static_cast(MIN_VALUE)) / (static_cast(N_CONTROL_POINTS) - 1); + + const int y1 = static_cast(cp_y1); + const int y2 = static_cast(cp_y2); + const int y3 = static_cast(cp_y3); + const int y4 = static_cast(cp_y4); + + const int a = (6/(h*h)) * (-2 * y2 + y3); + const int b = (6/(h*h)) * (y2 - 2 * y3); + + const int m1 = 0; + const int m2 = (4 * a - b) / 15; + const int m3 = (-a + 4 * b) / 15; + const int m4 = 0; + + const int a1 = (m2-m1) / (6*h); + const int a2 = (m3-m2) / (6*h); + const int a3 = (m4-m3) / (6*h); + const int a4 = 0; + + const int b1 = m1 / 2; + const int b2 = m2 / 2; + const int b3 = m3 / 2; + const int b4 = m4 / 2; + + const int c1 = (y2 - y1) / h - ((m2 + 2*m1) / 6) * h; + const int c2 = (y3 - y2) / h - ((m3 + 2*m2) / 6) * h; + const int c3 = (y4 - y3) / h - ((m4 + 2*m3) / 6) * h; + const int c4 = 0; + + const int d1 = y1; + const int d2 = y2; + const int d3 = y3; + const int d4 = y4; + + int ai = a1; + int bi = b1; + int ci = c1; + int di = d1; + + if (x < h) { + // nothing to do here + } else if (x < 2 * h) { + ai = a2; + bi = b2; + ci = c2; + di = d2; + x -= h; + } else if (x < 3 * h) { + ai = a3; + bi = b3; + ci = c3; + di = d3; + x -= 2 * h; + } else { + ai = a4; + bi = b4; + ci = c4; + di = d4; + x -= 3 * h; + } + + const int x3 = x*x*x; + const int x2 = x*x; + + output() = static_cast(ai*x3 + bi*x2 + ci*x + di); + } +}; + +// Kernel description in Hipacc +class CombineChannels : public Kernel { + private: + Accessor& in_r; + Accessor& in_g; + Accessor& in_b; + + public: + CombineChannels(IterationSpace& iter, Accessor& acc_r, Accessor& acc_g, Accessor& acc_b) + : Kernel(iter), in_r(acc_r), in_g(acc_g), in_b(acc_b) { + add_accessor(&in_r); + add_accessor(&in_g); + add_accessor(&in_b); + } + + void kernel() { + uchar pixel_r = in_r(); + uchar pixel_g = in_g(); + uchar pixel_b = in_b(); + + uchar4 out; + out.x = pixel_r; + out.y = pixel_g; + out.z = pixel_b; + out.w = MAX_VALUE; + + output() = out; + } +}; + +// forward declaration of reference implementation +void kernel_fusion(uchar4 *in, uchar4 *out, int width, int height); + +void compare_color(uchar4* output, uchar4* ref_out, int width, int height) { + uchar *cmp_output_r = new uchar[width*height]; + uchar *cmp_output_g = new uchar[width*height]; + uchar *cmp_output_b = new uchar[width*height]; + uchar *cmp_output_a = new uchar[width*height]; + uchar *cmp_ref_out_r = new uchar[width*height]; + uchar *cmp_ref_out_g = new uchar[width*height]; + uchar *cmp_ref_out_b = new uchar[width*height]; + uchar *cmp_ref_out_a = new uchar[width*height]; + + for (int i = 0; i < width*height; ++i) { + cmp_output_r[i] = output[i].x; + cmp_output_g[i] = output[i].y; + cmp_output_b[i] = output[i].z; + cmp_output_a[i] = output[i].w; + cmp_ref_out_r[i] = ref_out[i].x; + cmp_ref_out_g[i] = ref_out[i].y; + cmp_ref_out_b[i] = ref_out[i].z; + cmp_ref_out_a[i] = ref_out[i].w; + } + + compare_results(cmp_output_r, cmp_ref_out_r, width, height); + compare_results(cmp_output_g, cmp_ref_out_g, width, height); + compare_results(cmp_output_b, cmp_ref_out_b, width, height); + compare_results(cmp_output_a, cmp_ref_out_a, width, height); + + delete[] cmp_output_r; + delete[] cmp_output_g; + delete[] cmp_output_b; + delete[] cmp_output_a; + delete[] cmp_ref_out_r; + delete[] cmp_ref_out_g; + delete[] cmp_ref_out_b; + delete[] cmp_ref_out_a; +} + +void diff_color(uchar4* img1, uchar4* img2, int width, int height) { + for (int i = 0; i < width*height; ++i) { + uchar4 c1 = img1[i]; + uchar4 c2 = img2[i]; + + const bool eq_r = c1.x == c2.x; + const bool eq_g = c1.y == c2.y; + const bool eq_b = c1.z == c2.z; + const bool eq_a = c1.w == c2.w; + + const bool eq_all = eq_r && eq_g && eq_b && eq_a; + + if (eq_all) { + std::cout << "[" << i << "] "; + std::cout << "(" << c1.x << "," << c1.y << "," << c1.z << "," << c1.w << ")"; + std::cout << " <=> "; + std::cout << "(" << c2.x << "," << c2.y << "," << c2.z << "," << c2.w << ")"; + std::cout << std::endl; + } + } +} + +/************************************************************************* + * Main function * + *************************************************************************/ +HIPACC_CODEGEN int main(int argc, const char **argv) { + int width_arg = WIDTH; + int height_arg = HEIGHT; + + if(argc >= 2) { + width_arg = std::stoi(argv[1]); + height_arg = width_arg; + } + + if(argc >= 3) { + height_arg = std::stoi(argv[2]); + } + + const int width = width_arg; + const int height = height_arg; + + // host memory for image of width x height pixels, random + uchar4 *input = (uchar4*)load_data(width, height, 4); + uchar4 *ref_out = new uchar4[width*height]; + + std::cout << "Testing Hipacc kernel fusion ..." << std::endl; + + //************************************************************************// + + // input and output image of width x height pixels + Image in(width, height, input); + Image out(width, height); + + Accessor acc_in(in); + + // **** + // * Red channel + // **** + Image channel_r(width, height); + IterationSpace iter_channel_r(channel_r); + Accessor acc_channel_r(channel_r); + + ColorCurves curves_channel_r(iter_channel_r, acc_in, CHANNEL_R, 10, 50, 230, 20); + + // **** + // * Green channel + // **** + Image channel_g(width, height); + IterationSpace iter_channel_g(channel_g); + Accessor acc_channel_g(channel_g); + + ColorCurves curves_channel_g(iter_channel_g, acc_in, CHANNEL_G, 10, 50, 230, 20); + + // **** + // * Blue channel + // **** + Image channel_b(width, height); + IterationSpace iter_channel_b(channel_b); + Accessor acc_channel_b(channel_b); + + ColorCurves curves_channel_b(iter_channel_b, acc_in, CHANNEL_B, 10, 50, 230, 20); + + // **** + // * Combining operator + // **** + IterationSpace iter_out(out); + + CombineChannels combine(iter_out, acc_channel_r, acc_channel_g, acc_channel_b); + + // execution after all decls + curves_channel_r.execute(); + curves_channel_g.execute(); + curves_channel_b.execute(); + combine.execute(); + + // get pointer to result data + uchar4 *output = out.data(); + + //************************************************************************// + std::cout << "Calculating reference ..." << std::endl; + kernel_fusion(input, ref_out, width, height); + + compare_color(output, ref_out, width, height); + //diff_color(output, ref_out, width, height); + + // free memory + delete[] input; + delete[] ref_out; + + return EXIT_SUCCESS; +} + +// kernel fusion reference +void color_curves_kernel(uchar4 *in, uchar *out, int width, int height, int channel, uchar cp_y1, uchar cp_y2, uchar cp_y3, uchar cp_y4) { + for (int p = 0; p < width*height; ++p) { + uchar4 interm_pixel = in[p]; + int x = static_cast(interm_pixel.w); + if (channel == CHANNEL_R) { + x = static_cast(interm_pixel.x); + } else if (channel == CHANNEL_G) { + x = static_cast(interm_pixel.y); + } else if (channel == CHANNEL_B) { + x = static_cast(interm_pixel.z); + } + + const int h = (static_cast(MAX_VALUE) - static_cast(MIN_VALUE)) / (static_cast(N_CONTROL_POINTS) - 1); + + const int y1 = static_cast(cp_y1); + const int y2 = static_cast(cp_y2); + const int y3 = static_cast(cp_y3); + const int y4 = static_cast(cp_y4); + + const int a = (6/(h*h)) * (-2 * y2 + y3); + const int b = (6/(h*h)) * (y2 - 2 * y3); + + const int m1 = 0; + const int m2 = (4 * a - b) / 15; + const int m3 = (-a + 4 * b) / 15; + const int m4 = 0; + + const int a1 = (m2-m1) / (6*h); + const int a2 = (m3-m2) / (6*h); + const int a3 = (m4-m3) / (6*h); + const int a4 = 0; + + const int b1 = m1 / 2; + const int b2 = m2 / 2; + const int b3 = m3 / 2; + const int b4 = m4 / 2; + + const int c1 = (y2 - y1) / h - ((m2 + 2*m1) / 6) * h; + const int c2 = (y3 - y2) / h - ((m3 + 2*m2) / 6) * h; + const int c3 = (y4 - y3) / h - ((m4 + 2*m3) / 6) * h; + const int c4 = 0; + + const int d1 = y1; + const int d2 = y2; + const int d3 = y3; + const int d4 = y4; + + int ai = a1; + int bi = b1; + int ci = c1; + int di = d1; + + if (x < h) { + // nothing to do here + } else if (x < 2 * h) { + ai = a2; + bi = b2; + ci = c2; + di = d2; + x -= h; + } else if (x < 3 * h) { + ai = a3; + bi = b3; + ci = c3; + di = d3; + x -= 2 * h; + } else { + ai = a4; + bi = b4; + ci = c4; + di = d4; + x -= 3 * h; + } + + const int x3 = x*x*x; + const int x2 = x*x; + + out[p] = static_cast(ai*x3 + bi*x2 + ci*x + di); + } +} + +void combine_channels_kernel(uchar* in_r, uchar* in_g, uchar* in_b, uchar4* out, int width, int height) { + for (int p = 0; p < width*height; ++p) { + uchar pixel_r = in_r[p]; + uchar pixel_g = in_g[p]; + uchar pixel_b = in_b[p]; + + uchar4 pixel_out; + pixel_out.x = pixel_r; + pixel_out.y = pixel_g; + pixel_out.z = pixel_b; + pixel_out.w = MAX_VALUE; + + out[p] = pixel_out; + } +} + +void kernel_fusion(uchar4 *in, uchar4 *out, int width, int height) { + uchar *ref_buf_r = new uchar[width*height]; + uchar *ref_buf_g = new uchar[width*height]; + uchar *ref_buf_b = new uchar[width*height]; + + color_curves_kernel(in, ref_buf_r, width, height, CHANNEL_R, 10, 50, 230, 20); + color_curves_kernel(in, ref_buf_g, width, height, CHANNEL_G, 10, 50, 230, 20); + color_curves_kernel(in, ref_buf_b, width, height, CHANNEL_B, 10, 50, 230, 20); + combine_channels_kernel(ref_buf_r, ref_buf_g, ref_buf_b, out, width, height); + + delete[] ref_buf_r; + delete[] ref_buf_g; + delete[] ref_buf_b; +} + diff --git a/samples-public/6_Test/Kernel_Fusion_3P2P/CMakeLists.txt b/samples-public/6_Test/Kernel_Fusion_3P2P/CMakeLists.txt new file mode 100644 index 00000000..453b7a37 --- /dev/null +++ b/samples-public/6_Test/Kernel_Fusion_3P2P/CMakeLists.txt @@ -0,0 +1,10 @@ +cmake_minimum_required(VERSION 3.14) + +get_filename_component(SAMPLE_NAME "${CMAKE_CURRENT_LIST_DIR}" NAME) + +project(${SAMPLE_NAME}) + +#add_hipacc_sample_dsl() +#add_hipacc_sample_cpu() +add_hipacc_sample_cuda(FUSION LOCAL PPT PPTN FUSEDPPTN) # kernel fusion is only supported for CUDA +#add_hipacc_sample_opencl(CPU GPU) diff --git a/samples-public/6_Test/Kernel_Fusion_3P2P/src/main.cpp b/samples-public/6_Test/Kernel_Fusion_3P2P/src/main.cpp new file mode 100644 index 00000000..2f68baf3 --- /dev/null +++ b/samples-public/6_Test/Kernel_Fusion_3P2P/src/main.cpp @@ -0,0 +1,198 @@ +// +// Copyright (c) 2020, University of Erlangen-Nuremberg +// Copyright (c) 2012, University of Erlangen-Nuremberg +// Copyright (c) 2012, Siemens AG +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, this +// list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + +#include "hipacc.hpp" + +#include +#include + +#define WIDTH 512 +#define HEIGHT 512 +#define TYPE uchar + +using namespace hipacc; +using namespace hipacc::math; + +// Kernel description in Hipacc +class PointOperatorExample : public Kernel { + private: + Accessor ∈ + + public: + PointOperatorExample(IterationSpace &iter, Accessor &acc) + : Kernel(iter), in(acc) { + add_accessor(&in); + } + + void kernel() { + TYPE interm_pixel = in(); + interm_pixel += 3; + output() = interm_pixel; + } +}; + +class OutOperatorExample : public Kernel { + private: + Accessor &in1; + Accessor &in2; + Accessor &in3; + + public: + OutOperatorExample( + IterationSpace &iter, + Accessor &acc1, + Accessor &acc2, + Accessor &acc3) + : Kernel(iter), in1(acc1), in2(acc2), in3(acc3) { + add_accessor(&in1); + add_accessor(&in2); + add_accessor(&in3); + } + + void kernel() { + TYPE interm_pixel1 = in1(); + TYPE interm_pixel2 = in2(); + TYPE interm_pixel3 = in3(); + output() = interm_pixel1 + interm_pixel2 + interm_pixel3; + } +}; + +// forward declaration of reference implementation +void kernel_fusion(TYPE *in, TYPE *out, int width, int height); + +/************************************************************************* + * Main function * + *************************************************************************/ +HIPACC_CODEGEN int main(int argc, const char **argv) { + int width_arg = WIDTH; + int height_arg = HEIGHT; + + if(argc >= 2) { + width_arg = std::stoi(argv[1]); + height_arg = width_arg; + } + + if(argc >= 3) { + height_arg = std::stoi(argv[2]); + } + + const int width = width_arg; + const int height = height_arg; + + // host memory for image of width x height pixels, random + TYPE *input = (TYPE*)load_data(width, height); + TYPE *ref_out = new TYPE[width*height]; + + std::cout << "Testing Hipacc kernel fusion ..." << std::endl; + + //************************************************************************// + + // input and output image of width x height pixels + Image in(width, height, input); + Image out(width, height); + + // test parallelpoint-point to point kernel fusion + Accessor acc0(in); + + Image buf0(width, height); + IterationSpace iter0(buf0); + PointOperatorExample pointOp0(iter0, acc0); + + Image buf1(width, height); + IterationSpace iter1(buf1); + PointOperatorExample pointOp1(iter1, acc0); + + Image buf2(width, height); + IterationSpace iter2(buf2); + PointOperatorExample pointOp2(iter2, acc0); + + Accessor acc1(buf0); + Accessor acc2(buf1); + Accessor acc3(buf2); + IterationSpace iter3(out); + OutOperatorExample outOp(iter3, acc1, acc2, acc3); + + // execution after all decls + pointOp0.execute(); + pointOp1.execute(); + pointOp2.execute(); + outOp.execute(); + + // get pointer to result data + TYPE *output = out.data(); + + //************************************************************************// + std::cout << "Calculating reference ..." << std::endl; + kernel_fusion(input, ref_out, width, height); + compare_results(output, ref_out, width, height); + + // free memory + delete[] input; + delete[] ref_out; + + return EXIT_SUCCESS; +} + +// kernel fusion reference +void point_kernel(TYPE *in, TYPE *out, int width, int height) { + for (int p = 0; p < width*height; ++p) { + TYPE interm_pixel = in[p]; + interm_pixel += 3; + out[p] = interm_pixel; + } +} + +void out_kernel(TYPE *in1, TYPE *in2, TYPE *in3, TYPE *out, int width, int height) { + for (int p = 0; p < width*height; ++p) { + TYPE interm_pixel1 = in1[p]; + TYPE interm_pixel2 = in2[p]; + TYPE interm_pixel3 = in3[p]; + out[p] = interm_pixel1 + interm_pixel2 + interm_pixel3; + } +} + +void kernel_fusion(TYPE *in, TYPE *out, int width, int height) { + TYPE *ref_buf0 = new TYPE[width*height]; + TYPE *ref_buf1 = new TYPE[width*height]; + TYPE *ref_buf2 = new TYPE[width*height]; + + // left-most + point_kernel(in, ref_buf0, width, height); + + // center operator + point_kernel(in, ref_buf1, width, height); + + // right-most operator + point_kernel(in, ref_buf2, width, height); + + // out operator + out_kernel(ref_buf0, ref_buf1, ref_buf2, out, width, height); + + delete[] ref_buf0; + delete[] ref_buf1; + delete[] ref_buf2; +} diff --git a/samples-public/6_Test/Kernel_Fusion_3P2P_Heavy/CMakeLists.txt b/samples-public/6_Test/Kernel_Fusion_3P2P_Heavy/CMakeLists.txt new file mode 100644 index 00000000..a33eaad4 --- /dev/null +++ b/samples-public/6_Test/Kernel_Fusion_3P2P_Heavy/CMakeLists.txt @@ -0,0 +1,10 @@ +cmake_minimum_required(VERSION 3.14) + +get_filename_component(SAMPLE_NAME "${CMAKE_CURRENT_LIST_DIR}" NAME) + +project(${SAMPLE_NAME}) + +#add_hipacc_sample_dsl() +#add_hipacc_sample_cpu() +add_hipacc_sample_cuda(FUSION LOCAL PPT) # kernel fusion is only supported for CUDA +#add_hipacc_sample_opencl(CPU GPU) diff --git a/samples-public/6_Test/Kernel_Fusion_3P2P_Heavy/src/main.cpp b/samples-public/6_Test/Kernel_Fusion_3P2P_Heavy/src/main.cpp new file mode 100644 index 00000000..45ae6f1d --- /dev/null +++ b/samples-public/6_Test/Kernel_Fusion_3P2P_Heavy/src/main.cpp @@ -0,0 +1,204 @@ +// +// Copyright (c) 2020, University of Erlangen-Nuremberg +// Copyright (c) 2012, University of Erlangen-Nuremberg +// Copyright (c) 2012, Siemens AG +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, this +// list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + +#include "hipacc.hpp" + +#include +#include + +#define WIDTH 512 +#define HEIGHT 512 +#define TYPE uchar +#define N_ITER 1024 + +using namespace hipacc; +using namespace hipacc::math; + +// Kernel description in Hipacc +class PointOperatorExample : public Kernel { + private: + Accessor ∈ + int n_iter; + + public: + PointOperatorExample(IterationSpace &iter, Accessor &acc, int n_iter) + : Kernel(iter), in(acc), n_iter(n_iter) { + add_accessor(&in); + } + + void kernel() { + TYPE interm_pixel = in(); + for(int i = 0; i < n_iter; ++i) { + interm_pixel += 3; + } + output() = interm_pixel; + } +}; + +class OutOperatorExample : public Kernel { + private: + Accessor &in1; + Accessor &in2; + Accessor &in3; + + public: + OutOperatorExample( + IterationSpace &iter, + Accessor &acc1, + Accessor &acc2, + Accessor &acc3) + : Kernel(iter), in1(acc1), in2(acc2), in3(acc3) { + add_accessor(&in1); + add_accessor(&in2); + add_accessor(&in3); + } + + void kernel() { + TYPE interm_pixel1 = in1(); + TYPE interm_pixel2 = in2(); + TYPE interm_pixel3 = in3(); + output() = interm_pixel1 + interm_pixel2 + interm_pixel3; + } +}; + +// forward declaration of reference implementation +void kernel_fusion(TYPE *in, TYPE *out, int width, int height); + +/************************************************************************* + * Main function * + *************************************************************************/ +HIPACC_CODEGEN int main(int argc, const char **argv) { + int width_arg = WIDTH; + int height_arg = HEIGHT; + + if(argc >= 2) { + width_arg = std::stoi(argv[1]); + height_arg = width_arg; + } + + if(argc >= 3) { + height_arg = std::stoi(argv[2]); + } + + const int width = width_arg; + const int height = height_arg; + + // host memory for image of width x height pixels, random + TYPE *input = (TYPE*)load_data(width, height); + TYPE *ref_out = new TYPE[width*height]; + + std::cout << "Testing Hipacc kernel fusion ..." << std::endl; + + //************************************************************************// + + // input and output image of width x height pixels + Image in(width, height, input); + Image out(width, height); + + // test parallelpoint-point to point kernel fusion + Accessor acc0(in); + + Image buf0(width, height); + IterationSpace iter0(buf0); + PointOperatorExample pointOp0(iter0, acc0, N_ITER); + + Image buf1(width, height); + IterationSpace iter1(buf1); + PointOperatorExample pointOp1(iter1, acc0, N_ITER); + + Image buf2(width, height); + IterationSpace iter2(buf2); + PointOperatorExample pointOp2(iter2, acc0, N_ITER); + + Accessor acc1(buf0); + Accessor acc2(buf1); + Accessor acc3(buf2); + IterationSpace iter3(out); + OutOperatorExample outOp(iter3, acc1, acc2, acc3); + + // execution after all decls + pointOp0.execute(); + pointOp1.execute(); + pointOp2.execute(); + outOp.execute(); + + // get pointer to result data + TYPE *output = out.data(); + + //************************************************************************// + std::cout << "Calculating reference ..." << std::endl; + kernel_fusion(input, ref_out, width, height); + compare_results(output, ref_out, width, height); + + // free memory + delete[] input; + delete[] ref_out; + + return EXIT_SUCCESS; +} + +// kernel fusion reference +void point_kernel(TYPE *in, TYPE *out, int width, int height) { + for (int p = 0; p < width*height; ++p) { + TYPE interm_pixel = in[p]; + for(int i = 0; i < N_ITER; ++i) { + interm_pixel += 3; + } + out[p] = interm_pixel; + } +} + +void out_kernel(TYPE *in1, TYPE *in2, TYPE *in3, TYPE *out, int width, int height) { + for (int p = 0; p < width*height; ++p) { + TYPE interm_pixel1 = in1[p]; + TYPE interm_pixel2 = in2[p]; + TYPE interm_pixel3 = in3[p]; + out[p] = interm_pixel1 + interm_pixel2 + interm_pixel3; + } +} + +void kernel_fusion(TYPE *in, TYPE *out, int width, int height) { + TYPE *ref_buf0 = new TYPE[width*height]; + TYPE *ref_buf1 = new TYPE[width*height]; + TYPE *ref_buf2 = new TYPE[width*height]; + + // left-most + point_kernel(in, ref_buf0, width, height); + + // center operator + point_kernel(in, ref_buf1, width, height); + + // right-most operator + point_kernel(in, ref_buf2, width, height); + + // out operator + out_kernel(ref_buf0, ref_buf1, ref_buf2, out, width, height); + + delete[] ref_buf0; + delete[] ref_buf1; + delete[] ref_buf2; +} diff --git a/samples-public/6_Test/Kernel_Fusion_4P2P/CMakeLists.txt b/samples-public/6_Test/Kernel_Fusion_4P2P/CMakeLists.txt new file mode 100644 index 00000000..453b7a37 --- /dev/null +++ b/samples-public/6_Test/Kernel_Fusion_4P2P/CMakeLists.txt @@ -0,0 +1,10 @@ +cmake_minimum_required(VERSION 3.14) + +get_filename_component(SAMPLE_NAME "${CMAKE_CURRENT_LIST_DIR}" NAME) + +project(${SAMPLE_NAME}) + +#add_hipacc_sample_dsl() +#add_hipacc_sample_cpu() +add_hipacc_sample_cuda(FUSION LOCAL PPT PPTN FUSEDPPTN) # kernel fusion is only supported for CUDA +#add_hipacc_sample_opencl(CPU GPU) diff --git a/samples-public/6_Test/Kernel_Fusion_4P2P/src/main.cpp b/samples-public/6_Test/Kernel_Fusion_4P2P/src/main.cpp new file mode 100644 index 00000000..11cf38d5 --- /dev/null +++ b/samples-public/6_Test/Kernel_Fusion_4P2P/src/main.cpp @@ -0,0 +1,214 @@ +// +// Copyright (c) 2020, University of Erlangen-Nuremberg +// Copyright (c) 2012, University of Erlangen-Nuremberg +// Copyright (c) 2012, Siemens AG +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, this +// list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + +#include "hipacc.hpp" + +#include +#include + +#define WIDTH 512 +#define HEIGHT 512 +#define TYPE uchar + +using namespace hipacc; +using namespace hipacc::math; + +// Kernel description in Hipacc +class PointOperatorExample : public Kernel { + private: + Accessor ∈ + + public: + PointOperatorExample(IterationSpace &iter, Accessor &acc) + : Kernel(iter), in(acc) { + add_accessor(&in); + } + + void kernel() { + TYPE interm_pixel = in(); + interm_pixel += 3; + output() = interm_pixel; + } +}; + +class OutOperatorExample : public Kernel { + private: + Accessor &in1; + Accessor &in2; + Accessor &in3; + Accessor &in4; + + public: + OutOperatorExample( + IterationSpace &iter, + Accessor &acc1, + Accessor &acc2, + Accessor &acc3, + Accessor &acc4) + : Kernel(iter), in1(acc1), in2(acc2), in3(acc3), in4(acc4) { + add_accessor(&in1); + add_accessor(&in2); + add_accessor(&in3); + add_accessor(&in4); + } + + void kernel() { + TYPE interm_pixel1 = in1(); + TYPE interm_pixel2 = in2(); + TYPE interm_pixel3 = in3(); + TYPE interm_pixel4 = in4(); + output() = interm_pixel1 + interm_pixel2 + interm_pixel3 + interm_pixel4; + } +}; + +// forward declaration of reference implementation +void kernel_fusion(TYPE *in, TYPE *out, int width, int height); + +/************************************************************************* + * Main function * + *************************************************************************/ +HIPACC_CODEGEN int main(int argc, const char **argv) { + int width_arg = WIDTH; + int height_arg = HEIGHT; + + if(argc >= 2) { + width_arg = std::stoi(argv[1]); + height_arg = width_arg; + } + + if(argc >= 3) { + height_arg = std::stoi(argv[2]); + } + + const int width = width_arg; + const int height = height_arg; + + // host memory for image of width x height pixels, random + TYPE *input = (TYPE*)load_data(width, height); + TYPE *ref_out = new TYPE[width*height]; + + std::cout << "Testing Hipacc kernel fusion ..." << std::endl; + + //************************************************************************// + + // input and output image of width x height pixels + Image in(width, height, input); + Image out(width, height); + + // test parallelpoint-point to point kernel fusion + Accessor acc0(in); + + Image buf0(width, height); + IterationSpace iter0(buf0); + PointOperatorExample pointOp0(iter0, acc0); + + Image buf1(width, height); + IterationSpace iter1(buf1); + PointOperatorExample pointOp1(iter1, acc0); + + Image buf2(width, height); + IterationSpace iter2(buf2); + PointOperatorExample pointOp2(iter2, acc0); + + Image buf3(width, height); + IterationSpace iter3(buf3); + PointOperatorExample pointOp3(iter3, acc0); + + Accessor acc1(buf0); + Accessor acc2(buf1); + Accessor acc3(buf2); + Accessor acc4(buf3); + IterationSpace iter4(out); + OutOperatorExample outOp(iter4, acc1, acc2, acc3, acc4); + + // execution after all decls + pointOp0.execute(); + pointOp1.execute(); + pointOp2.execute(); + pointOp3.execute(); + outOp.execute(); + + // get pointer to result data + TYPE *output = out.data(); + + //************************************************************************// + std::cout << "Calculating reference ..." << std::endl; + kernel_fusion(input, ref_out, width, height); + compare_results(output, ref_out, width, height); + + // free memory + delete[] input; + delete[] ref_out; + + return EXIT_SUCCESS; +} + +// kernel fusion reference +void point_kernel(TYPE *in, TYPE *out, int width, int height) { + for (int p = 0; p < width*height; ++p) { + TYPE interm_pixel = in[p]; + interm_pixel += 3; + out[p] = interm_pixel; + } +} + +void out_kernel(TYPE *in1, TYPE *in2, TYPE *in3, TYPE *in4, TYPE *out, int width, int height) { + for (int p = 0; p < width*height; ++p) { + TYPE interm_pixel1 = in1[p]; + TYPE interm_pixel2 = in2[p]; + TYPE interm_pixel3 = in3[p]; + TYPE interm_pixel4 = in4[p]; + out[p] = interm_pixel1 + interm_pixel2 + interm_pixel3 + interm_pixel4; + } +} + +void kernel_fusion(TYPE *in, TYPE *out, int width, int height) { + TYPE *ref_buf0 = new TYPE[width*height]; + TYPE *ref_buf1 = new TYPE[width*height]; + TYPE *ref_buf2 = new TYPE[width*height]; + TYPE *ref_buf3 = new TYPE[width*height]; + + // left-most + point_kernel(in, ref_buf0, width, height); + + // center-left operator + point_kernel(in, ref_buf1, width, height); + + // center-right operator + point_kernel(in, ref_buf2, width, height); + + // right-most operator + point_kernel(in, ref_buf3, width, height); + + // out operator + out_kernel(ref_buf0, ref_buf1, ref_buf2, ref_buf3, out, width, height); + + delete[] ref_buf0; + delete[] ref_buf1; + delete[] ref_buf2; + delete[] ref_buf3; +} diff --git a/samples-public/6_Test/Kernel_Fusion_4P2P_Heavy/CMakeLists.txt b/samples-public/6_Test/Kernel_Fusion_4P2P_Heavy/CMakeLists.txt new file mode 100644 index 00000000..a33eaad4 --- /dev/null +++ b/samples-public/6_Test/Kernel_Fusion_4P2P_Heavy/CMakeLists.txt @@ -0,0 +1,10 @@ +cmake_minimum_required(VERSION 3.14) + +get_filename_component(SAMPLE_NAME "${CMAKE_CURRENT_LIST_DIR}" NAME) + +project(${SAMPLE_NAME}) + +#add_hipacc_sample_dsl() +#add_hipacc_sample_cpu() +add_hipacc_sample_cuda(FUSION LOCAL PPT) # kernel fusion is only supported for CUDA +#add_hipacc_sample_opencl(CPU GPU) diff --git a/samples-public/6_Test/Kernel_Fusion_4P2P_Heavy/src/main.cpp b/samples-public/6_Test/Kernel_Fusion_4P2P_Heavy/src/main.cpp new file mode 100644 index 00000000..bb455ef3 --- /dev/null +++ b/samples-public/6_Test/Kernel_Fusion_4P2P_Heavy/src/main.cpp @@ -0,0 +1,220 @@ +// +// Copyright (c) 2020, University of Erlangen-Nuremberg +// Copyright (c) 2012, University of Erlangen-Nuremberg +// Copyright (c) 2012, Siemens AG +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, this +// list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + +#include "hipacc.hpp" + +#include +#include + +#define WIDTH 512 +#define HEIGHT 512 +#define TYPE uchar +#define N_ITER 1024 + +using namespace hipacc; +using namespace hipacc::math; + +// Kernel description in Hipacc +class PointOperatorExample : public Kernel { + private: + Accessor ∈ + int n_iter; + + public: + PointOperatorExample(IterationSpace &iter, Accessor &acc, int n_iter) + : Kernel(iter), in(acc), n_iter(n_iter) { + add_accessor(&in); + } + + void kernel() { + TYPE interm_pixel = in(); + for(int i = 0; i < n_iter; ++i) { + interm_pixel += 3; + } + output() = interm_pixel; + } +}; + +class OutOperatorExample : public Kernel { + private: + Accessor &in1; + Accessor &in2; + Accessor &in3; + Accessor &in4; + + public: + OutOperatorExample( + IterationSpace &iter, + Accessor &acc1, + Accessor &acc2, + Accessor &acc3, + Accessor &acc4) + : Kernel(iter), in1(acc1), in2(acc2), in3(acc3), in4(acc4) { + add_accessor(&in1); + add_accessor(&in2); + add_accessor(&in3); + add_accessor(&in4); + } + + void kernel() { + TYPE interm_pixel1 = in1(); + TYPE interm_pixel2 = in2(); + TYPE interm_pixel3 = in3(); + TYPE interm_pixel4 = in4(); + output() = interm_pixel1 + interm_pixel2 + interm_pixel3 + interm_pixel4; + } +}; + +// forward declaration of reference implementation +void kernel_fusion(TYPE *in, TYPE *out, int width, int height); + +/************************************************************************* + * Main function * + *************************************************************************/ +HIPACC_CODEGEN int main(int argc, const char **argv) { + int width_arg = WIDTH; + int height_arg = HEIGHT; + + if(argc >= 2) { + width_arg = std::stoi(argv[1]); + height_arg = width_arg; + } + + if(argc >= 3) { + height_arg = std::stoi(argv[2]); + } + + const int width = width_arg; + const int height = height_arg; + + // host memory for image of width x height pixels, random + TYPE *input = (TYPE*)load_data(width, height); + TYPE *ref_out = new TYPE[width*height]; + + std::cout << "Testing Hipacc kernel fusion ..." << std::endl; + + //************************************************************************// + + // input and output image of width x height pixels + Image in(width, height, input); + Image out(width, height); + + // test parallelpoint-point to point kernel fusion + Accessor acc0(in); + + Image buf0(width, height); + IterationSpace iter0(buf0); + PointOperatorExample pointOp0(iter0, acc0, N_ITER); + + Image buf1(width, height); + IterationSpace iter1(buf1); + PointOperatorExample pointOp1(iter1, acc0, N_ITER); + + Image buf2(width, height); + IterationSpace iter2(buf2); + PointOperatorExample pointOp2(iter2, acc0, N_ITER); + + Image buf3(width, height); + IterationSpace iter3(buf3); + PointOperatorExample pointOp3(iter3, acc0, N_ITER); + + Accessor acc1(buf0); + Accessor acc2(buf1); + Accessor acc3(buf2); + Accessor acc4(buf3); + IterationSpace iter4(out); + OutOperatorExample outOp(iter4, acc1, acc2, acc3, acc4); + + // execution after all decls + pointOp0.execute(); + pointOp1.execute(); + pointOp2.execute(); + pointOp3.execute(); + outOp.execute(); + + // get pointer to result data + TYPE *output = out.data(); + + //************************************************************************// + std::cout << "Calculating reference ..." << std::endl; + kernel_fusion(input, ref_out, width, height); + compare_results(output, ref_out, width, height); + + // free memory + delete[] input; + delete[] ref_out; + + return EXIT_SUCCESS; +} + +// kernel fusion reference +void point_kernel(TYPE *in, TYPE *out, int width, int height) { + for (int p = 0; p < width*height; ++p) { + TYPE interm_pixel = in[p]; + for(int i = 0; i < N_ITER; ++i) { + interm_pixel += 3; + } + out[p] = interm_pixel; + } +} + +void out_kernel(TYPE *in1, TYPE *in2, TYPE *in3, TYPE *in4, TYPE *out, int width, int height) { + for (int p = 0; p < width*height; ++p) { + TYPE interm_pixel1 = in1[p]; + TYPE interm_pixel2 = in2[p]; + TYPE interm_pixel3 = in3[p]; + TYPE interm_pixel4 = in4[p]; + out[p] = interm_pixel1 + interm_pixel2 + interm_pixel3 + interm_pixel4; + } +} + +void kernel_fusion(TYPE *in, TYPE *out, int width, int height) { + TYPE *ref_buf0 = new TYPE[width*height]; + TYPE *ref_buf1 = new TYPE[width*height]; + TYPE *ref_buf2 = new TYPE[width*height]; + TYPE *ref_buf3 = new TYPE[width*height]; + + // left-most + point_kernel(in, ref_buf0, width, height); + + // center-left operator + point_kernel(in, ref_buf1, width, height); + + // center-right operator + point_kernel(in, ref_buf2, width, height); + + // right-most operator + point_kernel(in, ref_buf3, width, height); + + // out operator + out_kernel(ref_buf0, ref_buf1, ref_buf2, ref_buf3, out, width, height); + + delete[] ref_buf0; + delete[] ref_buf1; + delete[] ref_buf2; + delete[] ref_buf3; +} diff --git a/samples-public/6_Test/Kernel_Fusion_LL2L/CMakeLists.txt b/samples-public/6_Test/Kernel_Fusion_LL2L/CMakeLists.txt new file mode 100644 index 00000000..a33eaad4 --- /dev/null +++ b/samples-public/6_Test/Kernel_Fusion_LL2L/CMakeLists.txt @@ -0,0 +1,10 @@ +cmake_minimum_required(VERSION 3.14) + +get_filename_component(SAMPLE_NAME "${CMAKE_CURRENT_LIST_DIR}" NAME) + +project(${SAMPLE_NAME}) + +#add_hipacc_sample_dsl() +#add_hipacc_sample_cpu() +add_hipacc_sample_cuda(FUSION LOCAL PPT) # kernel fusion is only supported for CUDA +#add_hipacc_sample_opencl(CPU GPU) diff --git a/samples-public/6_Test/Kernel_Fusion_LL2L/src/main.cpp b/samples-public/6_Test/Kernel_Fusion_LL2L/src/main.cpp new file mode 100644 index 00000000..492ea6e9 --- /dev/null +++ b/samples-public/6_Test/Kernel_Fusion_LL2L/src/main.cpp @@ -0,0 +1,261 @@ +// +// Copyright (c) 2020, University of Erlangen-Nuremberg +// Copyright (c) 2012, University of Erlangen-Nuremberg +// Copyright (c) 2012, Siemens AG +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, this +// list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + +#include "hipacc.hpp" + +#include +#include +#include + +#define WIDTH 512 +#define HEIGHT 512 +#define SIZE_X 5 +#define SIZE_Y 5 +#define TYPE uchar + +using namespace hipacc; +using namespace hipacc::math; + +// Kernel description in Hipacc +class LocalOperatorExample : public Kernel { + private: + Accessor &Input; + Mask &mask; + + public: + LocalOperatorExample(IterationSpace &IS, Accessor &Input, + Mask &mask) + : Kernel(IS), Input(Input), mask(mask) { + add_accessor(&Input); + } + + void kernel() { + output() = (TYPE)(convolve(mask, Reduce::SUM, [&] () -> float { + return mask() * Input(mask); + }) + 0.5f); + } +}; + +class PointOperatorExample : public Kernel { + private: + Accessor ∈ + + public: + PointOperatorExample(IterationSpace &iter, Accessor &acc) + : Kernel(iter), in(acc) { + add_accessor(&in); + } + + void kernel() { + TYPE interm_pixel = in(); + interm_pixel += 3; + output() = interm_pixel; + } +}; + +class LocalOutOperatorExample : public Kernel { + private: + Accessor &in1; + Accessor &in2; + Mask &mask; + + public: + LocalOutOperatorExample( + IterationSpace &iter, + Accessor &acc1, + Accessor &acc2, + Mask &mask) + : Kernel(iter), in1(acc1), in2(acc2), mask(mask) { + add_accessor(&in1); + add_accessor(&in2); + } + + void kernel() { + output() = (TYPE)(convolve(mask, Reduce::SUM, [&] () -> float { + return (mask() * in1(mask)) + (mask() * in2(mask)); + }) + 0.5f); + } +}; + +// forward declaration of reference implementation +void kernel_fusion(TYPE *in, TYPE *out, float *filter, + int size_x, int size_y, int width, int height); + +/************************************************************************* + * Main function * + *************************************************************************/ +HIPACC_CODEGEN int main(int argc, const char **argv) { + int width_arg = WIDTH; + int height_arg = HEIGHT; + + if(argc >= 2) { + width_arg = std::stoi(argv[1]); + height_arg = width_arg; + } + + if(argc >= 3) { + height_arg = std::stoi(argv[2]); + } + + const int width = width_arg; + const int height = height_arg; + const int size_x = SIZE_X; + const int size_y = SIZE_Y; + const int offset_x = size_x >> 1; + const int offset_y = size_y >> 1; + + // convolution filter mask + const float coef[SIZE_Y][SIZE_X] = { + { 0.005008f, 0.017300f, 0.026151f, 0.017300f, 0.005008f }, + { 0.017300f, 0.059761f, 0.090339f, 0.059761f, 0.017300f }, + { 0.026151f, 0.090339f, 0.136565f, 0.090339f, 0.026151f }, + { 0.017300f, 0.059761f, 0.090339f, 0.059761f, 0.017300f }, + { 0.005008f, 0.017300f, 0.026151f, 0.017300f, 0.005008f } + }; + + // host memory for image of width x height pixels, random + TYPE *input = (TYPE*)load_data(width, height); + TYPE *ref_out = new TYPE[width*height]; + + std::cout << "Testing Hipacc kernel fusion ..." << std::endl; + + //************************************************************************// + + // input and output image of width x height pixels + Image in(width, height, input); + Image out(width, height); + + // test local-point to local kernel fusion + + Mask mask0(coef); + BoundaryCondition bound0(in, mask0, Boundary::CLAMP); + Accessor acc0(bound0); + Image buf0(width, height); + IterationSpace iter0(buf0); + LocalOperatorExample localOp0(iter0, acc0, mask0); + + Mask mask1(coef); + BoundaryCondition bound1(in, mask1, Boundary::CLAMP); + Accessor acc1(bound1); + Image buf1(width, height); + IterationSpace iter1(buf1); + LocalOperatorExample localOp1(iter1, acc1, mask1); + + Mask mask2(coef); + BoundaryCondition bound2(buf0, mask2, Boundary::CLAMP); + Accessor acc2(bound2); + BoundaryCondition bound3(buf1, mask2, Boundary::CLAMP); + Accessor acc3(bound3); + IterationSpace iter2(out); + LocalOutOperatorExample localOutOp(iter2, acc2, acc3, mask2); + + // execution after all decls + localOp0.execute(); + localOp1.execute(); + localOutOp.execute(); + + // get pointer to result data + TYPE *output = out.data(); + + //************************************************************************// + std::cout << "Calculating reference ..." << std::endl; + kernel_fusion(input, ref_out, (float*)coef, size_x, size_y, width, height); + compare_results(output, ref_out, width, height, offset_x*2, offset_y*2); + + // free memory + delete[] input; + delete[] ref_out; + + return EXIT_SUCCESS; +} + +// kernel fusion reference +void local_kernel(TYPE *in, TYPE *out, float *filter, + int size_x, int size_y, int width, int height) { + int anchor_x = size_x >> 1; + int anchor_y = size_y >> 1; + int upper_x = width - anchor_x; + int upper_y = height - anchor_y; + + for (int y=anchor_y; y> 1; + int anchor_y = size_y >> 1; + int upper_x = width - anchor_x; + int upper_y = height - anchor_y; + + for (int y=anchor_y; y +#include + +#define WIDTH 512 +#define HEIGHT 512 +#define SIZE_X 5 +#define SIZE_Y 5 +#define TYPE uchar + +using namespace hipacc; +using namespace hipacc::math; + +// Kernel description in Hipacc +class LocalOperatorExample : public Kernel { + private: + Accessor &Input; + Mask &mask; + + public: + LocalOperatorExample(IterationSpace &IS, Accessor &Input, + Mask &mask) + : Kernel(IS), Input(Input), mask(mask) { + add_accessor(&Input); + } + + void kernel() { + output() = (TYPE)(convolve(mask, Reduce::SUM, [&] () -> float { + return mask() * Input(mask); + }) + 0.5f); + } +}; + +class OutOperatorExample : public Kernel { + private: + Accessor &in1; + Accessor &in2; + + public: + OutOperatorExample( + IterationSpace &iter, + Accessor &acc1, + Accessor &acc2) + : Kernel(iter), in1(acc1), in2(acc2) { + add_accessor(&in1); + add_accessor(&in2); + } + + void kernel() { + TYPE interm_pixel1 = in1(); + TYPE interm_pixel2 = in2(); + output() = interm_pixel1 + interm_pixel2; + } +}; + +// forward declaration of reference implementation +void kernel_fusion(TYPE *in, TYPE *out, float *filter, + int size_x, int size_y, int width, int height); + +/************************************************************************* + * Main function * + *************************************************************************/ +HIPACC_CODEGEN int main(int argc, const char **argv) { + int width_arg = WIDTH; + int height_arg = HEIGHT; + + if(argc >= 2) { + width_arg = std::stoi(argv[1]); + height_arg = width_arg; + } + + if(argc >= 3) { + height_arg = std::stoi(argv[2]); + } + + const int width = width_arg; + const int height = height_arg; + const int size_x = SIZE_X; + const int size_y = SIZE_Y; + const int offset_x = size_x >> 1; + const int offset_y = size_y >> 1; + + // convolution filter mask + const float coef[SIZE_Y][SIZE_X] = { + { 0.005008f, 0.017300f, 0.026151f, 0.017300f, 0.005008f }, + { 0.017300f, 0.059761f, 0.090339f, 0.059761f, 0.017300f }, + { 0.026151f, 0.090339f, 0.136565f, 0.090339f, 0.026151f }, + { 0.017300f, 0.059761f, 0.090339f, 0.059761f, 0.017300f }, + { 0.005008f, 0.017300f, 0.026151f, 0.017300f, 0.005008f } + }; + + // host memory for image of width x height pixels, random + TYPE *input = (TYPE*)load_data(width, height); + TYPE *ref_out = new TYPE[width*height]; + + std::cout << "Testing Hipacc kernel fusion ..." << std::endl; + + //************************************************************************// + + // input and output image of width x height pixels + Image in(width, height, input); + Image out(width, height); + + // test parallel local-local to point kernel fusion + Mask mask0(coef); + BoundaryCondition bound0(in, mask0, Boundary::CLAMP); + Accessor acc0(bound0); + Image buf0(width, height); + IterationSpace iter0(buf0); + LocalOperatorExample localOp0(iter0, acc0, mask0); + + Mask mask1(coef); + BoundaryCondition bound1(in, mask1, Boundary::CLAMP); + Accessor acc1(bound1); + Image buf1(width, height); + IterationSpace iter1(buf1); + LocalOperatorExample localOp1(iter1, acc1, mask1); + + Accessor acc2(buf0); + Accessor acc3(buf1); + IterationSpace iter2(out); + OutOperatorExample outOp(iter2, acc2, acc3); + + // execution after all decls + localOp0.execute(); + localOp1.execute(); + outOp.execute(); + + // get pointer to result data + TYPE *output = out.data(); + + //************************************************************************// + std::cout << "Calculating reference ..." << std::endl; + kernel_fusion(input, ref_out, (float*)coef, size_x, size_y, width, height); + compare_results(output, ref_out, width, height, offset_x, offset_y); + + // free memory + delete[] input; + delete[] ref_out; + + return EXIT_SUCCESS; +} + +// kernel fusion reference +void local_kernel(TYPE *in, TYPE *out, float *filter, + int size_x, int size_y, int width, int height) { + int anchor_x = size_x >> 1; + int anchor_y = size_y >> 1; + int upper_x = width - anchor_x; + int upper_y = height - anchor_y; + + for (int y=anchor_y; y +#include +#include + +#define WIDTH 512 +#define HEIGHT 512 +#define SIZE_X 5 +#define SIZE_Y 5 +#define TYPE uchar + +using namespace hipacc; +using namespace hipacc::math; + +// Kernel description in Hipacc +class LocalOperatorExample : public Kernel { + private: + Accessor &Input; + Mask &mask; + + public: + LocalOperatorExample(IterationSpace &IS, Accessor &Input, + Mask &mask) + : Kernel(IS), Input(Input), mask(mask) { + add_accessor(&Input); + } + + void kernel() { + output() = (TYPE)(convolve(mask, Reduce::SUM, [&] () -> float { + return mask() * Input(mask); + }) + 0.5f); + } +}; + +class PointOperatorExample : public Kernel { + private: + Accessor ∈ + + public: + PointOperatorExample(IterationSpace &iter, Accessor &acc) + : Kernel(iter), in(acc) { + add_accessor(&in); + } + + void kernel() { + TYPE interm_pixel = in(); + interm_pixel += 3; + output() = interm_pixel; + } +}; + +class LocalOutOperatorExample : public Kernel { + private: + Accessor &in1; + Accessor &in2; + Mask &mask; + + public: + LocalOutOperatorExample( + IterationSpace &iter, + Accessor &acc1, + Accessor &acc2, + Mask &mask) + : Kernel(iter), in1(acc1), in2(acc2), mask(mask) { + add_accessor(&in1); + add_accessor(&in2); + } + + void kernel() { + output() = (TYPE)(convolve(mask, Reduce::SUM, [&] () -> float { + return (mask() * in1(mask)) + (mask() * in2(mask)); + }) + 0.5f); + } +}; + +// forward declaration of reference implementation +void kernel_fusion(TYPE *in, TYPE *out, float *filter, + int size_x, int size_y, int width, int height); + +/************************************************************************* + * Main function * + *************************************************************************/ +HIPACC_CODEGEN int main(int argc, const char **argv) { + int width_arg = WIDTH; + int height_arg = HEIGHT; + + if(argc >= 2) { + width_arg = std::stoi(argv[1]); + height_arg = width_arg; + } + + if(argc >= 3) { + height_arg = std::stoi(argv[2]); + } + + const int width = width_arg; + const int height = height_arg; + const int size_x = SIZE_X; + const int size_y = SIZE_Y; + const int offset_x = size_x >> 1; + const int offset_y = size_y >> 1; + + // convolution filter mask + const float coef[SIZE_Y][SIZE_X] = { + { 0.005008f, 0.017300f, 0.026151f, 0.017300f, 0.005008f }, + { 0.017300f, 0.059761f, 0.090339f, 0.059761f, 0.017300f }, + { 0.026151f, 0.090339f, 0.136565f, 0.090339f, 0.026151f }, + { 0.017300f, 0.059761f, 0.090339f, 0.059761f, 0.017300f }, + { 0.005008f, 0.017300f, 0.026151f, 0.017300f, 0.005008f } + }; + + // host memory for image of width x height pixels, random + TYPE *input = (TYPE*)load_data(width, height); + TYPE *ref_out = new TYPE[width*height]; + + std::cout << "Testing Hipacc kernel fusion ..." << std::endl; + + //************************************************************************// + + // input and output image of width x height pixels + Image in(width, height, input); + Image out(width, height); + + // test local-point to local kernel fusion + + Mask mask0(coef); + BoundaryCondition bound0(in, mask0, Boundary::CLAMP); + Accessor acc0(bound0); + Image buf0(width, height); + IterationSpace iter0(buf0); + LocalOperatorExample localOp(iter0, acc0, mask0); + + Accessor acc1(in); + Image buf1(width, height); + IterationSpace iter1(buf1); + PointOperatorExample pointOp(iter1, acc1); + + Mask mask1(coef); + BoundaryCondition bound1(buf0, mask1, Boundary::CLAMP); + Accessor acc2(bound1); + BoundaryCondition bound2(buf1, mask1, Boundary::CLAMP); + Accessor acc3(bound2); + IterationSpace iter2(out); + LocalOutOperatorExample localOutOp(iter2, acc2, acc3, mask1); + + // execution after all decls + localOp.execute(); + pointOp.execute(); + localOutOp.execute(); + + // get pointer to result data + TYPE *output = out.data(); + + //************************************************************************// + std::cout << "Calculating reference ..." << std::endl; + kernel_fusion(input, ref_out, (float*)coef, size_x, size_y, width, height); + compare_results(output, ref_out, width, height, offset_x*2, offset_y*2); + + // free memory + delete[] input; + delete[] ref_out; + + return EXIT_SUCCESS; +} + +// kernel fusion reference +void local_kernel(TYPE *in, TYPE *out, float *filter, + int size_x, int size_y, int width, int height) { + int anchor_x = size_x >> 1; + int anchor_y = size_y >> 1; + int upper_x = width - anchor_x; + int upper_y = height - anchor_y; + + for (int y=anchor_y; y> 1; + int anchor_y = size_y >> 1; + int upper_x = width - anchor_x; + int upper_y = height - anchor_y; + + for (int y=anchor_y; y +#include + +#define WIDTH 512 +#define HEIGHT 512 +#define SIZE_X 5 +#define SIZE_Y 5 +#define TYPE uchar + +using namespace hipacc; +using namespace hipacc::math; + +// Kernel description in Hipacc +class LocalOperatorExample : public Kernel { + private: + Accessor &Input; + Mask &mask; + + public: + LocalOperatorExample(IterationSpace &IS, Accessor &Input, + Mask &mask) + : Kernel(IS), Input(Input), mask(mask) { + add_accessor(&Input); + } + + void kernel() { + output() = (TYPE)(convolve(mask, Reduce::SUM, [&] () -> float { + return mask() * Input(mask); + }) + 0.5f); + } +}; + +class PointOperatorExample : public Kernel { + private: + Accessor ∈ + + public: + PointOperatorExample( + IterationSpace &iter, + Accessor &acc) + : Kernel(iter), in(acc) { + add_accessor(&in); + } + + void kernel() { + TYPE interm_pixel = in(); + interm_pixel += 3; + output() = interm_pixel; + } +}; + +class OutOperatorExample : public Kernel { + private: + Accessor &in1; + Accessor &in2; + + public: + OutOperatorExample( + IterationSpace &iter, + Accessor &acc1, + Accessor &acc2) + : Kernel(iter), in1(acc1), in2(acc2) { + add_accessor(&in1); + add_accessor(&in2); + } + + void kernel() { + TYPE interm_pixel1 = in1(); + TYPE interm_pixel2 = in2(); + output() = interm_pixel1 + interm_pixel2; + } +}; + +// forward declaration of reference implementation +void kernel_fusion(TYPE *in, TYPE *out, float *filter, + int size_x, int size_y, int width, int height); + +/************************************************************************* + * Main function * + *************************************************************************/ +HIPACC_CODEGEN int main(int argc, const char **argv) { + int width_arg = WIDTH; + int height_arg = HEIGHT; + + if(argc >= 2) { + width_arg = std::stoi(argv[1]); + height_arg = width_arg; + } + + if(argc >= 3) { + height_arg = std::stoi(argv[2]); + } + + const int width = width_arg; + const int height = height_arg; + const int size_x = SIZE_X; + const int size_y = SIZE_Y; + const int offset_x = size_x >> 1; + const int offset_y = size_y >> 1; + + // convolution filter mask + const float coef[SIZE_Y][SIZE_X] = { + { 0.005008f, 0.017300f, 0.026151f, 0.017300f, 0.005008f }, + { 0.017300f, 0.059761f, 0.090339f, 0.059761f, 0.017300f }, + { 0.026151f, 0.090339f, 0.136565f, 0.090339f, 0.026151f }, + { 0.017300f, 0.059761f, 0.090339f, 0.059761f, 0.017300f }, + { 0.005008f, 0.017300f, 0.026151f, 0.017300f, 0.005008f } + }; + + // host memory for image of width x height pixels, random + TYPE *input = (TYPE*)load_data(width, height); + TYPE *ref_out = new TYPE[width*height]; + + std::cout << "Testing Hipacc kernel fusion ..." << std::endl; + + //************************************************************************// + + // input and output image of width x height pixels + Image in(width, height, input); + Image out(width, height); + + // test parallel local-point to point kernel fusion + Mask mask0(coef); + BoundaryCondition bound0(in, mask0, Boundary::CLAMP); + Accessor acc0(bound0); + Image buf0(width, height); + IterationSpace iter0(buf0); + LocalOperatorExample localOp(iter0, acc0, mask0); + + Accessor acc1(in); + Image buf1(width, height); + IterationSpace iter1(buf1); + PointOperatorExample pointOp(iter1, acc1); + + Accessor acc2(buf0); + Accessor acc3(buf1); + IterationSpace iter2(out); + OutOperatorExample outOp(iter2, acc2, acc3); + + // execution after all decls + localOp.execute(); + pointOp.execute(); + outOp.execute(); + + // get pointer to result data + TYPE *output = out.data(); + + //************************************************************************// + std::cout << "Calculating reference ..." << std::endl; + kernel_fusion(input, ref_out, (float*)coef, size_x, size_y, width, height); + compare_results(output, ref_out, width, height, offset_x, offset_y); + + // free memory + delete[] input; + delete[] ref_out; + + return EXIT_SUCCESS; +} + +// kernel fusion reference +void local_kernel(TYPE *in, TYPE *out, float *filter, + int size_x, int size_y, int width, int height) { + int anchor_x = size_x >> 1; + int anchor_y = size_y >> 1; + int upper_x = width - anchor_x; + int upper_y = height - anchor_y; + + for (int y=anchor_y; y= 2) { + width_arg = std::stoi(argv[1]); + height_arg = width_arg; + } + + if(argc >= 3) { + height_arg = std::stoi(argv[2]); + } + + const int width = width_arg; + const int height = height_arg; // host memory for image of width x height pixels, random TYPE *input = (TYPE*)load_data(width, height); @@ -86,18 +98,18 @@ HIPACC_CODEGEN int main(int argc, const char **argv) { Accessor acc1(buf0); Image buf1(width, height); - IterationSpace iter1(buf1); + IterationSpace iter1(out); PointOperatorExample pointOp1(iter1, acc1); - Accessor acc2(buf1); - Image buf2(width, height); - IterationSpace iter2(out); - PointOperatorExample pointOp2(iter2, acc2); + //Accessor acc2(buf1); + //Image buf2(width, height); + //IterationSpace iter2(out); + //PointOperatorExample pointOp2(iter2, acc2); // execution after all decls pointOp0.execute(); pointOp1.execute(); - pointOp2.execute(); + //pointOp2.execute(); // get pointer to result data TYPE *output = out.data(); @@ -127,8 +139,8 @@ void kernel_fusion(TYPE *in, TYPE *out, int width, int height) { TYPE *ref_buf0 = new TYPE[width*height]; TYPE *ref_buf1 = new TYPE[width*height]; point_kernel(in, ref_buf0, width, height); - point_kernel(ref_buf0, ref_buf1, width, height); - point_kernel(ref_buf1, out, width, height); + point_kernel(ref_buf0, out, width, height); + //point_kernel(ref_buf1, out, width, height); delete[] ref_buf0; delete[] ref_buf1; } diff --git a/samples-public/6_Test/Kernel_Fusion_P2P_Heavy/CMakeLists.txt b/samples-public/6_Test/Kernel_Fusion_P2P_Heavy/CMakeLists.txt new file mode 100644 index 00000000..a33eaad4 --- /dev/null +++ b/samples-public/6_Test/Kernel_Fusion_P2P_Heavy/CMakeLists.txt @@ -0,0 +1,10 @@ +cmake_minimum_required(VERSION 3.14) + +get_filename_component(SAMPLE_NAME "${CMAKE_CURRENT_LIST_DIR}" NAME) + +project(${SAMPLE_NAME}) + +#add_hipacc_sample_dsl() +#add_hipacc_sample_cpu() +add_hipacc_sample_cuda(FUSION LOCAL PPT) # kernel fusion is only supported for CUDA +#add_hipacc_sample_opencl(CPU GPU) diff --git a/samples-public/6_Test/Kernel_Fusion_P2P_Heavy/src/main.cpp b/samples-public/6_Test/Kernel_Fusion_P2P_Heavy/src/main.cpp new file mode 100644 index 00000000..dcdbeb65 --- /dev/null +++ b/samples-public/6_Test/Kernel_Fusion_P2P_Heavy/src/main.cpp @@ -0,0 +1,139 @@ +// +// Copyright (c) 2020, University of Erlangen-Nuremberg +// Copyright (c) 2012, University of Erlangen-Nuremberg +// Copyright (c) 2012, Siemens AG +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, this +// list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + +#include "hipacc.hpp" + +#include +#include + +#define WIDTH 512 +#define HEIGHT 512 +#define TYPE uchar +#define N_ITER 1024 + +using namespace hipacc; +using namespace hipacc::math; + +// Kernel description in Hipacc +class PointOperatorExample : public Kernel { + private: + Accessor ∈ + + public: + PointOperatorExample(IterationSpace &iter, Accessor &acc) + : Kernel(iter), in(acc) { + add_accessor(&in); + } + + void kernel() { + TYPE interm_pixel = in(); + for(int i = 0; i < N_ITER; ++i) { + interm_pixel += 3; + } + output() = interm_pixel; + } +}; + +// forward declaration of reference implementation +void kernel_fusion(TYPE *in, TYPE *out, int width, int height); + +/************************************************************************* + * Main function * + *************************************************************************/ +HIPACC_CODEGEN int main(int argc, const char **argv) { + const int width = WIDTH; + const int height = HEIGHT; + + // host memory for image of width x height pixels, random + TYPE *input = (TYPE*)load_data(width, height); + TYPE *ref_out = new TYPE[width*height]; + + std::cout << "Testing Hipacc kernel fusion ..." << std::endl; + + //************************************************************************// + + // input and output image of width x height pixels + Image in(width, height, input); + Image out(width, height); + + // test point to point kernel fusion + // e.g., p -> p -> ... -> p + Accessor acc0(in); + Image buf0(width, height); + IterationSpace iter0(buf0); + PointOperatorExample pointOp0(iter0, acc0); + + Accessor acc1(buf0); + Image buf1(width, height); + IterationSpace iter1(buf1); + PointOperatorExample pointOp1(iter1, acc1); + + Accessor acc2(buf1); + Image buf2(width, height); + IterationSpace iter2(out); + PointOperatorExample pointOp2(iter2, acc2); + + // execution after all decls + pointOp0.execute(); + pointOp1.execute(); + pointOp2.execute(); + + // get pointer to result data + TYPE *output = out.data(); + + //************************************************************************// + std::cout << "Calculating reference ..." << std::endl; + kernel_fusion(input, ref_out, width, height); + compare_results(output, ref_out, width, height); + + // free memory + delete[] input; + delete[] ref_out; + + return EXIT_SUCCESS; +} + +// kernel fusion reference +void point_kernel(TYPE *in, TYPE *out, int width, int height) { + for (int p = 0; p < width*height; ++p) { + TYPE interm_pixel = in[p]; + for(int i = 0; i < N_ITER; ++i) { + interm_pixel += 3; + } + out[p] = interm_pixel; + } +} + +void kernel_fusion(TYPE *in, TYPE *out, int width, int height) { + TYPE *ref_buf0 = new TYPE[width*height]; + TYPE *ref_buf1 = new TYPE[width*height]; + point_kernel(in, ref_buf0, width, height); + point_kernel(ref_buf0, ref_buf1, width, height); + point_kernel(ref_buf1, out, width, height); + delete[] ref_buf0; + delete[] ref_buf1; +} diff --git a/samples-public/6_Test/Kernel_Fusion_PP2L/CMakeLists.txt b/samples-public/6_Test/Kernel_Fusion_PP2L/CMakeLists.txt new file mode 100644 index 00000000..a33eaad4 --- /dev/null +++ b/samples-public/6_Test/Kernel_Fusion_PP2L/CMakeLists.txt @@ -0,0 +1,10 @@ +cmake_minimum_required(VERSION 3.14) + +get_filename_component(SAMPLE_NAME "${CMAKE_CURRENT_LIST_DIR}" NAME) + +project(${SAMPLE_NAME}) + +#add_hipacc_sample_dsl() +#add_hipacc_sample_cpu() +add_hipacc_sample_cuda(FUSION LOCAL PPT) # kernel fusion is only supported for CUDA +#add_hipacc_sample_opencl(CPU GPU) diff --git a/samples-public/6_Test/Kernel_Fusion_PP2L/src/main.cpp b/samples-public/6_Test/Kernel_Fusion_PP2L/src/main.cpp new file mode 100644 index 00000000..5cd52f77 --- /dev/null +++ b/samples-public/6_Test/Kernel_Fusion_PP2L/src/main.cpp @@ -0,0 +1,204 @@ +// +// Copyright (c) 2020, University of Erlangen-Nuremberg +// Copyright (c) 2012, University of Erlangen-Nuremberg +// Copyright (c) 2012, Siemens AG +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, this +// list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + +#include "hipacc.hpp" + +#include +#include + +#define WIDTH 512 +#define HEIGHT 512 +#define SIZE_X 5 +#define SIZE_Y 5 +#define TYPE uchar + +using namespace hipacc; +using namespace hipacc::math; + +// Kernel description in Hipacc +class PointOperatorExample : public Kernel { + private: + Accessor ∈ + + public: + PointOperatorExample(IterationSpace &iter, Accessor &acc) + : Kernel(iter), in(acc) { + add_accessor(&in); + } + + void kernel() { + TYPE interm_pixel = in(); + interm_pixel += 3; + output() = interm_pixel; + } +}; + +class LocalOutOperatorExample : public Kernel { + private: + Accessor &in1; + Accessor &in2; + Mask &mask; + + public: + LocalOutOperatorExample( + IterationSpace &iter, + Accessor &acc1, + Accessor &acc2, + Mask &mask) + : Kernel(iter), in1(acc1), in2(acc2), mask(mask) { + add_accessor(&in1); + add_accessor(&in2); + } + + void kernel() { + output() = (TYPE)(convolve(mask, Reduce::SUM, [&] () -> float { + return (mask() * in1(mask)) + (mask() * in2(mask)); + }) + 0.5f); + } +}; + +// forward declaration of reference implementation +void kernel_fusion(TYPE *in, TYPE *out, float *filter, + int size_x, int size_y, int width, int height); + +/************************************************************************* + * Main function * + *************************************************************************/ +HIPACC_CODEGEN int main(int argc, const char **argv) { + const int width = WIDTH; + const int height = HEIGHT; + const int size_x = SIZE_X; + const int size_y = SIZE_Y; + const int offset_x = size_x >> 1; + const int offset_y = size_y >> 1; + + // convolution filter mask + const float coef[SIZE_Y][SIZE_X] = { + { 0.005008f, 0.017300f, 0.026151f, 0.017300f, 0.005008f }, + { 0.017300f, 0.059761f, 0.090339f, 0.059761f, 0.017300f }, + { 0.026151f, 0.090339f, 0.136565f, 0.090339f, 0.026151f }, + { 0.017300f, 0.059761f, 0.090339f, 0.059761f, 0.017300f }, + { 0.005008f, 0.017300f, 0.026151f, 0.017300f, 0.005008f } + }; + + // host memory for image of width x height pixels, random + TYPE *input = (TYPE*)load_data(width, height); + TYPE *ref_out = new TYPE[width*height]; + + std::cout << "Testing Hipacc kernel fusion ..." << std::endl; + + //************************************************************************// + + // input and output image of width x height pixels + Image in(width, height, input); + Image out(width, height); + + // test point-point to local kernel fusion + + Accessor acc0(in); + Image buf0(width, height); + IterationSpace iter0(buf0); + PointOperatorExample pointOp0(iter0, acc0); + + Accessor acc1(in); + Image buf1(width, height); + IterationSpace iter1(buf1); + PointOperatorExample pointOp1(iter1, acc1); + + Mask mask(coef); + BoundaryCondition bound0(buf0, mask, Boundary::CLAMP); + Accessor acc2(bound0); + BoundaryCondition bound1(buf1, mask, Boundary::CLAMP); + Accessor acc3(bound1); + IterationSpace iter2(out); + LocalOutOperatorExample localOutOp(iter2, acc2, acc3, mask); + + // execution after all decls + pointOp0.execute(); + pointOp1.execute(); + localOutOp.execute(); + + // get pointer to result data + TYPE *output = out.data(); + + //************************************************************************// + std::cout << "Calculating reference ..." << std::endl; + kernel_fusion(input, ref_out, (float*)coef, size_x, size_y, width, height); + compare_results(output, ref_out, width, height, offset_x, offset_y); + + // free memory + delete[] input; + delete[] ref_out; + + return EXIT_SUCCESS; +} + +// kernel fusion reference +void point_kernel(TYPE *in, TYPE *out, int width, int height) { + for (int p = 0; p < width*height; ++p) { + TYPE interm_pixel = in[p]; + interm_pixel += 3; + out[p] = interm_pixel; + } +} + +void local_out_kernel(TYPE *in1, TYPE *in2, TYPE *out, float *filter, + int size_x, int size_y, int width, int height) { + int anchor_x = size_x >> 1; + int anchor_y = size_y >> 1; + int upper_x = width - anchor_x; + int upper_y = height - anchor_y; + + for (int y=anchor_y; y +#include + +#define WIDTH 512 +#define HEIGHT 512 +#define TYPE uchar + +using namespace hipacc; +using namespace hipacc::math; + +// Kernel description in Hipacc +class PointOperatorExample : public Kernel { + private: + Accessor ∈ + + public: + PointOperatorExample(IterationSpace &iter, Accessor &acc) + : Kernel(iter), in(acc) { + add_accessor(&in); + } + + void kernel() { + TYPE interm_pixel = in(); + interm_pixel += 3; + output() = interm_pixel; + } +}; + +class OutOperatorExample : public Kernel { + private: + Accessor &in1; + Accessor &in2; + + public: + OutOperatorExample(IterationSpace &iter, Accessor &acc1, Accessor &acc2) + : Kernel(iter), in1(acc1), in2(acc2) { + add_accessor(&in1); + add_accessor(&in2); + } + + void kernel() { + TYPE interm_pixel1 = in1(); + TYPE interm_pixel2 = in2(); + output() = interm_pixel1 + interm_pixel2; + } +}; + +// forward declaration of reference implementation +void kernel_fusion(TYPE *in, TYPE *out, int width, int height); + +/************************************************************************* + * Main function * + *************************************************************************/ +HIPACC_CODEGEN int main(int argc, const char **argv) { + int width_arg = WIDTH; + int height_arg = HEIGHT; + + if(argc >= 2) { + width_arg = std::stoi(argv[1]); + height_arg = width_arg; + } + + if(argc >= 3) { + height_arg = std::stoi(argv[2]); + } + + const int width = width_arg; + const int height = height_arg; + + // host memory for image of width x height pixels, random + TYPE *input = (TYPE*)load_data(width, height); + TYPE *ref_out = new TYPE[width*height]; + + std::cout << "Testing Hipacc kernel fusion ..." << std::endl; + + //************************************************************************// + + // input and output image of width x height pixels + Image in(width, height, input); + Image out(width, height); + + // test parallelpoint-point to point kernel fusion + Accessor acc0(in); + + Image buf0(width, height); + IterationSpace iter0(buf0); + PointOperatorExample pointOp0(iter0, acc0); + + Image buf1(width, height); + IterationSpace iter1(buf1); + PointOperatorExample pointOp1(iter1, acc0); + + Accessor acc1(buf0); + Accessor acc2(buf1); + IterationSpace iter2(out); + OutOperatorExample outOp(iter2, acc1, acc2); + + // execution after all decls + pointOp0.execute(); + pointOp1.execute(); + outOp.execute(); + + // get pointer to result data + TYPE *output = out.data(); + + //************************************************************************// + std::cout << "Calculating reference ..." << std::endl; + kernel_fusion(input, ref_out, width, height); + compare_results(output, ref_out, width, height); + + // free memory + delete[] input; + delete[] ref_out; + + return EXIT_SUCCESS; +} + +// kernel fusion reference +void point_kernel(TYPE *in, TYPE *out, int width, int height) { + for (int p = 0; p < width*height; ++p) { + TYPE interm_pixel = in[p]; + interm_pixel += 3; + out[p] = interm_pixel; + } +} + +void out_kernel(TYPE *in1, TYPE *in2, TYPE *out, int width, int height) { + for (int p = 0; p < width*height; ++p) { + TYPE interm_pixel1 = in1[p]; + TYPE interm_pixel2 = in2[p]; + out[p] = interm_pixel1 + interm_pixel2; + } +} + +void kernel_fusion(TYPE *in, TYPE *out, int width, int height) { + TYPE *ref_buf0 = new TYPE[width*height]; + TYPE *ref_buf1 = new TYPE[width*height]; + + // left operator + point_kernel(in, ref_buf0, width, height); + + // right operator + point_kernel(in, ref_buf1, width, height); + + // out operator + out_kernel(ref_buf0, ref_buf1, out, width, height); + + delete[] ref_buf0; + delete[] ref_buf1; +} diff --git a/samples-public/6_Test/Kernel_Fusion_PP2P_Heavy/CMakeLists.txt b/samples-public/6_Test/Kernel_Fusion_PP2P_Heavy/CMakeLists.txt new file mode 100644 index 00000000..a33eaad4 --- /dev/null +++ b/samples-public/6_Test/Kernel_Fusion_PP2P_Heavy/CMakeLists.txt @@ -0,0 +1,10 @@ +cmake_minimum_required(VERSION 3.14) + +get_filename_component(SAMPLE_NAME "${CMAKE_CURRENT_LIST_DIR}" NAME) + +project(${SAMPLE_NAME}) + +#add_hipacc_sample_dsl() +#add_hipacc_sample_cpu() +add_hipacc_sample_cuda(FUSION LOCAL PPT) # kernel fusion is only supported for CUDA +#add_hipacc_sample_opencl(CPU GPU) diff --git a/samples-public/6_Test/Kernel_Fusion_PP2P_Heavy/src/main.cpp b/samples-public/6_Test/Kernel_Fusion_PP2P_Heavy/src/main.cpp new file mode 100644 index 00000000..418e2fdd --- /dev/null +++ b/samples-public/6_Test/Kernel_Fusion_PP2P_Heavy/src/main.cpp @@ -0,0 +1,185 @@ +// +// Copyright (c) 2020, University of Erlangen-Nuremberg +// Copyright (c) 2012, University of Erlangen-Nuremberg +// Copyright (c) 2012, Siemens AG +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, this +// list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + +#include "hipacc.hpp" + +#include +#include + +#define WIDTH 512 +#define HEIGHT 512 +#define TYPE uchar +#define N_ITER 1024 + +using namespace hipacc; +using namespace hipacc::math; + +// Kernel description in Hipacc +class PointOperatorExample : public Kernel { + private: + Accessor ∈ + int n_iter; + + public: + PointOperatorExample(IterationSpace &iter, Accessor &acc, int n_iter) + : Kernel(iter), in(acc), n_iter(n_iter) { + add_accessor(&in); + } + + void kernel() { + TYPE interm_pixel = in(); + for(int i = 0; i < n_iter; ++i) { + interm_pixel += 3; + } + output() = interm_pixel; + } +}; + +class OutOperatorExample : public Kernel { + private: + Accessor &in1; + Accessor &in2; + + public: + OutOperatorExample(IterationSpace &iter, Accessor &acc1, Accessor &acc2) + : Kernel(iter), in1(acc1), in2(acc2) { + add_accessor(&in1); + add_accessor(&in2); + } + + void kernel() { + TYPE interm_pixel1 = in1(); + TYPE interm_pixel2 = in2(); + output() = interm_pixel1 + interm_pixel2; + } +}; + +// forward declaration of reference implementation +void kernel_fusion(TYPE *in, TYPE *out, int width, int height); + +/************************************************************************* + * Main function * + *************************************************************************/ +HIPACC_CODEGEN int main(int argc, const char **argv) { + int width_arg = WIDTH; + int height_arg = HEIGHT; + + if(argc >= 2) { + width_arg = std::stoi(argv[1]); + height_arg = width_arg; + } + + if(argc >= 3) { + height_arg = std::stoi(argv[2]); + } + + const int width = width_arg; + const int height = height_arg; + + // host memory for image of width x height pixels, random + TYPE *input = (TYPE*)load_data(width, height); + TYPE *ref_out = new TYPE[width*height]; + + std::cout << "Testing Hipacc kernel fusion ..." << std::endl; + + //************************************************************************// + + // input and output image of width x height pixels + Image in(width, height, input); + Image out(width, height); + + // test parallelpoint-point to point kernel fusion + Accessor acc0(in); + + Image buf0(width, height); + IterationSpace iter0(buf0); + PointOperatorExample pointOp0(iter0, acc0, N_ITER); + + Image buf1(width, height); + IterationSpace iter1(buf1); + PointOperatorExample pointOp1(iter1, acc0, N_ITER); + + Accessor acc1(buf0); + Accessor acc2(buf1); + IterationSpace iter2(out); + OutOperatorExample outOp(iter2, acc1, acc2); + + // execution after all decls + pointOp0.execute(); + pointOp1.execute(); + outOp.execute(); + + // get pointer to result data + TYPE *output = out.data(); + + //************************************************************************// + std::cout << "Calculating reference ..." << std::endl; + kernel_fusion(input, ref_out, width, height); + compare_results(output, ref_out, width, height); + + // free memory + delete[] input; + delete[] ref_out; + + return EXIT_SUCCESS; +} + +// kernel fusion reference +void point_kernel(TYPE *in, TYPE *out, int width, int height) { + for (int p = 0; p < width*height; ++p) { + TYPE interm_pixel = in[p]; + for(int i = 0; i < N_ITER; ++i) { + interm_pixel += 3; + } + out[p] = interm_pixel; + } +} + +void out_kernel(TYPE *in1, TYPE *in2, TYPE *out, int width, int height) { + for (int p = 0; p < width*height; ++p) { + TYPE interm_pixel1 = in1[p]; + TYPE interm_pixel2 = in2[p]; + out[p] = interm_pixel1 + interm_pixel2; + } +} + +void kernel_fusion(TYPE *in, TYPE *out, int width, int height) { + TYPE *ref_buf0 = new TYPE[width*height]; + TYPE *ref_buf1 = new TYPE[width*height]; + + // left operator + point_kernel(in, ref_buf0, width, height); + + // right operator + point_kernel(in, ref_buf1, width, height); + + // out operator + out_kernel(ref_buf0, ref_buf1, out, width, height); + + delete[] ref_buf0; + delete[] ref_buf1; +} diff --git a/samples-public/CMakeLists.txt.in b/samples-public/CMakeLists.txt.in index 6c2c4161..25ef2d7e 100644 --- a/samples-public/CMakeLists.txt.in +++ b/samples-public/CMakeLists.txt.in @@ -27,6 +27,7 @@ option(HIPACC_SAMPLE_CUDA_FUSION "Enable kernel fusion for CUDA samples" ON) option(HIPACC_SAMPLE_CUDA_LOCAL "Enable local memory use for CUDA samples" ON) option(HIPACC_SAMPLE_CUDA_PPT "Enable thread coarsening for CUDA samples" ON) option(HIPACC_SAMPLE_CUDA_GRAPH "Enable thread coarsening for CUDA samples" ON) +option(HIPACC_SAMPLE_CUDA_FUSEDPPT "Enable thread coarsening for CUDA samples, combined with kernel fusion" ON) if(OpenCL_FOUND) option(HIPACC_SAMPLE_OPENCL_CPU "Enable CPU targets for OpenCL samples" ON) @@ -186,7 +187,7 @@ endmacro() macro(add_hipacc_sample_cuda) - set(options HIDE FUSION LOCAL PPT GRAPH) + set(options HIDE FUSION LOCAL PPT PPTN GRAPH FUSEDPPT FUSEDPPTN) set(oneValueArgs) set(multiValueArgs) @@ -224,6 +225,18 @@ macro(add_hipacc_sample_cuda) endif() endif() + if(ARG_CUDA_PPTN) + if(HIPACC_SAMPLE_CUDA_PPT) + list(APPEND _CUDA_TARGET_CONFIG PPT2) + list(APPEND _CUDA_TARGET_CONFIG PPT4) + list(APPEND _CUDA_TARGET_CONFIG PPT8) + list(APPEND _CUDA_TARGET_CONFIG PPT16) + list(APPEND _CUDA_TARGET_CONFIG PPT32) + else() + message(STATUS "Skip sample ${SAMPLE_CATEGORY}/${SAMPLE_NAME}_Cuda_PptN (CUDA thread coarsening disabled)") + endif() + endif() + if(ARG_CUDA_GRAPH) if(HIPACC_SAMPLE_CUDA_GRAPH) list(APPEND _CUDA_TARGET_CONFIG GRAPH) @@ -232,6 +245,26 @@ macro(add_hipacc_sample_cuda) endif() endif() + if(ARG_CUDA_FUSEDPPT) + if(HIPACC_SAMPLE_CUDA_FUSEDPPT) + list(APPEND _CUDA_TARGET_CONFIG FUSEDPPT) + else() + message(STATUS "Skip sample ${SAMPLE_CATEGORY}/${SAMPLE_NAME}_Cuda_FusedPpt (CUDA thread coarsening disabled)") + endif() + endif() + + if(ARG_CUDA_FUSEDPPTN) + if(HIPACC_SAMPLE_CUDA_FUSEDPPT) + list(APPEND _CUDA_TARGET_CONFIG FUSEDPPT2) + list(APPEND _CUDA_TARGET_CONFIG FUSEDPPT4) + list(APPEND _CUDA_TARGET_CONFIG FUSEDPPT8) + list(APPEND _CUDA_TARGET_CONFIG FUSEDPPT16) + list(APPEND _CUDA_TARGET_CONFIG FUSEDPPT32) + else() + message(STATUS "Skip sample ${SAMPLE_CATEGORY}/${SAMPLE_NAME}_Cuda_FusedPptN (CUDA thread coarsening disabled)") + endif() + endif() + check_language(CUDA) if(CMAKE_CUDA_COMPILER) diff --git a/samples-public/common/config/cuda-fusedppt.conf b/samples-public/common/config/cuda-fusedppt.conf new file mode 100644 index 00000000..658a3941 --- /dev/null +++ b/samples-public/common/config/cuda-fusedppt.conf @@ -0,0 +1,8 @@ +-target Kepler-30 +-use-config 128x1 +-reduce-config 16x16 +-use-textures off +-use-local off +-vectorize off +-pixels-per-thread 4 +-fuse-kernel on diff --git a/samples-public/common/config/cuda-fusedppt16.conf b/samples-public/common/config/cuda-fusedppt16.conf new file mode 100644 index 00000000..c0a7d017 --- /dev/null +++ b/samples-public/common/config/cuda-fusedppt16.conf @@ -0,0 +1,8 @@ +-target Kepler-30 +-use-config 128x1 +-reduce-config 16x16 +-use-textures off +-use-local off +-vectorize off +-pixels-per-thread 16 +-fuse-kernel on diff --git a/samples-public/common/config/cuda-fusedppt2.conf b/samples-public/common/config/cuda-fusedppt2.conf new file mode 100644 index 00000000..a27c8fab --- /dev/null +++ b/samples-public/common/config/cuda-fusedppt2.conf @@ -0,0 +1,8 @@ +-target Kepler-30 +-use-config 128x1 +-reduce-config 16x16 +-use-textures off +-use-local off +-vectorize off +-pixels-per-thread 2 +-fuse-kernel on diff --git a/samples-public/common/config/cuda-fusedppt32.conf b/samples-public/common/config/cuda-fusedppt32.conf new file mode 100644 index 00000000..0a28d608 --- /dev/null +++ b/samples-public/common/config/cuda-fusedppt32.conf @@ -0,0 +1,8 @@ +-target Kepler-30 +-use-config 128x1 +-reduce-config 16x16 +-use-textures off +-use-local off +-vectorize off +-pixels-per-thread 32 +-fuse-kernel on diff --git a/samples-public/common/config/cuda-fusedppt4.conf b/samples-public/common/config/cuda-fusedppt4.conf new file mode 100644 index 00000000..658a3941 --- /dev/null +++ b/samples-public/common/config/cuda-fusedppt4.conf @@ -0,0 +1,8 @@ +-target Kepler-30 +-use-config 128x1 +-reduce-config 16x16 +-use-textures off +-use-local off +-vectorize off +-pixels-per-thread 4 +-fuse-kernel on diff --git a/samples-public/common/config/cuda-fusedppt8.conf b/samples-public/common/config/cuda-fusedppt8.conf new file mode 100644 index 00000000..af589b3f --- /dev/null +++ b/samples-public/common/config/cuda-fusedppt8.conf @@ -0,0 +1,8 @@ +-target Kepler-30 +-use-config 128x1 +-reduce-config 16x16 +-use-textures off +-use-local off +-vectorize off +-pixels-per-thread 8 +-fuse-kernel on diff --git a/samples-public/common/config/cuda-ppt16.conf b/samples-public/common/config/cuda-ppt16.conf new file mode 100644 index 00000000..660b3d29 --- /dev/null +++ b/samples-public/common/config/cuda-ppt16.conf @@ -0,0 +1,8 @@ +-target Kepler-30 +-use-config 128x1 +-reduce-config 16x16 +-use-textures off +-use-local off +-vectorize off +-pixels-per-thread 16 +-fuse-kernel off diff --git a/samples-public/common/config/cuda-ppt2.conf b/samples-public/common/config/cuda-ppt2.conf new file mode 100644 index 00000000..a9204a53 --- /dev/null +++ b/samples-public/common/config/cuda-ppt2.conf @@ -0,0 +1,8 @@ +-target Kepler-30 +-use-config 128x1 +-reduce-config 16x16 +-use-textures off +-use-local off +-vectorize off +-pixels-per-thread 2 +-fuse-kernel off diff --git a/samples-public/common/config/cuda-ppt32.conf b/samples-public/common/config/cuda-ppt32.conf new file mode 100644 index 00000000..96c5596a --- /dev/null +++ b/samples-public/common/config/cuda-ppt32.conf @@ -0,0 +1,8 @@ +-target Kepler-30 +-use-config 128x1 +-reduce-config 16x16 +-use-textures off +-use-local off +-vectorize off +-pixels-per-thread 32 +-fuse-kernel off diff --git a/samples-public/common/config/cuda-ppt4.conf b/samples-public/common/config/cuda-ppt4.conf new file mode 100644 index 00000000..bc96a919 --- /dev/null +++ b/samples-public/common/config/cuda-ppt4.conf @@ -0,0 +1,8 @@ +-target Kepler-30 +-use-config 128x1 +-reduce-config 16x16 +-use-textures off +-use-local off +-vectorize off +-pixels-per-thread 4 +-fuse-kernel off diff --git a/samples-public/common/config/cuda-ppt8.conf b/samples-public/common/config/cuda-ppt8.conf new file mode 100644 index 00000000..fc06d8a3 --- /dev/null +++ b/samples-public/common/config/cuda-ppt8.conf @@ -0,0 +1,8 @@ +-target Kepler-30 +-use-config 128x1 +-reduce-config 16x16 +-use-textures off +-use-local off +-vectorize off +-pixels-per-thread 8 +-fuse-kernel off