Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
290e298
Add PP2P parallel fusion example
Nov 12, 2020
ba89fce
Fix indent
dezajno Nov 12, 2020
1b1c9af
Remove unnecessary buf2 and fix further indent inconsistencies
dezajno Nov 12, 2020
b321602
Add parallel 3P2P fusion example
dezajno Nov 12, 2020
1219e86
Add parallel 4P2P fusion example
dezajno Nov 12, 2020
205c676
Move *P2P one level up
dezajno Nov 13, 2020
54b15b9
Add parallel LL2P fusion example
dezajno Nov 13, 2020
fbe97b9
Add parallel LP2P fusion example
dezajno Nov 13, 2020
7759550
Add parallel PP2L fusion example
dezajno Nov 13, 2020
4d7f4b5
Fix commit in LP2P
dezajno Nov 13, 2020
4a1e237
Add parallel LP2L fusion example
dezajno Nov 14, 2020
4d6de57
Add parallel LL2L fusion example
dezajno Nov 14, 2020
36d2adc
Add P2P_Heavy fusion example
dezajno Nov 14, 2020
84129c6
Add PP2P_Heavy fusion example
dezajno Nov 14, 2020
db564b5
Add 3P2P_Heavy fusion example
dezajno Nov 14, 2020
7de1004
Add 4P2P_Heavy fusion example
dezajno Nov 14, 2020
e661406
Implement detection of fusible parallel pattern
dezajno Nov 24, 2020
8ed814f
Add getter for fusibleSetNamesParallel
dezajno Nov 24, 2020
952b56b
Merge remote-tracking branch 'origin/siemens-dev' into parallel-kerne…
dezajno Nov 24, 2020
471d27b
Remove dumping of the application graph
dezajno Nov 24, 2020
3ff0886
Add helper function to convert partitionBlock to partitionBlockNames
dezajno Nov 25, 2020
0e8590b
Adapt isFusible to consider parallel fusible blocks as well
dezajno Nov 25, 2020
4ab8f86
Prepare kernel location map for parallel fusion
dezajno Nov 25, 2020
67da6c2
Fix windows CI error
dezajno Nov 25, 2020
4f68c90
Fix isFusible()
dezajno Nov 25, 2020
63674c0
Also insert locations for parallel fusible blocks
dezajno Nov 25, 2020
c2514f9
Distinguish linear and parallel fusion
dezajno Nov 25, 2020
a81888f
Prepare implemenation of parallel fusion
dezajno Nov 25, 2020
316310d
Activate working parallel kernel fusion
dezajno Nov 25, 2020
c43a4a0
Fix reading from multiple inputs
dezajno Nov 25, 2020
0eec2af
Prepare implementation of input caching for parallel fusion
dezajno Nov 26, 2020
9dee815
Implement input caching for parallel kernel fusion
dezajno Nov 26, 2020
ca838f0
Add FusiblePartitionBlock type
dezajno Nov 28, 2020
3b0c489
Implement FusiblePartitionBlock
dezajno Nov 28, 2020
ad8f344
Add getters for FusiblePartitionBlock
dezajno Nov 28, 2020
2de995f
Fill and query fusiblePartitionBlocks
dezajno Nov 28, 2020
1fbba96
Use fusiblePartitionBlocks in ASTFuse
dezajno Nov 28, 2020
52c77f4
Detect all parallel xx2x patterns
dezajno Nov 28, 2020
3d21cc1
Print hints for non-supported patterns
dezajno Nov 28, 2020
ea4d1f4
Rename fusibilityAnalysisLinear to fusibilityAnalysisLinearAndParallel
dezajno Nov 28, 2020
0ec7508
Fix parallel kernel fusion
dezajno Dec 8, 2020
e7055cd
Allow to specify input size for Kernel_Fusion examples
dezajno Dec 8, 2020
5fdf2e8
Make loop boundary dynamic to prevent compiler optimizations
dezajno Dec 8, 2020
95c98a1
Add color curves sample
May 27, 2021
8f71fff
Fix fusion + PPT for nP2P
Jun 1, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 52 additions & 15 deletions include/hipacc/AST/ASTFuse.h
Original file line number Diff line number Diff line change
Expand Up @@ -114,10 +114,21 @@ class ASTFuse {
SubListPosition Local2PointLoc = Undefined;
SubListPosition Point2LocalLoc = Undefined;
SubListPosition Local2LocalLoc = Undefined;
FusiblePartitionBlock::PatternType patternType;

FusionTypeTags(FusiblePartitionBlock::PatternType patternType) : patternType(patternType) {}
};

struct KernelListLocation {
// The location of the block in a set of partitionBlockNames
unsigned blockLocation;

// The location of the respective kernel list in a partitionBlockNames
unsigned listLocation;
};

std::map<HipaccKernel *, FusionTypeTags *> FusibleKernelSubListPosMap;
std::map<std::string, std::tuple<unsigned, unsigned>> FusibleKernelBlockLocation;
std::set<std::vector<std::list<std::string>>> fusibleSetNames;
std::map<std::string, KernelListLocation> FusibleKernelBlockLocation;
std::vector<std::list<HipaccKernel*> *> fusibleKernelSet;

// member functions
Expand All @@ -128,12 +139,32 @@ class ASTFuse {
FunctionDecl *createFusedKernelDecl(std::list<HipaccKernel *> *l);
void insertPrologFusedKernel();
void insertEpilogFusedKernel();
void createReg4FusionVarDecl(QualType QT);
void createIdx4FusionVarDecl();
void createGidVarDecl();
void createReg4FusionVarDecl(QualType QT, unsigned int ppt);
void createIdx4FusionVarDecl(unsigned int ppt);
void createGidVarDecl(unsigned int ppt);
void markKernelPositionSublist(std::list<HipaccKernel *> *l);
void recomputeMemorySizeLocalFusion(std::list<HipaccKernel *> *l);

const FusiblePartitionBlock& getPartitionBlockFor(std::list<HipaccKernel *> *l) {
hipacc_require((!l->empty()), "There is no fusion type for empty lists.");

auto fusibleBlocks = dataDeps->getFusiblePartitionBlocks();
auto block = fusibleBlocks.end();

for (auto k : *l) {
// get iterator
auto innerBlock = FusiblePartitionBlock::findForKernel(k, fusibleBlocks);
if (block != fusibleBlocks.end()) {
hipacc_require((block == innerBlock), "The given kernel list contains kernels of distinct partition blocks.");
} else {
block = innerBlock;
}
}

hipacc_require(block != fusibleBlocks.end(), "The given kernel list did not correspond to a partition block.");
return *block;
}

public:
ASTFuse(ASTContext& Ctx, DiagnosticsEngine &Diags, hipacc::Builtin::Context &builtins,
CompilerOptions &options, PrintingPolicy Policy, HostDataDeps *dataDeps) :
Expand All @@ -150,24 +181,30 @@ class ASTFuse {
fusionRegVarCount(0),
fusionIdxVarCount(0)
{
fusibleSetNames = dataDeps->getFusibleSetNames();
unsigned nFusibleKernelBlockLocations = 0;
for (const auto& fusibleBlock : dataDeps->getFusiblePartitionBlocks()) { // block level
if (!fusibleBlock.isPatternFusible()) {
continue;
}

// unpack fusible kernel info, one kernel per PB
// TODO, merge parallel kernels
unsigned PBlockID;
PBlockID = 0;
for (auto PBN : fusibleSetNames) { // block level
unsigned KernelVecID = 0;
for (auto sL : PBN) { // vector level
auto pos = std::make_tuple(PBlockID, KernelVecID);
auto nam = sL.front();
for (const auto& part : fusibleBlock.getParts()) { // vector level
KernelListLocation pos = {
nFusibleKernelBlockLocations,
KernelVecID
};

auto nam = part.front().getName();
bool locExists = FusibleKernelBlockLocation.find(nam) != FusibleKernelBlockLocation.end();
hipacc_require(!locExists, "Kernel lists cannot be added twice");

FusibleKernelBlockLocation[nam] = pos;
KernelVecID++;
}
// create a list for each partion block
std::list<HipaccKernel*> *list = new std::list<HipaccKernel*>;
fusibleKernelSet.push_back(list);
PBlockID++;
nFusibleKernelBlockLocations++;
}
}

Expand Down
27 changes: 21 additions & 6 deletions include/hipacc/AST/ASTTranslate.h
Original file line number Diff line number Diff line change
Expand Up @@ -197,20 +197,24 @@ class ASTTranslate : public StmtVisitor<ASTTranslate, Stmt *> {
class KernelFusionVars {
public:
bool bSkipGidDecl;
Expr *exprOutput;
VarDecl *exprOutput;
bool bReplaceExprOutput;
Expr *exprInput;
bool multipleInputs;
std::map<HipaccImage*, VarDecl*> exprInputs;
VarDecl *exprInput;
VarDecl *exprInputAccess;
bool bInputAccessProduce;
bool bReplaceExprInput;
bool bP2LReplaceExprInputIdx;
Expr *exprP2LInputIdx;
VarDecl *exprP2LInputIdx;
bool bP2LReplaceInputExprs;
Stmt *stmtP2LProducerBody;
Expr *exprSharedImgReg;
std::string exprSharedImgName;
bool bL2LInsertKernelBody;
bool bL2LInsertBeforeSmem;
Expr *exprL2LIdXShift;
Expr *exprL2LIdYShift;
VarDecl *exprL2LIdXShift;
VarDecl *exprL2LIdYShift;
int curL2LIdXShift;
int curL2LIdYShift;
bool bL2LRecordBorder;
Expand All @@ -230,7 +234,11 @@ class ASTTranslate : public StmtVisitor<ASTTranslate, Stmt *> {
bSkipGidDecl(true),
exprOutput(nullptr),
bReplaceExprOutput(false),
multipleInputs(false),
exprInputs(),
exprInput(nullptr),
exprInputAccess(nullptr),
bInputAccessProduce(false),
bReplaceExprInput(false),
bP2LReplaceExprInputIdx(false),
exprP2LInputIdx(nullptr),
Expand All @@ -255,6 +263,8 @@ class ASTTranslate : public StmtVisitor<ASTTranslate, Stmt *> {
};
KernelFusionVars fusionVars;

size_t currentPptIndex;


template<class T> T *Clone(T *S) {
if (S==nullptr)
Expand All @@ -280,6 +290,8 @@ class ASTTranslate : public StmtVisitor<ASTTranslate, Stmt *> {
}
}

Expr* createPptVarRefExpr(VarDecl *VD) const;

VarDecl *CloneVarDecl(VarDecl *VD);
VarDecl *CloneParmVarDecl(ParmVarDecl *PVD);
VarDecl *CloneDeclTex(ParmVarDecl *D, std::string prefix);
Expand Down Expand Up @@ -458,7 +470,8 @@ class ASTTranslate : public StmtVisitor<ASTTranslate, Stmt *> {
tileVars(),
lidYRef(nullptr),
gidYRef(nullptr),
fusionVars(kernel) {
fusionVars(kernel),
currentPptIndex(0) {
// get 'hipacc' namespace context for lookups
auto hipacc_ident = &Ctx.Idents.get("hipacc");
for (auto *decl : Ctx.getTranslationUnitDecl()->lookup(hipacc_ident))
Expand Down Expand Up @@ -519,7 +532,9 @@ class ASTTranslate : public StmtVisitor<ASTTranslate, Stmt *> {
// Kernel Fusion getters and setters
void setFusionSkipGidDecl(bool b) { fusionVars.bSkipGidDecl = b; }
void setFusionP2PSrcOperator(VarDecl *VD);
void setFusionNP2PSrcOperator(VarDecl *inVD, VarDecl *outVD, bool produce);
void setFusionP2PDestOperator(VarDecl *VD);
void setFusionNP2PDestOperator(const std::map<HipaccImage*, VarDecl*>& imgVarDeclMap);
void setFusionP2PIntermOperator(VarDecl *VDIn, VarDecl *VDOut);
void setFusionL2PDestOperator(VarDecl *VD, VarDecl *VDSharedImg, std::string nam);
void setFusionL2PIntermOperator(VarDecl *VDIn, VarDecl *VDOut, VarDecl *VDSharedImg, std::string nam);
Expand Down
130 changes: 124 additions & 6 deletions include/hipacc/Analysis/HostDataDeps.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
#include <algorithm>
#include <iostream>
#include <sstream>
#include <unordered_set>

//#define PRINT_DEBUG

Expand Down Expand Up @@ -118,10 +119,11 @@ class DependencyTracker : public StmtVisitor<DependencyTracker> {
}
};


class FusiblePartitionBlock;

class HostDataDeps : public ManagedAnalysis {
friend class DependencyTracker;
friend class FusiblePartitionBlock;

private:
static const bool DEBUG;
Expand Down Expand Up @@ -166,7 +168,7 @@ class HostDataDeps : public ManagedAnalysis {
using partitionBlock = std::vector<std::list<Process*> *>;
partitionBlock applicationGraph;
using partitionBlockNames = std::vector<std::list<std::string>>;
std::set<partitionBlockNames> fusibleSetNames;
std::set<FusiblePartitionBlock> fusiblePartitionBlocks;
using edgeWeight = std::map<std::pair<Process *, Process *>, unsigned>;
edgeWeight edgeWeightMap_;

Expand Down Expand Up @@ -485,10 +487,44 @@ class HostDataDeps : public ManagedAnalysis {
std::string getMemcpyNodeName(std::string imgDst, std::string imgSrc, std::string direction);
std::string getKernelNodeName(std::string kernelName);

// helper to convert a partitionBlock to a block of the respective kernel names
static partitionBlockNames convertToNames(const partitionBlock* pB) {
partitionBlockNames PBNam;
llvm::errs() << " [ ";
for (auto pL : *pB) {
llvm::errs() << "{";
std::list<std::string> lNam;
for (auto p : *pL) {
std::string kname = p->getKernel()->getName();
llvm::errs() << " --> " << kname;
lNam.push_back(kname);
}
llvm::errs() << "} ";
PBNam.push_back(lNam);
}
llvm::errs() << "] \n";

return PBNam;
}

static bool partitionBlockNamesContains(
const std::set<partitionBlockNames>& haystack,
const std::string& needle
) {
for (const auto& PBN : haystack) {
if (std::any_of(PBN.begin(), PBN.end(), [&](std::list<std::string> lNam){
return (std::find(lNam.begin(), lNam.end(), needle) != lNam.end()) &&
(lNam.size() > 1);})) {
return true;
}
}
return false;
}

// kernel fusion analysis
void computeGraphWeight();
void fusibilityAnalysis();
void fusibilityAnalysisLinear();
void fusibilityAnalysisLinearAndParallel();
void minCutGlobal(partitionBlock PB, partitionBlock &PBRet0, partitionBlock &PBRet1);
unsigned minCutPhase(partitionBlock &PB, edgeWeight &curEdgeWeightMap,
std::pair<Process *, Process *> &ST);
Expand All @@ -499,15 +535,14 @@ class HostDataDeps : public ManagedAnalysis {
std::string getSharedISName(HipaccKernel *K);
bool isSrc(Process *P);
bool isDest(Process *P);
std::set<partitionBlockNames> getFusibleSetNames() const;
const std::set<FusiblePartitionBlock>& getFusiblePartitionBlocks() const;
std::string getGraphMemcpyNodeName(std::string dst, std::string src, std::string dir);
std::string getGraphKernelNodeName(std::string kernelName);
std::set<std::string> getGraphMemcpyNodeDepOn(std::string dst, std::string src, std::string dir);
std::set<std::string> getGraphKernelNodeDepOn(std::string kernelName);
std::map<std::string, std::set<std::string>> getGraphNodeDepMap() const;
std::vector<std::string> getOutputImageNames();


static HostDataDeps *parse(ASTContext &Context,
PrintingPolicy &Policy,
AnalysisDeclContext &analysisContext,
Expand All @@ -522,7 +557,7 @@ class HostDataDeps : public ManagedAnalysis {
DependencyTracker DT(Context, Policy, analysisContext, compilerClasses, dataDeps);
dataDeps.generateSchedule();
if (dataDeps.compilerOptions->fuseKernels()) {
dataDeps.fusibilityAnalysisLinear();
dataDeps.fusibilityAnalysisLinearAndParallel();
}
if (dataDeps.compilerOptions->useGraph()) {
dataDeps.buildGraphDependency();
Expand All @@ -532,6 +567,89 @@ class HostDataDeps : public ManagedAnalysis {
}
};

class FusiblePartitionBlock {
public:
class KernelInfo;
using Part = std::vector<KernelInfo>;

enum class PatternType {
Linear,
Parallel
};

enum class Pattern {
// Linear patterns
Linear,

// Parallel patterns

// Parallel points to point
NP2P,
// Parallel locals to point
NL2P,
// Parallel mixed locals/points to point
Mixed2P,
// Parallel points to local
NP2L,
// Parallel locals to local
NL2L,
// Parallel mixed locals/points to local
Mixed2L
};

struct KernelInfo {
std::string name;

const std::string& getName() const;

bool operator < ( const KernelInfo& rhs ) const;
};

private:
Pattern pattern;
std::vector<Part> parts;
std::unordered_set<std::string> kernelNames;

public:
FusiblePartitionBlock(PatternType patternType, HostDataDeps::partitionBlock& inBlock);

static std::set<FusiblePartitionBlock>::iterator findForKernel(
const HipaccKernel* kernel,
const std::set<FusiblePartitionBlock>& fusibleBlocks
) {
return std::find_if(
fusibleBlocks.begin(),
fusibleBlocks.end(),
[&](const FusiblePartitionBlock& block) {
return block.hasKernel(kernel);
}
);
}

/**
* Check whether the pattern of this block is fusible.
*/
bool isPatternFusible() const {
// Return true if the pattern is fusible by the current ASTFuse tool, false otherwise.

switch (pattern) {
case FusiblePartitionBlock::Pattern::Linear:
case FusiblePartitionBlock::Pattern::NP2P:
return true;
default:
return false;
}
}

PatternType getPatternType() const;
Pattern getPattern() const;
const std::vector<Part>& getParts() const;
bool hasKernelName(const std::string& name) const;
bool hasKernel(const HipaccKernel* kernel) const;

bool operator < ( const FusiblePartitionBlock& rhs ) const;
};

}
}

Expand Down
Loading