Skip to content

Commit 00311cf

Browse files
authored
[BOLT] Impute missing trace fall-through (#145258)
1 parent d5def01 commit 00311cf

File tree

3 files changed

+130
-19
lines changed

3 files changed

+130
-19
lines changed

bolt/include/bolt/Core/MCPlusBuilder.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -430,6 +430,17 @@ class MCPlusBuilder {
430430
return Analysis->isIndirectBranch(Inst);
431431
}
432432

433+
/// Returns true if the instruction unconditionally transfers the control to
434+
/// another program point, interrupting sequential code execution, e.g. by a
435+
/// call, return, or unconditional jump. This explicitly leaves out
436+
/// conditional branches as they may not be taken, but does allow transferring
437+
/// the control to the next instruction (zero-displacement jump/call).
438+
bool isUnconditionalControlTransfer(const MCInst &Inst) const {
439+
const MCInstrDesc &Desc = Info->get(Inst.getOpcode());
440+
// barrier captures returns and unconditional branches
441+
return Desc.isBarrier() || Desc.isCall();
442+
}
443+
433444
/// Returns true if the instruction is memory indirect call or jump
434445
virtual bool isBranchOnMem(const MCInst &Inst) const {
435446
llvm_unreachable("not implemented");

bolt/include/bolt/Profile/DataAggregator.h

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ class DataAggregator : public DataReader {
137137
std::vector<std::pair<Trace, TakenBranchInfo>> Traces;
138138
/// Pre-populated addresses of returns, coming from pre-aggregated data or
139139
/// disassembly. Used to disambiguate call-continuation fall-throughs.
140-
std::unordered_set<uint64_t> Returns;
140+
std::unordered_map<uint64_t, bool> Returns;
141141
std::unordered_map<uint64_t, uint64_t> BasicSamples;
142142
std::vector<PerfMemSample> MemSamples;
143143

@@ -498,6 +498,10 @@ class DataAggregator : public DataReader {
498498
/// If \p FileBuildID has no match, then issue an error and exit.
499499
void processFileBuildID(StringRef FileBuildID);
500500

501+
/// Infer missing fall-throughs for branch-only traces (LBR top-of-stack
502+
/// entries).
503+
void imputeFallThroughs();
504+
501505
/// Debugging dump methods
502506
void dump() const;
503507
void dump(const PerfBranchSample &Sample) const;
@@ -509,6 +513,43 @@ class DataAggregator : public DataReader {
509513
void printBasicSamplesDiagnostics(uint64_t OutOfRangeSamples) const;
510514
void printBranchStacksDiagnostics(uint64_t IgnoredSamples) const;
511515

516+
/// Get instruction at \p Addr either from containing binary function or
517+
/// disassemble in-place, and invoke \p Callback on resulting MCInst.
518+
/// Returns the result of the callback or nullopt.
519+
template <typename T>
520+
std::optional<T>
521+
testInstructionAt(const uint64_t Addr,
522+
std::function<T(const MCInst &)> Callback) const {
523+
BinaryFunction *Func = getBinaryFunctionContainingAddress(Addr);
524+
if (!Func)
525+
return std::nullopt;
526+
const uint64_t Offset = Addr - Func->getAddress();
527+
if (Func->hasInstructions()) {
528+
if (auto *MI = Func->getInstructionAtOffset(Offset))
529+
return Callback(*MI);
530+
} else {
531+
if (auto MI = Func->disassembleInstructionAtOffset(Offset))
532+
return Callback(*MI);
533+
}
534+
return std::nullopt;
535+
}
536+
537+
/// Apply \p Callback to the instruction at \p Addr, and memoize the result
538+
/// in a \p Map.
539+
template <typename T>
540+
std::optional<T> testAndSet(const uint64_t Addr,
541+
std::function<T(const MCInst &)> Callback,
542+
std::unordered_map<uint64_t, T> &Map) {
543+
auto It = Map.find(Addr);
544+
if (It != Map.end())
545+
return It->second;
546+
if (std::optional<T> Res = testInstructionAt<T>(Addr, Callback)) {
547+
Map.emplace(Addr, *Res);
548+
return *Res;
549+
}
550+
return std::nullopt;
551+
}
552+
512553
public:
513554
/// If perf.data was collected without build ids, the buildid-list may contain
514555
/// incomplete entries. Return true if the buffer containing

bolt/lib/Profile/DataAggregator.cpp

Lines changed: 77 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,11 @@ FilterPID("pid",
7777
cl::Optional,
7878
cl::cat(AggregatorCategory));
7979

80+
static cl::opt<bool> ImputeTraceFallthrough(
81+
"impute-trace-fall-through",
82+
cl::desc("impute missing fall-throughs for branch-only traces"),
83+
cl::Optional, cl::cat(AggregatorCategory));
84+
8085
static cl::opt<bool>
8186
IgnoreBuildID("ignore-build-id",
8287
cl::desc("continue even if build-ids in input binary and perf.data mismatch"),
@@ -513,6 +518,69 @@ void DataAggregator::parsePerfData(BinaryContext &BC) {
513518
deleteTempFiles();
514519
}
515520

521+
void DataAggregator::imputeFallThroughs() {
522+
if (Traces.empty())
523+
return;
524+
525+
std::pair PrevBranch(Trace::EXTERNAL, Trace::EXTERNAL);
526+
uint64_t AggregateCount = 0;
527+
uint64_t AggregateFallthroughSize = 0;
528+
uint64_t InferredTraces = 0;
529+
530+
// Helper map with whether the instruction is a call/ret/unconditional branch
531+
std::unordered_map<uint64_t, bool> IsUncondCTMap;
532+
auto checkUnconditionalControlTransfer = [&](const uint64_t Addr) {
533+
auto isUncondCT = [&](const MCInst &MI) -> bool {
534+
return BC->MIB->isUnconditionalControlTransfer(MI);
535+
};
536+
return testAndSet<bool>(Addr, isUncondCT, IsUncondCTMap).value_or(true);
537+
};
538+
539+
// Traces are sorted by their component addresses (Branch, From, To).
540+
// assert(is_sorted(Traces));
541+
542+
// Traces corresponding to the top-of-stack branch entry with a missing
543+
// fall-through have BR_ONLY(-1ULL/UINT64_MAX) in To field, meaning that for
544+
// fixed values of Branch and From branch-only traces are stored after all
545+
// traces with valid fall-through.
546+
//
547+
// Group traces by (Branch, From) and compute weighted average fall-through
548+
// length for the top-of-stack trace (closing the group) by accumulating the
549+
// fall-through lengths of traces with valid fall-throughs earlier in the
550+
// group.
551+
for (auto &[Trace, Info] : Traces) {
552+
// Skip fall-throughs in external code.
553+
if (Trace.From == Trace::EXTERNAL)
554+
continue;
555+
std::pair CurrentBranch(Trace.Branch, Trace.From);
556+
// BR_ONLY must be the last trace in the group
557+
if (Trace.To == Trace::BR_ONLY) {
558+
// If the group is not empty, use aggregate values, otherwise 0-length
559+
// for unconditional jumps (call/ret/uncond branch) or 1-length for others
560+
uint64_t InferredBytes =
561+
PrevBranch == CurrentBranch
562+
? AggregateFallthroughSize / AggregateCount
563+
: !checkUnconditionalControlTransfer(Trace.From);
564+
Trace.To = Trace.From + InferredBytes;
565+
LLVM_DEBUG(dbgs() << "imputed " << Trace << " (" << InferredBytes
566+
<< " bytes)\n");
567+
++InferredTraces;
568+
} else {
569+
// Trace with a valid fall-through
570+
// New group: reset aggregates.
571+
if (CurrentBranch != PrevBranch)
572+
AggregateCount = AggregateFallthroughSize = 0;
573+
// Only use valid fall-through lengths
574+
if (Trace.To != Trace::EXTERNAL)
575+
AggregateFallthroughSize += (Trace.To - Trace.From) * Info.TakenCount;
576+
AggregateCount += Info.TakenCount;
577+
}
578+
PrevBranch = CurrentBranch;
579+
}
580+
if (opts::Verbosity >= 1)
581+
outs() << "BOLT-INFO: imputed " << InferredTraces << " traces\n";
582+
}
583+
516584
Error DataAggregator::preprocessProfile(BinaryContext &BC) {
517585
this->BC = &BC;
518586

@@ -525,6 +593,9 @@ Error DataAggregator::preprocessProfile(BinaryContext &BC) {
525593
// Sort parsed traces for faster processing.
526594
llvm::sort(Traces, llvm::less_first());
527595

596+
if (opts::ImputeTraceFallthrough)
597+
imputeFallThroughs();
598+
528599
if (opts::HeatmapMode) {
529600
if (std::error_code EC = printLBRHeatMap())
530601
return errorCodeToError(EC);
@@ -726,22 +797,10 @@ bool DataAggregator::doInterBranch(BinaryFunction *FromFunc,
726797
}
727798

728799
bool DataAggregator::checkReturn(uint64_t Addr) {
729-
auto isReturn = [&](auto MI) { return MI && BC->MIB->isReturn(*MI); };
730-
if (llvm::is_contained(Returns, Addr))
731-
return true;
732-
733-
BinaryFunction *Func = getBinaryFunctionContainingAddress(Addr);
734-
if (!Func)
735-
return false;
736-
737-
const uint64_t Offset = Addr - Func->getAddress();
738-
if (Func->hasInstructions()
739-
? isReturn(Func->getInstructionAtOffset(Offset))
740-
: isReturn(Func->disassembleInstructionAtOffset(Offset))) {
741-
Returns.emplace(Addr);
742-
return true;
743-
}
744-
return false;
800+
auto isReturn = [&](const MCInst &MI) -> bool {
801+
return BC->MIB->isReturn(MI);
802+
};
803+
return testAndSet<bool>(Addr, isReturn, Returns).value_or(false);
745804
}
746805

747806
bool DataAggregator::doBranch(uint64_t From, uint64_t To, uint64_t Count,
@@ -1331,7 +1390,7 @@ std::error_code DataAggregator::parseAggregatedLBREntry() {
13311390
if (!Addr[0]->Offset)
13321391
Addr[0]->Offset = Trace::FT_EXTERNAL_RETURN;
13331392
else
1334-
Returns.emplace(Addr[0]->Offset);
1393+
Returns.emplace(Addr[0]->Offset, true);
13351394
}
13361395

13371396
/// Record a trace.
@@ -1592,7 +1651,7 @@ void DataAggregator::processBranchEvents() {
15921651
NamedRegionTimer T("processBranch", "Processing branch events",
15931652
TimerGroupName, TimerGroupDesc, opts::TimeAggregator);
15941653

1595-
Returns.emplace(Trace::FT_EXTERNAL_RETURN);
1654+
Returns.emplace(Trace::FT_EXTERNAL_RETURN, true);
15961655
for (const auto &[Trace, Info] : Traces) {
15971656
bool IsReturn = checkReturn(Trace.Branch);
15981657
// Ignore returns.

0 commit comments

Comments
 (0)