Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/llvm/llvm-project.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/bolt
diff options
context:
space:
mode:
authorRafael Auler <rafaelauler@fb.com>2022-05-21 05:43:07 +0300
committerRafael Auler <rafaelauler@fb.com>2022-07-12 03:30:00 +0300
commit3332904ad6fad848a77c46d0432e8b55a10c1079 (patch)
treec2efe8064b96de8c1759976b95219bf389f1398d /bolt
parent3508ced6ea239d17b2f4a79a91b8398d0d1ac514 (diff)
[BOLT] Increase coverage of shrink wrapping [3/5]
Add the option to run -equalize-bb-counts before shrink wrapping to avoid unnecessarily optimizing some CFGs where profile is inaccurate but we can prove two blocks have the same frequency. Reviewed By: Amir Differential Revision: https://reviews.llvm.org/D126113
Diffstat (limited to 'bolt')
-rw-r--r--bolt/include/bolt/Passes/MCF.h7
-rw-r--r--bolt/include/bolt/Utils/CommandLineOpts.h1
-rw-r--r--bolt/lib/Passes/FrameOptimizer.cpp1
-rw-r--r--bolt/lib/Passes/MCF.cpp25
-rw-r--r--bolt/lib/Passes/ShrinkWrapping.cpp5
-rw-r--r--bolt/lib/Utils/CommandLineOpts.cpp6
-rw-r--r--bolt/test/X86/shrinkwrapping-do-not-pessimize.s58
7 files changed, 87 insertions, 16 deletions
diff --git a/bolt/include/bolt/Passes/MCF.h b/bolt/include/bolt/Passes/MCF.h
index 95ca64870356..feac7f88ac11 100644
--- a/bolt/include/bolt/Passes/MCF.h
+++ b/bolt/include/bolt/Passes/MCF.h
@@ -13,6 +13,7 @@ namespace llvm {
namespace bolt {
class BinaryFunction;
+class DataflowInfoManager;
enum MCFCostFunction : char {
MCF_DISABLE = 0,
@@ -22,6 +23,12 @@ enum MCFCostFunction : char {
MCF_BLAMEFTS
};
+/// Implement the idea in "SamplePGO - The Power of Profile Guided Optimizations
+/// without the Usability Burden" by Diego Novillo to make basic block counts
+/// equal if we show that A dominates B, B post-dominates A and they are in the
+/// same loop and same loop nesting level.
+void equalizeBBCounts(DataflowInfoManager &Info, BinaryFunction &BF);
+
/// Fill edge counts based on the basic block count. Used in nonLBR mode when
/// we only have bb count.
void estimateEdgeCounts(BinaryFunction &BF);
diff --git a/bolt/include/bolt/Utils/CommandLineOpts.h b/bolt/include/bolt/Utils/CommandLineOpts.h
index 08321168f0e0..9216048854e4 100644
--- a/bolt/include/bolt/Utils/CommandLineOpts.h
+++ b/bolt/include/bolt/Utils/CommandLineOpts.h
@@ -35,6 +35,7 @@ extern llvm::cl::opt<bool> AggregateOnly;
extern llvm::cl::opt<unsigned> BucketsPerLine;
extern llvm::cl::opt<bool> DiffOnly;
extern llvm::cl::opt<bool> EnableBAT;
+extern llvm::cl::opt<bool> EqualizeBBCounts;
extern llvm::cl::opt<bool> RemoveSymtab;
extern llvm::cl::opt<unsigned> ExecutionCountThreshold;
extern llvm::cl::opt<unsigned> HeatmapBlock;
diff --git a/bolt/lib/Passes/FrameOptimizer.cpp b/bolt/lib/Passes/FrameOptimizer.cpp
index 8e07834faab4..9b9ab518d21a 100644
--- a/bolt/lib/Passes/FrameOptimizer.cpp
+++ b/bolt/lib/Passes/FrameOptimizer.cpp
@@ -17,6 +17,7 @@
#include "bolt/Passes/ShrinkWrapping.h"
#include "bolt/Passes/StackAvailableExpressions.h"
#include "bolt/Passes/StackReachingUses.h"
+#include "bolt/Utils/CommandLineOpts.h"
#include "llvm/Support/Timer.h"
#include <deque>
#include <unordered_map>
diff --git a/bolt/lib/Passes/MCF.cpp b/bolt/lib/Passes/MCF.cpp
index 4ba737da41cf..e92202aea4ba 100644
--- a/bolt/lib/Passes/MCF.cpp
+++ b/bolt/lib/Passes/MCF.cpp
@@ -13,6 +13,7 @@
#include "bolt/Passes/MCF.h"
#include "bolt/Core/BinaryFunction.h"
#include "bolt/Passes/DataflowInfoManager.h"
+#include "bolt/Utils/CommandLineOpts.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/Support/CommandLine.h"
#include <algorithm>
@@ -33,15 +34,8 @@ extern cl::opt<bool> TimeOpts;
static cl::opt<bool> IterativeGuess(
"iterative-guess",
cl::desc("in non-LBR mode, guess edge counts using iterative technique"),
-
cl::Hidden, cl::cat(BoltOptCategory));
-static cl::opt<bool>
- EqualizeBBCounts("equalize-bb-counts",
- cl::desc("in non-LBR mode, use same count for BBs "
- "that should have equivalent count"),
- cl::Hidden, cl::cat(BoltOptCategory));
-
static cl::opt<bool> UseRArcs(
"mcf-use-rarcs",
cl::desc("in MCF, consider the possibility of cancelling flow to balance "
@@ -370,12 +364,12 @@ createLoopNestLevelMap(BinaryFunction &BF) {
return LoopNestLevel;
}
-/// Implement the idea in "SamplePGO - The Power of Profile Guided Optimizations
-/// without the Usability Burden" by Diego Novillo to make basic block counts
-/// equal if we show that A dominates B, B post-dominates A and they are in the
-/// same loop and same loop nesting level.
-void equalizeBBCounts(BinaryFunction &BF) {
- auto Info = DataflowInfoManager(BF, nullptr, nullptr);
+} // end anonymous namespace
+
+void equalizeBBCounts(DataflowInfoManager &Info, BinaryFunction &BF) {
+ if (BF.begin() == BF.end())
+ return;
+
DominatorAnalysis<false> &DA = Info.getDominatorAnalysis();
DominatorAnalysis<true> &PDA = Info.getPostDominatorAnalysis();
auto &InsnToBB = Info.getInsnToBBMap();
@@ -448,8 +442,6 @@ void equalizeBBCounts(BinaryFunction &BF) {
}
}
-} // end anonymous namespace
-
void estimateEdgeCounts(BinaryFunction &BF) {
EdgeWeightMap PredEdgeWeights;
EdgeWeightMap SuccEdgeWeights;
@@ -459,7 +451,8 @@ void estimateEdgeCounts(BinaryFunction &BF) {
}
if (opts::EqualizeBBCounts) {
LLVM_DEBUG(BF.print(dbgs(), "before equalize BB counts", true));
- equalizeBBCounts(BF);
+ auto Info = DataflowInfoManager(BF, nullptr, nullptr);
+ equalizeBBCounts(Info, BF);
LLVM_DEBUG(BF.print(dbgs(), "after equalize BB counts", true));
}
if (opts::IterativeGuess)
diff --git a/bolt/lib/Passes/ShrinkWrapping.cpp b/bolt/lib/Passes/ShrinkWrapping.cpp
index d58c6b6c37cd..94771b3e504a 100644
--- a/bolt/lib/Passes/ShrinkWrapping.cpp
+++ b/bolt/lib/Passes/ShrinkWrapping.cpp
@@ -13,6 +13,8 @@
#include "bolt/Passes/ShrinkWrapping.h"
#include "bolt/Core/MCPlus.h"
#include "bolt/Passes/DataflowInfoManager.h"
+#include "bolt/Passes/MCF.h"
+#include "bolt/Utils/CommandLineOpts.h"
#include <numeric>
#include <stack>
@@ -1982,6 +1984,9 @@ bool ShrinkWrapping::perform(bool HotOnly) {
if (HotOnly && (BF.getKnownExecutionCount() < BC.getHotThreshold()))
return false;
+ if (opts::EqualizeBBCounts)
+ equalizeBBCounts(Info, BF);
+
if (BF.checkForAmbiguousJumpTables()) {
LLVM_DEBUG(dbgs() << "BOLT-DEBUG: ambiguous JTs in " << BF.getPrintName()
<< ".\n");
diff --git a/bolt/lib/Utils/CommandLineOpts.cpp b/bolt/lib/Utils/CommandLineOpts.cpp
index dde84e4f000b..fbaaf16019fc 100644
--- a/bolt/lib/Utils/CommandLineOpts.cpp
+++ b/bolt/lib/Utils/CommandLineOpts.cpp
@@ -73,6 +73,12 @@ EnableBAT("enable-bat",
cl::ZeroOrMore,
cl::cat(BoltCategory));
+cl::opt<bool> EqualizeBBCounts(
+ "equalize-bb-counts",
+ cl::desc("use same count for BBs that should have equivalent count (used "
+ "in non-LBR and shrink wrapping)"),
+ cl::ZeroOrMore, cl::init(false), cl::Hidden, cl::cat(BoltOptCategory));
+
cl::opt<bool> RemoveSymtab("remove-symtab", cl::desc("Remove .symtab section"),
cl::cat(BoltCategory));
diff --git a/bolt/test/X86/shrinkwrapping-do-not-pessimize.s b/bolt/test/X86/shrinkwrapping-do-not-pessimize.s
new file mode 100644
index 000000000000..a57131131423
--- /dev/null
+++ b/bolt/test/X86/shrinkwrapping-do-not-pessimize.s
@@ -0,0 +1,58 @@
+# This checks that shrink wrapping does not pessimize a CFG pattern where two
+# blocks can be proved to have the same execution count but, because of profile
+# inaccuricies, we could move saves into the second block. We can prove two
+# blocks have the same frequency when B post-dominate A and A dominates B and
+# are at the same loop nesting level. This would be a pessimization because
+# shrink wrapping is unlikely to be able to cleanly move PUSH instructions,
+# inserting additional store instructions.
+
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \
+# RUN: %s -o %t.o
+# RUN: link_fdata %s %t.o %t.fdata
+# RUN: llvm-strip --strip-unneeded %t.o
+# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -nostdlib
+# RUN: llvm-bolt -relocs %t.exe -o %t.out -data %t.fdata \
+# RUN: -frame-opt=all -equalize-bb-counts | FileCheck %s
+
+# Here we create a CFG pattern with two blocks A and B belonging to the same
+# equivalency class as defined by dominance relations and having in theory
+# the same frequency. But we tweak edge counts from profile to make block A
+# hotter than block B.
+ .globl _start
+ .type _start, %function
+_start:
+ .cfi_startproc
+# Hot prologue
+# FDATA: 0 [unknown] 0 1 _start 0 0 10
+ push %rbp
+ mov %rsp, %rbp
+ push %rbx
+ push %r14
+ subq $0x20, %rsp
+b: je end_if_1
+# FDATA: 1 _start #b# 1 _start #end_if_1# 0 1
+if_false:
+ movq rel(%rip), %rdi # Add this to create a relocation and run bolt w/ relocs
+c: jmp end_if_1
+# Reduce frequency from 9 to 1 to simulate an inaccurate profile
+# FDATA: 1 _start #c# 1 _start #end_if_1# 0 1
+end_if_1:
+ # first uses of R14 and RBX appear at this point, possible move point for SW
+ mov %r14, %rdi
+ mov %rbx, %rdi
+ leaq -0x20(%rbp), %r14
+ movq -0x20(%rbp), %rdi
+ addq $0x20, %rsp
+ pop %r14
+ pop %rbx
+ pop %rbp
+ ret
+ .cfi_endproc
+ .size _start, .-_start
+
+ .data
+rel: .quad end_if_1
+
+# CHECK: BOLT-INFO: Shrink wrapping moved 0 spills inserting load/stores and 0 spills inserting push/pops