Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/llvm/llvm-project.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/polly
diff options
context:
space:
mode:
authorMichael Kruse <llvm-project@meinersbur.de>2021-10-09 03:49:40 +0300
committerMichael Kruse <llvm-project@meinersbur.de>2021-10-09 04:33:30 +0300
commit64489255be4903dc8683aff8dade315461a0a397 (patch)
tree32d1bffabc36bf5a8f162cea62568cc7638e2ea7 /polly
parent20a0c482e030df701fb2c94683c57c73d2e2e94c (diff)
[Polly] Add greedy fusion algorithm.
When the option -polly-loopfusion-greedy is set, the ScheduleOptimizer tries to aggressively fuse any band it can and does not violate any dependences. As part if the implementation, the functionalty for copying a band into an new schedule was extracted out of the ScheduleTreeRewriter.
Diffstat (limited to 'polly')
-rw-r--r--polly/docs/ReleaseNotes.rst12
-rw-r--r--polly/include/polly/ScheduleTreeTransform.h8
-rw-r--r--polly/include/polly/Support/GICHelper.h6
-rw-r--r--polly/lib/Support/GICHelper.cpp21
-rw-r--r--polly/lib/Transform/ScheduleOptimizer.cpp12
-rw-r--r--polly/lib/Transform/ScheduleTreeTransform.cpp536
-rw-r--r--polly/test/ScheduleOptimizer/GreedyFuse/fuse-double.ll78
-rw-r--r--polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-first.ll90
-rw-r--r--polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-third.ll88
-rw-r--r--polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-carried.ll69
-rw-r--r--polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-third.ll88
-rw-r--r--polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner.ll66
-rw-r--r--polly/test/ScheduleOptimizer/GreedyFuse/fuse-simple.ll54
-rw-r--r--polly/test/ScheduleOptimizer/GreedyFuse/nofuse-simple.ll51
-rw-r--r--polly/test/ScheduleOptimizer/GreedyFuse/nofuse-with-middle.ll57
15 files changed, 1216 insertions, 20 deletions
diff --git a/polly/docs/ReleaseNotes.rst b/polly/docs/ReleaseNotes.rst
index 1dbbc11f62cb..3d550e21d690 100644
--- a/polly/docs/ReleaseNotes.rst
+++ b/polly/docs/ReleaseNotes.rst
@@ -11,3 +11,15 @@ In Polly 14 the following important changes have been incorporated.
branch.
- Change ...
+
+ * The command line option -polly-opt-fusion has been removed. What the
+ flag does was frequently misunderstood and is rarely useful. However,
+ the functionality is still accessible using
+```
+ -polly-isl-arg=--no-schedule-serialize-sccs
+```
+
+ * The command line option -polly-loopfusion-greedy has been added.
+ This will agressively try to fuse any loop regardless of
+ profitability. The is what users might have expected what
+ -polly-opt-fusion=max would do.
diff --git a/polly/include/polly/ScheduleTreeTransform.h b/polly/include/polly/ScheduleTreeTransform.h
index e8685313c83c..993245cfce48 100644
--- a/polly/include/polly/ScheduleTreeTransform.h
+++ b/polly/include/polly/ScheduleTreeTransform.h
@@ -240,6 +240,14 @@ isl::schedule_node applyRegisterTiling(isl::schedule_node Node,
llvm::ArrayRef<int> TileSizes,
int DefaultTileSize);
+/// Apply greedy fusion. That is, fuse any loop that is possible to be fused
+/// top-down.
+///
+/// @param Sched Sched tree to fuse all the loops in.
+/// @param Deps Validity constraints that must be preserved.
+isl::schedule applyGreedyFusion(isl::schedule Sched,
+ const isl::union_map &Deps);
+
} // namespace polly
#endif // POLLY_SCHEDULETREETRANSFORM_H
diff --git a/polly/include/polly/Support/GICHelper.h b/polly/include/polly/Support/GICHelper.h
index 5c6a3256c1c3..5a106d7ae0d8 100644
--- a/polly/include/polly/Support/GICHelper.h
+++ b/polly/include/polly/Support/GICHelper.h
@@ -231,6 +231,12 @@ ISL_DUMP_OBJECT(val)
ISL_DUMP_OBJECT(val_list)
//@}
+/// Emit the equivaltent of the isl_*_dump output into a raw_ostream.
+/// @{
+void dumpIslObj(const isl::schedule_node &Node, llvm::raw_ostream &OS);
+void dumpIslObj(__isl_keep isl_schedule_node *node, llvm::raw_ostream &OS);
+/// @}
+
inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
__isl_keep isl_union_map *Map) {
OS << polly::stringFromIslObj(Map, "null");
diff --git a/polly/lib/Support/GICHelper.cpp b/polly/lib/Support/GICHelper.cpp
index 409dbf4766dd..359b17050231 100644
--- a/polly/lib/Support/GICHelper.cpp
+++ b/polly/lib/Support/GICHelper.cpp
@@ -238,4 +238,25 @@ void neverCalled() {
polly::dumpIslObj(isl::val());
polly::dumpIslObj(isl::val_list());
}
+
+void polly::dumpIslObj(__isl_keep isl_schedule_node *node, raw_ostream &OS) {
+ if (!node)
+ return;
+
+ isl_ctx *ctx = isl_schedule_node_get_ctx(node);
+ isl_printer *p = isl_printer_to_str(ctx);
+ p = isl_printer_set_yaml_style(p, ISL_YAML_STYLE_BLOCK);
+ p = isl_printer_print_schedule_node(p, node);
+
+ char *char_str = isl_printer_get_str(p);
+ OS << char_str;
+
+ free(char_str);
+ isl_printer_free(p);
+}
+
+void polly::dumpIslObj(const isl::schedule_node &Node, raw_ostream &OS) {
+ dumpIslObj(Node.get(), OS);
+}
+
#endif
diff --git a/polly/lib/Transform/ScheduleOptimizer.cpp b/polly/lib/Transform/ScheduleOptimizer.cpp
index 932f69ef7da6..02d468577387 100644
--- a/polly/lib/Transform/ScheduleOptimizer.cpp
+++ b/polly/lib/Transform/ScheduleOptimizer.cpp
@@ -97,6 +97,11 @@ static cl::opt<std::string>
cl::desc("Maximize the band depth (yes/no)"), cl::Hidden,
cl::init("yes"), cl::ZeroOrMore, cl::cat(PollyCategory));
+static cl::opt<bool>
+ GreedyFusion("polly-loopfusion-greedy",
+ cl::desc("Aggressively try to fuse everything"), cl::Hidden,
+ cl::ZeroOrMore, cl::cat(PollyCategory));
+
static cl::opt<std::string> OuterCoincidence(
"polly-opt-outer-coincidence",
cl::desc("Try to construct schedules where the outer member of each band "
@@ -835,6 +840,13 @@ static bool runIslScheduleOptimizer(
if (Schedule.is_null())
return false;
+ if (GreedyFusion) {
+ isl::union_map Validity = D.getDependences(
+ Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW);
+ Schedule = applyGreedyFusion(Schedule, Validity);
+ assert(!Schedule.is_null());
+ }
+
// Apply post-rescheduling optimizations (if enabled) and/or prevectorization.
const OptimizerAdditionalInfoTy OAI = {
TTI, const_cast<Dependences *>(&D),
diff --git a/polly/lib/Transform/ScheduleTreeTransform.cpp b/polly/lib/Transform/ScheduleTreeTransform.cpp
index ce4e6ae9614d..08b377c707ac 100644
--- a/polly/lib/Transform/ScheduleTreeTransform.cpp
+++ b/polly/lib/Transform/ScheduleTreeTransform.cpp
@@ -11,6 +11,7 @@
//===----------------------------------------------------------------------===//
#include "polly/ScheduleTreeTransform.h"
+#include "polly/Support/GICHelper.h"
#include "polly/Support/ISLTools.h"
#include "polly/Support/ScopHelper.h"
#include "llvm/ADT/ArrayRef.h"
@@ -20,10 +21,103 @@
#include "llvm/IR/Metadata.h"
#include "llvm/Transforms/Utils/UnrollLoop.h"
+#define DEBUG_TYPE "polly-opt-isl"
+
using namespace polly;
using namespace llvm;
namespace {
+
+/// Copy the band member attributes (coincidence, loop type, isolate ast loop
+/// type) from one band to another.
+static isl::schedule_node_band
+applyBandMemberAttributes(isl::schedule_node_band Target, int TargetIdx,
+ const isl::schedule_node_band &Source,
+ int SourceIdx) {
+ bool Coincident = Source.member_get_coincident(SourceIdx).release();
+ Target = Target.member_set_coincident(TargetIdx, Coincident);
+
+ isl_ast_loop_type LoopType =
+ isl_schedule_node_band_member_get_ast_loop_type(Source.get(), SourceIdx);
+ Target = isl::manage(isl_schedule_node_band_member_set_ast_loop_type(
+ Target.release(), TargetIdx, LoopType))
+ .as<isl::schedule_node_band>();
+
+ isl_ast_loop_type IsolateType =
+ isl_schedule_node_band_member_get_isolate_ast_loop_type(Source.get(),
+ SourceIdx);
+ Target = isl::manage(isl_schedule_node_band_member_set_isolate_ast_loop_type(
+ Target.release(), TargetIdx, IsolateType))
+ .as<isl::schedule_node_band>();
+
+ return Target;
+}
+
+/// Create a new band by copying members from another @p Band. @p IncludeCb
+/// decides which band indices are copied to the result.
+template <typename CbTy>
+static isl::schedule rebuildBand(isl::schedule_node_band OldBand,
+ isl::schedule Body, CbTy IncludeCb) {
+ int NumBandDims = OldBand.n_member().release();
+
+ bool ExcludeAny = false;
+ bool IncludeAny = false;
+ for (auto OldIdx : seq<int>(0, NumBandDims)) {
+ if (IncludeCb(OldIdx))
+ IncludeAny = true;
+ else
+ ExcludeAny = true;
+ }
+
+ // Instead of creating a zero-member band, don't create a band at all.
+ if (!IncludeAny)
+ return Body;
+
+ isl::multi_union_pw_aff PartialSched = OldBand.get_partial_schedule();
+ isl::multi_union_pw_aff NewPartialSched;
+ if (ExcludeAny) {
+ // Select the included partial scatter functions.
+ isl::union_pw_aff_list List = PartialSched.list();
+ int NewIdx = 0;
+ for (auto OldIdx : seq<int>(0, NumBandDims)) {
+ if (IncludeCb(OldIdx))
+ NewIdx += 1;
+ else
+ List = List.drop(NewIdx, 1);
+ }
+ isl::space ParamSpace = PartialSched.get_space().params();
+ isl::space NewScatterSpace = ParamSpace.add_unnamed_tuple(NewIdx);
+ NewPartialSched = isl::multi_union_pw_aff(NewScatterSpace, List);
+ } else {
+ // Just reuse original scatter function of copying all of them.
+ NewPartialSched = PartialSched;
+ }
+
+ // Create the new band node.
+ isl::schedule_node_band NewBand =
+ Body.insert_partial_schedule(NewPartialSched)
+ .get_root()
+ .child(0)
+ .as<isl::schedule_node_band>();
+
+ // If OldBand was permutable, so is the new one, even if some dimensions are
+ // missing.
+ bool IsPermutable = OldBand.permutable().release();
+ NewBand = NewBand.set_permutable(IsPermutable);
+
+ // Reapply member attributes.
+ int NewIdx = 0;
+ for (auto OldIdx : seq<int>(0, NumBandDims)) {
+ if (!IncludeCb(OldIdx))
+ continue;
+ NewBand =
+ applyBandMemberAttributes(std::move(NewBand), NewIdx, OldBand, OldIdx);
+ NewIdx += 1;
+ }
+
+ return NewBand.get_schedule();
+}
+
/// Recursively visit all nodes of a schedule tree while allowing changes.
///
/// The visit methods return an isl::schedule_node that is used to continue
@@ -75,23 +169,9 @@ struct ScheduleTreeRewriter
}
isl::schedule visitBand(isl::schedule_node_band Band, Args... args) {
- isl::multi_union_pw_aff PartialSched =
- isl::manage(isl_schedule_node_band_get_partial_schedule(Band.get()));
isl::schedule NewChild =
getDerived().visit(Band.child(0), std::forward<Args>(args)...);
- isl::schedule_node NewNode =
- NewChild.insert_partial_schedule(PartialSched).get_root().child(0);
-
- // Reapply permutability and coincidence attributes.
- NewNode = isl::manage(isl_schedule_node_band_set_permutable(
- NewNode.release(), isl_schedule_node_band_get_permutable(Band.get())));
- unsigned BandDims = isl_schedule_node_band_n_member(Band.get());
- for (unsigned i = 0; i < BandDims; i += 1)
- NewNode = isl::manage(isl_schedule_node_band_member_set_coincident(
- NewNode.release(), i,
- isl_schedule_node_band_member_get_coincident(Band.get(), i)));
-
- return NewNode.get_schedule();
+ return rebuildBand(Band, NewChild, [](int) { return true; });
}
isl::schedule visitSequence(isl::schedule_node_sequence Sequence,
@@ -159,6 +239,10 @@ struct ScheduleTreeRewriter
}
};
+/// Rewrite the schedule tree without any changes. Useful to copy a subtree into
+/// a new schedule, discarding everything but.
+struct IdentityRewriter : public ScheduleTreeRewriter<IdentityRewriter> {};
+
/// Rewrite a schedule tree to an equivalent one without extension nodes.
///
/// Each visit method takes two additional arguments:
@@ -265,11 +349,9 @@ struct ExtensionNodeRewriter
NewNode = isl::manage(isl_schedule_node_band_set_permutable(
NewNode.release(),
isl_schedule_node_band_get_permutable(OldNode.get())));
- for (unsigned i = 0; i < BandDims; i += 1) {
- NewNode = isl::manage(isl_schedule_node_band_member_set_coincident(
- NewNode.release(), i,
- isl_schedule_node_band_member_get_coincident(OldNode.get(), i)));
- }
+ for (unsigned i = 0; i < BandDims; i += 1)
+ NewNode = applyBandMemberAttributes(NewNode.as<isl::schedule_node_band>(),
+ i, OldNode, i);
return NewNode.get_schedule();
}
@@ -504,6 +586,404 @@ static isl::set addExtentConstraints(isl::set Set, int VectorWidth) {
ExtConstr = ExtConstr.set_coefficient_si(isl::dim::set, Dims - 1, -1);
return Set.add_constraint(ExtConstr);
}
+
+/// Collapse perfectly nested bands into a single band.
+class BandCollapseRewriter : public ScheduleTreeRewriter<BandCollapseRewriter> {
+private:
+ using BaseTy = ScheduleTreeRewriter<BandCollapseRewriter>;
+ BaseTy &getBase() { return *this; }
+ const BaseTy &getBase() const { return *this; }
+
+public:
+ isl::schedule visitBand(isl::schedule_node_band RootBand) {
+ isl::schedule_node_band Band = RootBand;
+ isl::ctx Ctx = Band.ctx();
+
+ // Do not merge permutable band to avoid loosing the permutability property.
+ // Cannot collapse even two permutable loops, they might be permutable
+ // individually, but not necassarily accross.
+ if (Band.n_member().release() > 1 && Band.permutable())
+ return getBase().visitBand(Band);
+
+ // Find collapsable bands.
+ SmallVector<isl::schedule_node_band> Nest;
+ int NumTotalLoops = 0;
+ isl::schedule_node Body;
+ while (true) {
+ Nest.push_back(Band);
+ NumTotalLoops += Band.n_member().release();
+ Body = Band.first_child();
+ if (!Body.isa<isl::schedule_node_band>())
+ break;
+ Band = Body.as<isl::schedule_node_band>();
+
+ // Do not include next band if it is permutable to not lose its
+ // permutability property.
+ if (Band.n_member().release() > 1 && Band.permutable())
+ break;
+ }
+
+ // Nothing to collapse, preserve permutability.
+ if (Nest.size() <= 1)
+ return getBase().visitBand(Band);
+
+ LLVM_DEBUG({
+ dbgs() << "Found loops to collapse between\n";
+ dumpIslObj(RootBand, dbgs());
+ dbgs() << "and\n";
+ dumpIslObj(Body, dbgs());
+ dbgs() << "\n";
+ });
+
+ isl::schedule NewBody = visit(Body);
+
+ // Collect partial schedules from all members.
+ isl::union_pw_aff_list PartScheds{Ctx, NumTotalLoops};
+ for (isl::schedule_node_band Band : Nest) {
+ int NumLoops = Band.n_member().release();
+ isl::multi_union_pw_aff BandScheds = Band.get_partial_schedule();
+ for (auto j : seq<int>(0, NumLoops))
+ PartScheds = PartScheds.add(BandScheds.at(j));
+ }
+ isl::space ScatterSpace = isl::space(Ctx, 0, NumTotalLoops);
+ isl::multi_union_pw_aff PartSchedsMulti{ScatterSpace, PartScheds};
+
+ isl::schedule_node_band CollapsedBand =
+ NewBody.insert_partial_schedule(PartSchedsMulti)
+ .get_root()
+ .first_child()
+ .as<isl::schedule_node_band>();
+
+ // Copy over loop attributes form original bands.
+ int LoopIdx = 0;
+ for (isl::schedule_node_band Band : Nest) {
+ int NumLoops = Band.n_member().release();
+ for (int i : seq<int>(0, NumLoops)) {
+ CollapsedBand = applyBandMemberAttributes(std::move(CollapsedBand),
+ LoopIdx, Band, i);
+ LoopIdx += 1;
+ }
+ }
+ assert(LoopIdx == NumTotalLoops &&
+ "Expect the same number of loops to add up again");
+
+ return CollapsedBand.get_schedule();
+ }
+};
+
+static isl::schedule collapseBands(isl::schedule Sched) {
+ LLVM_DEBUG(dbgs() << "Collapse bands in schedule\n");
+ BandCollapseRewriter Rewriter;
+ return Rewriter.visit(Sched);
+}
+
+/// Collect sequentially executed bands (or anything else), even if nested in a
+/// mark or other nodes whose child is executed just once. If we can
+/// successfully fuse the bands, we allow them to be removed.
+static void collectPotentiallyFusableBands(
+ isl::schedule_node Node,
+ SmallVectorImpl<std::pair<isl::schedule_node, isl::schedule_node>>
+ &ScheduleBands,
+ const isl::schedule_node &DirectChild) {
+ switch (isl_schedule_node_get_type(Node.get())) {
+ case isl_schedule_node_sequence:
+ case isl_schedule_node_set:
+ case isl_schedule_node_mark:
+ case isl_schedule_node_domain:
+ case isl_schedule_node_filter:
+ if (Node.has_children()) {
+ isl::schedule_node C = Node.first_child();
+ while (true) {
+ collectPotentiallyFusableBands(C, ScheduleBands, DirectChild);
+ if (!C.has_next_sibling())
+ break;
+ C = C.next_sibling();
+ }
+ }
+ break;
+
+ default:
+ // Something that does not execute suquentially (e.g. a band)
+ ScheduleBands.push_back({Node, DirectChild});
+ break;
+ }
+}
+
+/// Remove dependencies that are resolved by @p PartSched. That is, remove
+/// everything that we already know is executed in-order.
+static isl::union_map remainingDepsFromPartialSchedule(isl::union_map PartSched,
+ isl::union_map Deps) {
+ int NumDims = getNumScatterDims(PartSched);
+ auto ParamSpace = PartSched.get_space().params();
+
+ // { Scatter[] }
+ isl::space ScatterSpace =
+ ParamSpace.set_from_params().add_dims(isl::dim::set, NumDims);
+
+ // { Scatter[] -> Domain[] }
+ isl::union_map PartSchedRev = PartSched.reverse();
+
+ // { Scatter[] -> Scatter[] }
+ isl::map MaybeBefore = isl::map::lex_le(ScatterSpace);
+
+ // { Domain[] -> Domain[] }
+ isl::union_map DomMaybeBefore =
+ MaybeBefore.apply_domain(PartSchedRev).apply_range(PartSchedRev);
+
+ // { Domain[] -> Domain[] }
+ isl::union_map ChildRemainingDeps = Deps.intersect(DomMaybeBefore);
+
+ return ChildRemainingDeps;
+}
+
+/// Remove dependencies that are resolved by executing them in the order
+/// specified by @p Domains;
+static isl::union_map remainigDepsFromSequence(ArrayRef<isl::union_set> Domains,
+ isl::union_map Deps) {
+ isl::ctx Ctx = Deps.ctx();
+ isl::space ParamSpace = Deps.get_space().params();
+
+ // Create a partial schedule mapping to constants that reflect the execution
+ // order.
+ isl::union_map PartialSchedules = isl::union_map::empty(Ctx);
+ for (auto P : enumerate(Domains)) {
+ isl::val ExecTime = isl::val(Ctx, P.index());
+ isl::union_pw_aff DomSched{P.value(), ExecTime};
+ PartialSchedules = PartialSchedules.unite(DomSched.as_union_map());
+ }
+
+ return remainingDepsFromPartialSchedule(PartialSchedules, Deps);
+}
+
+/// Determine whether the outermost loop of to bands can be fused while
+/// respecting validity dependencies.
+static bool canFuseOutermost(const isl::schedule_node_band &LHS,
+ const isl::schedule_node_band &RHS,
+ const isl::union_map &Deps) {
+ // { LHSDomain[] -> Scatter[] }
+ isl::union_map LHSPartSched =
+ LHS.get_partial_schedule().get_at(0).as_union_map();
+
+ // { Domain[] -> Scatter[] }
+ isl::union_map RHSPartSched =
+ RHS.get_partial_schedule().get_at(0).as_union_map();
+
+ // Dependencies that are already resolved because LHS executes before RHS, but
+ // will not be anymore after fusion. { DefDomain[] -> UseDomain[] }
+ isl::union_map OrderedBySequence =
+ Deps.intersect_domain(LHSPartSched.domain())
+ .intersect_range(RHSPartSched.domain());
+
+ isl::space ParamSpace = OrderedBySequence.get_space().params();
+ isl::space NewScatterSpace = ParamSpace.add_unnamed_tuple(1);
+
+ // { Scatter[] -> Scatter[] }
+ isl::map After = isl::map::lex_gt(NewScatterSpace);
+
+ // After fusion, instances with smaller (or equal, which means they will be
+ // executed in the same iteration, but the LHS instance is still sequenced
+ // before RHS) scatter value will still be executed before. This are the
+ // orderings where this is not necessarily the case.
+ // { LHSDomain[] -> RHSDomain[] }
+ isl::union_map MightBeAfterDoms = After.apply_domain(LHSPartSched.reverse())
+ .apply_range(RHSPartSched.reverse());
+
+ // Dependencies that are not resolved by the new execution order.
+ isl::union_map WithBefore = OrderedBySequence.intersect(MightBeAfterDoms);
+
+ return WithBefore.is_empty();
+}
+
+/// Fuse @p LHS and @p RHS if possible while preserving validity dependenvies.
+static isl::schedule tryGreedyFuse(isl::schedule_node_band LHS,
+ isl::schedule_node_band RHS,
+ const isl::union_map &Deps) {
+ if (!canFuseOutermost(LHS, RHS, Deps))
+ return {};
+
+ LLVM_DEBUG({
+ dbgs() << "Found loops for greedy fusion:\n";
+ dumpIslObj(LHS, dbgs());
+ dbgs() << "and\n";
+ dumpIslObj(RHS, dbgs());
+ dbgs() << "\n";
+ });
+
+ // The partial schedule of the bands outermost loop that we need to combine
+ // for the fusion.
+ isl::union_pw_aff LHSPartOuterSched = LHS.get_partial_schedule().get_at(0);
+ isl::union_pw_aff RHSPartOuterSched = RHS.get_partial_schedule().get_at(0);
+
+ // Isolate band bodies as roots of their own schedule trees.
+ IdentityRewriter Rewriter;
+ isl::schedule LHSBody = Rewriter.visit(LHS.first_child());
+ isl::schedule RHSBody = Rewriter.visit(RHS.first_child());
+
+ // Reconstruct the non-outermost (not going to be fused) loops from both
+ // bands.
+ // TODO: Maybe it is possibly to transfer the 'permutability' property from
+ // LHS+RHS. At minimum we need merge multiple band members at once, otherwise
+ // permutability has no meaning.
+ isl::schedule LHSNewBody =
+ rebuildBand(LHS, LHSBody, [](int i) { return i > 0; });
+ isl::schedule RHSNewBody =
+ rebuildBand(RHS, RHSBody, [](int i) { return i > 0; });
+
+ // The loop body of the fused loop.
+ isl::schedule NewCommonBody = LHSNewBody.sequence(RHSNewBody);
+
+ // Combine the partial schedules of both loops to a new one. Instances with
+ // the same scatter value are put together.
+ isl::union_map NewCommonPartialSched =
+ LHSPartOuterSched.as_union_map().unite(RHSPartOuterSched.as_union_map());
+ isl::schedule NewCommonSchedule = NewCommonBody.insert_partial_schedule(
+ NewCommonPartialSched.as_multi_union_pw_aff());
+
+ return NewCommonSchedule;
+}
+
+static isl::schedule tryGreedyFuse(isl::schedule_node LHS,
+ isl::schedule_node RHS,
+ const isl::union_map &Deps) {
+ // TODO: Non-bands could be interpreted as a band with just as single
+ // iteration. However, this is only useful if both ends of a fused loop were
+ // originally loops themselves.
+ if (!LHS.isa<isl::schedule_node_band>())
+ return {};
+ if (!RHS.isa<isl::schedule_node_band>())
+ return {};
+
+ return tryGreedyFuse(LHS.as<isl::schedule_node_band>(),
+ RHS.as<isl::schedule_node_band>(), Deps);
+}
+
+/// Fuse all fusable loop top-down in a schedule tree.
+///
+/// The isl::union_map parameters is the set of validity dependencies that have
+/// not been resolved/carried by a parent schedule node.
+class GreedyFusionRewriter
+ : public ScheduleTreeRewriter<GreedyFusionRewriter, isl::union_map> {
+private:
+ using BaseTy = ScheduleTreeRewriter<GreedyFusionRewriter, isl::union_map>;
+ BaseTy &getBase() { return *this; }
+ const BaseTy &getBase() const { return *this; }
+
+public:
+ /// Is set to true if anything has been fused.
+ bool AnyChange = false;
+
+ isl::schedule visitBand(isl::schedule_node_band Band, isl::union_map Deps) {
+ int NumLoops = Band.n_member().release();
+
+ // { Domain[] -> Scatter[] }
+ isl::union_map PartSched =
+ isl::union_map::from(Band.get_partial_schedule());
+ assert(getNumScatterDims(PartSched) == NumLoops);
+ isl::space ParamSpace = PartSched.get_space().params();
+
+ // { Scatter[] -> Domain[] }
+ isl::union_map PartSchedRev = PartSched.reverse();
+
+ // Possible within the same iteration. Dependencies with smaller scatter
+ // value are carried by this loop and therefore have been resolved by the
+ // in-order execution if the loop iteration. A dependency with small scatter
+ // value would be a dependency violation that we assume did not happen. {
+ // Domain[] -> Domain[] }
+ isl::union_map Unsequenced = PartSchedRev.apply_domain(PartSchedRev);
+
+ // Actual dependencies within the same iteration.
+ // { DefDomain[] -> UseDomain[] }
+ isl::union_map RemDeps = Deps.intersect(Unsequenced);
+
+ return getBase().visitBand(Band, RemDeps);
+ }
+
+ isl::schedule visitSequence(isl::schedule_node_sequence Sequence,
+ isl::union_map Deps) {
+ int NumChildren = isl_schedule_node_n_children(Sequence.get());
+
+ // List of fusion candidates. The first element is the fusion candidate, the
+ // second is candidate's ancestor that is the sequence's direct child. It is
+ // preferable to use the direct child if not if its non-direct children is
+ // fused to preserve its structure such as mark nodes.
+ SmallVector<std::pair<isl::schedule_node, isl::schedule_node>> Bands;
+ for (auto i : seq<int>(0, NumChildren)) {
+ isl::schedule_node Child = Sequence.child(i);
+ collectPotentiallyFusableBands(Child, Bands, Child);
+ }
+
+ // Direct children that had at least one of its decendants fused.
+ SmallDenseSet<isl_schedule_node *, 4> ChangedDirectChildren;
+
+ // Fuse neigboring bands until reaching the end of candidates.
+ int i = 0;
+ while (i + 1 < (int)Bands.size()) {
+ isl::schedule Fused =
+ tryGreedyFuse(Bands[i].first, Bands[i + 1].first, Deps);
+ if (Fused.is_null()) {
+ // Cannot merge this node with the next; look at next pair.
+ i += 1;
+ continue;
+ }
+
+ // Mark the direct children as (partially) fused.
+ if (!Bands[i].second.is_null())
+ ChangedDirectChildren.insert(Bands[i].second.get());
+ if (!Bands[i + 1].second.is_null())
+ ChangedDirectChildren.insert(Bands[i + 1].second.get());
+
+ // Collapse the neigbros to a single new candidate that could be fused
+ // with the next candidate.
+ Bands[i] = {Fused.get_root(), {}};
+ Bands.erase(Bands.begin() + i + 1);
+
+ AnyChange = true;
+ }
+
+ // By construction equal if done with collectPotentiallyFusableBands's
+ // output.
+ SmallVector<isl::union_set> SubDomains;
+ SubDomains.reserve(NumChildren);
+ for (int i = 0; i < NumChildren; i += 1)
+ SubDomains.push_back(Sequence.child(i).domain());
+ auto SubRemainingDeps = remainigDepsFromSequence(SubDomains, Deps);
+
+ // We may iterate over direct children multiple times, be sure to add each
+ // at most once.
+ SmallDenseSet<isl_schedule_node *, 4> AlreadyAdded;
+
+ isl::schedule Result;
+ for (auto &P : Bands) {
+ isl::schedule_node MaybeFused = P.first;
+ isl::schedule_node DirectChild = P.second;
+
+ // If not modified, use the direct child.
+ if (!DirectChild.is_null() &&
+ !ChangedDirectChildren.count(DirectChild.get())) {
+ if (AlreadyAdded.count(DirectChild.get()))
+ continue;
+ AlreadyAdded.insert(DirectChild.get());
+ MaybeFused = DirectChild;
+ } else {
+ assert(AnyChange &&
+ "Need changed flag for be consistent with actual change");
+ }
+
+ // Top-down recursion: If the outermost loop has been fused, their nested
+ // bands might be fusable now as well.
+ isl::schedule InnerFused = visit(MaybeFused, SubRemainingDeps);
+
+ // Reconstruct the sequence, with some of the children fused.
+ if (Result.is_null())
+ Result = InnerFused;
+ else
+ Result = Result.sequence(InnerFused);
+ }
+
+ return Result;
+ }
+};
+
} // namespace
bool polly::isBandMark(const isl::schedule_node &Node) {
@@ -774,3 +1254,19 @@ isl::schedule polly::applyMaxFission(isl::schedule_node BandToFission) {
return Fissioned.get_schedule();
}
+
+isl::schedule polly::applyGreedyFusion(isl::schedule Sched,
+ const isl::union_map &Deps) {
+ LLVM_DEBUG(dbgs() << "Greedy loop fusion\n");
+
+ GreedyFusionRewriter Rewriter;
+ isl::schedule Result = Rewriter.visit(Sched, Deps);
+ if (!Rewriter.AnyChange) {
+ LLVM_DEBUG(dbgs() << "Found nothing to fuse\n");
+ return Sched;
+ }
+
+ // GreedyFusionRewriter due to working loop-by-loop, bands with multiple loops
+ // may have been split into multiple bands.
+ return collapseBands(Result);
+}
diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-double.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-double.ll
new file mode 100644
index 000000000000..807abe3d4fe1
--- /dev/null
+++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-double.ll
@@ -0,0 +1,78 @@
+; RUN: opt %loadPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s
+
+define void @func(i32 %n, [1024 x double]* noalias nonnull %A, [1024 x double]* noalias nonnull %B) {
+entry:
+ br label %outer.for1
+
+outer.for1:
+ %k1 = phi i32 [0, %entry], [%k1.inc, %outer.inc1]
+ %k1.cmp = icmp slt i32 %k1, %n
+ br i1 %k1.cmp, label %for1, label %outer.exit1
+
+ for1:
+ %j1 = phi i32 [0, %outer.for1], [%j1.inc, %inc1]
+ %j1.cmp = icmp slt i32 %j1, %n
+ br i1 %j1.cmp, label %body1, label %exit1
+
+ body1:
+ %arrayidx1 = getelementptr inbounds [1024 x double], [1024 x double]* %A, i32 %k1, i32 %j1
+ store double 21.0, double* %arrayidx1
+ br label %inc1
+
+ inc1:
+ %j1.inc = add nuw nsw i32 %j1, 1
+ br label %for1
+
+ exit1:
+ br label %outer.inc1
+
+outer.inc1:
+ %k1.inc = add nuw nsw i32 %k1, 1
+ br label %outer.for1
+
+outer.exit1:
+ br label %outer.for2
+
+outer.for2:
+ %k2 = phi i32 [0, %outer.exit1], [%k2.inc, %outer.inc2]
+ %k2.cmp = icmp slt i32 %k2, %n
+ br i1 %k2.cmp, label %for2, label %outer.exit2
+
+ for2:
+ %j2 = phi i32 [0, %outer.for2], [%j2.inc, %inc2]
+ %j2.cmp = icmp slt i32 %j2, %n
+ br i1 %j2.cmp, label %body2, label %exit2
+
+ body2:
+ %arrayidx2 = getelementptr inbounds [1024 x double], [1024 x double]* %A, i32 %k2, i32 %j2
+ store double 42.0, double* %arrayidx2
+ br label %inc2
+
+ inc2:
+ %j2.inc = add nuw nsw i32 %j2, 1
+ br label %for2
+
+ exit2:
+ br label %outer.inc2
+
+outer.inc2:
+ %k2.inc = add nuw nsw i32 %k2, 1
+ br label %outer.for2
+
+outer.exit2:
+ br label %return
+
+return:
+ ret void
+}
+
+
+; CHECK: Calculated schedule:
+; CHECK-NEXT: domain: "[n] -> { Stmt_body2[i0, i1] : 0 <= i0 < n and 0 <= i1 < n; Stmt_body1[i0, i1] : 0 <= i0 < n and 0 <= i1 < n }"
+; CHECK-NEXT: child:
+; CHECK-NEXT: schedule: "[n] -> [{ Stmt_body2[i0, i1] -> [(i0)]; Stmt_body1[i0, i1] -> [(i0)] }, { Stmt_body2[i0, i1] -> [(i1)]; Stmt_body1[i0, i1] -> [(i1)] }]"
+; CHECK-NEXT: child:
+; CHECK-NEXT: sequence:
+; CHECK-NEXT: - filter: "[n] -> { Stmt_body1[i0, i1] }"
+; CHECK-NEXT: - filter: "[n] -> { Stmt_body2[i0, i1] }"
diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-first.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-first.ll
new file mode 100644
index 000000000000..601d38188cf6
--- /dev/null
+++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-first.ll
@@ -0,0 +1,90 @@
+; RUN: opt %loadPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s --check-prefixes=CHECK,RAW
+; RUN: opt %loadPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s --check-prefixes=CHECK,OPT
+
+define void @func(i32 %n, double* noalias nonnull %A, double* noalias nonnull %B, i32 %k) {
+entry:
+ br label %for1
+
+
+for1:
+ %j1 = phi i32 [0, %entry], [%j1.inc, %inc1]
+ %j1.cmp = icmp slt i32 %j1, %n
+ br i1 %j1.cmp, label %body1, label %exit1
+
+ body1:
+ %idx1 = add i32 %j1, %k
+ %arrayidx1 = getelementptr inbounds double, double* %B, i32 %idx1
+ store double 21.0, double* %arrayidx1
+ br label %inc1
+
+inc1:
+ %j1.inc = add nuw nsw i32 %j1, 1
+ br label %for1, !llvm.loop !1
+
+exit1:
+ br label %for2
+
+
+for2:
+ %j2 = phi i32 [0, %exit1], [%j2.inc, %inc2]
+ %j2.cmp = icmp slt i32 %j2, %n
+ br i1 %j2.cmp, label %body2, label %exit2
+
+ body2:
+ %arrayidx2 = getelementptr inbounds double, double* %B, i32 %j2
+ store double 42.0, double* %arrayidx2
+ br label %inc2
+
+inc2:
+ %j2.inc = add nuw nsw i32 %j2, 1
+ br label %for2
+
+exit2:
+ br label %for3
+
+
+for3:
+ %j3 = phi i32 [0, %exit2], [%j3.inc, %inc3]
+ %j3.cmp = icmp slt i32 %j3, %n
+ br i1 %j3.cmp, label %body3, label %exit3
+
+ body3:
+ %arrayidx3 = getelementptr inbounds double, double* %A, i32 %j3
+ store double 84.0, double* %arrayidx3
+ br label %inc3
+
+inc3:
+ %j3.inc = add nuw nsw i32 %j3, 1
+ br label %for3
+
+exit3:
+ br label %return
+
+
+return:
+ ret void
+}
+
+
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.id", !"Hello World!"}
+
+
+; CHECK: Calculated schedule:
+; CHECK-NEXT: domain: "[n, k] -> { Stmt_body2[i0] : 0 <= i0 < n; Stmt_body1[i0] : 0 <= i0 < n; Stmt_body3[i0] : 0 <= i0 < n }"
+; CHECK-NEXT: child:
+; CHECK-NEXT: sequence:
+; CHECK-NEXT: - filter: "[n, k] -> { Stmt_body1[i0] }"
+; CHECK-NEXT: child:
+; RAW-NEXT: mark: "Loop with Metadata"
+; RAW-NEXT: child:
+; CHECK-NEXT: schedule: "[n, k] -> [{ Stmt_body1[i0] -> [(i0)] }]"
+; OPT-NEXT: permutable: 1
+; OPT-NEXT: coincident: [ 1 ]
+; CHECK-NEXT: - filter: "[n, k] -> { Stmt_body2[i0]; Stmt_body3[i0] }"
+; CHECK-NEXT: child:
+; CHECK-NEXT: schedule: "[n, k] -> [{ Stmt_body2[i0] -> [(i0)]; Stmt_body3[i0] -> [(i0)] }]"
+; CHECK-NEXT: child:
+; CHECK-NEXT: sequence:
+; CHECK-NEXT: - filter: "[n, k] -> { Stmt_body2[i0] }"
+; CHECK-NEXT: - filter: "[n, k] -> { Stmt_body3[i0] }"
diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-third.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-third.ll
new file mode 100644
index 000000000000..eeb20e610a2e
--- /dev/null
+++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-third.ll
@@ -0,0 +1,88 @@
+; RUN: opt %loadPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s --check-prefixes=CHECK,RAW
+; RUN: opt %loadPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s --check-prefixes=CHECK
+
+define void @func(i32 %n, double* noalias nonnull %A, double* noalias nonnull %B, i32 %k) {
+entry:
+ br label %for1
+
+
+for1:
+ %j1 = phi i32 [0, %entry], [%j1.inc, %inc1]
+ %j1.cmp = icmp slt i32 %j1, %n
+ br i1 %j1.cmp, label %body1, label %exit1
+
+ body1:
+ %arrayidx1 = getelementptr inbounds double, double* %A, i32 %j1
+ store double 21.0, double* %arrayidx1
+ br label %inc1
+
+inc1:
+ %j1.inc = add nuw nsw i32 %j1, 1
+ br label %for1
+
+exit1:
+ br label %for2
+
+
+for2:
+ %j2 = phi i32 [0, %exit1], [%j2.inc, %inc2]
+ %j2.cmp = icmp slt i32 %j2, %n
+ br i1 %j2.cmp, label %body2, label %exit2
+
+ body2:
+ %arrayidx2 = getelementptr inbounds double, double* %B, i32 %j1
+ store double 42.0, double* %arrayidx2
+ br label %inc2
+
+inc2:
+ %j2.inc = add nuw nsw i32 %j2, 1
+ br label %for2
+
+exit2:
+ br label %for3
+
+
+for3:
+ %j3 = phi i32 [0, %exit2], [%j3.inc, %inc3]
+ %j3.cmp = icmp slt i32 %j3, %n
+ br i1 %j3.cmp, label %body3, label %exit3
+
+ body3:
+ %idx3 = add i32 %j3, %k
+ %arrayidx3 = getelementptr inbounds double, double* %B, i32 %idx3
+ store double 84.0, double* %arrayidx3
+ br label %inc3
+
+inc3:
+ %j3.inc = add nuw nsw i32 %j3, 1
+ br label %for3, !llvm.loop !1
+
+exit3:
+ br label %return
+
+
+return:
+ ret void
+}
+
+
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.id", !"Hello World!"}
+
+
+; CHECK: Calculated schedule:
+; CHECK-NEXT: domain: "[n, k] -> { Stmt_body2[i0] : 0 <= i0 < n; Stmt_body1[i0] : 0 <= i0 < n; Stmt_body3[i0] : 0 <= i0 < n }"
+; CHECK-NEXT: child:
+; CHECK-NEXT: sequence:
+; CHECK-NEXT: - filter: "[n, k] -> { Stmt_body2[i0]; Stmt_body1[i0] }"
+; CHECK-NEXT: child:
+; CHECK-NEXT: schedule: "[n, k] -> [{ Stmt_body2[i0] -> [(i0)]; Stmt_body1[i0] -> [(i0)] }]"
+; CHECK-NEXT: child:
+; CHECK-NEXT: sequence:
+; CHECK-NEXT: - filter: "[n, k] -> { Stmt_body1[i0] }"
+; CHECK-NEXT: - filter: "[n, k] -> { Stmt_body2[i0] }"
+; CHECK-NEXT: - filter: "[n, k] -> { Stmt_body3[i0] }"
+; CHECK-NEXT: child:
+; RAW-NEXT: mark: "Loop with Metadata"
+; RAW-NEXT: child:
+; CHECK-NEXT: schedule: "[n, k] -> [{ Stmt_body3[i0] -> [(i0)] }]"
diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-carried.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-carried.ll
new file mode 100644
index 000000000000..387c77c86098
--- /dev/null
+++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-carried.ll
@@ -0,0 +1,69 @@
+; RUN: opt %loadPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s --check-prefixes=CHECK,RAW
+; RUN: opt %loadPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s --check-prefixes=CHECK,OPT
+
+define void @func(i32 %n, double* noalias nonnull %A) {
+entry:
+ br label %outer.for
+
+outer.for:
+ %k = phi i32 [0, %entry], [%k.inc, %outer.inc]
+ %k.cmp = icmp slt i32 %k, %n
+ br i1 %k.cmp, label %for1, label %outer.exit
+
+ for1:
+ %j1 = phi i32 [0, %outer.for], [%j1.inc, %inc1]
+ %j1.cmp = icmp slt i32 %j1, %n
+ br i1 %j1.cmp, label %body1, label %exit1
+
+ body1:
+ %arrayidx1 = getelementptr inbounds double, double* %A, i32 %j1
+ store double 21.0, double* %arrayidx1
+ br label %inc1
+
+ inc1:
+ %j1.inc = add nuw nsw i32 %j1, 1
+ br label %for1
+
+ exit1:
+ br label %for2
+
+ for2:
+ %j2 = phi i32 [0, %exit1], [%j2.inc, %inc2]
+ %j2.cmp = icmp slt i32 %j2, %n
+ br i1 %j2.cmp, label %body2, label %exit2
+
+ body2:
+ %arrayidx2 = getelementptr inbounds double, double* %A, i32 %j2
+ store double 42.0, double* %arrayidx2
+ br label %inc2
+
+ inc2:
+ %j2.inc = add nuw nsw i32 %j2, 1
+ br label %for2
+
+ exit2:
+ br label %outer.inc
+
+outer.inc:
+ %k.inc = add nuw nsw i32 %k, 1
+ br label %outer.for
+
+outer.exit:
+ br label %return
+
+return:
+ ret void
+}
+
+
+; CHECK: Calculated schedule:
+; CHECK-NEXT: domain: "[n] -> { Stmt_body2[i0, i1] : 0 <= i0 < n and 0 <= i1 < n; Stmt_body1[i0, i1] : 0 <= i0 < n and 0 <= i1 < n }"
+; CHECK-NEXT: child:
+; RAW-NEXT: schedule: "[n] -> [{ Stmt_body2[i0, i1] -> [(i0)]; Stmt_body1[i0, i1] -> [(i0)] }, { Stmt_body2[i0, i1] -> [(i1)]; Stmt_body1[i0, i1] -> [(i1)] }]"
+; OPT-NEXT: schedule: "[n] -> [{ Stmt_body2[i0, i1] -> [(i1)]; Stmt_body1[i0, i1] -> [(i1)] }, { Stmt_body2[i0, i1] -> [(i0)]; Stmt_body1[i0, i1] -> [(i0)] }]"
+; OPT-NEXT: permutable: 1
+; OPT-NEXT: coincident: [ 1, 0 ]
+; CHECK-NEXT: child:
+; CHECK-NEXT: sequence:
+; CHECK-NEXT: - filter: "[n] -> { Stmt_body1[i0, i1] }"
+; CHECK-NEXT: - filter: "[n] -> { Stmt_body2[i0, i1] }"
diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-third.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-third.ll
new file mode 100644
index 000000000000..00eb041f9e28
--- /dev/null
+++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-third.ll
@@ -0,0 +1,88 @@
+; RUN: opt %loadPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s --check-prefixes=CHECK,RAW
+; RUN: opt %loadPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s --check-prefixes=CHECK
+
+define void @func(i32 %n, double* noalias nonnull %A, double* noalias nonnull %B, i32 %k) {
+entry:
+ br label %for1
+
+
+for1:
+ %j1 = phi i32 [0, %entry], [%j1.inc, %inc1]
+ %j1.cmp = icmp slt i32 %j1, %n
+ br i1 %j1.cmp, label %body1, label %exit1
+
+ body1:
+ %arrayidx1 = getelementptr inbounds double, double* %A, i32 %j1
+ store double 21.0, double* %arrayidx1
+ br label %inc1
+
+inc1:
+ %j1.inc = add nuw nsw i32 %j1, 1
+ br label %for1
+
+exit1:
+ br label %for2
+
+
+for2:
+ %j2 = phi i32 [0, %exit1], [%j2.inc, %inc2]
+ %j2.cmp = icmp slt i32 %j2, %n
+ br i1 %j2.cmp, label %body2, label %exit2
+
+ body2:
+ %arrayidx2 = getelementptr inbounds double, double* %B, i32 %j2
+ store double 42.0, double* %arrayidx2
+ br label %inc2
+
+inc2:
+ %j2.inc = add nuw nsw i32 %j2, 1
+ br label %for2
+
+exit2:
+ br label %for3
+
+
+for3:
+ %j3 = phi i32 [0, %exit2], [%j3.inc, %inc3]
+ %j3.cmp = icmp slt i32 %j3, %n
+ br i1 %j3.cmp, label %body3, label %exit3
+
+ body3:
+ %idx3 = add i32 %j3, %k
+ %arrayidx3 = getelementptr inbounds double, double* %B, i32 %idx3
+ store double 84.0, double* %arrayidx3
+ br label %inc3
+
+inc3:
+ %j3.inc = add nuw nsw i32 %j3, 1
+ br label %for3, !llvm.loop !1
+
+exit3:
+ br label %return
+
+
+return:
+ ret void
+}
+
+
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.id", !"Hello World!"}
+
+
+; CHECK: Calculated schedule:
+; CHECK-NEXT: domain: "[n, k] -> { Stmt_body2[i0] : 0 <= i0 < n; Stmt_body1[i0] : 0 <= i0 < n; Stmt_body3[i0] : 0 <= i0 < n }"
+; CHECK-NEXT: child:
+; CHECK-NEXT: sequence:
+; CHECK-NEXT: - filter: "[n, k] -> { Stmt_body2[i0]; Stmt_body1[i0] }"
+; CHECK-NEXT: child:
+; CHECK-NEXT: schedule: "[n, k] -> [{ Stmt_body2[i0] -> [(i0)]; Stmt_body1[i0] -> [(i0)] }]"
+; CHECK-NEXT: child:
+; CHECK-NEXT: sequence:
+; CHECK-NEXT: - filter: "[n, k] -> { Stmt_body1[i0] }"
+; CHECK-NEXT: - filter: "[n, k] -> { Stmt_body2[i0] }"
+; CHECK-NEXT: - filter: "[n, k] -> { Stmt_body3[i0] }"
+; CHECK-NEXT: child:
+; RAW-NEXT: mark: "Loop with Metadata"
+; RAW-NEXT: child:
+; CHECK-NEXT: schedule: "[n, k] -> [{ Stmt_body3[i0] -> [(i0)] }]"
diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner.ll
new file mode 100644
index 000000000000..ada3934dec4d
--- /dev/null
+++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner.ll
@@ -0,0 +1,66 @@
+; RUN: opt %loadPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s
+
+define void @func(i32 %n, [1024 x double]* noalias nonnull %A) {
+entry:
+ br label %outer.for
+
+outer.for:
+ %k = phi i32 [0, %entry], [%k.inc, %outer.inc]
+ %k.cmp = icmp slt i32 %k, %n
+ br i1 %k.cmp, label %for1, label %outer.exit
+
+ for1:
+ %j1 = phi i32 [0, %outer.for], [%j1.inc, %inc1]
+ %j1.cmp = icmp slt i32 %j1, %n
+ br i1 %j1.cmp, label %body1, label %exit1
+
+ body1:
+ %arrayidx1 = getelementptr inbounds [1024 x double], [1024 x double]* %A, i32 %k, i32 %j1
+ store double 21.0, double* %arrayidx1
+ br label %inc1
+
+ inc1:
+ %j1.inc = add nuw nsw i32 %j1, 1
+ br label %for1
+
+ exit1:
+ br label %for2
+
+ for2:
+ %j2 = phi i32 [0, %exit1], [%j2.inc, %inc2]
+ %j2.cmp = icmp slt i32 %j2, %n
+ br i1 %j2.cmp, label %body2, label %exit2
+
+ body2:
+ %arrayidx2 = getelementptr inbounds [1024 x double], [1024 x double]* %A, i32 %k, i32 %j2
+ store double 42.0, double* %arrayidx2
+ br label %inc2
+
+ inc2:
+ %j2.inc = add nuw nsw i32 %j2, 1
+ br label %for2
+
+ exit2:
+ br label %outer.inc
+
+outer.inc:
+ %k.inc = add nuw nsw i32 %k, 1
+ br label %outer.for
+
+outer.exit:
+ br label %return
+
+return:
+ ret void
+}
+
+
+; CHECK: Calculated schedule:
+; CHECK-NEXT: domain: "[n] -> { Stmt_body2[i0, i1] : 0 <= i0 < n and 0 <= i1 < n; Stmt_body1[i0, i1] : 0 <= i0 < n and 0 <= i1 < n }"
+; CHECK-NEXT: child:
+; CHECK-NEXT: schedule: "[n] -> [{ Stmt_body2[i0, i1] -> [(i0)]; Stmt_body1[i0, i1] -> [(i0)] }, { Stmt_body2[i0, i1] -> [(i1)]; Stmt_body1[i0, i1] -> [(i1)] }]"
+; CHECK-NEXT: child:
+; CHECK-NEXT: sequence:
+; CHECK-NEXT: - filter: "[n] -> { Stmt_body1[i0, i1] }"
+; CHECK-NEXT: - filter: "[n] -> { Stmt_body2[i0, i1] }"
diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-simple.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-simple.ll
new file mode 100644
index 000000000000..ba083ac0b5fa
--- /dev/null
+++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-simple.ll
@@ -0,0 +1,54 @@
+; RUN: opt %loadPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s
+
+define void @func(i32 %n, double* noalias nonnull %A) {
+entry:
+ br label %for1
+
+for1:
+ %j1 = phi i32 [0, %entry], [%j1.inc, %inc1]
+ %j1.cmp = icmp slt i32 %j1, %n
+ br i1 %j1.cmp, label %body1, label %exit1
+
+ body1:
+ %arrayidx1 = getelementptr inbounds double, double* %A, i32 %j1
+ store double 21.0, double* %arrayidx1
+ br label %inc1
+
+inc1:
+ %j1.inc = add nuw nsw i32 %j1, 1
+ br label %for1
+
+exit1:
+ br label %for2
+
+for2:
+ %j2 = phi i32 [0, %exit1], [%j2.inc, %inc2]
+ %j2.cmp = icmp slt i32 %j2, %n
+ br i1 %j2.cmp, label %body2, label %exit2
+
+ body2:
+ %arrayidx2 = getelementptr inbounds double, double* %A, i32 %j2
+ store double 42.0, double* %arrayidx2
+ br label %inc2
+
+inc2:
+ %j2.inc = add nuw nsw i32 %j2, 1
+ br label %for2
+
+exit2:
+ br label %return
+
+return:
+ ret void
+}
+
+
+; CHECK: Calculated schedule:
+; CHECK-NEXT: domain: "[n] -> { Stmt_body2[i0] : 0 <= i0 < n; Stmt_body1[i0] : 0 <= i0 < n }"
+; CHECK-NEXT: child:
+; CHECK-NEXT: schedule: "[n] -> [{ Stmt_body2[i0] -> [(i0)]; Stmt_body1[i0] -> [(i0)] }]"
+; CHECK-NEXT: child:
+; CHECK-NEXT: sequence:
+; CHECK-NEXT: - filter: "[n] -> { Stmt_body1[i0] }"
+; CHECK-NEXT: - filter: "[n] -> { Stmt_body2[i0] }"
diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-simple.ll b/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-simple.ll
new file mode 100644
index 000000000000..1509aa995917
--- /dev/null
+++ b/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-simple.ll
@@ -0,0 +1,51 @@
+; RUN: opt %loadPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s
+
+; This could theoretically be fused by adjusting the offset of the second loop by %k (instead of relying on schedule dimensions).
+
+define void @func(i32 %n, double* noalias nonnull %A, i32 %k) {
+entry:
+ br label %for1
+
+for1:
+ %j1 = phi i32 [0, %entry], [%j1.inc, %inc1]
+ %j1.cmp = icmp slt i32 %j1, %n
+ br i1 %j1.cmp, label %body1, label %exit1
+
+ body1:
+ %arrayidx1 = getelementptr inbounds double, double* %A, i32 %j1
+ store double 21.0, double* %arrayidx1
+ br label %inc1
+
+inc1:
+ %j1.inc = add nuw nsw i32 %j1, 1
+ br label %for1
+
+exit1:
+ br label %for2
+
+for2:
+ %j2 = phi i32 [0, %exit1], [%j2.inc, %inc2]
+ %j2.cmp = icmp slt i32 %j2, %n
+ br i1 %j2.cmp, label %body2, label %exit2
+
+ body2:
+ %idx2 = add i32 %j2, %k
+ %arrayidx2 = getelementptr inbounds double, double* %A, i32 %idx2
+ store double 42.0, double* %arrayidx2
+ br label %inc2
+
+inc2:
+ %j2.inc = add nuw nsw i32 %j2, 1
+ br label %for2
+
+exit2:
+ br label %return
+
+return:
+ ret void
+}
+
+
+; CHECK: Calculated schedule:
+; CHECK-NEXT: n/a
diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-with-middle.ll b/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-with-middle.ll
new file mode 100644
index 000000000000..e52b89beef0b
--- /dev/null
+++ b/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-with-middle.ll
@@ -0,0 +1,57 @@
+; RUN: opt %loadPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-opt-isl -analyze < %s | FileCheck %s
+
+define void @func(i32 %n, double* noalias nonnull %A, double* noalias nonnull %B, i32 %k) {
+entry:
+ br label %for1
+
+
+for1:
+ %j1 = phi i32 [0, %entry], [%j1.inc, %inc1]
+ %j1.cmp = icmp slt i32 %j1, %n
+ br i1 %j1.cmp, label %body1, label %exit1
+
+ body1:
+ %idx1 = add i32 %j1, %k
+ %arrayidx1 = getelementptr inbounds double, double* %A, i32 %j1
+ store double 21.0, double* %arrayidx1
+ br label %inc1
+
+inc1:
+ %j1.inc = add nuw nsw i32 %j1, 1
+ br label %for1
+
+exit1:
+ br label %middle2
+
+
+middle2:
+ store double 52.0, double* %A
+ br label %for3
+
+
+for3:
+ %j3 = phi i32 [0, %middle2], [%j3.inc, %inc3]
+ %j3.cmp = icmp slt i32 %j3, %n
+ br i1 %j3.cmp, label %body3, label %exit3
+
+ body3:
+ %arrayidx3 = getelementptr inbounds double, double* %B, i32 %j3
+ store double 84.0, double* %arrayidx3
+ br label %inc3
+
+inc3:
+ %j3.inc = add nuw nsw i32 %j3, 1
+ br label %for3
+
+exit3:
+ br label %return
+
+
+return:
+ ret void
+}
+
+
+; CHECK: Calculated schedule:
+; CHECK-NEXT: n/a