Skip to content

Commit

Permalink
[AIEX] Basic heuristics for scheduling loops with LCDs
Browse files Browse the repository at this point in the history
  • Loading branch information
gbossu committed Oct 8, 2024
1 parent eb6b621 commit 99d8277
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 11 deletions.
1 change: 1 addition & 0 deletions llvm/lib/Target/AIE/AIEInterBlockScheduling.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ class InterBlockEdges {
/// Retrieve the SUnit that represents MI's instance before the
/// boundary, null if not found.
const SUnit *getPreBoundaryNode(MachineInstr *MI) const;

/// Check whether SU represents an instruction after the boundary
bool isPostBoundaryNode(SUnit *SU) const;
};
Expand Down
51 changes: 51 additions & 0 deletions llvm/lib/Target/AIE/AIEMachineScheduler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,10 @@ static cl::opt<bool>
InterBlockAlignment("aie-interblock-alignment", cl::init(true),
cl::desc("Allow for alignment of successor blocks"));

static cl::opt<bool> UseLoopHeuristics(
"aie-loop-sched-heuristics", cl::init(true),
cl::desc("Use special picking heuristics when scheduling a loop region"));

namespace {
// A sentinel value to represent an unknown SUnit.
const constexpr unsigned UnknownSUNum = ~0;
Expand Down Expand Up @@ -694,6 +698,30 @@ void AIEPostRASchedStrategy::handleRegionConflicts(
}
}

/// The earliest use of this instruction in the next iteration.
/// Note that we reason with "bottom-up" cycle, so a larger cycle means it's
/// used earlier in topological order. If the SU has no loop-carried dependency,
/// this will be MAX_INT.
int getEarliestLoopCarriedUse(const SUnit &SU,
const InterBlockEdges &LoopEdges) {
const SUnit *SUInCurrentIteration =
LoopEdges.getPreBoundaryNode(SU.getInstr());
assert(SUInCurrentIteration);
assert(SUInCurrentIteration->getHeight() >= SU.getHeight());

// Look at loop-carried dependencies to see how early the instruction will be
// needed in the next iteration.
int EarliestCycle = std::numeric_limits<int>::max();
for (const SDep &Succ : SUInCurrentIteration->Succs) {
if (!LoopEdges.isPostBoundaryNode(Succ.getSUnit()))
continue;

EarliestCycle = std::min(EarliestCycle, int(Succ.getSUnit()->getHeight()));
}

return EarliestCycle;
}

/// Apply a set of heuristics to a new candidate for PostRA scheduling.
///
/// \param Cand provides the policy and current best candidate.
Expand Down Expand Up @@ -739,6 +767,29 @@ bool AIEPostRASchedStrategy::tryCandidate(SchedCandidate &Cand,
return TryCand.Reason != NoCand;
}

// Special heuristics for loops.
// Note that they aren't used for the first fixpoint iteration: this is
// currently a workaround because we want a very optimistic schedule in that
// first iteration. That is because it decides the slot assignments for
// multi-slot instructions. This rule can probably be deleted once the
// loop-aware scheduler knows how to reassign those.
const BlockState &BS = getInterBlock().getBlockState(CurMBB);
if (UseLoopHeuristics && BS.Kind == AIE::BlockType::Loop &&
BS.getRegions().size() == 1 && BS.FixPoint.NumIters > 0) {
const InterBlockEdges &LoopEdges = BS.getBoundaryEdges();

// For instructions with equal dependence chains, prioritize scheduling
// instructions that are used later in the next iteration. The point is
// to teach our heuristics a tiny bit about LCDs.
if (tryLess(getEarliestLoopCarriedUse(*TryCand.SU, LoopEdges) +
TryCand.SU->BotReadyCycle,
getEarliestLoopCarriedUse(*Cand.SU, LoopEdges) +
Cand.SU->BotReadyCycle,
TryCand, Cand, BotPathReduce)) {
return TryCand.Reason != NoCand;
}
}

// Prefer the instruction whose dependent chain is estimated to
// finish executing later. This can help reducing the overall height
// of the region.
Expand Down
24 changes: 13 additions & 11 deletions llvm/test/CodeGen/AIE/aie2/end-to-end/Add2D-red.ll
Original file line number Diff line number Diff line change
Expand Up @@ -124,19 +124,21 @@ define void @add2d(ptr noalias %params, ptr noalias %ifm1_data, ptr noalias %ifm
; ASM-NEXT: .p2align 4
; ASM-NEXT: .LBB0_3: // %for.body
; ASM-NEXT: // =>This Inner Loop Header: Depth=1
; ASM-NEXT: vlda.ups.s32.d8 cm2, s1, [p1], m1; nopb ; nopxm ; nops
; ASM-NEXT: vlda.3d.ups.s32.d8 cm6, s1, [p2], d0
; ASM-NEXT: vlda.ups.s32.d8 cm5, s1, [p1], m1; vst.srs.d8.s32 cm7, s0, [p3], #32; vadd cm3, cm4, cm3, r0
; ASM-NEXT: vlda.ups.s32.d8 cm3, s1, [p1], m1; vadd cm5, cm6, cm5, r0
; ASM-NEXT: vlda.3d.ups.s32.d8 cm6, s1, [p2], d0; add r1, r1, #-4; vadd cm7, cm1, cm0, r0
; ASM-NEXT: vlda.ups.s32.d8 cm0, s1, [p1], m1; jnz r1, #.LBB0_3
; ASM-NEXT: vlda.3d.ups.s32.d8 cm4, s1, [p2], d0 // Delay Slot 5
; ASM-NEXT: vlda.3d.ups.s32.d8 cm1, s1, [p2], d0; vst.srs.d8.s32 cm8, s0, [p3], #32 // Delay Slot 4
; ASM-NEXT: nopb ; vlda.ups.s32.d8 cm2, s1, [p1], m1; nops ; nopxm ; nopv
; ASM-NEXT: vlda.3d.ups.s32.d8 cm6, s1, [p2], d0; nopx ; vadd cm5, cm6, cm5, r0
; ASM-NEXT: vlda.ups.s32.d8 cm5, s1, [p1], m1
; ASM-NEXT: vlda.3d.ups.s32.d8 cm6, s1, [p2], d0; add r1, r1, #-4
; ASM-NEXT: vst.srs.d8.s32 cm7, s0, [p3], #32; jnz r1, #.LBB0_3; vadd cm3, cm4, cm3, r0
; ASM-NEXT: vlda.ups.s32.d8 cm3, s1, [p1], m1; vst.srs.d8.s32 cm8, s0, [p3], #32 // Delay Slot 5
; ASM-NEXT: vlda.3d.ups.s32.d8 cm4, s1, [p2], d0 // Delay Slot 4
; ASM-NEXT: vst.srs.d8.s32 cm5, s0, [p3], #32 // Delay Slot 3
; ASM-NEXT: nop // Delay Slot 2
; ASM-NEXT: vst.srs.d8.s32 cm3, s0, [p3], #32; vadd cm8, cm6, cm2, r0 // Delay Slot 1
; ASM-NEXT: vlda.ups.s32.d8 cm0, s1, [p1], m1; vadd cm7, cm1, cm0, r0 // Delay Slot 2
; ASM-NEXT: vlda.3d.ups.s32.d8 cm1, s1, [p2], d0; vst.srs.d8.s32 cm3, s0, [p3], #32; vadd cm8, cm6, cm2, r0 // Delay Slot 1
; ASM-NEXT: // %bb.4:
; ASM-NEXT: nopb ; nopa ; nops ; nopxm ; nopv
; ASM-NEXT: nopa ; nopxm
; ASM-NEXT: nop
; ASM-NEXT: nop
; ASM-NEXT: nop
; ASM-NEXT: .p2align 4
; ASM-NEXT: .LBB0_5:
; ASM-NEXT: nopb ; nopa ; nops ; nopxm ; vadd cm5, cm6, cm5, r0
Expand Down

0 comments on commit 99d8277

Please sign in to comment.