Skip to content

Commit

Permalink
[AIE] Missed opportunity for postinc ld/st
Browse files Browse the repository at this point in the history
Dominance checks don't apply to phi nodes.  Ultimately we want to know whether
we can move the two instructions together without breaking a dependence.

Outline a complicated condition

Also some clang-tidy issues.
  • Loading branch information
Martien de Jong authored and martien-de-jong committed Sep 27, 2024
1 parent 8a225ed commit fc44e76
Show file tree
Hide file tree
Showing 4 changed files with 126 additions and 74 deletions.
94 changes: 52 additions & 42 deletions llvm/lib/Target/AIE/AIECombinerHelper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,20 @@ findConstantOffsetsToMove(MachineInstr &PtrAdd, MachineInstr &PtrAddInsertLoc,
return GConstsToMove;
}

// Check that MI is after First and not after Last
bool isBetween(MachineInstr &MI, MachineInstr &First, MachineInstr &Last,
CombinerHelper &Helper) {
assert(First.getParent() == Last.getParent());
// If it's in another block, it can't be between
if (MI.getParent() != First.getParent()) {
return false;
}

// We want First < MI && MI <= Last :=: !(MI <= First) && (MI <= Last)
// and we have dominates(A, B) :=: A <= B
return !Helper.dominates(MI, First) && Helper.dominates(MI, Last);
}

MachineInstr *findPostIncMatch(MachineInstr &MemI, MachineRegisterInfo &MRI,
CombinerHelper &Helper,
AIELoadStoreCombineMatchData &MatchData,
Expand All @@ -222,51 +236,47 @@ MachineInstr *findPostIncMatch(MachineInstr &MemI, MachineRegisterInfo &MRI,
return nullptr;

Register Addr = MemI.getOperand(1).getReg();
for (auto &AddrUse : MRI.use_nodbg_instructions(Addr)) {
std::optional<unsigned> CombinedOpcode = TII.getCombinedPostIncOpcode(
MemI, AddrUse,
MRI.getType(MemI.getOperand(0).getReg()).getSizeInBits());
if (!CombinedOpcode || isTriviallyDead(AddrUse, MRI))
for (auto &PtrInc : MRI.use_nodbg_instructions(Addr)) {
if (MemI.getParent() != PtrInc.getParent())
continue;
if (MemI.getParent() != AddrUse.getParent())
std::optional<unsigned> CombinedOpcode = TII.getCombinedPostIncOpcode(
MemI, PtrInc, MRI.getType(MemI.getOperand(0).getReg()).getSizeInBits());
if (!CombinedOpcode || isTriviallyDead(PtrInc, MRI))
continue;
// Find the closest location to the memory operation where the ptr_add can
// be moved to.
MachineInstr &PtrAddInsertLoc = *findEarliestInsertPoint(
AddrUse, /*NoMoveBeforeInstr=*/MemI,
PtrInc, /*NoMoveBeforeInstr=*/MemI,
/*NoMoveBeforeLastUse=*/true, MRI, Helper, TII);
// If the PtrAdd is defined before MemI, we need to make sure that there is
// no use before def error if we combine the PtrAdd into the MemI. We need
// to check that all uses of the new pointer defined by the PtrAdd are after
// MemI. Note that MemI must also not use the new pointer.
if (Helper.dominates(AddrUse, MemI)) {
bool MemOpDominatesAddrUses = all_of(
MRI.use_nodbg_instructions(AddrUse.getOperand(0).getReg()),
[&](MachineInstr &PtrAddUse) {
return &MemI != &PtrAddUse && Helper.dominates(MemI, PtrAddUse);
});
MatchData = {&AddrUse, *CombinedOpcode, &MemI,
// If the PtrInc is defined before MemI, we need to make sure that there is
// no use before def error if we combine the PtrInc into the MemI. We check
// that none of the uses is between the PtrInc and the MemOp.
if (Helper.dominates(PtrInc, MemI)) {
if (any_of(MRI.use_nodbg_instructions(PtrInc.getOperand(0).getReg()),
[&](MachineInstr &PtrIncUse) {
return isBetween(PtrIncUse, PtrInc, MemI, Helper);
})) {
continue;
}
MatchData = {&PtrInc, *CombinedOpcode, &MemI,
/*ExtraInstrsToMove=*/{},
/*RemoveInstr=*/true};
if (!MemOpDominatesAddrUses)
continue;
}
// The offset of the PtrAdd might be defined after MemI, in this case we
// want to verify if it would be possible to insert the combined instruction
// at the PtrAdd instead of the location of MemI.
// Instruction with side effects are also blocking: Loads, stores, calls,
// instructions with side effects cannot be moved.
// TODO: try move other instructions that block us from combining
else if (canDelayMemOp(MemI, PtrAddInsertLoc, MRI)) {
// The offset of the PtrInc might be defined after MemI, in this case we
// want to verify if it would be possible to insert the combined
// instruction at the PtrInc instead of the location of MemI. Instruction
// with side effects are also blocking: Loads, stores, calls, instructions
// with side effects cannot be moved.
// TODO: try move other instructions that block us from combining
} else if (canDelayMemOp(MemI, PtrAddInsertLoc, MRI)) {
// If Definition of the offset is a G_CONSTANT we have to move that
// instruction up
MatchData = {
&AddrUse, *CombinedOpcode, &PtrAddInsertLoc,
&PtrInc, *CombinedOpcode, &PtrAddInsertLoc,
/*ExtraInstrsToMove=*/
findConstantOffsetsToMove(AddrUse, PtrAddInsertLoc, MRI, Helper),
findConstantOffsetsToMove(PtrInc, PtrAddInsertLoc, MRI, Helper),
/*RemoveInstr=*/true};
} else {
LLVM_DEBUG(dbgs() << " Ignoring candidate " << AddrUse);
LLVM_DEBUG(dbgs() << " Ignoring candidate " << PtrInc);
continue;
}
// Only combine postIncs if we know that the original pointer is not used
Expand All @@ -276,9 +286,9 @@ MachineInstr *findPostIncMatch(MachineInstr &MemI, MachineRegisterInfo &MRI,
// a combine does not dominate the insertion point but can never follow the
// insertion point, e.g. being in a sibling BB.
bool AddrUsesDominatesInsertPoint = checkRegUsesDominate(
Addr, *MatchData.CombinedInsertPoint, AddrUse, MRI, Helper, TII);
Addr, *MatchData.CombinedInsertPoint, PtrInc, MRI, Helper, TII);
if (EnableGreedyAddressCombine || AddrUsesDominatesInsertPoint)
return &AddrUse;
return &PtrInc;
}
return nullptr;
}
Expand All @@ -303,24 +313,24 @@ void llvm::applyLdStInc(MachineInstr &MI, MachineRegisterInfo &MRI,
// Debug Loc: Debug Loc of LOAD STORE: MI
B.setDebugLoc(MI.getDebugLoc());
auto NewInstr = B.buildInstr(MatchData.CombinedInstrOpcode);
for (auto Instr : MatchData.ExtraInstrsToMove) {
for (auto *Instr : MatchData.ExtraInstrsToMove) {
Instr->moveBefore(NewInstr);
}
if (MI.mayLoad())
NewInstr.addDef(MI.getOperand(0).getReg() /* Loaded value */);
if (MatchData.RemoveInstr)
// If we remove the instr it is because we have defs that would otherwise
// be redefined. We have to add these defs into the new instruction.
for (auto def : MatchData.Instr->defs())
if (def.isReg())
NewInstr.addDef(def.getReg());
for (auto Def : MatchData.Instr->defs())
if (Def.isReg())
NewInstr.addDef(Def.getReg());
if (MI.getOpcode() == TargetOpcode::G_STORE)
NewInstr.addUse(MI.getOperand(0).getReg() /* Stored value */);
for (auto use : MatchData.Instr->uses())
if (use.isReg())
NewInstr.addUse(use.getReg());
for (auto mem : MI.memoperands())
NewInstr.addMemOperand(mem);
for (auto Use : MatchData.Instr->uses())
if (Use.isReg())
NewInstr.addUse(Use.getReg());
for (auto *Mem : MI.memoperands())
NewInstr.addMemOperand(Mem);

if (MatchData.RemoveInstr)
MatchData.Instr->removeFromParent();
Expand Down
38 changes: 38 additions & 0 deletions llvm/test/CodeGen/AIE/GlobalISel/combine-loads-stores.mir
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,44 @@ body: |
$p0 = COPY %4
...

---
name: load_to_postinc_ptradd_before_usedinphi
tracksRegLiveness: true
body: |
; CHECK-LABEL: name: load_to_postinc_ptradd_before_usedinphi
; CHECK: bb.0:
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: liveins: $p0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0
; CHECK-NEXT: G_BR %bb.1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[PHI:%[0-9]+]]:_(p0) = G_PHI [[COPY]](p0), %bb.0, %2(p0), %bb.1
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s20) = G_TRUNC [[C]](s32)
; CHECK-NEXT: [[AIE_POSTINC_LOAD:%[0-9]+]]:_(s32), [[AIE_POSTINC_LOAD1:%[0-9]+]]:_(p0) = G_AIE_POSTINC_LOAD [[PHI]], [[TRUNC]](s20) :: (load (s32))
; CHECK-NEXT: $r0 = COPY [[AIE_POSTINC_LOAD]](s32)
; CHECK-NEXT: G_BR %bb.1
bb.0:
successors: %bb.1
liveins: $p0
%0:_(p0) = COPY $p0
G_BR %bb.1
bb.1:
successors: %bb.1
%1:_(p0) = G_PHI %0(p0), %bb.0, %4(p0), %bb.1
%2:_(s32) = G_CONSTANT i32 32
%3:_(s20) = G_TRUNC %2
%4:_(p0) = G_PTR_ADD %1, %3
%5:_(s32) = G_LOAD %1 :: (load (s32))
$r0 = COPY %5
G_BR %bb.1
...


# Our current combine code is not able to move the memory operation up. In this
# case we cannot just move the pointer add to the load and we therefore don't
# combine. This could be improved.
Expand Down
36 changes: 19 additions & 17 deletions llvm/test/CodeGen/AIE/aie2/schedule/swp/doloop-stage0.ll
Original file line number Diff line number Diff line change
Expand Up @@ -19,33 +19,35 @@ define dso_local i32 @dot(ptr nocapture readonly %a, ptr nocapture readonly %b,
; CHECK-LABEL: dot:
; CHECK: .p2align 4
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: nopa ; add r5, r1, #-1
; CHECK-NEXT: jz r5, #.LBB0_5
; CHECK-NEXT: lda r2, [p0, #0] // Delay Slot 5
; CHECK-NEXT: padda [p0], #2044 // Delay Slot 4
; CHECK-NEXT: lda r3, [p1, #0] // Delay Slot 3
; CHECK-NEXT: padda [p1], #2044 // Delay Slot 2
; CHECK-NEXT: mova r0, #0 // Delay Slot 1
; CHECK-NEXT: nopb ; nopa ; nops ; movxm m0, #2044; nopv
; CHECK-NEXT: lda r3, [p1], m0; add r5, r1, #-1; nopm
; CHECK-NEXT: lda r2, [p0], m0; jz r5, #.LBB0_5
; CHECK-NEXT: nop // Delay Slot 5
; CHECK-NEXT: nop // Delay Slot 4
; CHECK-NEXT: nop // Delay Slot 3
; CHECK-NEXT: nop // Delay Slot 2
; CHECK-NEXT: movx r0, #0 // Delay Slot 1
; CHECK-NEXT: // %bb.1: // %do.body
; CHECK-NEXT: lda r4, [p1, #0]; nopb ; add r5, r5, #-1
; CHECK-NEXT: lda r1, [p0, #0]; jz r5, #.LBB0_4
; CHECK-NEXT: lda r4, [p1], m0; nopb ; add r5, r5, #-1; nopm
; CHECK-NEXT: lda r1, [p0], m0; jz r5, #.LBB0_4
; CHECK-NEXT: nop // Delay Slot 5
; CHECK-NEXT: nop // Delay Slot 4
; CHECK-NEXT: nop // Delay Slot 3
; CHECK-NEXT: padda [p0], #2044 // Delay Slot 2
; CHECK-NEXT: padda [p1], #2044 // Delay Slot 1
; CHECK-NEXT: nop // Delay Slot 2
; CHECK-NEXT: nop // Delay Slot 1
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB0_2: // %do.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: nopa ; nopb ; add r5, r5, #-1; nopm
; CHECK-NEXT: nopb ; nopa ; nops ; add r5, r5, #-1; nopm ; nopv
; CHECK-NEXT: jnz r5, #.LBB0_2
; CHECK-NEXT: nop // Delay Slot 5
; CHECK-NEXT: lda r1, [p0, #0] // Delay Slot 4
; CHECK-NEXT: padda [p0], #2044 // Delay Slot 3
; CHECK-NEXT: lda r4, [p1, #0]; and r6, r3, r2; mov r3, r4 // Delay Slot 2
; CHECK-NEXT: padda [p1], #2044; or r0, r6, r0; mov r2, r1 // Delay Slot 1
; CHECK-NEXT: nop // Delay Slot 4
; CHECK-NEXT: nop // Delay Slot 3
; CHECK-NEXT: lda r1, [p0], m0; and r6, r3, r2; mov r2, r1 // Delay Slot 2
; CHECK-NEXT: lda r4, [p1], m0; or r0, r6, r0; mov r3, r4 // Delay Slot 1
; CHECK-NEXT: // %bb.3:
; CHECK-NEXT: nopa ; nopb ; nopx
; CHECK-NEXT: nopa ; nopx
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
Expand Down
32 changes: 17 additions & 15 deletions llvm/test/CodeGen/AIE/aie2/schedule/swp/stage0.ll
Original file line number Diff line number Diff line change
Expand Up @@ -25,35 +25,37 @@ define dso_local i32 @dot(ptr nocapture readonly %a, ptr nocapture readonly %b,
; CHECK-NEXT: nop // Delay Slot 3
; CHECK-NEXT: nop // Delay Slot 2
; CHECK-NEXT: nop // Delay Slot 1
; CHECK-NEXT: // %bb.1: // %for.body
; CHECK-NEXT: lda r2, [p0, #0]; nopx
; CHECK-NEXT: lda r3, [p1, #0]; add r5, r1, #-1
; CHECK-NEXT: // %bb.1:
; CHECK-NEXT: movxm m0, #2044
; CHECK-NEXT: lda r2, [p0], m0
; CHECK-NEXT: lda r3, [p1], m0; add r5, r1, #-1
; CHECK-NEXT: jz r5, #.LBB0_6
; CHECK-NEXT: nop // Delay Slot 5
; CHECK-NEXT: nop // Delay Slot 4
; CHECK-NEXT: nop // Delay Slot 3
; CHECK-NEXT: padda [p0], #2044 // Delay Slot 2
; CHECK-NEXT: padda [p1], #2044 // Delay Slot 1
; CHECK-NEXT: nop // Delay Slot 2
; CHECK-NEXT: nop // Delay Slot 1
; CHECK-NEXT: // %bb.2: // %for.body
; CHECK-NEXT: lda r4, [p1, #0]; nopb ; add r5, r5, #-1
; CHECK-NEXT: lda r1, [p0, #0]; jz r5, #.LBB0_5
; CHECK-NEXT: lda r4, [p1], m0; nopb ; add r5, r5, #-1; nopm
; CHECK-NEXT: lda r1, [p0], m0; jz r5, #.LBB0_5
; CHECK-NEXT: nop // Delay Slot 5
; CHECK-NEXT: nop // Delay Slot 4
; CHECK-NEXT: nop // Delay Slot 3
; CHECK-NEXT: padda [p0], #2044 // Delay Slot 2
; CHECK-NEXT: padda [p1], #2044 // Delay Slot 1
; CHECK-NEXT: nop // Delay Slot 2
; CHECK-NEXT: nop // Delay Slot 1
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB0_3: // %for.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: nopa ; nopb ; add r5, r5, #-1; nopm
; CHECK-NEXT: nopb ; nopa ; nops ; add r5, r5, #-1; nopm ; nopv
; CHECK-NEXT: jnz r5, #.LBB0_3
; CHECK-NEXT: nop // Delay Slot 5
; CHECK-NEXT: lda r1, [p0, #0] // Delay Slot 4
; CHECK-NEXT: padda [p0], #2044 // Delay Slot 3
; CHECK-NEXT: lda r4, [p1, #0]; and r6, r3, r2; mov r3, r4 // Delay Slot 2
; CHECK-NEXT: padda [p1], #2044; or r0, r6, r0; mov r2, r1 // Delay Slot 1
; CHECK-NEXT: nop // Delay Slot 4
; CHECK-NEXT: nop // Delay Slot 3
; CHECK-NEXT: lda r1, [p0], m0; and r6, r3, r2; mov r2, r1 // Delay Slot 2
; CHECK-NEXT: lda r4, [p1], m0; or r0, r6, r0; mov r3, r4 // Delay Slot 1
; CHECK-NEXT: // %bb.4:
; CHECK-NEXT: nopa ; nopb ; nopx
; CHECK-NEXT: nopa ; nopx
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
Expand Down

0 comments on commit fc44e76

Please sign in to comment.