diff options
author | Stephen Hines <srhines@google.com> | 2014-07-21 00:45:20 -0700 |
---|---|---|
committer | Stephen Hines <srhines@google.com> | 2014-07-21 00:45:20 -0700 |
commit | c6a4f5e819217e1e12c458aed8e7b122e23a3a58 (patch) | |
tree | 81b7dd2bb4370a392f31d332a566c903b5744764 /lib/Target | |
parent | 19c6fbb3e8aaf74093afa08013134b61fa08f245 (diff) | |
download | llvm-c6a4f5e819217e1e12c458aed8e7b122e23a3a58.tar.gz |
Update LLVM for rebase to r212749.
Includes a cherry-pick of:
r212948 - fixes a small issue with atomic calls
Change-Id: Ib97bd980b59f18142a69506400911a6009d9df18
Diffstat (limited to 'lib/Target')
388 files changed, 20335 insertions, 9870 deletions
diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td index 1ad5ac8c6f38..e6a27c386b0e 100644 --- a/lib/Target/AArch64/AArch64.td +++ b/lib/Target/AArch64/AArch64.td @@ -60,6 +60,7 @@ def AArch64InstrInfo : InstrInfo; // AArch64 Processors supported. // include "AArch64SchedA53.td" +include "AArch64SchedA57.td" include "AArch64SchedCyclone.td" def ProcA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53", @@ -89,7 +90,7 @@ def : ProcessorModel<"generic", NoSchedModel, [FeatureFPARMv8, FeatureCRC]>; def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>; -def : ProcessorModel<"cortex-a57", NoSchedModel, [ProcA57]>; +def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>; def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>; //===----------------------------------------------------------------------===// diff --git a/lib/Target/AArch64/AArch64AddressTypePromotion.cpp b/lib/Target/AArch64/AArch64AddressTypePromotion.cpp index 04906f6078f8..ab2c4b73e67c 100644 --- a/lib/Target/AArch64/AArch64AddressTypePromotion.cpp +++ b/lib/Target/AArch64/AArch64AddressTypePromotion.cpp @@ -214,8 +214,8 @@ AArch64AddressTypePromotion::shouldConsiderSExt(const Instruction *SExt) const { if (SExt->getType() != ConsideredSExtType) return false; - for (const Use &U : SExt->uses()) { - if (isa<GetElementPtrInst>(*U)) + for (const User *U : SExt->users()) { + if (isa<GetElementPtrInst>(U)) return true; } @@ -267,8 +267,7 @@ AArch64AddressTypePromotion::propagateSignExtension(Instructions &SExtInsts) { } // Now try to get through the chain of definitions. - while (isa<Instruction>(SExt->getOperand(0))) { - Instruction *Inst = dyn_cast<Instruction>(SExt->getOperand(0)); + while (auto *Inst = dyn_cast<Instruction>(SExt->getOperand(0))) { DEBUG(dbgs() << "Try to get through:\n" << *Inst << '\n'); if (!canGetThrough(Inst) || !shouldGetThrough(Inst)) { // We cannot get through something that is not an Instruction @@ -285,10 +284,10 @@ AArch64AddressTypePromotion::propagateSignExtension(Instructions &SExtInsts) { // assertion on the type as all involved sext operation may have not // been moved yet. while (!Inst->use_empty()) { - Value::use_iterator UseIt = Inst->use_begin(); - Instruction *UseInst = dyn_cast<Instruction>(*UseIt); - assert(UseInst && "Use of sext is not an Instruction!"); - UseInst->setOperand(UseIt->getOperandNo(), SExt); + Use &U = *Inst->use_begin(); + Instruction *User = dyn_cast<Instruction>(U.getUser()); + assert(User && "User of sext is not an Instruction!"); + User->setOperand(U.getOperandNo(), SExt); } ToRemove.insert(Inst); SExt->setOperand(0, Inst->getOperand(0)); @@ -385,11 +384,11 @@ void AArch64AddressTypePromotion::mergeSExts(ValueToInsts &ValToSExtendedUses, if (ToRemove.count(Inst)) continue; bool inserted = false; - for (auto Pt : CurPts) { + for (auto &Pt : CurPts) { if (DT.dominates(Inst, Pt)) { DEBUG(dbgs() << "Replace all uses of:\n" << *Pt << "\nwith:\n" << *Inst << '\n'); - (Pt)->replaceAllUsesWith(Inst); + Pt->replaceAllUsesWith(Inst); ToRemove.insert(Pt); Pt = Inst; inserted = true; @@ -436,7 +435,7 @@ void AArch64AddressTypePromotion::analyzeSExtension(Instructions &SExtInsts) { bool insert = false; // #1. - for (const Use &U : SExt->uses()) { + for (const User *U : SExt->users()) { const Instruction *Inst = dyn_cast<GetElementPtrInst>(U); if (Inst && Inst->getNumOperands() > 2) { DEBUG(dbgs() << "Interesting use in GetElementPtrInst\n" << *Inst diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp index c3ee9bbb8179..cd94e244dc38 100644 --- a/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -211,7 +211,7 @@ void AArch64AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNum, const MachineOperand &MO = MI->getOperand(OpNum); switch (MO.getType()) { default: - assert(0 && "<unknown operand type>"); + llvm_unreachable("<unknown operand type>"); case MachineOperand::MO_Register: { unsigned Reg = MO.getReg(); assert(TargetRegisterInfo::isPhysicalRegister(Reg)); diff --git a/lib/Target/AArch64/AArch64BranchRelaxation.cpp b/lib/Target/AArch64/AArch64BranchRelaxation.cpp index 52094526727d..484e7e8ffce4 100644 --- a/lib/Target/AArch64/AArch64BranchRelaxation.cpp +++ b/lib/Target/AArch64/AArch64BranchRelaxation.cpp @@ -291,7 +291,7 @@ static bool isConditionalBranch(unsigned Opc) { static MachineBasicBlock *getDestBlock(MachineInstr *MI) { switch (MI->getOpcode()) { default: - assert(0 && "unexpected opcode!"); + llvm_unreachable("unexpected opcode!"); case AArch64::TBZW: case AArch64::TBNZW: case AArch64::TBZX: @@ -309,7 +309,7 @@ static MachineBasicBlock *getDestBlock(MachineInstr *MI) { static unsigned getOppositeConditionOpcode(unsigned Opc) { switch (Opc) { default: - assert(0 && "unexpected opcode!"); + llvm_unreachable("unexpected opcode!"); case AArch64::TBNZW: return AArch64::TBZW; case AArch64::TBNZX: return AArch64::TBZX; case AArch64::TBZW: return AArch64::TBNZW; @@ -325,7 +325,7 @@ static unsigned getOppositeConditionOpcode(unsigned Opc) { static unsigned getBranchDisplacementBits(unsigned Opc) { switch (Opc) { default: - assert(0 && "unexpected opcode!"); + llvm_unreachable("unexpected opcode!"); case AArch64::TBNZW: case AArch64::TBZW: case AArch64::TBNZX: diff --git a/lib/Target/AArch64/AArch64CallingConvention.td b/lib/Target/AArch64/AArch64CallingConvention.td index ded2e17c544e..8e8bd3d0bcd8 100644 --- a/lib/Target/AArch64/AArch64CallingConvention.td +++ b/lib/Target/AArch64/AArch64CallingConvention.td @@ -18,9 +18,6 @@ class CCIfAlign<string Align, CCAction A> : class CCIfBigEndian<CCAction A> : CCIf<"State.getTarget().getDataLayout()->isBigEndian()", A>; -class CCIfUnallocated<string Reg, CCAction A> : - CCIf<"!State.isAllocated(AArch64::" # Reg # ")", A>; - //===----------------------------------------------------------------------===// // ARM AAPCS64 Calling Convention //===----------------------------------------------------------------------===// @@ -45,7 +42,7 @@ def CC_AArch64_AAPCS : CallingConv<[ // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers, // up to eight each of GPR and FPR. - CCIfType<[i1, i8, i16], CCIfUnallocated<"X7", CCPromoteToType<i32>>>, + CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7], [X0, X1, X2, X3, X4, X5, X6, X7]>>, // i128 is split to two i64s, we can't fit half to register X7. @@ -120,7 +117,7 @@ def CC_AArch64_DarwinPCS : CallingConv<[ // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers, // up to eight each of GPR and FPR. - CCIfType<[i1, i8, i16], CCIfUnallocated<"X7", CCPromoteToType<i32>>>, + CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7], [X0, X1, X2, X3, X4, X5, X6, X7]>>, // i128 is split to two i64s, we can't fit half to register X7. @@ -143,8 +140,8 @@ def CC_AArch64_DarwinPCS : CallingConv<[ CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, // If more than will fit in registers, pass them on the stack instead. - CCIfType<[i1, i8], CCAssignToStack<1, 1>>, - CCIfType<[i16], CCAssignToStack<2, 2>>, + CCIf<"ValVT == MVT::i1 || ValVT == MVT::i8", CCAssignToStack<1, 1>>, + CCIf<"ValVT == MVT::i16", CCAssignToStack<2, 2>>, CCIfType<[i32, f32], CCAssignToStack<4, 4>>, CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8], CCAssignToStack<8, 8>>, @@ -172,12 +169,11 @@ def CC_AArch64_DarwinPCS_VarArg : CallingConv<[ // 32bit quantity as undef. def CC_AArch64_WebKit_JS : CallingConv<[ // Handle i1, i8, i16, i32, and i64 passing in register X0 (W0). - CCIfType<[i1, i8, i16], CCIfUnallocated<"X0", CCPromoteToType<i32>>>, + CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, CCIfType<[i32], CCAssignToRegWithShadow<[W0], [X0]>>, CCIfType<[i64], CCAssignToRegWithShadow<[X0], [W0]>>, // Pass the remaining arguments on the stack instead. - CCIfType<[i1, i8, i16], CCAssignToStack<4, 4>>, CCIfType<[i32, f32], CCAssignToStack<4, 4>>, CCIfType<[i64, f64], CCAssignToStack<8, 8>> ]>; diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp index c3b53692fb2a..2164d77b7900 100644 --- a/lib/Target/AArch64/AArch64FastISel.cpp +++ b/lib/Target/AArch64/AArch64FastISel.cpp @@ -240,21 +240,15 @@ unsigned AArch64FastISel::AArch64MaterializeFP(const ConstantFP *CFP, MVT VT) { } unsigned AArch64FastISel::AArch64MaterializeGV(const GlobalValue *GV) { - // We can't handle thread-local variables quickly yet. Unfortunately we have - // to peer through any aliases to find out if that rule applies. - const GlobalValue *TLSGV = GV; - if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) - TLSGV = GA->getAliasee(); + // We can't handle thread-local variables quickly yet. + if (GV->isThreadLocal()) + return 0; // MachO still uses GOT for large code-model accesses, but ELF requires // movz/movk sequences, which FastISel doesn't handle yet. if (TM.getCodeModel() != CodeModel::Small && !Subtarget->isTargetMachO()) return 0; - if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(TLSGV)) - if (GVar->isThreadLocal()) - return 0; - unsigned char OpFlags = Subtarget->ClassifyGlobalReference(GV, TM); EVT DestEVT = TLI.getValueType(GV->getType(), true); @@ -469,11 +463,18 @@ bool AArch64FastISel::SimplifyAddress(Address &Addr, MVT VT, break; } - // FIXME: If this is a stack pointer and the offset needs to be simplified - // then put the alloca address into a register, set the base type back to - // register and continue. This should almost never happen. + //If this is a stack pointer and the offset needs to be simplified then put + // the alloca address into a register, set the base type back to register and + // continue. This should almost never happen. if (needsLowering && Addr.getKind() == Address::FrameIndexBase) { - return false; + unsigned ResultReg = createResultReg(&AArch64::GPR64RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADDXri), + ResultReg) + .addFrameIndex(Addr.getFI()) + .addImm(0) + .addImm(0); + Addr.setKind(Address::RegBase); + Addr.setReg(ResultReg); } // Since the offset is too large for the load/store instruction get the @@ -1224,7 +1225,6 @@ bool AArch64FastISel::ProcessCallArgs( Arg = EmitIntExt(SrcVT, Arg, DestVT, /*isZExt*/ false); if (Arg == 0) return false; - ArgVT = DestVT; break; } case CCValAssign::AExt: @@ -1235,7 +1235,6 @@ bool AArch64FastISel::ProcessCallArgs( Arg = EmitIntExt(SrcVT, Arg, DestVT, /*isZExt*/ true); if (Arg == 0) return false; - ArgVT = DestVT; break; } default: @@ -1254,7 +1253,7 @@ bool AArch64FastISel::ProcessCallArgs( assert(VA.isMemLoc() && "Assuming store on stack."); // Need to store on the stack. - unsigned ArgSize = VA.getLocVT().getSizeInBits() / 8; + unsigned ArgSize = (ArgVT.getSizeInBits() + 7) / 8; unsigned BEAlign = 0; if (ArgSize < 8 && !Subtarget->isLittleEndian()) @@ -1468,10 +1467,12 @@ bool AArch64FastISel::TryEmitSmallMemCpy(Address Dest, Address Src, bool RV; unsigned ResultReg; RV = EmitLoad(VT, ResultReg, Src); - assert(RV == true && "Should be able to handle this load."); + if (!RV) + return false; + RV = EmitStore(VT, ResultReg, Dest); - assert(RV == true && "Should be able to handle this store."); - (void)RV; + if (!RV) + return false; int64_t Size = VT.getSizeInBits() / 8; Len -= Size; @@ -1749,6 +1750,17 @@ unsigned AArch64FastISel::Emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt) { unsigned AArch64FastISel::EmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, bool isZExt) { assert(DestVT != MVT::i1 && "ZeroExt/SignExt an i1?"); + + // FastISel does not have plumbing to deal with extensions where the SrcVT or + // DestVT are odd things, so test to make sure that they are both types we can + // handle (i1/i8/i16/i32 for SrcVT and i8/i16/i32/i64 for DestVT), otherwise + // bail out to SelectionDAG. + if (((DestVT != MVT::i8) && (DestVT != MVT::i16) && + (DestVT != MVT::i32) && (DestVT != MVT::i64)) || + ((SrcVT != MVT::i1) && (SrcVT != MVT::i8) && + (SrcVT != MVT::i16) && (SrcVT != MVT::i32))) + return 0; + unsigned Opc; unsigned Imm = 0; @@ -1895,6 +1907,7 @@ bool AArch64FastISel::SelectMul(const Instruction *I) { case MVT::i32: ZReg = AArch64::WZR; Opc = AArch64::MADDWrrr; + SrcVT = MVT::i32; break; case MVT::i64: ZReg = AArch64::XZR; diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp index deb306a506dd..9c337172dd06 100644 --- a/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -158,7 +158,7 @@ void AArch64FrameLowering::emitCalleeSavedFrameMoves( MachineFrameInfo *MFI = MF.getFrameInfo(); MachineModuleInfo &MMI = MF.getMMI(); const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo(); - const AArch64InstrInfo *TII = TM.getInstrInfo(); + const TargetInstrInfo *TII = MF.getTarget().getInstrInfo(); DebugLoc DL = MBB.findDebugLoc(MBBI); // Add callee saved registers to move list. @@ -204,8 +204,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const { MachineBasicBlock::iterator MBBI = MBB.begin(); const MachineFrameInfo *MFI = MF.getFrameInfo(); const Function *Fn = MF.getFunction(); - const AArch64RegisterInfo *RegInfo = TM.getRegisterInfo(); - const AArch64InstrInfo *TII = TM.getInstrInfo(); + const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>( + MF.getTarget().getRegisterInfo()); + const TargetInstrInfo *TII = MF.getTarget().getInstrInfo(); MachineModuleInfo &MMI = MF.getMMI(); AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); bool needsFrameMoves = MMI.hasDebugInfo() || Fn->needsUnwindTableEntry(); diff --git a/lib/Target/AArch64/AArch64FrameLowering.h b/lib/Target/AArch64/AArch64FrameLowering.h index 0e00d168003f..7686e6f18432 100644 --- a/lib/Target/AArch64/AArch64FrameLowering.h +++ b/lib/Target/AArch64/AArch64FrameLowering.h @@ -18,18 +18,11 @@ namespace llvm { -class AArch64Subtarget; -class AArch64TargetMachine; - class AArch64FrameLowering : public TargetFrameLowering { - const AArch64TargetMachine &TM; - public: - explicit AArch64FrameLowering(const AArch64TargetMachine &TM, - const AArch64Subtarget &STI) + explicit AArch64FrameLowering() : TargetFrameLowering(StackGrowsDown, 16, 0, 16, - false /*StackRealignable*/), - TM(TM) {} + false /*StackRealignable*/) {} void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 7007ffcce29b..3f49fabfb58e 100644 --- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -153,9 +153,6 @@ public: SDNode *SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc); SDNode *SelectPostStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc); - SDNode *SelectSIMDAddSubNarrowing(unsigned IntNo, SDNode *Node); - SDNode *SelectSIMDXtnNarrowing(unsigned IntNo, SDNode *Node); - SDNode *SelectBitfieldExtractOp(SDNode *N); SDNode *SelectBitfieldInsertOp(SDNode *N); @@ -596,8 +593,9 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size, const GlobalValue *GV = GAN->getGlobal(); unsigned Alignment = GV->getAlignment(); const DataLayout *DL = TLI->getDataLayout(); - if (Alignment == 0 && !Subtarget->isTargetDarwin()) - Alignment = DL->getABITypeAlignment(GV->getType()->getElementType()); + Type *Ty = GV->getType()->getElementType(); + if (Alignment == 0 && Ty->isSized() && !Subtarget->isTargetDarwin()) + Alignment = DL->getABITypeAlignment(Ty); if (Alignment >= Size) return true; @@ -2111,7 +2109,7 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { .getVectorElementType() .getSizeInBits()) { default: - assert(0 && "Unexpected vector element type!"); + llvm_unreachable("Unexpected vector element type!"); case 64: SubReg = AArch64::dsub; break; diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 80d6669cbf3d..28d0035a11ac 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -67,15 +67,15 @@ EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden, //===----------------------------------------------------------------------===// // AArch64 Lowering public interface. //===----------------------------------------------------------------------===// -static TargetLoweringObjectFile *createTLOF(TargetMachine &TM) { - if (TM.getSubtarget<AArch64Subtarget>().isTargetDarwin()) +static TargetLoweringObjectFile *createTLOF(const Triple &TT) { + if (TT.isOSBinFormatMachO()) return new AArch64_MachoTargetObjectFile(); return new AArch64_ELFTargetObjectFile(); } -AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM) - : TargetLowering(TM, createTLOF(TM)) { +AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM) + : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))) { Subtarget = &TM.getSubtarget<AArch64Subtarget>(); // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so @@ -627,7 +627,7 @@ MVT AArch64TargetLowering::getScalarShiftAmountTy(EVT LHSTy) const { unsigned AArch64TargetLowering::getMaximalGlobalOffset() const { // FIXME: On AArch64, this depends on the type. - // Basically, the addressable offsets are o to 4095 * Ty.getSizeInBytes(). + // Basically, the addressable offsets are up to 4095 * Ty.getSizeInBytes(). // and the offset has to be a multiple of the related size in bytes. return 4095; } @@ -823,8 +823,7 @@ AArch64TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, #ifndef NDEBUG MI->dump(); #endif - assert(0 && "Unexpected instruction for custom inserter!"); - break; + llvm_unreachable("Unexpected instruction for custom inserter!"); case AArch64::F128CSEL: return EmitF128CSEL(MI, BB); @@ -833,7 +832,6 @@ AArch64TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case TargetOpcode::PATCHPOINT: return emitPatchPoint(MI, BB); } - llvm_unreachable("Unexpected instruction for custom inserter!"); } //===----------------------------------------------------------------------===// @@ -1273,7 +1271,7 @@ static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { bool ExtraOp = false; switch (Op.getOpcode()) { default: - assert(0 && "Invalid code"); + llvm_unreachable("Invalid code"); case ISD::ADDC: Opc = AArch64ISD::ADDS; break; @@ -1387,24 +1385,22 @@ static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) { EVT InVT = Op.getOperand(0).getValueType(); EVT VT = Op.getValueType(); - // FP_TO_XINT conversion from the same type are legal. - if (VT.getSizeInBits() == InVT.getSizeInBits()) - return Op; - - if (InVT == MVT::v2f64 || InVT == MVT::v4f32) { + if (VT.getSizeInBits() < InVT.getSizeInBits()) { SDLoc dl(Op); SDValue Cv = DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(), Op.getOperand(0)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv); - } else if (InVT == MVT::v2f32) { + } + + if (VT.getSizeInBits() > InVT.getSizeInBits()) { SDLoc dl(Op); SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v2f64, Op.getOperand(0)); return DAG.getNode(Op.getOpcode(), dl, VT, Ext); } // Type changing conversions are illegal. - return SDValue(); + return Op; } SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, @@ -1440,32 +1436,23 @@ static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { SDValue In = Op.getOperand(0); EVT InVT = In.getValueType(); - // v2i32 to v2f32 is legal. - if (VT == MVT::v2f32 && InVT == MVT::v2i32) - return Op; - - // This function only handles v2f64 outputs. - if (VT == MVT::v2f64) { - // Extend the input argument to a v2i64 that we can feed into the - // floating point conversion. Zero or sign extend based on whether - // we're doing a signed or unsigned float conversion. - unsigned Opc = - Op.getOpcode() == ISD::UINT_TO_FP ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND; - assert(Op.getNumOperands() == 1 && "FP conversions take one argument"); - SDValue Promoted = DAG.getNode(Opc, dl, MVT::v2i64, Op.getOperand(0)); - return DAG.getNode(Op.getOpcode(), dl, Op.getValueType(), Promoted); + if (VT.getSizeInBits() < InVT.getSizeInBits()) { + MVT CastVT = + MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()), + InVT.getVectorNumElements()); + In = DAG.getNode(Op.getOpcode(), dl, CastVT, In); + return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0)); } - // Scalarize v2i64 to v2f32 conversions. - std::vector<SDValue> BuildVectorOps; - for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { - SDValue Sclr = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, In, - DAG.getConstant(i, MVT::i64)); - Sclr = DAG.getNode(Op->getOpcode(), dl, MVT::f32, Sclr); - BuildVectorOps.push_back(Sclr); + if (VT.getSizeInBits() > InVT.getSizeInBits()) { + unsigned CastOpc = + Op.getOpcode() == ISD::SINT_TO_FP ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; + EVT CastVT = VT.changeVectorElementTypeToInteger(); + In = DAG.getNode(CastOpc, dl, CastVT, In); + return DAG.getNode(Op.getOpcode(), dl, VT, In); } - return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, BuildVectorOps); + return Op; } SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, @@ -1516,7 +1503,7 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, StructType *RetTy = StructType::get(ArgTy, ArgTy, NULL); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) - .setCallee(CallingConv::Fast, RetTy, Callee, &Args, 0); + .setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args), 0); std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); return CallResult.first; @@ -1711,7 +1698,9 @@ SDValue AArch64TargetLowering::LowerFormalArguments( InVals.push_back(FrameIdxN); continue; - } if (VA.isRegLoc()) { + } + + if (VA.isRegLoc()) { // Arguments stored in registers. EVT RegVT = VA.getLocVT(); @@ -1772,10 +1761,16 @@ SDValue AArch64TargetLowering::LowerFormalArguments( SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); SDValue ArgValue; + // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT) ISD::LoadExtType ExtType = ISD::NON_EXTLOAD; + MVT MemVT = VA.getValVT(); + switch (VA.getLocInfo()) { default: break; + case CCValAssign::BCvt: + MemVT = VA.getLocVT(); + break; case CCValAssign::SExt: ExtType = ISD::SEXTLOAD; break; @@ -1787,10 +1782,9 @@ SDValue AArch64TargetLowering::LowerFormalArguments( break; } - ArgValue = DAG.getExtLoad(ExtType, DL, VA.getValVT(), Chain, FIN, + ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, MachinePointerInfo::getFixedStack(FI), - VA.getLocVT(), - false, false, false, 0); + MemVT, false, false, false, nullptr); InVals.push_back(ArgValue); } @@ -2339,11 +2333,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already // promoted to a legal register type i32, we should truncate Arg back to // i1/i8/i16. - if (Arg.getValueType().isSimple() && - Arg.getValueType().getSimpleVT() == MVT::i32 && - (VA.getLocVT() == MVT::i1 || VA.getLocVT() == MVT::i8 || - VA.getLocVT() == MVT::i16)) - Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getLocVT(), Arg); + if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 || + VA.getValVT() == MVT::i16) + Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg); SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, false, false, 0); @@ -4116,6 +4108,7 @@ static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) { // shuffle in combination with VEXTs. SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const { + assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); SDLoc dl(Op); EVT VT = Op.getValueType(); unsigned NumElts = VT.getVectorNumElements(); @@ -4164,35 +4157,47 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, SDValue ShuffleSrcs[2] = { DAG.getUNDEF(VT), DAG.getUNDEF(VT) }; int VEXTOffsets[2] = { 0, 0 }; + int OffsetMultipliers[2] = { 1, 1 }; // This loop extracts the usage patterns of the source vectors // and prepares appropriate SDValues for a shuffle if possible. for (unsigned i = 0; i < SourceVecs.size(); ++i) { - if (SourceVecs[i].getValueType() == VT) { + unsigned NumSrcElts = SourceVecs[i].getValueType().getVectorNumElements(); + SDValue CurSource = SourceVecs[i]; + if (SourceVecs[i].getValueType().getVectorElementType() != + VT.getVectorElementType()) { + // It may hit this case if SourceVecs[i] is AssertSext/AssertZext. + // Then bitcast it to the vector which holds asserted element type, + // and record the multiplier of element width between SourceVecs and + // Build_vector which is needed to extract the correct lanes later. + EVT CastVT = + EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), + SourceVecs[i].getValueSizeInBits() / + VT.getVectorElementType().getSizeInBits()); + + CurSource = DAG.getNode(ISD::BITCAST, dl, CastVT, SourceVecs[i]); + OffsetMultipliers[i] = CastVT.getVectorNumElements() / NumSrcElts; + NumSrcElts *= OffsetMultipliers[i]; + MaxElts[i] *= OffsetMultipliers[i]; + MinElts[i] *= OffsetMultipliers[i]; + } + + if (CurSource.getValueType() == VT) { // No VEXT necessary - ShuffleSrcs[i] = SourceVecs[i]; + ShuffleSrcs[i] = CurSource; VEXTOffsets[i] = 0; continue; - } else if (SourceVecs[i].getValueType().getVectorNumElements() < NumElts) { + } else if (NumSrcElts < NumElts) { // We can pad out the smaller vector for free, so if it's part of a // shuffle... - ShuffleSrcs[i] = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, SourceVecs[i], - DAG.getUNDEF(SourceVecs[i].getValueType())); + ShuffleSrcs[i] = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, CurSource, + DAG.getUNDEF(CurSource.getValueType())); continue; } - // Don't attempt to extract subvectors from BUILD_VECTOR sources - // that expand or trunc the original value. - // TODO: We can try to bitcast and ANY_EXTEND the result but - // we need to consider the cost of vector ANY_EXTEND, and the - // legality of all the types. - if (SourceVecs[i].getValueType().getVectorElementType() != - VT.getVectorElementType()) - return SDValue(); - // Since only 64-bit and 128-bit vectors are legal on ARM and // we've eliminated the other cases... - assert(SourceVecs[i].getValueType().getVectorNumElements() == 2 * NumElts && + assert(NumSrcElts == 2 * NumElts && "unexpected vector sizes in ReconstructShuffle"); if (MaxElts[i] - MinElts[i] >= NumElts) { @@ -4203,22 +4208,20 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, if (MinElts[i] >= NumElts) { // The extraction can just take the second half VEXTOffsets[i] = NumElts; - ShuffleSrcs[i] = - DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SourceVecs[i], - DAG.getIntPtrConstant(NumElts)); + ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource, + DAG.getIntPtrConstant(NumElts)); } else if (MaxElts[i] < NumElts) { // The extraction can just take the first half VEXTOffsets[i] = 0; - ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, - SourceVecs[i], DAG.getIntPtrConstant(0)); + ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource, + DAG.getIntPtrConstant(0)); } else { // An actual VEXT is needed VEXTOffsets[i] = MinElts[i]; - SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, - SourceVecs[i], DAG.getIntPtrConstant(0)); - SDValue VEXTSrc2 = - DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SourceVecs[i], - DAG.getIntPtrConstant(NumElts)); + SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource, + DAG.getIntPtrConstant(0)); + SDValue VEXTSrc2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource, + DAG.getIntPtrConstant(NumElts)); unsigned Imm = VEXTOffsets[i] * getExtFactor(VEXTSrc1); ShuffleSrcs[i] = DAG.getNode(AArch64ISD::EXT, dl, VT, VEXTSrc1, VEXTSrc2, DAG.getConstant(Imm, MVT::i32)); @@ -4238,9 +4241,10 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, int ExtractElt = cast<ConstantSDNode>(Op.getOperand(i).getOperand(1))->getSExtValue(); if (ExtractVec == SourceVecs[0]) { - Mask.push_back(ExtractElt - VEXTOffsets[0]); + Mask.push_back(ExtractElt * OffsetMultipliers[0] - VEXTOffsets[0]); } else { - Mask.push_back(ExtractElt + NumElts - VEXTOffsets[1]); + Mask.push_back(ExtractElt * OffsetMultipliers[1] + NumElts - + VEXTOffsets[1]); } } @@ -5177,11 +5181,37 @@ FailedModImm: return Op; } +// Normalize the operands of BUILD_VECTOR. The value of constant operands will +// be truncated to fit element width. +static SDValue NormalizeBuildVector(SDValue Op, + SelectionDAG &DAG) { + assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); + SDLoc dl(Op); + EVT VT = Op.getValueType(); + EVT EltTy= VT.getVectorElementType(); + + if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16) + return Op; + + SmallVector<SDValue, 16> Ops; + for (unsigned I = 0, E = VT.getVectorNumElements(); I != E; ++I) { + SDValue Lane = Op.getOperand(I); + if (Lane.getOpcode() == ISD::Constant) { + APInt LowBits(EltTy.getSizeInBits(), + cast<ConstantSDNode>(Lane)->getZExtValue()); + Lane = DAG.getConstant(LowBits.getZExtValue(), MVT::i32); + } + Ops.push_back(Lane); + } + return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); +} + SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { - BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode()); SDLoc dl(Op); EVT VT = Op.getValueType(); + Op = NormalizeBuildVector(Op, DAG); + BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode()); APInt CnstBits(VT.getSizeInBits(), 0); APInt UndefBits(VT.getSizeInBits(), 0); @@ -6047,18 +6077,14 @@ bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { return false; unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); - if (NumBits1 <= NumBits2) - return false; - return true; + return NumBits1 > NumBits2; } bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { - if (!VT1.isInteger() || !VT2.isInteger()) + if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger()) return false; unsigned NumBits1 = VT1.getSizeInBits(); unsigned NumBits2 = VT2.getSizeInBits(); - if (NumBits1 <= NumBits2) - return false; - return true; + return NumBits1 > NumBits2; } // All 32-bit GPR operations implicitly zero the high-half of the corresponding @@ -6068,18 +6094,14 @@ bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const { return false; unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); - if (NumBits1 == 32 && NumBits2 == 64) - return true; - return false; + return NumBits1 == 32 && NumBits2 == 64; } bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { - if (!VT1.isInteger() || !VT2.isInteger()) + if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger()) return false; unsigned NumBits1 = VT1.getSizeInBits(); unsigned NumBits2 = VT2.getSizeInBits(); - if (NumBits1 == 32 && NumBits2 == 64) - return true; - return false; + return NumBits1 == 32 && NumBits2 == 64; } bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { @@ -6092,8 +6114,9 @@ bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { return false; // 8-, 16-, and 32-bit integer loads all implicitly zero-extend. - return (VT1.isSimple() && VT1.isInteger() && VT2.isSimple() && - VT2.isInteger() && VT1.getSizeInBits() <= 32); + return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() && + VT2.isSimple() && !VT2.isVector() && VT2.isInteger() && + VT1.getSizeInBits() <= 32); } bool AArch64TargetLowering::hasPairedLoad(Type *LoadedType, @@ -6346,23 +6369,45 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) { APInt Value = C->getAPIntValue(); EVT VT = N->getValueType(0); - APInt VP1 = Value + 1; - if (VP1.isPowerOf2()) { - // Multiplying by one less than a power of two, replace with a shift - // and a subtract. - SDValue ShiftedVal = - DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), - DAG.getConstant(VP1.logBase2(), MVT::i64)); - return DAG.getNode(ISD::SUB, SDLoc(N), VT, ShiftedVal, N->getOperand(0)); - } - APInt VM1 = Value - 1; - if (VM1.isPowerOf2()) { - // Multiplying by one more than a power of two, replace with a shift - // and an add. - SDValue ShiftedVal = - DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), - DAG.getConstant(VM1.logBase2(), MVT::i64)); - return DAG.getNode(ISD::ADD, SDLoc(N), VT, ShiftedVal, N->getOperand(0)); + if (Value.isNonNegative()) { + // (mul x, 2^N + 1) => (add (shl x, N), x) + APInt VM1 = Value - 1; + if (VM1.isPowerOf2()) { + SDValue ShiftedVal = + DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), + DAG.getConstant(VM1.logBase2(), MVT::i64)); + return DAG.getNode(ISD::ADD, SDLoc(N), VT, ShiftedVal, + N->getOperand(0)); + } + // (mul x, 2^N - 1) => (sub (shl x, N), x) + APInt VP1 = Value + 1; + if (VP1.isPowerOf2()) { + SDValue ShiftedVal = + DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), + DAG.getConstant(VP1.logBase2(), MVT::i64)); + return DAG.getNode(ISD::SUB, SDLoc(N), VT, ShiftedVal, + N->getOperand(0)); + } + } else { + // (mul x, -(2^N + 1)) => - (add (shl x, N), x) + APInt VNM1 = -Value - 1; + if (VNM1.isPowerOf2()) { + SDValue ShiftedVal = + DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), + DAG.getConstant(VNM1.logBase2(), MVT::i64)); + SDValue Add = + DAG.getNode(ISD::ADD, SDLoc(N), VT, ShiftedVal, N->getOperand(0)); + return DAG.getNode(ISD::SUB, SDLoc(N), VT, DAG.getConstant(0, VT), Add); + } + // (mul x, -(2^N - 1)) => (sub x, (shl x, N)) + APInt VNP1 = -Value + 1; + if (VNP1.isPowerOf2()) { + SDValue ShiftedVal = + DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), + DAG.getConstant(VNP1.logBase2(), MVT::i64)); + return DAG.getNode(ISD::SUB, SDLoc(N), VT, N->getOperand(0), + ShiftedVal); + } } } return SDValue(); @@ -6687,7 +6732,7 @@ static SDValue tryCombineFixedPointConvert(SDNode *N, else if (Vec.getValueType() == MVT::v2i64) VecResTy = MVT::v2f64; else - assert(0 && "unexpected vector type!"); + llvm_unreachable("unexpected vector type!"); SDValue Convert = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift); @@ -7020,7 +7065,7 @@ static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) { if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), N->getOperand(1), DAG.getConstant(-ShiftAmount, MVT::i32)); - else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount <= ElemBits) + else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), N->getOperand(1), DAG.getConstant(ShiftAmount, MVT::i32)); @@ -7867,6 +7912,18 @@ bool AArch64TargetLowering::shouldExpandAtomicInIR(Instruction *Inst) const { return Inst->getType()->getPrimitiveSizeInBits() <= 128; } +TargetLoweringBase::LegalizeTypeAction +AArch64TargetLowering::getPreferredVectorAction(EVT VT) const { + MVT SVT = VT.getSimpleVT(); + // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8, + // v4i16, v2i32 instead of to promote. + if (SVT == MVT::v1i8 || SVT == MVT::v1i16 || SVT == MVT::v1i32 + || SVT == MVT::v1f32) + return TypeWidenVector; + + return TargetLoweringBase::getPreferredVectorAction(VT); +} + Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, AtomicOrdering Ord) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h index de16c4d9d4b8..cb0b9ef261dc 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.h +++ b/lib/Target/AArch64/AArch64ISelLowering.h @@ -197,7 +197,7 @@ class AArch64TargetLowering : public TargetLowering { bool RequireStrictAlign; public: - explicit AArch64TargetLowering(AArch64TargetMachine &TM); + explicit AArch64TargetLowering(TargetMachine &TM); /// Selects the correct CCAssignFn for a the given CallingConvention /// value. @@ -324,6 +324,9 @@ public: bool shouldExpandAtomicInIR(Instruction *Inst) const override; + TargetLoweringBase::LegalizeTypeAction + getPreferredVectorAction(EVT VT) const override; + private: /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can /// make the right decision when generating code for different targets. diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td index d455d7e45e06..5007172f1539 100644 --- a/lib/Target/AArch64/AArch64InstrFormats.td +++ b/lib/Target/AArch64/AArch64InstrFormats.td @@ -448,13 +448,19 @@ def logical_imm64_XFORM : SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(enc, MVT::i32); }]>; -def LogicalImm32Operand : AsmOperandClass { - let Name = "LogicalImm32"; - let DiagnosticType = "LogicalSecondSource"; -} -def LogicalImm64Operand : AsmOperandClass { - let Name = "LogicalImm64"; - let DiagnosticType = "LogicalSecondSource"; +let DiagnosticType = "LogicalSecondSource" in { + def LogicalImm32Operand : AsmOperandClass { + let Name = "LogicalImm32"; + } + def LogicalImm64Operand : AsmOperandClass { + let Name = "LogicalImm64"; + } + def LogicalImm32NotOperand : AsmOperandClass { + let Name = "LogicalImm32Not"; + } + def LogicalImm64NotOperand : AsmOperandClass { + let Name = "LogicalImm64Not"; + } } def logical_imm32 : Operand<i32>, PatLeaf<(imm), [{ return AArch64_AM::isLogicalImmediate(N->getZExtValue(), 32); @@ -468,6 +474,12 @@ def logical_imm64 : Operand<i64>, PatLeaf<(imm), [{ let PrintMethod = "printLogicalImm64"; let ParserMatchClass = LogicalImm64Operand; } +def logical_imm32_not : Operand<i32> { + let ParserMatchClass = LogicalImm32NotOperand; +} +def logical_imm64_not : Operand<i64> { + let ParserMatchClass = LogicalImm64NotOperand; +} // imm0_65535 predicate - True if the immediate is in the range [0,65535]. def Imm0_65535Operand : AsmImmRange<0, 65535>; @@ -963,8 +975,14 @@ def ccode : Operand<i32> { let ParserMatchClass = CondCode; } def inv_ccode : Operand<i32> { + // AL and NV are invalid in the aliases which use inv_ccode let PrintMethod = "printInverseCondCode"; let ParserMatchClass = CondCode; + let MCOperandPredicate = [{ + return MCOp.isImm() && + MCOp.getImm() != AArch64CC::AL && + MCOp.getImm() != AArch64CC::NV; + }]; } // Conditional branch target. 19-bit immediate. The low two bits of the target @@ -1323,13 +1341,13 @@ class BaseMulAccum<bit isSub, bits<3> opc, RegisterClass multype, multiclass MulAccum<bit isSub, string asm, SDNode AccNode> { def Wrrr : BaseMulAccum<isSub, 0b000, GPR32, GPR32, asm, [(set GPR32:$Rd, (AccNode GPR32:$Ra, (mul GPR32:$Rn, GPR32:$Rm)))]>, - Sched<[WriteIM32, ReadIMA, ReadIM, ReadIM]> { + Sched<[WriteIM32, ReadIM, ReadIM, ReadIMA]> { let Inst{31} = 0; } def Xrrr : BaseMulAccum<isSub, 0b000, GPR64, GPR64, asm, [(set GPR64:$Rd, (AccNode GPR64:$Ra, (mul GPR64:$Rn, GPR64:$Rm)))]>, - Sched<[WriteIM64, ReadIMA, ReadIM, ReadIM]> { + Sched<[WriteIM64, ReadIM, ReadIM, ReadIMA]> { let Inst{31} = 1; } } @@ -1339,7 +1357,7 @@ class WideMulAccum<bit isSub, bits<3> opc, string asm, : BaseMulAccum<isSub, opc, GPR32, GPR64, asm, [(set GPR64:$Rd, (AccNode GPR64:$Ra, (mul (ExtNode GPR32:$Rn), (ExtNode GPR32:$Rm))))]>, - Sched<[WriteIM32, ReadIMA, ReadIM, ReadIM]> { + Sched<[WriteIM32, ReadIM, ReadIM, ReadIMA]> { let Inst{31} = 1; } @@ -1738,6 +1756,10 @@ multiclass AddSubS<bit isSub, string mnemonic, SDNode OpNode, string cmp> { WZR, GPR32:$src1, GPR32:$src2, 0), 5>; def : InstAlias<cmp#" $src1, $src2", (!cast<Instruction>(NAME#"Xrs") XZR, GPR64:$src1, GPR64:$src2, 0), 5>; + def : InstAlias<cmp#" $src1, $src2", (!cast<Instruction>(NAME#"Wrx") + WZR, GPR32sponly:$src1, GPR32:$src2, 16), 5>; + def : InstAlias<cmp#" $src1, $src2", (!cast<Instruction>(NAME#"Xrx64") + XZR, GPR64sponly:$src1, GPR64:$src2, 24), 5>; // Register/register aliases with no shift when SP is not used. def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrs"), @@ -1925,22 +1947,32 @@ class LogicalRegAlias<string asm, Instruction inst, RegisterClass regtype> : InstAlias<asm#" $dst, $src1, $src2", (inst regtype:$dst, regtype:$src1, regtype:$src2, 0)>; -let AddedComplexity = 6 in -multiclass LogicalImm<bits<2> opc, string mnemonic, SDNode OpNode> { +multiclass LogicalImm<bits<2> opc, string mnemonic, SDNode OpNode, + string Alias> { + let AddedComplexity = 6 in def Wri : BaseLogicalImm<opc, GPR32sp, GPR32, logical_imm32, mnemonic, [(set GPR32sp:$Rd, (OpNode GPR32:$Rn, logical_imm32:$imm))]> { let Inst{31} = 0; let Inst{22} = 0; // 64-bit version has an additional bit of immediate. } + let AddedComplexity = 6 in def Xri : BaseLogicalImm<opc, GPR64sp, GPR64, logical_imm64, mnemonic, [(set GPR64sp:$Rd, (OpNode GPR64:$Rn, logical_imm64:$imm))]> { let Inst{31} = 1; } + + def : InstAlias<Alias # " $Rd, $Rn, $imm", + (!cast<Instruction>(NAME # "Wri") GPR32sp:$Rd, GPR32:$Rn, + logical_imm32_not:$imm), 0>; + def : InstAlias<Alias # " $Rd, $Rn, $imm", + (!cast<Instruction>(NAME # "Xri") GPR64sp:$Rd, GPR64:$Rn, + logical_imm64_not:$imm), 0>; } -multiclass LogicalImmS<bits<2> opc, string mnemonic, SDNode OpNode> { +multiclass LogicalImmS<bits<2> opc, string mnemonic, SDNode OpNode, + string Alias> { let isCompare = 1, Defs = [NZCV] in { def Wri : BaseLogicalImm<opc, GPR32, GPR32, logical_imm32, mnemonic, [(set GPR32:$Rd, (OpNode GPR32:$Rn, logical_imm32:$imm))]> { @@ -1952,6 +1984,13 @@ multiclass LogicalImmS<bits<2> opc, string mnemonic, SDNode OpNode> { let Inst{31} = 1; } } // end Defs = [NZCV] + + def : InstAlias<Alias # " $Rd, $Rn, $imm", + (!cast<Instruction>(NAME # "Wri") GPR32:$Rd, GPR32:$Rn, + logical_imm32_not:$imm), 0>; + def : InstAlias<Alias # " $Rd, $Rn, $imm", + (!cast<Instruction>(NAME # "Xri") GPR64:$Rd, GPR64:$Rn, + logical_imm64_not:$imm), 0>; } class BaseLogicalRegPseudo<RegisterClass regtype, SDPatternOperator OpNode> diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp index ff115c0bd5ed..ce85b2ceba95 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -35,8 +35,14 @@ AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI) /// GetInstSize - Return the number of bytes of code the specified /// instruction may be. This returns the maximum number of bytes. unsigned AArch64InstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const { - const MCInstrDesc &Desc = MI->getDesc(); + const MachineBasicBlock &MBB = *MI->getParent(); + const MachineFunction *MF = MBB.getParent(); + const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); + + if (MI->getOpcode() == AArch64::INLINEASM) + return getInlineAsmLength(MI->getOperand(0).getSymbolName(), *MAI); + const MCInstrDesc &Desc = MI->getDesc(); switch (Desc.getOpcode()) { default: // Anything not explicitly designated otherwise is a nomal 4-byte insn. @@ -1224,7 +1230,7 @@ void AArch64InstrInfo::copyPhysRegTuple( MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL, unsigned DestReg, unsigned SrcReg, bool KillSrc, unsigned Opcode, llvm::ArrayRef<unsigned> Indices) const { - assert(getSubTarget().hasNEON() && + assert(Subtarget.hasNEON() && "Unexpected register copy without NEON"); const TargetRegisterInfo *TRI = &getRegisterInfo(); uint16_t DestEncoding = TRI->getEncodingValue(DestReg); @@ -1385,7 +1391,7 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, if (AArch64::FPR128RegClass.contains(DestReg) && AArch64::FPR128RegClass.contains(SrcReg)) { - if(getSubTarget().hasNEON()) { + if(Subtarget.hasNEON()) { BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) .addReg(SrcReg) .addReg(SrcReg, getKillRegState(KillSrc)); @@ -1406,7 +1412,7 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, if (AArch64::FPR64RegClass.contains(DestReg) && AArch64::FPR64RegClass.contains(SrcReg)) { - if(getSubTarget().hasNEON()) { + if(Subtarget.hasNEON()) { DestReg = RI.getMatchingSuperReg(DestReg, AArch64::dsub, &AArch64::FPR128RegClass); SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::dsub, @@ -1423,7 +1429,7 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, if (AArch64::FPR32RegClass.contains(DestReg) && AArch64::FPR32RegClass.contains(SrcReg)) { - if(getSubTarget().hasNEON()) { + if(Subtarget.hasNEON()) { DestReg = RI.getMatchingSuperReg(DestReg, AArch64::ssub, &AArch64::FPR128RegClass); SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::ssub, @@ -1440,7 +1446,7 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, if (AArch64::FPR16RegClass.contains(DestReg) && AArch64::FPR16RegClass.contains(SrcReg)) { - if(getSubTarget().hasNEON()) { + if(Subtarget.hasNEON()) { DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub, &AArch64::FPR128RegClass); SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub, @@ -1461,7 +1467,7 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, if (AArch64::FPR8RegClass.contains(DestReg) && AArch64::FPR8RegClass.contains(SrcReg)) { - if(getSubTarget().hasNEON()) { + if(Subtarget.hasNEON()) { DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub, &AArch64::FPR128RegClass); SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub, @@ -1577,39 +1583,39 @@ void AArch64InstrInfo::storeRegToStackSlot( if (AArch64::FPR128RegClass.hasSubClassEq(RC)) Opc = AArch64::STRQui; else if (AArch64::DDRegClass.hasSubClassEq(RC)) { - assert(getSubTarget().hasNEON() && + assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); Opc = AArch64::ST1Twov1d, Offset = false; } break; case 24: if (AArch64::DDDRegClass.hasSubClassEq(RC)) { - assert(getSubTarget().hasNEON() && + assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); Opc = AArch64::ST1Threev1d, Offset = false; } break; case 32: if (AArch64::DDDDRegClass.hasSubClassEq(RC)) { - assert(getSubTarget().hasNEON() && + assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); Opc = AArch64::ST1Fourv1d, Offset = false; } else if (AArch64::QQRegClass.hasSubClassEq(RC)) { - assert(getSubTarget().hasNEON() && + assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); Opc = AArch64::ST1Twov2d, Offset = false; } break; case 48: if (AArch64::QQQRegClass.hasSubClassEq(RC)) { - assert(getSubTarget().hasNEON() && + assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); Opc = AArch64::ST1Threev2d, Offset = false; } break; case 64: if (AArch64::QQQQRegClass.hasSubClassEq(RC)) { - assert(getSubTarget().hasNEON() && + assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); Opc = AArch64::ST1Fourv2d, Offset = false; } @@ -1675,39 +1681,39 @@ void AArch64InstrInfo::loadRegFromStackSlot( if (AArch64::FPR128RegClass.hasSubClassEq(RC)) Opc = AArch64::LDRQui; else if (AArch64::DDRegClass.hasSubClassEq(RC)) { - assert(getSubTarget().hasNEON() && + assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); Opc = AArch64::LD1Twov1d, Offset = false; } break; case 24: if (AArch64::DDDRegClass.hasSubClassEq(RC)) { - assert(getSubTarget().hasNEON() && + assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); Opc = AArch64::LD1Threev1d, Offset = false; } break; case 32: if (AArch64::DDDDRegClass.hasSubClassEq(RC)) { - assert(getSubTarget().hasNEON() && + assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); Opc = AArch64::LD1Fourv1d, Offset = false; } else if (AArch64::QQRegClass.hasSubClassEq(RC)) { - assert(getSubTarget().hasNEON() && + assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); Opc = AArch64::LD1Twov2d, Offset = false; } break; case 48: if (AArch64::QQQRegClass.hasSubClassEq(RC)) { - assert(getSubTarget().hasNEON() && + assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); Opc = AArch64::LD1Threev2d, Offset = false; } break; case 64: if (AArch64::QQQQRegClass.hasSubClassEq(RC)) { - assert(getSubTarget().hasNEON() && + assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); Opc = AArch64::LD1Fourv2d, Offset = false; } @@ -1726,7 +1732,7 @@ void AArch64InstrInfo::loadRegFromStackSlot( void llvm::emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc DL, unsigned DestReg, unsigned SrcReg, int Offset, - const AArch64InstrInfo *TII, + const TargetInstrInfo *TII, MachineInstr::MIFlag Flag, bool SetNZCV) { if (DestReg == SrcReg && Offset == 0) return; @@ -1835,7 +1841,7 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset, *OutUnscaledOp = 0; switch (MI.getOpcode()) { default: - assert(0 && "unhandled opcode in rewriteAArch64FrameIndex"); + llvm_unreachable("unhandled opcode in rewriteAArch64FrameIndex"); // Vector spills/fills can't take an immediate offset. case AArch64::LD1Twov2d: case AArch64::LD1Threev2d: diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h index 90ce75f26d42..f70b82b7c2da 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.h +++ b/lib/Target/AArch64/AArch64InstrInfo.h @@ -44,8 +44,6 @@ public: /// always be able to get register info as well (through this method). const AArch64RegisterInfo &getRegisterInfo() const { return RI; } - const AArch64Subtarget &getSubTarget() const { return Subtarget; } - unsigned GetInstSizeInBytes(const MachineInstr *MI) const; bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg, @@ -168,7 +166,7 @@ private: /// if necessary, to be replaced by the scavenger at the end of PEI. void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc DL, unsigned DestReg, unsigned SrcReg, int Offset, - const AArch64InstrInfo *TII, + const TargetInstrInfo *TII, MachineInstr::MIFlag = MachineInstr::NoFlags, bool SetNZCV = false); diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td index 9ad36e8740db..1211fba60c25 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.td +++ b/lib/Target/AArch64/AArch64InstrInfo.td @@ -323,7 +323,7 @@ def : Pat<(AArch64LOADgot tconstpool:$addr), // System instructions. //===----------------------------------------------------------------------===// -def HINT : HintI<"hint">; +def HINT : HintI<"hint">; def : InstAlias<"nop", (HINT 0b000)>; def : InstAlias<"yield",(HINT 0b001)>; def : InstAlias<"wfe", (HINT 0b010)>; @@ -671,10 +671,10 @@ def CRC32CXrr : BaseCRC32<1, 0b11, 1, GPR64, int_aarch64_crc32cx, "crc32cx">; //===----------------------------------------------------------------------===// // (immediate) -defm ANDS : LogicalImmS<0b11, "ands", AArch64and_flag>; -defm AND : LogicalImm<0b00, "and", and>; -defm EOR : LogicalImm<0b10, "eor", xor>; -defm ORR : LogicalImm<0b01, "orr", or>; +defm ANDS : LogicalImmS<0b11, "ands", AArch64and_flag, "bics">; +defm AND : LogicalImm<0b00, "and", and, "bic">; +defm EOR : LogicalImm<0b10, "eor", xor, "eon">; +defm ORR : LogicalImm<0b01, "orr", or, "orn">; // FIXME: these aliases *are* canonical sometimes (when movz can't be // used). Actually, it seems to be working right now, but putting logical_immXX @@ -737,6 +737,10 @@ def : Pat<(not GPR64:$Xm), (ORNXrr XZR, GPR64:$Xm)>; defm CLS : OneOperandData<0b101, "cls">; defm CLZ : OneOperandData<0b100, "clz", ctlz>; defm RBIT : OneOperandData<0b000, "rbit">; + +def : Pat<(int_aarch64_rbit GPR32:$Rn), (RBITWr $Rn)>; +def : Pat<(int_aarch64_rbit GPR64:$Rn), (RBITXr $Rn)>; + def REV16Wr : OneWRegData<0b001, "rev16", UnOpFrag<(rotr (bswap node:$LHS), (i64 16))>>; def REV16Xr : OneXRegData<0b001, "rev16", null_frag>; @@ -2238,6 +2242,81 @@ def : Pat<(f32_to_f16 FPR32:$Rn), def FCVTSHpseudo : Pseudo<(outs FPR32:$Rd), (ins FPR32:$Rn), [(set (f32 FPR32:$Rd), (f16_to_f32 i32:$Rn))]>; +// When converting from f16 coming directly from a load, make sure we +// load into the FPR16 registers rather than going through the GPRs. +// f16->f32 +def : Pat<(f32 (f16_to_f32 (i32 + (zextloadi16 (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm, + ro_Wextend16:$extend))))), + (FCVTSHr (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend))>; +def : Pat<(f32 (f16_to_f32 (i32 + (zextloadi16 (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm, + ro_Xextend16:$extend))))), + (FCVTSHr (LDRHroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend))>; +def : Pat <(f32 (f16_to_f32 (i32 + (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))), + (FCVTSHr (LDRHui GPR64sp:$Rn, uimm12s2:$offset))>; +def : Pat <(f32 (f16_to_f32 (i32 + (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))))), + (FCVTSHr (LDURHi GPR64sp:$Rn, simm9:$offset))>; + +// f16->f64 +def : Pat<(f64 (fextend (f32 (f16_to_f32 (i32 + (zextloadi16 (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm, + ro_Wextend16:$extend))))))), + (FCVTDHr (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend))>; +def : Pat<(f64 (fextend (f32 (f16_to_f32 (i32 + (zextloadi16 (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm, + ro_Xextend16:$extend))))))), + (FCVTDHr (LDRHroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend))>; +def : Pat <(f64 (fextend (f32 (f16_to_f32 (i32 + (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))))), + (FCVTDHr (LDRHui GPR64sp:$Rn, uimm12s2:$offset))>; +def : Pat <(f64 (fextend (f32 (f16_to_f32 (i32 + (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))))))), + (FCVTDHr (LDURHi GPR64sp:$Rn, simm9:$offset))>; + +// When converting to f16 going directly to a store, make sure we use the +// appropriate direct conversion instructions and store via the FPR16 +// registers rather than going through the GPRs. +let AddedComplexity = 10 in { +// f32->f16 +def : Pat< (truncstorei16 (assertzext (i32 (f32_to_f16 FPR32:$Rt))), + (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm, + ro_Wextend16:$extend)), + (STRHroW (FCVTHSr FPR32:$Rt), GPR64sp:$Rn, GPR32:$Rm, + ro_Wextend16:$extend)>; +def : Pat< (truncstorei16 (assertzext (i32 (f32_to_f16 FPR32:$Rt))), + (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm, + ro_Xextend16:$extend)), + (STRHroX (FCVTHSr FPR32:$Rt), GPR64sp:$Rn, GPR64:$Rm, + ro_Xextend16:$extend)>; +def : Pat <(truncstorei16 (assertzext (i32 (f32_to_f16 FPR32:$Rt))), + (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)), + (STRHui (FCVTHSr FPR32:$Rt), GPR64sp:$Rn, uimm12s2:$offset)>; +def : Pat <(truncstorei16 (assertzext (i32 (f32_to_f16 FPR32:$Rt))), + (am_unscaled16 GPR64sp:$Rn, simm9:$offset)), + (STURHi (FCVTHSr FPR32:$Rt), GPR64sp:$Rn, simm9:$offset)>; +// f64->f16 +def : Pat< (truncstorei16 (assertzext (i32 (f32_to_f16 (f32 (fround FPR64:$Rt))))), + (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm, + ro_Wextend16:$extend)), + (STRHroW (FCVTHDr FPR64:$Rt), GPR64sp:$Rn, GPR32:$Rm, + ro_Wextend16:$extend)>; +def : Pat< (truncstorei16 (assertzext (i32 (f32_to_f16 (f32 (fround FPR64:$Rt))))), + (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm, + ro_Xextend16:$extend)), + (STRHroX (FCVTHDr FPR64:$Rt), GPR64sp:$Rn, GPR64:$Rm, + ro_Xextend16:$extend)>; +def : Pat <(truncstorei16 (assertzext (i32 (f32_to_f16 (f32 (fround FPR64:$Rt))))), + (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)), + (STRHui (FCVTHDr FPR64:$Rt), GPR64sp:$Rn, uimm12s2:$offset)>; +def : Pat <(truncstorei16 (assertzext (i32 (f32_to_f16 (f32 (fround FPR64:$Rt))))), + (am_unscaled16 GPR64sp:$Rn, simm9:$offset)), + (STURHi (FCVTHDr FPR64:$Rt), GPR64sp:$Rn, simm9:$offset)>; +} + + //===----------------------------------------------------------------------===// // Floating point single operand instructions. //===----------------------------------------------------------------------===// diff --git a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp index e7454be125bc..3df9c4f23aa1 100644 --- a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -40,14 +40,13 @@ STATISTIC(NumPreFolded, "Number of pre-index updates folded"); STATISTIC(NumUnscaledPairCreated, "Number of load/store from unscaled generated"); -static cl::opt<unsigned> ScanLimit("aarch64-load-store-scan-limit", cl::init(20), - cl::Hidden); +static cl::opt<unsigned> ScanLimit("aarch64-load-store-scan-limit", + cl::init(20), cl::Hidden); // Place holder while testing unscaled load/store combining -static cl::opt<bool> -EnableAArch64UnscaledMemOp("aarch64-unscaled-mem-op", cl::Hidden, - cl::desc("Allow AArch64 unscaled load/store combining"), - cl::init(true)); +static cl::opt<bool> EnableAArch64UnscaledMemOp( + "aarch64-unscaled-mem-op", cl::Hidden, + cl::desc("Allow AArch64 unscaled load/store combining"), cl::init(true)); namespace { struct AArch64LoadStoreOpt : public MachineFunctionPass { @@ -60,19 +59,19 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass { // Scan the instructions looking for a load/store that can be combined // with the current instruction into a load/store pair. // Return the matching instruction if one is found, else MBB->end(). - // If a matching instruction is found, mergeForward is set to true if the + // If a matching instruction is found, MergeForward is set to true if the // merge is to remove the first instruction and replace the second with // a pair-wise insn, and false if the reverse is true. MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I, - bool &mergeForward, + bool &MergeForward, unsigned Limit); // Merge the two instructions indicated into a single pair-wise instruction. - // If mergeForward is true, erase the first instruction and fold its + // If MergeForward is true, erase the first instruction and fold its // operation into the second. If false, the reverse. Return the instruction // following the first instruction (which may change during processing). MachineBasicBlock::iterator mergePairedInsns(MachineBasicBlock::iterator I, - MachineBasicBlock::iterator Paired, bool mergeForward); + MachineBasicBlock::iterator Paired, bool MergeForward); // Scan the instruction list to find a base register update that can // be combined with the current instruction (a load or store) using @@ -142,7 +141,7 @@ static bool isUnscaledLdst(unsigned Opc) { int AArch64LoadStoreOpt::getMemSize(MachineInstr *MemMI) { switch (MemMI->getOpcode()) { default: - llvm_unreachable("Opcode has has unknown size!"); + llvm_unreachable("Opcode has unknown size!"); case AArch64::STRSui: case AArch64::STURSi: return 4; @@ -217,16 +216,26 @@ static unsigned getPreIndexedOpcode(unsigned Opc) { switch (Opc) { default: llvm_unreachable("Opcode has no pre-indexed equivalent!"); - case AArch64::STRSui: return AArch64::STRSpre; - case AArch64::STRDui: return AArch64::STRDpre; - case AArch64::STRQui: return AArch64::STRQpre; - case AArch64::STRWui: return AArch64::STRWpre; - case AArch64::STRXui: return AArch64::STRXpre; - case AArch64::LDRSui: return AArch64::LDRSpre; - case AArch64::LDRDui: return AArch64::LDRDpre; - case AArch64::LDRQui: return AArch64::LDRQpre; - case AArch64::LDRWui: return AArch64::LDRWpre; - case AArch64::LDRXui: return AArch64::LDRXpre; + case AArch64::STRSui: + return AArch64::STRSpre; + case AArch64::STRDui: + return AArch64::STRDpre; + case AArch64::STRQui: + return AArch64::STRQpre; + case AArch64::STRWui: + return AArch64::STRWpre; + case AArch64::STRXui: + return AArch64::STRXpre; + case AArch64::LDRSui: + return AArch64::LDRSpre; + case AArch64::LDRDui: + return AArch64::LDRDpre; + case AArch64::LDRQui: + return AArch64::LDRQpre; + case AArch64::LDRWui: + return AArch64::LDRWpre; + case AArch64::LDRXui: + return AArch64::LDRXpre; } } @@ -260,7 +269,7 @@ static unsigned getPostIndexedOpcode(unsigned Opc) { MachineBasicBlock::iterator AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, MachineBasicBlock::iterator Paired, - bool mergeForward) { + bool MergeForward) { MachineBasicBlock::iterator NextI = I; ++NextI; // If NextI is the second of the two instructions to be merged, we need @@ -276,12 +285,12 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, unsigned NewOpc = getMatchingPairOpcode(I->getOpcode()); // Insert our new paired instruction after whichever of the paired - // instructions mergeForward indicates. - MachineBasicBlock::iterator InsertionPoint = mergeForward ? Paired : I; - // Also based on mergeForward is from where we copy the base register operand + // instructions MergeForward indicates. + MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I; + // Also based on MergeForward is from where we copy the base register operand // so we get the flags compatible with the input code. MachineOperand &BaseRegOp = - mergeForward ? Paired->getOperand(1) : I->getOperand(1); + MergeForward ? Paired->getOperand(1) : I->getOperand(1); // Which register is Rt and which is Rt2 depends on the offset order. MachineInstr *RtMI, *Rt2MI; @@ -355,8 +364,8 @@ static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) { if (IsUnscaled) { // Convert the byte-offset used by unscaled into an "element" offset used // by the scaled pair load/store instructions. - int elemOffset = Offset / OffsetStride; - if (elemOffset > 63 || elemOffset < -64) + int ElemOffset = Offset / OffsetStride; + if (ElemOffset > 63 || ElemOffset < -64) return false; } return true; @@ -374,14 +383,14 @@ static int alignTo(int Num, int PowOf2) { /// be combined with the current instruction into a load/store pair. MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, - bool &mergeForward, unsigned Limit) { + bool &MergeForward, unsigned Limit) { MachineBasicBlock::iterator E = I->getParent()->end(); MachineBasicBlock::iterator MBBI = I; MachineInstr *FirstMI = I; ++MBBI; int Opc = FirstMI->getOpcode(); - bool mayLoad = FirstMI->mayLoad(); + bool MayLoad = FirstMI->mayLoad(); bool IsUnscaled = isUnscaledLdst(Opc); unsigned Reg = FirstMI->getOperand(0).getReg(); unsigned BaseReg = FirstMI->getOperand(1).getReg(); @@ -453,7 +462,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, // If the destination register of the loads is the same register, bail // and keep looking. A load-pair instruction with both destination // registers the same is UNPREDICTABLE and will result in an exception. - if (mayLoad && Reg == MI->getOperand(0).getReg()) { + if (MayLoad && Reg == MI->getOperand(0).getReg()) { trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); continue; } @@ -462,7 +471,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, // the two instructions, we can combine the second into the first. if (!ModifiedRegs[MI->getOperand(0).getReg()] && !UsedRegs[MI->getOperand(0).getReg()]) { - mergeForward = false; + MergeForward = false; return MBBI; } @@ -471,7 +480,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, // second. if (!ModifiedRegs[FirstMI->getOperand(0).getReg()] && !UsedRegs[FirstMI->getOperand(0).getReg()]) { - mergeForward = true; + MergeForward = true; return MBBI; } // Unable to combine these instructions due to interference in between. @@ -798,14 +807,14 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { break; } // Look ahead up to ScanLimit instructions for a pairable instruction. - bool mergeForward = false; + bool MergeForward = false; MachineBasicBlock::iterator Paired = - findMatchingInsn(MBBI, mergeForward, ScanLimit); + findMatchingInsn(MBBI, MergeForward, ScanLimit); if (Paired != E) { // Merge the loads into a pair. Keeping the iterator straight is a // pain, so we let the merge routine tell us what the next instruction // is after it's done mucking about. - MBBI = mergePairedInsns(MBBI, Paired, mergeForward); + MBBI = mergePairedInsns(MBBI, Paired, MergeForward); Modified = true; ++NumPairCreated; diff --git a/lib/Target/AArch64/AArch64MCInstLower.cpp b/lib/Target/AArch64/AArch64MCInstLower.cpp index ab6d37532a70..75a17b9dc999 100644 --- a/lib/Target/AArch64/AArch64MCInstLower.cpp +++ b/lib/Target/AArch64/AArch64MCInstLower.cpp @@ -51,7 +51,7 @@ MCOperand AArch64MCInstLower::lowerSymbolOperandDarwin(const MachineOperand &MO, AArch64II::MO_PAGEOFF) RefKind = MCSymbolRefExpr::VK_GOTPAGEOFF; else - assert(0 && "Unexpected target flags with MO_GOT on GV operand"); + llvm_unreachable("Unexpected target flags with MO_GOT on GV operand"); } else if ((MO.getTargetFlags() & AArch64II::MO_TLS) != 0) { if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE) RefKind = MCSymbolRefExpr::VK_TLVPPAGE; @@ -154,7 +154,7 @@ bool AArch64MCInstLower::lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const { switch (MO.getType()) { default: - assert(0 && "unknown operand type"); + llvm_unreachable("unknown operand type"); case MachineOperand::MO_Register: // Ignore all implicit register operands. if (MO.isImplicit()) diff --git a/lib/Target/AArch64/AArch64RegisterInfo.td b/lib/Target/AArch64/AArch64RegisterInfo.td index 21c927f2385b..a30e4ad0b5d8 100644 --- a/lib/Target/AArch64/AArch64RegisterInfo.td +++ b/lib/Target/AArch64/AArch64RegisterInfo.td @@ -175,7 +175,7 @@ def GPR64all : RegisterClass<"AArch64", [i64], 64, (add GPR64common, XZR, SP)>; // This is for indirect tail calls to store the address of the destination. def tcGPR64 : RegisterClass<"AArch64", [i64], 64, (sub GPR64common, X19, X20, X21, X22, X23, X24, X25, X26, - X27, X28)>; + X27, X28, FP, LR)>; // GPR register classes for post increment amount of vector load/store that // has alternate printing when Rm=31 and prints a constant immediate value diff --git a/lib/Target/AArch64/AArch64SchedA53.td b/lib/Target/AArch64/AArch64SchedA53.td index 0c3949ecfc17..d709bee7b9eb 100644 --- a/lib/Target/AArch64/AArch64SchedA53.td +++ b/lib/Target/AArch64/AArch64SchedA53.td @@ -148,9 +148,9 @@ def : ReadAdvance<ReadVLD, 0>; // ALU - Most operands in the ALU pipes are not needed for two cycles. Shiftable // operands are needed one cycle later if and only if they are to be -// shifted. Otherwise, they too are needed two cycle later. This same +// shifted. Otherwise, they too are needed two cycles later. This same // ReadAdvance applies to Extended registers as well, even though there is -// a seperate SchedPredicate for them. +// a separate SchedPredicate for them. def : ReadAdvance<ReadI, 2, [WriteImm,WriteI, WriteISReg, WriteIEReg,WriteIS, WriteID32,WriteID64, diff --git a/lib/Target/AArch64/AArch64SchedA57.td b/lib/Target/AArch64/AArch64SchedA57.td new file mode 100644 index 000000000000..8209f965cfd5 --- /dev/null +++ b/lib/Target/AArch64/AArch64SchedA57.td @@ -0,0 +1,304 @@ +//=- AArch64SchedA57.td - ARM Cortex-A57 Scheduling Defs -----*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for ARM Cortex-A57 to support +// instruction scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +def CortexA57Model : SchedMachineModel { + let IssueWidth = 8; // 3-way decode and 8-way issue + let MicroOpBufferSize = 128; // 128 micro-op re-order buffer + let LoadLatency = 4; // Optimistic load latency + let MispredictPenalty = 14; // Fetch + Decode/Rename/Dispatch + Branch +} + +//===----------------------------------------------------------------------===// +// Define each kind of processor resource and number available on Cortex-A57. +// Cortex A-57 has 8 pipelines that each has its own 8-entry queue where +// micro-ops wait for their operands and then issue out-of-order. + +def A57UnitB : ProcResource<1> { let BufferSize = 8; } // Type B micro-ops +def A57UnitI : ProcResource<2> { let BufferSize = 8; } // Type I micro-ops +def A57UnitM : ProcResource<1> { let BufferSize = 8; } // Type M micro-ops +def A57UnitL : ProcResource<1> { let BufferSize = 8; } // Type L micro-ops +def A57UnitS : ProcResource<1> { let BufferSize = 8; } // Type S micro-ops +def A57UnitX : ProcResource<1> { let BufferSize = 8; } // Type X micro-ops +def A57UnitW : ProcResource<1> { let BufferSize = 8; } // Type W micro-ops +let SchedModel = CortexA57Model in { + def A57UnitV : ProcResGroup<[A57UnitX, A57UnitW]>; // Type V micro-ops +} + + +let SchedModel = CortexA57Model in { + +//===----------------------------------------------------------------------===// +// Define customized scheduler read/write types specific to the Cortex-A57. + +include "AArch64SchedA57WriteRes.td" + +//===----------------------------------------------------------------------===// +// Map the target-defined scheduler read/write resources and latency for +// Cortex-A57. The Cortex-A57 types are directly associated with resources, so +// defining the aliases precludes the need for mapping them using WriteRes. The +// aliases are sufficient for creating a coarse, working model. As the model +// evolves, InstRWs will be used to override these SchedAliases. + +def : SchedAlias<WriteImm, A57Write_1cyc_1I>; +def : SchedAlias<WriteI, A57Write_1cyc_1I>; +def : SchedAlias<WriteISReg, A57Write_2cyc_1M>; +def : SchedAlias<WriteIEReg, A57Write_2cyc_1M>; +def : SchedAlias<WriteExtr, A57Write_1cyc_1I>; +def : SchedAlias<WriteIS, A57Write_1cyc_1I>; +def : SchedAlias<WriteID32, A57Write_19cyc_1M>; +def : SchedAlias<WriteID64, A57Write_35cyc_1M>; +def : SchedAlias<WriteIM32, A57Write_3cyc_1M>; +def : SchedAlias<WriteIM64, A57Write_5cyc_1M>; +def : SchedAlias<WriteBr, A57Write_1cyc_1B>; +def : SchedAlias<WriteBrReg, A57Write_1cyc_1B>; +def : SchedAlias<WriteLD, A57Write_4cyc_1L>; +def : SchedAlias<WriteST, A57Write_1cyc_1S>; +def : SchedAlias<WriteSTP, A57Write_1cyc_1S>; +def : SchedAlias<WriteAdr, A57Write_1cyc_1I>; +def : SchedAlias<WriteLDIdx, A57Write_4cyc_1I_1L>; +def : SchedAlias<WriteSTIdx, A57Write_1cyc_1I_1S>; +def : SchedAlias<WriteF, A57Write_3cyc_1V>; +def : SchedAlias<WriteFCmp, A57Write_3cyc_1V>; +def : SchedAlias<WriteFCvt, A57Write_5cyc_1V>; +def : SchedAlias<WriteFCopy, A57Write_3cyc_1V>; +def : SchedAlias<WriteFImm, A57Write_3cyc_1V>; +def : SchedAlias<WriteFMul, A57Write_5cyc_1V>; +def : SchedAlias<WriteFDiv, A57Write_18cyc_1X>; +def : SchedAlias<WriteV, A57Write_3cyc_1V>; +def : SchedAlias<WriteVLD, A57Write_5cyc_1L>; +def : SchedAlias<WriteVST, A57Write_1cyc_1S>; + +def : WriteRes<WriteSys, []> { let Latency = 1; } +def : WriteRes<WriteBarrier, []> { let Latency = 1; } +def : WriteRes<WriteHint, []> { let Latency = 1; } + +def : WriteRes<WriteLDHi, []> { let Latency = 4; } + +// Forwarding logic is not [yet] explicitly modeled beyond what is captured +// in the latencies of the A57 Generic SchedWriteRes's. +def : ReadAdvance<ReadI, 0>; +def : ReadAdvance<ReadISReg, 0>; +def : ReadAdvance<ReadIEReg, 0>; +def : ReadAdvance<ReadIM, 0>; +def : ReadAdvance<ReadIMA, 0>; +def : ReadAdvance<ReadID, 0>; +def : ReadAdvance<ReadExtrHi, 0>; +def : ReadAdvance<ReadAdrBase, 0>; +def : ReadAdvance<ReadVLD, 0>; + + +//===----------------------------------------------------------------------===// +// Specialize the coarse model by associating instruction groups with the +// subtarget-defined types. As the modeled is refined, this will override most +// of the above ShchedAlias mappings. + +// Miscellaneous +// ----------------------------------------------------------------------------- + +def : InstRW<[WriteI], (instrs COPY)>; + + +// Branch Instructions +// ----------------------------------------------------------------------------- + +def : InstRW<[A57Write_1cyc_1B_1I], (instrs BL)>; +def : InstRW<[A57Write_2cyc_1B_1I], (instrs BLR)>; + + +// Divide and Multiply Instructions +// ----------------------------------------------------------------------------- + +// Multiply high +def : InstRW<[A57Write_6cyc_1M], (instrs SMULHrr, UMULHrr)>; + + +// Miscellaneous Data-Processing Instructions +// ----------------------------------------------------------------------------- + +def : InstRW<[A57Write_1cyc_1I], (instrs EXTRWrri)>; +def : InstRW<[A57Write_3cyc_1I_1M], (instrs EXTRXrri)>; +def : InstRW<[A57Write_2cyc_1M], (instregex "BFM")>; + + +// Cryptography Extensions +// ----------------------------------------------------------------------------- + +def : InstRW<[A57Write_3cyc_1W], (instregex "CRC32")>; + + +// Vector Load +// ----------------------------------------------------------------------------- + +def : InstRW<[A57Write_8cyc_1L_1V], (instregex "LD1i(8|16|32)$")>; +def : InstRW<[A57Write_8cyc_1L_1V, WriteAdr], (instregex "LD1i(8|16|32)_POST$")>; +def : InstRW<[A57Write_5cyc_1L], (instregex "LD1i(64)$")>; +def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instregex "LD1i(64)_POST$")>; + +def : InstRW<[A57Write_8cyc_1L_1V], (instregex "LD1Rv(8b|4h|2s)$")>; +def : InstRW<[A57Write_8cyc_1L_1V, WriteAdr], (instregex "LD1Rv(8b|4h|2s)_POST$")>; +def : InstRW<[A57Write_5cyc_1L], (instregex "LD1Rv(1d)$")>; +def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instregex "LD1Rv(1d)_POST$")>; +def : InstRW<[A57Write_8cyc_1L_1V], (instregex "LD1Rv(16b|8h|4s|2d)$")>; +def : InstRW<[A57Write_8cyc_1L_1V, WriteAdr], (instregex "LD1Rv(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[A57Write_5cyc_1L], (instregex "LD1Onev(8b|4h|2s|1d)$")>; +def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instregex "LD1Onev(8b|4h|2s|1d)_POST$")>; +def : InstRW<[A57Write_5cyc_1L], (instregex "LD1Onev(16b|8h|4s|2d)$")>; +def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instregex "LD1Onev(16b|8h|4s|2d)_POST$")>; +def : InstRW<[A57Write_5cyc_1L], (instregex "LD1Twov(8b|4h|2s|1d)$")>; +def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instregex "LD1Twov(8b|4h|2s|1d)_POST$")>; +def : InstRW<[A57Write_6cyc_2L], (instregex "LD1Twov(16b|8h|4s|2d)$")>; +def : InstRW<[A57Write_6cyc_2L, WriteAdr], (instregex "LD1Twov(16b|8h|4s|2d)_POST$")>; +def : InstRW<[A57Write_6cyc_2L], (instregex "LD1Threev(8b|4h|2s|1d)$")>; +def : InstRW<[A57Write_6cyc_2L, WriteAdr], (instregex "LD1Threev(8b|4h|2s|1d)_POST$")>; +def : InstRW<[A57Write_7cyc_3L], (instregex "LD1Threev(16b|8h|4s|2d)$")>; +def : InstRW<[A57Write_7cyc_3L, WriteAdr], (instregex "LD1Threev(16b|8h|4s|2d)_POST$")>; +def : InstRW<[A57Write_6cyc_2L], (instregex "LD1Fourv(8b|4h|2s|1d)$")>; +def : InstRW<[A57Write_6cyc_2L, WriteAdr], (instregex "LD1Fourv(8b|4h|2s|1d)_POST$")>; +def : InstRW<[A57Write_8cyc_4L], (instregex "LD1Fourv(16b|8h|4s|2d)$")>; +def : InstRW<[A57Write_8cyc_4L, WriteAdr], (instregex "LD1Fourv(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[A57Write_8cyc_1L_2V], (instregex "LD2i(8|16)$")>; +def : InstRW<[A57Write_8cyc_1L_2V, WriteAdr], (instregex "LD2i(8|16)_POST$")>; +def : InstRW<[A57Write_6cyc_2L], (instregex "LD2i(32)$")>; +def : InstRW<[A57Write_6cyc_2L, WriteAdr], (instregex "LD2i(32)_POST$")>; +def : InstRW<[A57Write_8cyc_1L_1V], (instregex "LD2i(64)$")>; +def : InstRW<[A57Write_8cyc_1L_1V, WriteAdr], (instregex "LD2i(64)_POST$")>; + +def : InstRW<[A57Write_8cyc_1L_1V], (instregex "LD2Rv(8b|4h|2s)$")>; +def : InstRW<[A57Write_8cyc_1L_1V, WriteAdr], (instregex "LD2Rv(8b|4h|2s)_POST$")>; +def : InstRW<[A57Write_5cyc_1L], (instregex "LD2Rv(1d)$")>; +def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instregex "LD2Rv(1d)_POST$")>; +def : InstRW<[A57Write_8cyc_1L_2V], (instregex "LD2Rv(16b|8h|4s|2d)$")>; +def : InstRW<[A57Write_8cyc_1L_2V, WriteAdr], (instregex "LD2Rv(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[A57Write_8cyc_1L_1V], (instregex "LD2Twov(8b|4h|2s)$")>; +def : InstRW<[A57Write_8cyc_1L_1V, WriteAdr], (instregex "LD2Twov(8b|4h|2s)_POST$")>; +def : InstRW<[A57Write_9cyc_2L_2V], (instregex "LD2Twov(16b|8h|4s)$")>; +def : InstRW<[A57Write_9cyc_2L_2V, WriteAdr], (instregex "LD2Twov(16b|8h|4s)_POST$")>; +def : InstRW<[A57Write_6cyc_2L], (instregex "LD2Twov(2d)$")>; +def : InstRW<[A57Write_6cyc_2L, WriteAdr], (instregex "LD2Twov(2d)_POST$")>; + +def : InstRW<[A57Write_9cyc_1L_3V], (instregex "LD3i(8|16)$")>; +def : InstRW<[A57Write_9cyc_1L_3V, WriteAdr], (instregex "LD3i(8|16)_POST$")>; +def : InstRW<[A57Write_8cyc_1L_2V], (instregex "LD3i(32)$")>; +def : InstRW<[A57Write_8cyc_1L_2V, WriteAdr], (instregex "LD3i(32)_POST$")>; +def : InstRW<[A57Write_6cyc_2L], (instregex "LD3i(64)$")>; +def : InstRW<[A57Write_6cyc_2L, WriteAdr], (instregex "LD3i(64)_POST$")>; + +def : InstRW<[A57Write_8cyc_1L_2V], (instregex "LD3Rv(8b|4h|2s)$")>; +def : InstRW<[A57Write_8cyc_1L_2V, WriteAdr], (instregex "LD3Rv(8b|4h|2s)_POST$")>; +def : InstRW<[A57Write_6cyc_2L], (instregex "LD3Rv(1d)$")>; +def : InstRW<[A57Write_6cyc_2L, WriteAdr], (instregex "LD3Rv(1d)_POST$")>; +def : InstRW<[A57Write_9cyc_1L_3V], (instregex "LD3Rv(16b|8h|4s)$")>; +def : InstRW<[A57Write_9cyc_1L_3V, WriteAdr], (instregex "LD3Rv(16b|8h|4s)_POST$")>; +def : InstRW<[A57Write_9cyc_2L_3V], (instregex "LD3Rv(2d)$")>; +def : InstRW<[A57Write_9cyc_2L_3V, WriteAdr], (instregex "LD3Rv(2d)_POST$")>; + +def : InstRW<[A57Write_9cyc_2L_2V], (instregex "LD3Threev(8b|4h|2s)$")>; +def : InstRW<[A57Write_9cyc_2L_2V, WriteAdr], (instregex "LD3Threev(8b|4h|2s)_POST$")>; +def : InstRW<[A57Write_10cyc_3L_4V], (instregex "LD3Threev(16b|8h|4s)$")>; +def : InstRW<[A57Write_10cyc_3L_4V, WriteAdr], (instregex "LD3Threev(16b|8h|4s)_POST$")>; +def : InstRW<[A57Write_8cyc_4L], (instregex "LD3Threev(2d)$")>; +def : InstRW<[A57Write_8cyc_4L, WriteAdr], (instregex "LD3Threev(2d)_POST$")>; + +def : InstRW<[A57Write_9cyc_2L_3V], (instregex "LD4i(8|16)$")>; +def : InstRW<[A57Write_9cyc_2L_3V, WriteAdr], (instregex "LD4i(8|16)_POST$")>; +def : InstRW<[A57Write_8cyc_1L_2V], (instregex "LD4i(32)$")>; +def : InstRW<[A57Write_8cyc_1L_2V, WriteAdr], (instregex "LD4i(32)_POST$")>; +def : InstRW<[A57Write_9cyc_2L_3V], (instregex "LD4i(64)$")>; +def : InstRW<[A57Write_9cyc_2L_3V, WriteAdr], (instregex "LD4i(64)_POST$")>; + +def : InstRW<[A57Write_8cyc_1L_2V], (instregex "LD4Rv(8b|4h|2s)$")>; +def : InstRW<[A57Write_8cyc_1L_2V, WriteAdr], (instregex "LD4Rv(8b|4h|2s)_POST$")>; +def : InstRW<[A57Write_6cyc_2L], (instregex "LD4Rv(1d)$")>; +def : InstRW<[A57Write_6cyc_2L, WriteAdr], (instregex "LD4Rv(1d)_POST$")>; +def : InstRW<[A57Write_9cyc_2L_3V], (instregex "LD4Rv(16b|8h|4s)$")>; +def : InstRW<[A57Write_9cyc_2L_3V, WriteAdr], (instregex "LD4Rv(16b|8h|4s)_POST$")>; +def : InstRW<[A57Write_9cyc_2L_4V], (instregex "LD4Rv(2d)$")>; +def : InstRW<[A57Write_9cyc_2L_4V, WriteAdr], (instregex "LD4Rv(2d)_POST$")>; + +def : InstRW<[A57Write_9cyc_2L_2V], (instregex "LD4Fourv(8b|4h|2s)$")>; +def : InstRW<[A57Write_9cyc_2L_2V, WriteAdr], (instregex "LD4Fourv(8b|4h|2s)_POST$")>; +def : InstRW<[A57Write_11cyc_4L_4V], (instregex "LD4Fourv(16b|8h|4s)$")>; +def : InstRW<[A57Write_11cyc_4L_4V, WriteAdr], (instregex "LD4Fourv(16b|8h|4s)_POST$")>; +def : InstRW<[A57Write_8cyc_4L], (instregex "LD4Fourv(2d)$")>; +def : InstRW<[A57Write_8cyc_4L, WriteAdr], (instregex "LD4Fourv(2d)_POST$")>; + +// Vector Store +// ----------------------------------------------------------------------------- + +def : InstRW<[A57Write_1cyc_1S], (instregex "ST1i(8|16|32)$")>; +def : InstRW<[A57Write_1cyc_1S, WriteAdr], (instregex "ST1i(8|16|32)_POST$")>; +def : InstRW<[A57Write_3cyc_1S_1V], (instregex "ST1i(64)$")>; +def : InstRW<[A57Write_3cyc_1S_1V, WriteAdr], (instregex "ST1i(64)_POST$")>; + +def : InstRW<[A57Write_1cyc_1S], (instregex "ST1Onev(8b|4h|2s|1d)$")>; +def : InstRW<[A57Write_1cyc_1S, WriteAdr], (instregex "ST1Onev(8b|4h|2s|1d)_POST$")>; +def : InstRW<[A57Write_2cyc_2S], (instregex "ST1Onev(16b|8h|4s|2d)$")>; +def : InstRW<[A57Write_2cyc_2S, WriteAdr], (instregex "ST1Onev(16b|8h|4s|2d)_POST$")>; +def : InstRW<[A57Write_2cyc_2S], (instregex "ST1Twov(8b|4h|2s|1d)$")>; +def : InstRW<[A57Write_2cyc_2S, WriteAdr], (instregex "ST1Twov(8b|4h|2s|1d)_POST$")>; +def : InstRW<[A57Write_4cyc_4S], (instregex "ST1Twov(16b|8h|4s|2d)$")>; +def : InstRW<[A57Write_4cyc_4S, WriteAdr], (instregex "ST1Twov(16b|8h|4s|2d)_POST$")>; +def : InstRW<[A57Write_3cyc_3S], (instregex "ST1Threev(8b|4h|2s|1d)$")>; +def : InstRW<[A57Write_3cyc_3S, WriteAdr], (instregex "ST1Threev(8b|4h|2s|1d)_POST$")>; +def : InstRW<[A57Write_6cyc_6S], (instregex "ST1Threev(16b|8h|4s|2d)$")>; +def : InstRW<[A57Write_6cyc_6S, WriteAdr], (instregex "ST1Threev(16b|8h|4s|2d)_POST$")>; +def : InstRW<[A57Write_4cyc_4S], (instregex "ST1Fourv(8b|4h|2s|1d)$")>; +def : InstRW<[A57Write_4cyc_4S, WriteAdr], (instregex "ST1Fourv(8b|4h|2s|1d)_POST$")>; +def : InstRW<[A57Write_8cyc_8S], (instregex "ST1Fourv(16b|8h|4s|2d)$")>; +def : InstRW<[A57Write_8cyc_8S, WriteAdr], (instregex "ST1Fourv(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[A57Write_3cyc_1S_1V], (instregex "ST2i(8|16|32)$")>; +def : InstRW<[A57Write_3cyc_1S_1V, WriteAdr], (instregex "ST2i(8|16|32)_POST$")>; +def : InstRW<[A57Write_2cyc_2S], (instregex "ST2i(64)$")>; +def : InstRW<[A57Write_2cyc_2S, WriteAdr], (instregex "ST2i(64)_POST$")>; + +def : InstRW<[A57Write_3cyc_2S_1V], (instregex "ST2Twov(8b|4h|2s)$")>; +def : InstRW<[A57Write_3cyc_2S_1V, WriteAdr], (instregex "ST2Twov(8b|4h|2s)_POST$")>; +def : InstRW<[A57Write_4cyc_4S_2V], (instregex "ST2Twov(16b|8h|4s)$")>; +def : InstRW<[A57Write_4cyc_4S_2V, WriteAdr], (instregex "ST2Twov(16b|8h|4s)_POST$")>; +def : InstRW<[A57Write_4cyc_4S], (instregex "ST2Twov(2d)$")>; +def : InstRW<[A57Write_4cyc_4S, WriteAdr], (instregex "ST2Twov(2d)_POST$")>; + +def : InstRW<[A57Write_3cyc_1S_1V], (instregex "ST3i(8|16)$")>; +def : InstRW<[A57Write_3cyc_1S_1V, WriteAdr], (instregex "ST3i(8|16)_POST$")>; +def : InstRW<[A57Write_3cyc_3S], (instregex "ST3i(32)$")>; +def : InstRW<[A57Write_3cyc_3S, WriteAdr], (instregex "ST3i(32)_POST$")>; +def : InstRW<[A57Write_3cyc_2S_1V], (instregex "ST3i(64)$")>; +def : InstRW<[A57Write_3cyc_2S_1V, WriteAdr], (instregex "ST3i(64)_POST$")>; + +def : InstRW<[A57Write_3cyc_3S_2V], (instregex "ST3Threev(8b|4h|2s)$")>; +def : InstRW<[A57Write_3cyc_3S_2V, WriteAdr], (instregex "ST3Threev(8b|4h|2s)_POST$")>; +def : InstRW<[A57Write_6cyc_6S_4V], (instregex "ST3Threev(16b|8h|4s)$")>; +def : InstRW<[A57Write_6cyc_6S_4V, WriteAdr], (instregex "ST3Threev(16b|8h|4s)_POST$")>; +def : InstRW<[A57Write_6cyc_6S], (instregex "ST3Threev(2d)$")>; +def : InstRW<[A57Write_6cyc_6S, WriteAdr], (instregex "ST3Threev(2d)_POST$")>; + +def : InstRW<[A57Write_3cyc_1S_1V], (instregex "ST4i(8|16)$")>; +def : InstRW<[A57Write_3cyc_1S_1V, WriteAdr], (instregex "ST4i(8|16)_POST$")>; +def : InstRW<[A57Write_4cyc_4S], (instregex "ST4i(32)$")>; +def : InstRW<[A57Write_4cyc_4S, WriteAdr], (instregex "ST4i(32)_POST$")>; +def : InstRW<[A57Write_3cyc_2S_1V], (instregex "ST4i(64)$")>; +def : InstRW<[A57Write_3cyc_2S_1V, WriteAdr], (instregex "ST4i(64)_POST$")>; + +def : InstRW<[A57Write_4cyc_4S_2V], (instregex "ST4Fourv(8b|4h|2s)$")>; +def : InstRW<[A57Write_4cyc_4S_2V, WriteAdr], (instregex "ST4Fourv(8b|4h|2s)_POST$")>; +def : InstRW<[A57Write_8cyc_8S_4V], (instregex "ST4Fourv(16b|8h|4s)$")>; +def : InstRW<[A57Write_8cyc_8S_4V, WriteAdr], (instregex "ST4Fourv(16b|8h|4s)_POST$")>; +def : InstRW<[A57Write_8cyc_8S], (instregex "ST4Fourv(2d)$")>; +def : InstRW<[A57Write_8cyc_8S, WriteAdr], (instregex "ST4Fourv(2d)_POST$")>; + +} // SchedModel = CortexA57Model diff --git a/lib/Target/AArch64/AArch64SchedA57WriteRes.td b/lib/Target/AArch64/AArch64SchedA57WriteRes.td new file mode 100644 index 000000000000..a8f421bf38d2 --- /dev/null +++ b/lib/Target/AArch64/AArch64SchedA57WriteRes.td @@ -0,0 +1,512 @@ +//=- AArch64SchedA57WriteRes.td - ARM Cortex-A57 Write Res ---*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Contains all of the Cortex-A57 specific SchedWriteRes types. The approach +// below is to define a generic SchedWriteRes for every combination of +// latency and microOps. The naming conventions is to use a prefix, one field +// for latency, and one or more microOp count/type designators. +// Prefix: A57Write +// Latency: #cyc +// MicroOp Count/Types: #(B|I|M|L|S|X|W|V) +// +// e.g. A57Write_6cyc_1I_6S_4V means the total latency is 6 and there are +// 11 micro-ops to be issued down one I pipe, six S pipes and four V pipes. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Define Generic 1 micro-op types + +def A57Write_5cyc_1L : SchedWriteRes<[A57UnitL]> { let Latency = 5; } +def A57Write_5cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 5; } +def A57Write_5cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 5; } +def A57Write_5cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 5; } +def A57Write_10cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 10; } +def A57Write_18cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 18; } +def A57Write_19cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 19; } +def A57Write_1cyc_1B : SchedWriteRes<[A57UnitB]> { let Latency = 1; } +def A57Write_1cyc_1I : SchedWriteRes<[A57UnitI]> { let Latency = 1; } +def A57Write_1cyc_1S : SchedWriteRes<[A57UnitS]> { let Latency = 1; } +def A57Write_2cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 2; } +def A57Write_32cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 32; } +def A57Write_35cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 35; } +def A57Write_3cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 3; } +def A57Write_3cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 3; } +def A57Write_3cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 3; } +def A57Write_3cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 3; } +def A57Write_4cyc_1L : SchedWriteRes<[A57UnitL]> { let Latency = 4; } +def A57Write_4cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 4; } +def A57Write_9cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 9; } +def A57Write_6cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 6; } +def A57Write_6cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 6; } + + +//===----------------------------------------------------------------------===// +// Define Generic 2 micro-op types + +def A57Write_64cyc_2X : SchedWriteRes<[A57UnitX, A57UnitX]> { + let Latency = 64; + let NumMicroOps = 2; +} +def A57Write_6cyc_1I_1L : SchedWriteRes<[A57UnitI, + A57UnitL]> { + let Latency = 6; + let NumMicroOps = 2; +} +def A57Write_7cyc_1V_1X : SchedWriteRes<[A57UnitV, + A57UnitX]> { + let Latency = 7; + let NumMicroOps = 2; +} +def A57Write_8cyc_1L_1V : SchedWriteRes<[A57UnitL, + A57UnitV]> { + let Latency = 8; + let NumMicroOps = 2; +} +def A57Write_9cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> { + let Latency = 9; + let NumMicroOps = 2; +} +def A57Write_8cyc_2X : SchedWriteRes<[A57UnitX, A57UnitX]> { + let Latency = 8; + let NumMicroOps = 2; +} +def A57Write_6cyc_2L : SchedWriteRes<[A57UnitL, A57UnitL]> { + let Latency = 6; + let NumMicroOps = 2; +} +def A57Write_6cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> { + let Latency = 6; + let NumMicroOps = 2; +} +def A57Write_6cyc_2W : SchedWriteRes<[A57UnitW, A57UnitW]> { + let Latency = 6; + let NumMicroOps = 2; +} +def A57Write_5cyc_1I_1L : SchedWriteRes<[A57UnitI, + A57UnitL]> { + let Latency = 5; + let NumMicroOps = 2; +} +def A57Write_5cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> { + let Latency = 5; + let NumMicroOps = 2; +} +def A57Write_5cyc_2X : SchedWriteRes<[A57UnitX, A57UnitX]> { + let Latency = 5; + let NumMicroOps = 2; +} +def A57Write_10cyc_1L_1V : SchedWriteRes<[A57UnitL, + A57UnitV]> { + let Latency = 10; + let NumMicroOps = 2; +} +def A57Write_10cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> { + let Latency = 10; + let NumMicroOps = 2; +} +def A57Write_1cyc_1B_1I : SchedWriteRes<[A57UnitB, + A57UnitI]> { + let Latency = 1; + let NumMicroOps = 2; +} +def A57Write_1cyc_1I_1S : SchedWriteRes<[A57UnitI, + A57UnitS]> { + let Latency = 1; + let NumMicroOps = 2; +} +def A57Write_2cyc_1B_1I : SchedWriteRes<[A57UnitB, + A57UnitI]> { + let Latency = 2; + let NumMicroOps = 2; +} +def A57Write_2cyc_2S : SchedWriteRes<[A57UnitS, A57UnitS]> { + let Latency = 2; + let NumMicroOps = 2; +} +def A57Write_2cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> { + let Latency = 2; + let NumMicroOps = 2; +} +def A57Write_36cyc_2X : SchedWriteRes<[A57UnitX, A57UnitX]> { + let Latency = 36; + let NumMicroOps = 2; +} +def A57Write_3cyc_1I_1M : SchedWriteRes<[A57UnitI, + A57UnitM]> { + let Latency = 3; + let NumMicroOps = 2; +} +def A57Write_3cyc_1I_1S : SchedWriteRes<[A57UnitI, + A57UnitS]> { + let Latency = 3; + let NumMicroOps = 2; +} +def A57Write_3cyc_1S_1V : SchedWriteRes<[A57UnitS, + A57UnitV]> { + let Latency = 3; + let NumMicroOps = 2; +} +def A57Write_4cyc_1I_1L : SchedWriteRes<[A57UnitI, + A57UnitL]> { + let Latency = 4; + let NumMicroOps = 2; +} +def A57Write_4cyc_2X : SchedWriteRes<[A57UnitX, A57UnitX]> { + let Latency = 4; + let NumMicroOps = 2; +} + + +//===----------------------------------------------------------------------===// +// Define Generic 3 micro-op types + +def A57Write_10cyc_3V : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV]> { + let Latency = 10; + let NumMicroOps = 3; +} +def A57Write_2cyc_1I_2S : SchedWriteRes<[A57UnitI, + A57UnitS, A57UnitS]> { + let Latency = 2; + let NumMicroOps = 3; +} +def A57Write_3cyc_1I_1S_1V : SchedWriteRes<[A57UnitI, + A57UnitS, + A57UnitV]> { + let Latency = 3; + let NumMicroOps = 3; +} +def A57Write_3cyc_1M_2S : SchedWriteRes<[A57UnitM, + A57UnitS, A57UnitS]> { + let Latency = 3; + let NumMicroOps = 3; +} +def A57Write_3cyc_3S : SchedWriteRes<[A57UnitS, A57UnitS, A57UnitS]> { + let Latency = 3; + let NumMicroOps = 3; +} +def A57Write_3cyc_2S_1V : SchedWriteRes<[A57UnitS, A57UnitS, + A57UnitV]> { + let Latency = 3; + let NumMicroOps = 3; +} +def A57Write_5cyc_1I_2L : SchedWriteRes<[A57UnitI, + A57UnitL, A57UnitL]> { + let Latency = 5; + let NumMicroOps = 3; +} +def A57Write_6cyc_1I_2L : SchedWriteRes<[A57UnitI, + A57UnitL, A57UnitL]> { + let Latency = 6; + let NumMicroOps = 3; +} +def A57Write_6cyc_3V : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV]> { + let Latency = 6; + let NumMicroOps = 3; +} +def A57Write_7cyc_3L : SchedWriteRes<[A57UnitL, A57UnitL, A57UnitL]> { + let Latency = 7; + let NumMicroOps = 3; +} +def A57Write_8cyc_1I_1L_1V : SchedWriteRes<[A57UnitI, + A57UnitL, + A57UnitV]> { + let Latency = 8; + let NumMicroOps = 3; +} +def A57Write_8cyc_1L_2V : SchedWriteRes<[A57UnitL, + A57UnitV, A57UnitV]> { + let Latency = 8; + let NumMicroOps = 3; +} +def A57Write_8cyc_3V : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV]> { + let Latency = 8; + let NumMicroOps = 3; +} +def A57Write_9cyc_3V : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV]> { + let Latency = 9; + let NumMicroOps = 3; +} + + +//===----------------------------------------------------------------------===// +// Define Generic 4 micro-op types + +def A57Write_2cyc_2I_2S : SchedWriteRes<[A57UnitI, A57UnitI, + A57UnitS, A57UnitS]> { + let Latency = 2; + let NumMicroOps = 4; +} +def A57Write_3cyc_2I_2S : SchedWriteRes<[A57UnitI, A57UnitI, + A57UnitS, A57UnitS]> { + let Latency = 3; + let NumMicroOps = 4; +} +def A57Write_3cyc_1I_3S : SchedWriteRes<[A57UnitI, + A57UnitS, A57UnitS, A57UnitS]> { + let Latency = 3; + let NumMicroOps = 4; +} +def A57Write_3cyc_1I_2S_1V : SchedWriteRes<[A57UnitI, + A57UnitS, A57UnitS, + A57UnitV]> { + let Latency = 3; + let NumMicroOps = 4; +} +def A57Write_4cyc_4S : SchedWriteRes<[A57UnitS, A57UnitS, + A57UnitS, A57UnitS]> { + let Latency = 4; + let NumMicroOps = 4; +} +def A57Write_7cyc_1I_3L : SchedWriteRes<[A57UnitI, + A57UnitL, A57UnitL, A57UnitL]> { + let Latency = 7; + let NumMicroOps = 4; +} +def A57Write_5cyc_2I_2L : SchedWriteRes<[A57UnitI, A57UnitI, + A57UnitL, A57UnitL]> { + let Latency = 5; + let NumMicroOps = 4; +} +def A57Write_8cyc_1I_1L_2V : SchedWriteRes<[A57UnitI, + A57UnitL, + A57UnitV, A57UnitV]> { + let Latency = 8; + let NumMicroOps = 4; +} +def A57Write_8cyc_4L : SchedWriteRes<[A57UnitL, A57UnitL, + A57UnitL, A57UnitL]> { + let Latency = 8; + let NumMicroOps = 4; +} +def A57Write_9cyc_2L_2V : SchedWriteRes<[A57UnitL, A57UnitL, + A57UnitV, A57UnitV]> { + let Latency = 9; + let NumMicroOps = 4; +} +def A57Write_9cyc_1L_3V : SchedWriteRes<[A57UnitL, + A57UnitV, A57UnitV, A57UnitV]> { + let Latency = 9; + let NumMicroOps = 4; +} + + +//===----------------------------------------------------------------------===// +// Define Generic 5 micro-op types + +def A57Write_3cyc_3S_2V : SchedWriteRes<[A57UnitS, A57UnitS, A57UnitS, + A57UnitV, A57UnitV]> { + let Latency = 3; + let NumMicroOps = 5; +} +def A57Write_8cyc_1I_4L : SchedWriteRes<[A57UnitI, + A57UnitL, A57UnitL, + A57UnitL, A57UnitL]> { + let Latency = 8; + let NumMicroOps = 5; +} +def A57Write_4cyc_1I_4S : SchedWriteRes<[A57UnitI, + A57UnitS, A57UnitS, + A57UnitS, A57UnitS]> { + let Latency = 4; + let NumMicroOps = 5; +} +def A57Write_9cyc_1I_2L_2V : SchedWriteRes<[A57UnitI, + A57UnitL, A57UnitL, + A57UnitV, A57UnitV]> { + let Latency = 9; + let NumMicroOps = 5; +} +def A57Write_9cyc_1I_1L_3V : SchedWriteRes<[A57UnitI, + A57UnitL, + A57UnitV, A57UnitV, A57UnitV]> { + let Latency = 9; + let NumMicroOps = 5; +} +def A57Write_9cyc_2L_3V : SchedWriteRes<[A57UnitL, A57UnitL, + A57UnitV, A57UnitV, A57UnitV]> { + let Latency = 9; + let NumMicroOps = 5; +} + + +//===----------------------------------------------------------------------===// +// Define Generic 6 micro-op types + +def A57Write_3cyc_1I_3S_2V : SchedWriteRes<[A57UnitI, + A57UnitS, A57UnitS, A57UnitS, + A57UnitV, A57UnitV]> { + let Latency = 3; + let NumMicroOps = 6; +} +def A57Write_4cyc_2I_4S : SchedWriteRes<[A57UnitI, A57UnitI, + A57UnitS, A57UnitS, + A57UnitS, A57UnitS]> { + let Latency = 4; + let NumMicroOps = 6; +} +def A57Write_4cyc_4S_2V : SchedWriteRes<[A57UnitS, A57UnitS, + A57UnitS, A57UnitS, + A57UnitV, A57UnitV]> { + let Latency = 4; + let NumMicroOps = 6; +} +def A57Write_6cyc_6S : SchedWriteRes<[A57UnitS, A57UnitS, A57UnitS, + A57UnitS, A57UnitS, A57UnitS]> { + let Latency = 6; + let NumMicroOps = 6; +} +def A57Write_9cyc_1I_2L_3V : SchedWriteRes<[A57UnitI, + A57UnitL, A57UnitL, + A57UnitV, A57UnitV, A57UnitV]> { + let Latency = 9; + let NumMicroOps = 6; +} +def A57Write_9cyc_1I_1L_4V : SchedWriteRes<[A57UnitI, + A57UnitL, + A57UnitV, A57UnitV, + A57UnitV, A57UnitV]> { + let Latency = 9; + let NumMicroOps = 6; +} +def A57Write_9cyc_2L_4V : SchedWriteRes<[A57UnitL, A57UnitL, + A57UnitV, A57UnitV, + A57UnitV, A57UnitV]> { + let Latency = 9; + let NumMicroOps = 6; +} + + +//===----------------------------------------------------------------------===// +// Define Generic 7 micro-op types + +def A57Write_10cyc_3L_4V : SchedWriteRes<[A57UnitL, A57UnitL, A57UnitL, + A57UnitV, A57UnitV, + A57UnitV, A57UnitV]> { + let Latency = 10; + let NumMicroOps = 7; +} +def A57Write_4cyc_1I_4S_2V : SchedWriteRes<[A57UnitI, + A57UnitS, A57UnitS, + A57UnitS, A57UnitS, + A57UnitV, A57UnitV]> { + let Latency = 4; + let NumMicroOps = 7; +} +def A57Write_6cyc_1I_6S : SchedWriteRes<[A57UnitI, + A57UnitS, A57UnitS, A57UnitS, + A57UnitS, A57UnitS, A57UnitS]> { + let Latency = 6; + let NumMicroOps = 7; +} +def A57Write_9cyc_1I_2L_4V : SchedWriteRes<[A57UnitI, + A57UnitL, A57UnitL, + A57UnitV, A57UnitV, + A57UnitV, A57UnitV]> { + let Latency = 9; + let NumMicroOps = 7; +} + + +//===----------------------------------------------------------------------===// +// Define Generic 8 micro-op types + +def A57Write_10cyc_1I_3L_4V : SchedWriteRes<[A57UnitI, + A57UnitL, A57UnitL, A57UnitL, + A57UnitV, A57UnitV, + A57UnitV, A57UnitV]> { + let Latency = 10; + let NumMicroOps = 8; +} +def A57Write_11cyc_4L_4V : SchedWriteRes<[A57UnitL, A57UnitL, + A57UnitL, A57UnitL, + A57UnitV, A57UnitV, + A57UnitV, A57UnitV]> { + let Latency = 11; + let NumMicroOps = 8; +} +def A57Write_8cyc_8S : SchedWriteRes<[A57UnitS, A57UnitS, + A57UnitS, A57UnitS, + A57UnitS, A57UnitS, + A57UnitS, A57UnitS]> { + let Latency = 8; + let NumMicroOps = 8; +} + + +//===----------------------------------------------------------------------===// +// Define Generic 9 micro-op types + +def A57Write_8cyc_1I_8S : SchedWriteRes<[A57UnitI, + A57UnitS, A57UnitS, + A57UnitS, A57UnitS, + A57UnitS, A57UnitS, + A57UnitS, A57UnitS]> { + let Latency = 8; + let NumMicroOps = 9; +} +def A57Write_11cyc_1I_4L_4V : SchedWriteRes<[A57UnitI, + A57UnitL, A57UnitL, + A57UnitL, A57UnitL, + A57UnitV, A57UnitV, + A57UnitV, A57UnitV]> { + let Latency = 11; + let NumMicroOps = 9; +} + + +//===----------------------------------------------------------------------===// +// Define Generic 10 micro-op types + +def A57Write_6cyc_6S_4V : SchedWriteRes<[A57UnitS, A57UnitS, A57UnitS, + A57UnitS, A57UnitS, A57UnitS, + A57UnitV, A57UnitV, + A57UnitV, A57UnitV]> { + let Latency = 6; + let NumMicroOps = 10; +} + + +//===----------------------------------------------------------------------===// +// Define Generic 11 micro-op types + +def A57Write_6cyc_1I_6S_4V : SchedWriteRes<[A57UnitI, + A57UnitS, A57UnitS, A57UnitS, + A57UnitS, A57UnitS, A57UnitS, + A57UnitV, A57UnitV, + A57UnitV, A57UnitV]> { + let Latency = 6; + let NumMicroOps = 11; +} + + +//===----------------------------------------------------------------------===// +// Define Generic 12 micro-op types + +def A57Write_8cyc_8S_4V : SchedWriteRes<[A57UnitS, A57UnitS, A57UnitS, A57UnitS, + A57UnitS, A57UnitS, A57UnitS, A57UnitS, + A57UnitV, A57UnitV, + A57UnitV, A57UnitV]> { + let Latency = 8; + let NumMicroOps = 12; +} + +//===----------------------------------------------------------------------===// +// Define Generic 13 micro-op types + +def A57Write_8cyc_1I_8S_4V : SchedWriteRes<[A57UnitI, + A57UnitS, A57UnitS, A57UnitS, + A57UnitS, A57UnitS, A57UnitS, + A57UnitS, A57UnitS, + A57UnitV, A57UnitV, + A57UnitV, A57UnitV]> { + let Latency = 8; + let NumMicroOps = 13; +} + diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp index 5c65b750ee55..1bf64fc4310b 100644 --- a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp +++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp @@ -16,9 +16,8 @@ using namespace llvm; #define DEBUG_TYPE "aarch64-selectiondag-info" -AArch64SelectionDAGInfo::AArch64SelectionDAGInfo(const TargetMachine &TM) - : TargetSelectionDAGInfo(TM), - Subtarget(&TM.getSubtarget<AArch64Subtarget>()) {} +AArch64SelectionDAGInfo::AArch64SelectionDAGInfo(const DataLayout *DL) + : TargetSelectionDAGInfo(DL) {} AArch64SelectionDAGInfo::~AArch64SelectionDAGInfo() {} @@ -30,7 +29,9 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset( ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src); ConstantSDNode *SizeValue = dyn_cast<ConstantSDNode>(Size); const char *bzeroEntry = - (V && V->isNullValue()) ? Subtarget->getBZeroEntry() : nullptr; + (V && V->isNullValue()) + ? DAG.getTarget().getSubtarget<AArch64Subtarget>().getBZeroEntry() + : nullptr; // For small size (< 256), it is not beneficial to use bzero // instead of memset. if (bzeroEntry && (!SizeValue || SizeValue->getZExtValue() > 256)) { @@ -50,7 +51,7 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset( TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(Chain) .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), - DAG.getExternalSymbol(bzeroEntry, IntPtr), &Args, 0) + DAG.getExternalSymbol(bzeroEntry, IntPtr), std::move(Args), 0) .setDiscardResult(); std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); return CallResult.second; diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/lib/Target/AArch64/AArch64SelectionDAGInfo.h index 8381f9916a8c..1180eea6f4c1 100644 --- a/lib/Target/AArch64/AArch64SelectionDAGInfo.h +++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.h @@ -19,12 +19,8 @@ namespace llvm { class AArch64SelectionDAGInfo : public TargetSelectionDAGInfo { - /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can - /// make the right decision when generating code for different targets. - const AArch64Subtarget *Subtarget; - public: - explicit AArch64SelectionDAGInfo(const TargetMachine &TM); + explicit AArch64SelectionDAGInfo(const DataLayout *DL); ~AArch64SelectionDAGInfo(); SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, SDValue Chain, diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp index cd69994620da..bb0b72c585b8 100644 --- a/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/lib/Target/AArch64/AArch64Subtarget.cpp @@ -30,21 +30,35 @@ static cl::opt<bool> EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if " "converter pass"), cl::init(true), cl::Hidden); -AArch64Subtarget::AArch64Subtarget(const std::string &TT, - const std::string &CPU, - const std::string &FS, bool LittleEndian) - : AArch64GenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others), - HasFPARMv8(false), HasNEON(false), HasCrypto(false), HasCRC(false), - HasZeroCycleRegMove(false), HasZeroCycleZeroing(false), CPUString(CPU), - TargetTriple(TT), IsLittleEndian(LittleEndian) { +AArch64Subtarget & +AArch64Subtarget::initializeSubtargetDependencies(StringRef FS) { // Determine default and user-specified characteristics if (CPUString.empty()) CPUString = "generic"; ParseSubtargetFeatures(CPUString, FS); + return *this; } +AArch64Subtarget::AArch64Subtarget(const std::string &TT, + const std::string &CPU, + const std::string &FS, TargetMachine &TM, + bool LittleEndian) + : AArch64GenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others), + HasFPARMv8(false), HasNEON(false), HasCrypto(false), HasCRC(false), + HasZeroCycleRegMove(false), HasZeroCycleZeroing(false), CPUString(CPU), + TargetTriple(TT), + // This nested ternary is horrible, but DL needs to be properly + // initialized + // before TLInfo is constructed. + DL(isTargetMachO() + ? "e-m:o-i64:64-i128:128-n32:64-S128" + : (LittleEndian ? "e-m:e-i64:64-i128:128-n32:64-S128" + : "E-m:e-i64:64-i128:128-n32:64-S128")), + FrameLowering(), InstrInfo(initializeSubtargetDependencies(FS)), + TSInfo(&DL), TLInfo(TM) {} + /// ClassifyGlobalReference - Find the target operand flags that describe /// how a global value should be referenced for the current subtarget. unsigned char diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h index 590ea0580ea7..52124f650c7a 100644 --- a/lib/Target/AArch64/AArch64Subtarget.h +++ b/lib/Target/AArch64/AArch64Subtarget.h @@ -14,8 +14,13 @@ #ifndef AArch64SUBTARGET_H #define AArch64SUBTARGET_H -#include "llvm/Target/TargetSubtargetInfo.h" +#include "AArch64InstrInfo.h" +#include "AArch64FrameLowering.h" +#include "AArch64ISelLowering.h" #include "AArch64RegisterInfo.h" +#include "AArch64SelectionDAGInfo.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/Target/TargetSubtargetInfo.h" #include <string> #define GET_SUBTARGETINFO_HEADER @@ -49,15 +54,32 @@ protected: /// TargetTriple - What processor and OS we're targeting. Triple TargetTriple; - /// IsLittleEndian - Is the target little endian? - bool IsLittleEndian; + const DataLayout DL; + AArch64FrameLowering FrameLowering; + AArch64InstrInfo InstrInfo; + AArch64SelectionDAGInfo TSInfo; + AArch64TargetLowering TLInfo; +private: + /// initializeSubtargetDependencies - Initializes using CPUString and the + /// passed in feature string so that we can use initializer lists for + /// subtarget initialization. + AArch64Subtarget &initializeSubtargetDependencies(StringRef FS); public: /// This constructor initializes the data members to match that /// of the specified triple. AArch64Subtarget(const std::string &TT, const std::string &CPU, - const std::string &FS, bool LittleEndian); - + const std::string &FS, TargetMachine &TM, bool LittleEndian); + + const AArch64SelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; } + const AArch64FrameLowering *getFrameLowering() const { + return &FrameLowering; + } + const AArch64TargetLowering *getTargetLowering() const { + return &TLInfo; + } + const AArch64InstrInfo *getInstrInfo() const { return &InstrInfo; } + const DataLayout *getDataLayout() const { return &DL; } bool enableMachineScheduler() const override { return true; } bool hasZeroCycleRegMove() const { return HasZeroCycleRegMove; } @@ -69,7 +91,7 @@ public: bool hasCrypto() const { return HasCrypto; } bool hasCRC() const { return HasCRC; } - bool isLittleEndian() const { return IsLittleEndian; } + bool isLittleEndian() const { return DL.isLittleEndian(); } bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); } diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp index 0b5dd2f067e7..f99b90b800f4 100644 --- a/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -53,6 +53,12 @@ static cl::opt<bool> EnableLoadStoreOpt("aarch64-load-store-opt", cl::desc("Enable the load/store pair" " optimization pass"), cl::init(true), cl::Hidden); +static cl::opt<bool> +EnableAtomicTidy("aarch64-atomic-cfg-tidy", cl::Hidden, + cl::desc("Run SimplifyCFG after expanding atomic operations" + " to make use of cmpxchg flow-based information"), + cl::init(true)); + extern "C" void LLVMInitializeAArch64Target() { // Register the target. RegisterTargetMachine<AArch64leTargetMachine> X(TheAArch64leTarget); @@ -71,16 +77,7 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, StringRef TT, CodeGenOpt::Level OL, bool LittleEndian) : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), - Subtarget(TT, CPU, FS, LittleEndian), - // This nested ternary is horrible, but DL needs to be properly - // initialized - // before TLInfo is constructed. - DL(Subtarget.isTargetMachO() - ? "e-m:o-i64:64-i128:128-n32:64-S128" - : (LittleEndian ? "e-m:e-i64:64-i128:128-n32:64-S128" - : "E-m:e-i64:64-i128:128-n32:64-S128")), - InstrInfo(Subtarget), TLInfo(*this), FrameLowering(*this, Subtarget), - TSInfo(*this) { + Subtarget(TT, CPU, FS, *this, LittleEndian) { initAsmInfo(); } @@ -113,6 +110,7 @@ public: return getTM<AArch64TargetMachine>(); } + void addIRPasses() override; bool addPreISel() override; bool addInstSelector() override; bool addILPOpts() override; @@ -135,6 +133,20 @@ TargetPassConfig *AArch64TargetMachine::createPassConfig(PassManagerBase &PM) { return new AArch64PassConfig(this, PM); } +void AArch64PassConfig::addIRPasses() { + // Always expand atomic operations, we don't deal with atomicrmw or cmpxchg + // ourselves. + addPass(createAtomicExpandLoadLinkedPass(TM)); + + // Cmpxchg instructions are often used with a subsequent comparison to + // determine whether it succeeded. We can exploit existing control-flow in + // ldrex/strex loops to simplify this, but it needs tidying up. + if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy) + addPass(createCFGSimplificationPass()); + + TargetPassConfig::addIRPasses(); +} + // Pass Pipeline Configuration bool AArch64PassConfig::addPreISel() { // Run promote constant before global merge, so that the promoted constants @@ -146,10 +158,6 @@ bool AArch64PassConfig::addPreISel() { if (TM->getOptLevel() != CodeGenOpt::None) addPass(createAArch64AddressTypePromotionPass()); - // Always expand atomic operations, we don't deal with atomicrmw or cmpxchg - // ourselves. - addPass(createAtomicExpandLoadLinkedPass(TM)); - return false; } diff --git a/lib/Target/AArch64/AArch64TargetMachine.h b/lib/Target/AArch64/AArch64TargetMachine.h index 079b19b23bb5..852cb3f8d2e9 100644 --- a/lib/Target/AArch64/AArch64TargetMachine.h +++ b/lib/Target/AArch64/AArch64TargetMachine.h @@ -15,13 +15,9 @@ #define AArch64TARGETMACHINE_H #include "AArch64InstrInfo.h" -#include "AArch64ISelLowering.h" #include "AArch64Subtarget.h" -#include "AArch64FrameLowering.h" -#include "AArch64SelectionDAGInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/Target/TargetMachine.h" -#include "llvm/MC/MCStreamer.h" namespace llvm { @@ -29,13 +25,6 @@ class AArch64TargetMachine : public LLVMTargetMachine { protected: AArch64Subtarget Subtarget; -private: - const DataLayout DL; - AArch64InstrInfo InstrInfo; - AArch64TargetLowering TLInfo; - AArch64FrameLowering FrameLowering; - AArch64SelectionDAGInfo TSInfo; - public: AArch64TargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, const TargetOptions &Options, @@ -46,18 +35,22 @@ public: return &Subtarget; } const AArch64TargetLowering *getTargetLowering() const override { - return &TLInfo; + return getSubtargetImpl()->getTargetLowering(); + } + const DataLayout *getDataLayout() const override { + return getSubtargetImpl()->getDataLayout(); } - const DataLayout *getDataLayout() const override { return &DL; } const AArch64FrameLowering *getFrameLowering() const override { - return &FrameLowering; + return getSubtargetImpl()->getFrameLowering(); + } + const AArch64InstrInfo *getInstrInfo() const override { + return getSubtargetImpl()->getInstrInfo(); } - const AArch64InstrInfo *getInstrInfo() const override { return &InstrInfo; } const AArch64RegisterInfo *getRegisterInfo() const override { - return &InstrInfo.getRegisterInfo(); + return &getInstrInfo()->getRegisterInfo(); } const AArch64SelectionDAGInfo *getSelectionDAGInfo() const override { - return &TSInfo; + return getSubtargetImpl()->getSelectionDAGInfo(); } // Pass Pipeline Configuration diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 33e482a53a46..1dac14b96af9 100644 --- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -306,28 +306,64 @@ unsigned AArch64TTI::getCastInstrCost(unsigned Opcode, Type *Dst, static const TypeConversionCostTblEntry<MVT> ConversionTbl[] = { // LowerVectorINT_TO_FP: { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, - { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 1 }, - { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 1 }, - { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, - { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 1 }, - { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 1 }, - { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, + + // Complex: to v2f32 + { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, + { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 }, + { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 }, + { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, + { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 }, + { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 }, + + // Complex: to v4f32 + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, + + // Complex: to v2f64 + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, + + // LowerVectorFP_TO_INT + { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 }, { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 }, { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 }, + { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 }, { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, - { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 1 }, - { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 1 }, - { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 4 }, - { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 4 }, - { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 4 }, - { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 4 }, - { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 4 }, - { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 4 }, + + // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext). + { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 }, + { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 }, + { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1 }, + { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 }, + { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 }, + { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1 }, + + // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2 + { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 }, + { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2 }, + { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 }, + { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2 }, + + // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2. + { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 }, + { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 }, + { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2 }, + { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 }, + { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 }, + { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 }, }; int Idx = ConvertCostTableLookup<MVT>( diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index 65b77c547dc9..c42d11e0f3d4 100644 --- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -38,14 +38,19 @@ namespace { class AArch64Operand; class AArch64AsmParser : public MCTargetAsmParser { -public: - typedef SmallVectorImpl<MCParsedAsmOperand *> OperandVector; - private: StringRef Mnemonic; ///< Instruction mnemonic. MCSubtargetInfo &STI; MCAsmParser &Parser; + // Map of register aliases registers via the .req directive. + StringMap<std::pair<bool, unsigned> > RegisterReqs; + + AArch64TargetStreamer &getTargetStreamer() { + MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer(); + return static_cast<AArch64TargetStreamer &>(TS); + } + MCAsmParser &getParser() const { return Parser; } MCAsmLexer &getLexer() const { return Parser.getLexer(); } @@ -54,6 +59,7 @@ private: bool parseSysAlias(StringRef Name, SMLoc NameLoc, OperandVector &Operands); AArch64CC::CondCode parseCondCodeString(StringRef Cond); bool parseCondCode(OperandVector &Operands, bool invertCondCode); + unsigned matchRegisterNameAlias(StringRef Name, bool isVector); int tryParseRegister(); int tryMatchVectorRegister(StringRef &Kind, bool expected); bool parseRegister(OperandVector &Operands); @@ -70,6 +76,10 @@ private: bool parseDirectiveTLSDescCall(SMLoc L); bool parseDirectiveLOH(StringRef LOH, SMLoc L); + bool parseDirectiveLtorg(SMLoc L); + + bool parseDirectiveReq(StringRef Name, SMLoc L); + bool parseDirectiveUnreq(SMLoc L); bool validateInstruction(MCInst &Inst, SmallVectorImpl<SMLoc> &Loc); bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, @@ -108,6 +118,8 @@ public: const MCTargetOptions &Options) : MCTargetAsmParser(), STI(_STI), Parser(_Parser) { MCAsmParserExtension::Initialize(_Parser); + if (Parser.getStreamer().getTargetStreamer() == nullptr) + new AArch64TargetStreamer(Parser.getStreamer()); // Initialize the set of available features. setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); @@ -117,7 +129,7 @@ public: SMLoc NameLoc, OperandVector &Operands) override; bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override; bool ParseDirective(AsmToken DirectiveID) override; - unsigned validateTargetOperandClass(MCParsedAsmOperand *Op, + unsigned validateTargetOperandClass(MCParsedAsmOperand &Op, unsigned Kind) override; static bool classifySymbolRef(const MCExpr *Expr, @@ -240,10 +252,10 @@ private: // the add<>Operands() calls. MCContext &Ctx; +public: AArch64Operand(KindTy K, MCContext &_Ctx) : MCParsedAsmOperand(), Kind(K), Ctx(_Ctx) {} -public: AArch64Operand(const AArch64Operand &o) : MCParsedAsmOperand(), Ctx(o.Ctx) { Kind = o.Kind; StartLoc = o.StartLoc; @@ -607,7 +619,11 @@ public: const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); if (!MCE) return false; - return AArch64_AM::isLogicalImmediate(MCE->getValue(), 32); + int64_t Val = MCE->getValue(); + if (Val >> 32 != 0 && Val >> 32 != ~0LL) + return false; + Val &= 0xFFFFFFFF; + return AArch64_AM::isLogicalImmediate(Val, 32); } bool isLogicalImm64() const { if (!isImm()) @@ -617,6 +633,23 @@ public: return false; return AArch64_AM::isLogicalImmediate(MCE->getValue(), 64); } + bool isLogicalImm32Not() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); + if (!MCE) + return false; + int64_t Val = ~MCE->getValue() & 0xFFFFFFFF; + return AArch64_AM::isLogicalImmediate(Val, 32); + } + bool isLogicalImm64Not() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); + if (!MCE) + return false; + return AArch64_AM::isLogicalImmediate(~MCE->getValue(), 64); + } bool isShiftedImm() const { return Kind == k_ShiftedImm; } bool isAddSubImm() const { if (!isShiftedImm() && !isImm()) @@ -1348,7 +1381,8 @@ public: assert(N == 1 && "Invalid number of operands!"); const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); assert(MCE && "Invalid logical immediate operand!"); - uint64_t encoding = AArch64_AM::encodeLogicalImmediate(MCE->getValue(), 32); + uint64_t encoding = + AArch64_AM::encodeLogicalImmediate(MCE->getValue() & 0xFFFFFFFF, 32); Inst.addOperand(MCOperand::CreateImm(encoding)); } @@ -1360,6 +1394,22 @@ public: Inst.addOperand(MCOperand::CreateImm(encoding)); } + void addLogicalImm32NotOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm()); + int64_t Val = ~MCE->getValue() & 0xFFFFFFFF; + uint64_t encoding = AArch64_AM::encodeLogicalImmediate(Val, 32); + Inst.addOperand(MCOperand::CreateImm(encoding)); + } + + void addLogicalImm64NotOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm()); + uint64_t encoding = + AArch64_AM::encodeLogicalImmediate(~MCE->getValue(), 64); + Inst.addOperand(MCOperand::CreateImm(encoding)); + } + void addSIMDImmType10Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); @@ -1523,9 +1573,9 @@ public: void print(raw_ostream &OS) const override; - static AArch64Operand *CreateToken(StringRef Str, bool IsSuffix, SMLoc S, - MCContext &Ctx) { - AArch64Operand *Op = new AArch64Operand(k_Token, Ctx); + static std::unique_ptr<AArch64Operand> + CreateToken(StringRef Str, bool IsSuffix, SMLoc S, MCContext &Ctx) { + auto Op = make_unique<AArch64Operand>(k_Token, Ctx); Op->Tok.Data = Str.data(); Op->Tok.Length = Str.size(); Op->Tok.IsSuffix = IsSuffix; @@ -1534,9 +1584,9 @@ public: return Op; } - static AArch64Operand *CreateReg(unsigned RegNum, bool isVector, SMLoc S, - SMLoc E, MCContext &Ctx) { - AArch64Operand *Op = new AArch64Operand(k_Register, Ctx); + static std::unique_ptr<AArch64Operand> + CreateReg(unsigned RegNum, bool isVector, SMLoc S, SMLoc E, MCContext &Ctx) { + auto Op = make_unique<AArch64Operand>(k_Register, Ctx); Op->Reg.RegNum = RegNum; Op->Reg.isVector = isVector; Op->StartLoc = S; @@ -1544,10 +1594,10 @@ public: return Op; } - static AArch64Operand *CreateVectorList(unsigned RegNum, unsigned Count, - unsigned NumElements, char ElementKind, - SMLoc S, SMLoc E, MCContext &Ctx) { - AArch64Operand *Op = new AArch64Operand(k_VectorList, Ctx); + static std::unique_ptr<AArch64Operand> + CreateVectorList(unsigned RegNum, unsigned Count, unsigned NumElements, + char ElementKind, SMLoc S, SMLoc E, MCContext &Ctx) { + auto Op = make_unique<AArch64Operand>(k_VectorList, Ctx); Op->VectorList.RegNum = RegNum; Op->VectorList.Count = Count; Op->VectorList.NumElements = NumElements; @@ -1557,28 +1607,29 @@ public: return Op; } - static AArch64Operand *CreateVectorIndex(unsigned Idx, SMLoc S, SMLoc E, - MCContext &Ctx) { - AArch64Operand *Op = new AArch64Operand(k_VectorIndex, Ctx); + static std::unique_ptr<AArch64Operand> + CreateVectorIndex(unsigned Idx, SMLoc S, SMLoc E, MCContext &Ctx) { + auto Op = make_unique<AArch64Operand>(k_VectorIndex, Ctx); Op->VectorIndex.Val = Idx; Op->StartLoc = S; Op->EndLoc = E; return Op; } - static AArch64Operand *CreateImm(const MCExpr *Val, SMLoc S, SMLoc E, - MCContext &Ctx) { - AArch64Operand *Op = new AArch64Operand(k_Immediate, Ctx); + static std::unique_ptr<AArch64Operand> CreateImm(const MCExpr *Val, SMLoc S, + SMLoc E, MCContext &Ctx) { + auto Op = make_unique<AArch64Operand>(k_Immediate, Ctx); Op->Imm.Val = Val; Op->StartLoc = S; Op->EndLoc = E; return Op; } - static AArch64Operand *CreateShiftedImm(const MCExpr *Val, - unsigned ShiftAmount, SMLoc S, - SMLoc E, MCContext &Ctx) { - AArch64Operand *Op = new AArch64Operand(k_ShiftedImm, Ctx); + static std::unique_ptr<AArch64Operand> CreateShiftedImm(const MCExpr *Val, + unsigned ShiftAmount, + SMLoc S, SMLoc E, + MCContext &Ctx) { + auto Op = make_unique<AArch64Operand>(k_ShiftedImm, Ctx); Op->ShiftedImm .Val = Val; Op->ShiftedImm.ShiftAmount = ShiftAmount; Op->StartLoc = S; @@ -1586,34 +1637,36 @@ public: return Op; } - static AArch64Operand *CreateCondCode(AArch64CC::CondCode Code, SMLoc S, - SMLoc E, MCContext &Ctx) { - AArch64Operand *Op = new AArch64Operand(k_CondCode, Ctx); + static std::unique_ptr<AArch64Operand> + CreateCondCode(AArch64CC::CondCode Code, SMLoc S, SMLoc E, MCContext &Ctx) { + auto Op = make_unique<AArch64Operand>(k_CondCode, Ctx); Op->CondCode.Code = Code; Op->StartLoc = S; Op->EndLoc = E; return Op; } - static AArch64Operand *CreateFPImm(unsigned Val, SMLoc S, MCContext &Ctx) { - AArch64Operand *Op = new AArch64Operand(k_FPImm, Ctx); + static std::unique_ptr<AArch64Operand> CreateFPImm(unsigned Val, SMLoc S, + MCContext &Ctx) { + auto Op = make_unique<AArch64Operand>(k_FPImm, Ctx); Op->FPImm.Val = Val; Op->StartLoc = S; Op->EndLoc = S; return Op; } - static AArch64Operand *CreateBarrier(unsigned Val, SMLoc S, MCContext &Ctx) { - AArch64Operand *Op = new AArch64Operand(k_Barrier, Ctx); + static std::unique_ptr<AArch64Operand> CreateBarrier(unsigned Val, SMLoc S, + MCContext &Ctx) { + auto Op = make_unique<AArch64Operand>(k_Barrier, Ctx); Op->Barrier.Val = Val; Op->StartLoc = S; Op->EndLoc = S; return Op; } - static AArch64Operand *CreateSysReg(StringRef Str, SMLoc S, - uint64_t FeatureBits, MCContext &Ctx) { - AArch64Operand *Op = new AArch64Operand(k_SysReg, Ctx); + static std::unique_ptr<AArch64Operand> + CreateSysReg(StringRef Str, SMLoc S, uint64_t FeatureBits, MCContext &Ctx) { + auto Op = make_unique<AArch64Operand>(k_SysReg, Ctx); Op->SysReg.Data = Str.data(); Op->SysReg.Length = Str.size(); Op->SysReg.FeatureBits = FeatureBits; @@ -1622,27 +1675,28 @@ public: return Op; } - static AArch64Operand *CreateSysCR(unsigned Val, SMLoc S, SMLoc E, - MCContext &Ctx) { - AArch64Operand *Op = new AArch64Operand(k_SysCR, Ctx); + static std::unique_ptr<AArch64Operand> CreateSysCR(unsigned Val, SMLoc S, + SMLoc E, MCContext &Ctx) { + auto Op = make_unique<AArch64Operand>(k_SysCR, Ctx); Op->SysCRImm.Val = Val; Op->StartLoc = S; Op->EndLoc = E; return Op; } - static AArch64Operand *CreatePrefetch(unsigned Val, SMLoc S, MCContext &Ctx) { - AArch64Operand *Op = new AArch64Operand(k_Prefetch, Ctx); + static std::unique_ptr<AArch64Operand> CreatePrefetch(unsigned Val, SMLoc S, + MCContext &Ctx) { + auto Op = make_unique<AArch64Operand>(k_Prefetch, Ctx); Op->Prefetch.Val = Val; Op->StartLoc = S; Op->EndLoc = S; return Op; } - static AArch64Operand *CreateShiftExtend(AArch64_AM::ShiftExtendType ShOp, - unsigned Val, bool HasExplicitAmount, - SMLoc S, SMLoc E, MCContext &Ctx) { - AArch64Operand *Op = new AArch64Operand(k_ShiftExtend, Ctx); + static std::unique_ptr<AArch64Operand> + CreateShiftExtend(AArch64_AM::ShiftExtendType ShOp, unsigned Val, + bool HasExplicitAmount, SMLoc S, SMLoc E, MCContext &Ctx) { + auto Op = make_unique<AArch64Operand>(k_ShiftExtend, Ctx); Op->ShiftExtend.Type = ShOp; Op->ShiftExtend.Amount = Val; Op->ShiftExtend.HasExplicitAmount = HasExplicitAmount; @@ -1816,6 +1870,26 @@ bool AArch64AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, return (RegNo == (unsigned)-1); } +// Matches a register name or register alias previously defined by '.req' +unsigned AArch64AsmParser::matchRegisterNameAlias(StringRef Name, + bool isVector) { + unsigned RegNum = isVector ? matchVectorRegName(Name) + : MatchRegisterName(Name); + + if (RegNum == 0) { + // Check for aliases registered via .req. Canonicalize to lower case. + // That's more consistent since register names are case insensitive, and + // it's how the original entry was passed in from MC/MCParser/AsmParser. + auto Entry = RegisterReqs.find(Name.lower()); + if (Entry == RegisterReqs.end()) + return 0; + // set RegNum if the match is the right kind of register + if (isVector == Entry->getValue().first) + RegNum = Entry->getValue().second; + } + return RegNum; +} + /// tryParseRegister - Try to parse a register name. The token must be an /// Identifier when called, and if it is a register name the token is eaten and /// the register is added to the operand list. @@ -1824,7 +1898,7 @@ int AArch64AsmParser::tryParseRegister() { assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier"); std::string lowerCase = Tok.getString().lower(); - unsigned RegNum = MatchRegisterName(lowerCase); + unsigned RegNum = matchRegisterNameAlias(lowerCase, false); // Also handle a few aliases of registers. if (RegNum == 0) RegNum = StringSwitch<unsigned>(lowerCase) @@ -1854,7 +1928,8 @@ int AArch64AsmParser::tryMatchVectorRegister(StringRef &Kind, bool expected) { // a '.'. size_t Start = 0, Next = Name.find('.'); StringRef Head = Name.slice(Start, Next); - unsigned RegNum = matchVectorRegName(Head); + unsigned RegNum = matchRegisterNameAlias(Head, true); + if (RegNum) { if (Next != StringRef::npos) { Kind = Name.slice(Next, StringRef::npos); @@ -2183,8 +2258,11 @@ bool AArch64AsmParser::parseCondCode(OperandVector &Operands, return TokError("invalid condition code"); Parser.Lex(); // Eat identifier token. - if (invertCondCode) + if (invertCondCode) { + if (CC == AArch64CC::AL || CC == AArch64CC::NV) + return TokError("condition codes AL and NV are invalid for this instruction"); CC = AArch64CC::getInvertedCondCode(AArch64CC::CondCode(CC)); + } Operands.push_back( AArch64Operand::CreateCondCode(CC, S, getLoc(), getContext())); @@ -2849,7 +2927,7 @@ AArch64AsmParser::tryParseGPR64sp0Operand(OperandVector &Operands) { if (!Tok.is(AsmToken::Identifier)) return MatchOperand_NoMatch; - unsigned RegNum = MatchRegisterName(Tok.getString().lower()); + unsigned RegNum = matchRegisterNameAlias(Tok.getString().lower(), false); MCContext &Ctx = getContext(); const MCRegisterInfo *RI = Ctx.getRegisterInfo(); @@ -3000,6 +3078,43 @@ bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode, Operands.push_back(AArch64Operand::CreateImm(ImmVal, S, E, getContext())); return false; } + case AsmToken::Equal: { + SMLoc Loc = Parser.getTok().getLoc(); + if (Mnemonic != "ldr") // only parse for ldr pseudo (e.g. ldr r0, =val) + return Error(Loc, "unexpected token in operand"); + Parser.Lex(); // Eat '=' + const MCExpr *SubExprVal; + if (getParser().parseExpression(SubExprVal)) + return true; + + MCContext& Ctx = getContext(); + E = SMLoc::getFromPointer(Loc.getPointer() - 1); + // If the op is an imm and can be fit into a mov, then replace ldr with mov. + if (isa<MCConstantExpr>(SubExprVal) && Operands.size() >= 2 && + static_cast<AArch64Operand &>(*Operands[1]).isReg()) { + bool IsXReg = AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains( + Operands[1]->getReg()); + uint64_t Imm = (cast<MCConstantExpr>(SubExprVal))->getValue(); + uint32_t ShiftAmt = 0, MaxShiftAmt = IsXReg ? 48 : 16; + while(Imm > 0xFFFF && countTrailingZeros(Imm) >= 16) { + ShiftAmt += 16; + Imm >>= 16; + } + if (ShiftAmt <= MaxShiftAmt && Imm <= 0xFFFF) { + Operands[0] = AArch64Operand::CreateToken("movz", false, Loc, Ctx); + Operands.push_back(AArch64Operand::CreateImm( + MCConstantExpr::Create(Imm, Ctx), S, E, Ctx)); + if (ShiftAmt) + Operands.push_back(AArch64Operand::CreateShiftExtend(AArch64_AM::LSL, + ShiftAmt, true, S, E, Ctx)); + return false; + } + } + // If it is a label or an imm that cannot fit in a movz, put it into CP. + const MCExpr *CPLoc = getTargetStreamer().addConstantPoolEntry(SubExprVal); + Operands.push_back(AArch64Operand::CreateImm(CPLoc, S, E, Ctx)); + return false; + } } } @@ -3029,6 +3144,15 @@ bool AArch64AsmParser::ParseInstruction(ParseInstructionInfo &Info, .Case("bnv", "b.nv") .Default(Name); + // First check for the AArch64-specific .req directive. + if (Parser.getTok().is(AsmToken::Identifier) && + Parser.getTok().getIdentifier() == ".req") { + parseDirectiveReq(Name, NameLoc); + // We always return 'error' for this, as we're done with this + // statement and don't need to match the 'instruction." + return true; + } + // Create the leading tokens for the mnemonic, split by '.' characters. size_t Start = 0, Next = Name.find('.'); StringRef Head = Name.slice(Start, Next); @@ -3443,8 +3567,7 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode) { case Match_MnemonicFail: return Error(Loc, "unrecognized instruction mnemonic"); default: - assert(0 && "unexpected error code!"); - return Error(Loc, "invalid instruction format"); + llvm_unreachable("unexpected error code!"); } } @@ -3456,23 +3579,23 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, unsigned &ErrorInfo, bool MatchingInlineAsm) { assert(!Operands.empty() && "Unexpect empty operand list!"); - AArch64Operand *Op = static_cast<AArch64Operand *>(Operands[0]); - assert(Op->isToken() && "Leading operand should always be a mnemonic!"); + AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[0]); + assert(Op.isToken() && "Leading operand should always be a mnemonic!"); - StringRef Tok = Op->getToken(); + StringRef Tok = Op.getToken(); unsigned NumOperands = Operands.size(); if (NumOperands == 4 && Tok == "lsl") { - AArch64Operand *Op2 = static_cast<AArch64Operand *>(Operands[2]); - AArch64Operand *Op3 = static_cast<AArch64Operand *>(Operands[3]); - if (Op2->isReg() && Op3->isImm()) { - const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3->getImm()); + AArch64Operand &Op2 = static_cast<AArch64Operand &>(*Operands[2]); + AArch64Operand &Op3 = static_cast<AArch64Operand &>(*Operands[3]); + if (Op2.isReg() && Op3.isImm()) { + const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3.getImm()); if (Op3CE) { uint64_t Op3Val = Op3CE->getValue(); uint64_t NewOp3Val = 0; uint64_t NewOp4Val = 0; if (AArch64MCRegisterClasses[AArch64::GPR32allRegClassID].contains( - Op2->getReg())) { + Op2.getReg())) { NewOp3Val = (32 - Op3Val) & 0x1f; NewOp4Val = 31 - Op3Val; } else { @@ -3484,26 +3607,24 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, const MCExpr *NewOp4 = MCConstantExpr::Create(NewOp4Val, getContext()); Operands[0] = AArch64Operand::CreateToken( - "ubfm", false, Op->getStartLoc(), getContext()); - Operands[3] = AArch64Operand::CreateImm(NewOp3, Op3->getStartLoc(), - Op3->getEndLoc(), getContext()); + "ubfm", false, Op.getStartLoc(), getContext()); Operands.push_back(AArch64Operand::CreateImm( - NewOp4, Op3->getStartLoc(), Op3->getEndLoc(), getContext())); - delete Op3; - delete Op; + NewOp4, Op3.getStartLoc(), Op3.getEndLoc(), getContext())); + Operands[3] = AArch64Operand::CreateImm(NewOp3, Op3.getStartLoc(), + Op3.getEndLoc(), getContext()); } } } else if (NumOperands == 5) { // FIXME: Horrible hack to handle the BFI -> BFM, SBFIZ->SBFM, and // UBFIZ -> UBFM aliases. if (Tok == "bfi" || Tok == "sbfiz" || Tok == "ubfiz") { - AArch64Operand *Op1 = static_cast<AArch64Operand *>(Operands[1]); - AArch64Operand *Op3 = static_cast<AArch64Operand *>(Operands[3]); - AArch64Operand *Op4 = static_cast<AArch64Operand *>(Operands[4]); + AArch64Operand &Op1 = static_cast<AArch64Operand &>(*Operands[1]); + AArch64Operand &Op3 = static_cast<AArch64Operand &>(*Operands[3]); + AArch64Operand &Op4 = static_cast<AArch64Operand &>(*Operands[4]); - if (Op1->isReg() && Op3->isImm() && Op4->isImm()) { - const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3->getImm()); - const MCConstantExpr *Op4CE = dyn_cast<MCConstantExpr>(Op4->getImm()); + if (Op1.isReg() && Op3.isImm() && Op4.isImm()) { + const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3.getImm()); + const MCConstantExpr *Op4CE = dyn_cast<MCConstantExpr>(Op4.getImm()); if (Op3CE && Op4CE) { uint64_t Op3Val = Op3CE->getValue(); @@ -3511,21 +3632,21 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, uint64_t RegWidth = 0; if (AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains( - Op1->getReg())) + Op1.getReg())) RegWidth = 64; else RegWidth = 32; if (Op3Val >= RegWidth) - return Error(Op3->getStartLoc(), + return Error(Op3.getStartLoc(), "expected integer in range [0, 31]"); if (Op4Val < 1 || Op4Val > RegWidth) - return Error(Op4->getStartLoc(), + return Error(Op4.getStartLoc(), "expected integer in range [1, 32]"); uint64_t NewOp3Val = 0; if (AArch64MCRegisterClasses[AArch64::GPR32allRegClassID].contains( - Op1->getReg())) + Op1.getReg())) NewOp3Val = (32 - Op3Val) & 0x1f; else NewOp3Val = (64 - Op3Val) & 0x3f; @@ -3533,7 +3654,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, uint64_t NewOp4Val = Op4Val - 1; if (NewOp3Val != 0 && NewOp4Val >= NewOp3Val) - return Error(Op4->getStartLoc(), + return Error(Op4.getStartLoc(), "requested insert overflows register"); const MCExpr *NewOp3 = @@ -3541,24 +3662,20 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, const MCExpr *NewOp4 = MCConstantExpr::Create(NewOp4Val, getContext()); Operands[3] = AArch64Operand::CreateImm( - NewOp3, Op3->getStartLoc(), Op3->getEndLoc(), getContext()); + NewOp3, Op3.getStartLoc(), Op3.getEndLoc(), getContext()); Operands[4] = AArch64Operand::CreateImm( - NewOp4, Op4->getStartLoc(), Op4->getEndLoc(), getContext()); + NewOp4, Op4.getStartLoc(), Op4.getEndLoc(), getContext()); if (Tok == "bfi") Operands[0] = AArch64Operand::CreateToken( - "bfm", false, Op->getStartLoc(), getContext()); + "bfm", false, Op.getStartLoc(), getContext()); else if (Tok == "sbfiz") Operands[0] = AArch64Operand::CreateToken( - "sbfm", false, Op->getStartLoc(), getContext()); + "sbfm", false, Op.getStartLoc(), getContext()); else if (Tok == "ubfiz") Operands[0] = AArch64Operand::CreateToken( - "ubfm", false, Op->getStartLoc(), getContext()); + "ubfm", false, Op.getStartLoc(), getContext()); else llvm_unreachable("No valid mnemonic for alias?"); - - delete Op; - delete Op3; - delete Op4; } } @@ -3566,13 +3683,13 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, // UBFX -> UBFM aliases. } else if (NumOperands == 5 && (Tok == "bfxil" || Tok == "sbfx" || Tok == "ubfx")) { - AArch64Operand *Op1 = static_cast<AArch64Operand *>(Operands[1]); - AArch64Operand *Op3 = static_cast<AArch64Operand *>(Operands[3]); - AArch64Operand *Op4 = static_cast<AArch64Operand *>(Operands[4]); + AArch64Operand &Op1 = static_cast<AArch64Operand &>(*Operands[1]); + AArch64Operand &Op3 = static_cast<AArch64Operand &>(*Operands[3]); + AArch64Operand &Op4 = static_cast<AArch64Operand &>(*Operands[4]); - if (Op1->isReg() && Op3->isImm() && Op4->isImm()) { - const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3->getImm()); - const MCConstantExpr *Op4CE = dyn_cast<MCConstantExpr>(Op4->getImm()); + if (Op1.isReg() && Op3.isImm() && Op4.isImm()) { + const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3.getImm()); + const MCConstantExpr *Op4CE = dyn_cast<MCConstantExpr>(Op4.getImm()); if (Op3CE && Op4CE) { uint64_t Op3Val = Op3CE->getValue(); @@ -3580,42 +3697,39 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, uint64_t RegWidth = 0; if (AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains( - Op1->getReg())) + Op1.getReg())) RegWidth = 64; else RegWidth = 32; if (Op3Val >= RegWidth) - return Error(Op3->getStartLoc(), + return Error(Op3.getStartLoc(), "expected integer in range [0, 31]"); if (Op4Val < 1 || Op4Val > RegWidth) - return Error(Op4->getStartLoc(), + return Error(Op4.getStartLoc(), "expected integer in range [1, 32]"); uint64_t NewOp4Val = Op3Val + Op4Val - 1; if (NewOp4Val >= RegWidth || NewOp4Val < Op3Val) - return Error(Op4->getStartLoc(), + return Error(Op4.getStartLoc(), "requested extract overflows register"); const MCExpr *NewOp4 = MCConstantExpr::Create(NewOp4Val, getContext()); Operands[4] = AArch64Operand::CreateImm( - NewOp4, Op4->getStartLoc(), Op4->getEndLoc(), getContext()); + NewOp4, Op4.getStartLoc(), Op4.getEndLoc(), getContext()); if (Tok == "bfxil") Operands[0] = AArch64Operand::CreateToken( - "bfm", false, Op->getStartLoc(), getContext()); + "bfm", false, Op.getStartLoc(), getContext()); else if (Tok == "sbfx") Operands[0] = AArch64Operand::CreateToken( - "sbfm", false, Op->getStartLoc(), getContext()); + "sbfm", false, Op.getStartLoc(), getContext()); else if (Tok == "ubfx") Operands[0] = AArch64Operand::CreateToken( - "ubfm", false, Op->getStartLoc(), getContext()); + "ubfm", false, Op.getStartLoc(), getContext()); else llvm_unreachable("No valid mnemonic for alias?"); - - delete Op; - delete Op4; } } } @@ -3626,63 +3740,58 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, if (NumOperands == 3 && (Tok == "sxtw" || Tok == "uxtw")) { // The source register can be Wn here, but the matcher expects a // GPR64. Twiddle it here if necessary. - AArch64Operand *Op = static_cast<AArch64Operand *>(Operands[2]); - if (Op->isReg()) { - unsigned Reg = getXRegFromWReg(Op->getReg()); - Operands[2] = AArch64Operand::CreateReg(Reg, false, Op->getStartLoc(), - Op->getEndLoc(), getContext()); - delete Op; + AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[2]); + if (Op.isReg()) { + unsigned Reg = getXRegFromWReg(Op.getReg()); + Operands[2] = AArch64Operand::CreateReg(Reg, false, Op.getStartLoc(), + Op.getEndLoc(), getContext()); } } // FIXME: Likewise for sxt[bh] with a Xd dst operand else if (NumOperands == 3 && (Tok == "sxtb" || Tok == "sxth")) { - AArch64Operand *Op = static_cast<AArch64Operand *>(Operands[1]); - if (Op->isReg() && + AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[1]); + if (Op.isReg() && AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains( - Op->getReg())) { + Op.getReg())) { // The source register can be Wn here, but the matcher expects a // GPR64. Twiddle it here if necessary. - AArch64Operand *Op = static_cast<AArch64Operand *>(Operands[2]); - if (Op->isReg()) { - unsigned Reg = getXRegFromWReg(Op->getReg()); - Operands[2] = AArch64Operand::CreateReg(Reg, false, Op->getStartLoc(), - Op->getEndLoc(), getContext()); - delete Op; + AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[2]); + if (Op.isReg()) { + unsigned Reg = getXRegFromWReg(Op.getReg()); + Operands[2] = AArch64Operand::CreateReg(Reg, false, Op.getStartLoc(), + Op.getEndLoc(), getContext()); } } } // FIXME: Likewise for uxt[bh] with a Xd dst operand else if (NumOperands == 3 && (Tok == "uxtb" || Tok == "uxth")) { - AArch64Operand *Op = static_cast<AArch64Operand *>(Operands[1]); - if (Op->isReg() && + AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[1]); + if (Op.isReg() && AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains( - Op->getReg())) { + Op.getReg())) { // The source register can be Wn here, but the matcher expects a // GPR32. Twiddle it here if necessary. - AArch64Operand *Op = static_cast<AArch64Operand *>(Operands[1]); - if (Op->isReg()) { - unsigned Reg = getWRegFromXReg(Op->getReg()); - Operands[1] = AArch64Operand::CreateReg(Reg, false, Op->getStartLoc(), - Op->getEndLoc(), getContext()); - delete Op; + AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[1]); + if (Op.isReg()) { + unsigned Reg = getWRegFromXReg(Op.getReg()); + Operands[1] = AArch64Operand::CreateReg(Reg, false, Op.getStartLoc(), + Op.getEndLoc(), getContext()); } } } // Yet another horrible hack to handle FMOV Rd, #0.0 using [WX]ZR. if (NumOperands == 3 && Tok == "fmov") { - AArch64Operand *RegOp = static_cast<AArch64Operand *>(Operands[1]); - AArch64Operand *ImmOp = static_cast<AArch64Operand *>(Operands[2]); - if (RegOp->isReg() && ImmOp->isFPImm() && - ImmOp->getFPImm() == (unsigned)-1) { + AArch64Operand &RegOp = static_cast<AArch64Operand &>(*Operands[1]); + AArch64Operand &ImmOp = static_cast<AArch64Operand &>(*Operands[2]); + if (RegOp.isReg() && ImmOp.isFPImm() && ImmOp.getFPImm() == (unsigned)-1) { unsigned zreg = AArch64MCRegisterClasses[AArch64::FPR32RegClassID].contains( - RegOp->getReg()) + RegOp.getReg()) ? AArch64::WZR : AArch64::XZR; - Operands[2] = AArch64Operand::CreateReg(zreg, false, Op->getStartLoc(), - Op->getEndLoc(), getContext()); - delete ImmOp; + Operands[2] = AArch64Operand::CreateReg(zreg, false, Op.getStartLoc(), + Op.getEndLoc(), getContext()); } } @@ -3735,14 +3844,14 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, if (ErrorInfo >= Operands.size()) return Error(IDLoc, "too few operands for instruction"); - ErrorLoc = ((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(); + ErrorLoc = ((AArch64Operand &)*Operands[ErrorInfo]).getStartLoc(); if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc; } // If the match failed on a suffix token operand, tweak the diagnostic // accordingly. - if (((AArch64Operand *)Operands[ErrorInfo])->isToken() && - ((AArch64Operand *)Operands[ErrorInfo])->isTokenSuffix()) + if (((AArch64Operand &)*Operands[ErrorInfo]).isToken() && + ((AArch64Operand &)*Operands[ErrorInfo]).isTokenSuffix()) MatchResult = Match_InvalidSuffix; return showMatchError(ErrorLoc, MatchResult); @@ -3794,9 +3903,11 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, case Match_InvalidLabel: case Match_MSR: case Match_MRS: { + if (ErrorInfo >= Operands.size()) + return Error(IDLoc, "too few operands for instruction"); // Any time we get here, there's nothing fancy to do. Just get the // operand SMLoc and display the diagnostic. - SMLoc ErrorLoc = ((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(); + SMLoc ErrorLoc = ((AArch64Operand &)*Operands[ErrorInfo]).getStartLoc(); if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc; return showMatchError(ErrorLoc, MatchResult); @@ -3819,6 +3930,10 @@ bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) { return parseDirectiveWord(8, Loc); if (IDVal == ".tlsdesccall") return parseDirectiveTLSDescCall(Loc); + if (IDVal == ".ltorg" || IDVal == ".pool") + return parseDirectiveLtorg(Loc); + if (IDVal == ".unreq") + return parseDirectiveUnreq(DirectiveID.getLoc()); return parseDirectiveLOH(IDVal, Loc); } @@ -3920,6 +4035,66 @@ bool AArch64AsmParser::parseDirectiveLOH(StringRef IDVal, SMLoc Loc) { return false; } +/// parseDirectiveLtorg +/// ::= .ltorg | .pool +bool AArch64AsmParser::parseDirectiveLtorg(SMLoc L) { + getTargetStreamer().emitCurrentConstantPool(); + return false; +} + +/// parseDirectiveReq +/// ::= name .req registername +bool AArch64AsmParser::parseDirectiveReq(StringRef Name, SMLoc L) { + Parser.Lex(); // Eat the '.req' token. + SMLoc SRegLoc = getLoc(); + unsigned RegNum = tryParseRegister(); + bool IsVector = false; + + if (RegNum == static_cast<unsigned>(-1)) { + StringRef Kind; + RegNum = tryMatchVectorRegister(Kind, false); + if (!Kind.empty()) { + Error(SRegLoc, "vector register without type specifier expected"); + return false; + } + IsVector = true; + } + + if (RegNum == static_cast<unsigned>(-1)) { + Parser.eatToEndOfStatement(); + Error(SRegLoc, "register name or alias expected"); + return false; + } + + // Shouldn't be anything else. + if (Parser.getTok().isNot(AsmToken::EndOfStatement)) { + Error(Parser.getTok().getLoc(), "unexpected input in .req directive"); + Parser.eatToEndOfStatement(); + return false; + } + + Parser.Lex(); // Consume the EndOfStatement + + auto pair = std::make_pair(IsVector, RegNum); + if (RegisterReqs.GetOrCreateValue(Name, pair).getValue() != pair) + Warning(L, "ignoring redefinition of register alias '" + Name + "'"); + + return true; +} + +/// parseDirectiveUneq +/// ::= .unreq registername +bool AArch64AsmParser::parseDirectiveUnreq(SMLoc L) { + if (Parser.getTok().isNot(AsmToken::Identifier)) { + Error(Parser.getTok().getLoc(), "unexpected input in .unreq directive."); + Parser.eatToEndOfStatement(); + return false; + } + RegisterReqs.erase(Parser.getTok().getIdentifier().lower()); + Parser.Lex(); // Eat the identifier. + return false; +} + bool AArch64AsmParser::classifySymbolRef(const MCExpr *Expr, AArch64MCExpr::VariantKind &ELFRefKind, @@ -3986,9 +4161,9 @@ extern "C" void LLVMInitializeAArch64AsmParser() { // Define this matcher function after the auto-generated include so we // have the match class enum definitions. -unsigned AArch64AsmParser::validateTargetOperandClass(MCParsedAsmOperand *AsmOp, +unsigned AArch64AsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp, unsigned Kind) { - AArch64Operand *Op = static_cast<AArch64Operand *>(AsmOp); + AArch64Operand &Op = static_cast<AArch64Operand &>(AsmOp); // If the kind is a token for a literal immediate, check if our asm // operand matches. This is for InstAliases which have a fixed-value // immediate in the syntax. @@ -4036,9 +4211,9 @@ unsigned AArch64AsmParser::validateTargetOperandClass(MCParsedAsmOperand *AsmOp, ExpectedVal = 8; break; } - if (!Op->isImm()) + if (!Op.isImm()) return Match_InvalidOperand; - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op->getImm()); + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op.getImm()); if (!CE) return Match_InvalidOperand; if (CE->getValue() == ExpectedVal) diff --git a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp index 24663684a3fd..2057c51346af 100644 --- a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp +++ b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp @@ -37,8 +37,7 @@ getVariant(uint64_t LLVMDisassembler_VariantKind) { case LLVMDisassembler_VariantKind_ARM64_TLVP: case LLVMDisassembler_VariantKind_ARM64_TLVOFF: default: - assert(0 && "bad LLVMDisassembler_VariantKind"); - return MCSymbolRefExpr::VK_None; + llvm_unreachable("bad LLVMDisassembler_VariantKind"); } } diff --git a/lib/Target/AArch64/Disassembler/CMakeLists.txt b/lib/Target/AArch64/Disassembler/CMakeLists.txt index be4ccad6d1b9..d64c05b0adbc 100644 --- a/lib/Target/AArch64/Disassembler/CMakeLists.txt +++ b/lib/Target/AArch64/Disassembler/CMakeLists.txt @@ -4,11 +4,5 @@ add_llvm_library(LLVMAArch64Disassembler AArch64Disassembler.cpp AArch64ExternalSymbolizer.cpp ) -# workaround for hanging compilation on MSVC8, 9 and 10 -#if( MSVC_VERSION EQUAL 1400 OR MSVC_VERSION EQUAL 1500 OR MSVC_VERSION EQUAL 1600 ) -#set_property( -# SOURCE ARMDisassembler.cpp -# PROPERTY COMPILE_FLAGS "/Od" -# ) -#endif() + add_dependencies(LLVMAArch64Disassembler AArch64CommonTableGen) diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp index f484a5b1bdcc..8a21f0650cdc 100644 --- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp +++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp @@ -918,7 +918,7 @@ void AArch64InstPrinter::printPostIncOperand(const MCInst *MI, unsigned OpNo, else O << getRegisterName(Reg); } else - assert(0 && "unknown operand kind in printPostIncOperand64"); + llvm_unreachable("unknown operand kind in printPostIncOperand64"); } void AArch64InstPrinter::printVRegOperand(const MCInst *MI, unsigned OpNo, @@ -1109,7 +1109,7 @@ static unsigned getNextVectorRegister(unsigned Reg, unsigned Stride = 1) { while (Stride--) { switch (Reg) { default: - assert(0 && "Vector register expected!"); + llvm_unreachable("Vector register expected!"); case AArch64::Q0: Reg = AArch64::Q1; break; case AArch64::Q1: Reg = AArch64::Q2; break; case AArch64::Q2: Reg = AArch64::Q3; break; diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp index d8900d4fceb2..a917616f142d 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp @@ -86,7 +86,7 @@ public: static unsigned getFixupKindNumBytes(unsigned Kind) { switch (Kind) { default: - assert(0 && "Unknown fixup kind!"); + llvm_unreachable("Unknown fixup kind!"); case AArch64::fixup_aarch64_tlsdesc_call: return 0; diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp index dc4a8bf6c9a9..1763b40e2d48 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp @@ -96,4 +96,6 @@ AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(StringRef TT) { ExceptionsType = ExceptionHandling::DwarfCFI; UseIntegratedAssembler = true; + + HasIdentDirective = true; } diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp index 464a18cdbc04..f0513575edb4 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp @@ -218,13 +218,9 @@ AArch64MCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCOperand &MO, const MCSubtargetInfo &STI) const { if (MO.isReg()) return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg()); - else { - assert(MO.isImm() && "did not expect relocated expression"); - return static_cast<unsigned>(MO.getImm()); - } - assert(0 && "Unable to encode MCOperand!"); - return 0; + assert(MO.isImm() && "did not expect relocated expression"); + return static_cast<unsigned>(MO.getImm()); } template<unsigned FixupKind> uint32_t diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp index 85c3ec7a55f1..42a6787da48e 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp @@ -81,37 +81,8 @@ void AArch64MCExpr::PrintImpl(raw_ostream &OS) const { OS << *Expr; } -// FIXME: This basically copies MCObjectStreamer::AddValueSymbols. Perhaps -// that method should be made public? -// FIXME: really do above: now that two backends are using it. -static void AddValueSymbolsImpl(const MCExpr *Value, MCAssembler *Asm) { - switch (Value->getKind()) { - case MCExpr::Target: - llvm_unreachable("Can't handle nested target expr!"); - break; - - case MCExpr::Constant: - break; - - case MCExpr::Binary: { - const MCBinaryExpr *BE = cast<MCBinaryExpr>(Value); - AddValueSymbolsImpl(BE->getLHS(), Asm); - AddValueSymbolsImpl(BE->getRHS(), Asm); - break; - } - - case MCExpr::SymbolRef: - Asm->getOrCreateSymbolData(cast<MCSymbolRefExpr>(Value)->getSymbol()); - break; - - case MCExpr::Unary: - AddValueSymbolsImpl(cast<MCUnaryExpr>(Value)->getSubExpr(), Asm); - break; - } -} - -void AArch64MCExpr::AddValueSymbols(MCAssembler *Asm) const { - AddValueSymbolsImpl(getSubExpr(), Asm); +void AArch64MCExpr::visitUsedExpr(MCStreamer &Streamer) const { + Streamer.visitUsedExpr(*getSubExpr()); } const MCSection *AArch64MCExpr::FindAssociatedSection() const { diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h index e869ed0a26a4..5422f9d7067e 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h @@ -147,7 +147,7 @@ public: void PrintImpl(raw_ostream &OS) const override; - void AddValueSymbols(MCAssembler *) const override; + void visitUsedExpr(MCStreamer &Streamer) const override; const MCSection *FindAssociatedSection() const override; diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp index 5c86189a6ef5..ba9536676e12 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp @@ -75,7 +75,7 @@ bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo( Log2Size = llvm::Log2_32(4); switch (Sym->getKind()) { default: - assert(0 && "Unexpected symbol reference variant kind!"); + llvm_unreachable("Unexpected symbol reference variant kind!"); case MCSymbolRefExpr::VK_PAGEOFF: RelocType = unsigned(MachO::ARM64_RELOC_PAGEOFF12); return true; diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp new file mode 100644 index 000000000000..f9aeb35b649e --- /dev/null +++ b/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp @@ -0,0 +1,40 @@ +//===- AArch64TargetStreamer.cpp - AArch64TargetStreamer class --*- C++ -*---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the AArch64TargetStreamer class. +// +//===----------------------------------------------------------------------===// +#include "llvm/ADT/MapVector.h" +#include "llvm/MC/ConstantPools.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCStreamer.h" + +using namespace llvm; + +// +// AArch64TargetStreamer Implemenation +// +AArch64TargetStreamer::AArch64TargetStreamer(MCStreamer &S) + : MCTargetStreamer(S), ConstantPools(new AssemblerConstantPools()) {} + +AArch64TargetStreamer::~AArch64TargetStreamer() {} + +// The constant pool handling is shared by all AArch64TargetStreamer +// implementations. +const MCExpr *AArch64TargetStreamer::addConstantPoolEntry(const MCExpr *Expr) { + return ConstantPools->addEntry(Streamer, Expr); +} + +void AArch64TargetStreamer::emitCurrentConstantPool() { + ConstantPools->emitForCurrentSection(Streamer); +} + +// finish() - write out any non-empty assembler constant pools. +void AArch64TargetStreamer::finish() { ConstantPools->emitAll(Streamer); } diff --git a/lib/Target/AArch64/MCTargetDesc/Android.mk b/lib/Target/AArch64/MCTargetDesc/Android.mk index e9d2323f39bb..a23c0e52567d 100644 --- a/lib/Target/AArch64/MCTargetDesc/Android.mk +++ b/lib/Target/AArch64/MCTargetDesc/Android.mk @@ -14,7 +14,8 @@ aarch64_mc_desc_SRC_FILES := \ AArch64MCAsmInfo.cpp \ AArch64MCCodeEmitter.cpp \ AArch64MCExpr.cpp \ - AArch64MCTargetDesc.cpp + AArch64MCTargetDesc.cpp \ + AArch64TargetStreamer.cpp # For the host # ===================================================== diff --git a/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt b/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt index 7d5bced17a6a..6d8be5e63fbb 100644 --- a/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt +++ b/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt @@ -7,6 +7,7 @@ add_llvm_library(LLVMAArch64Desc AArch64MCExpr.cpp AArch64MCTargetDesc.cpp AArch64MachObjectWriter.cpp + AArch64TargetStreamer.cpp ) add_dependencies(LLVMAArch64Desc AArch64CommonTableGen) diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/lib/Target/AArch64/Utils/AArch64BaseInfo.h index 9e4c389cc2e9..9d2ce21c9626 100644 --- a/lib/Target/AArch64/Utils/AArch64BaseInfo.h +++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.h @@ -233,23 +233,9 @@ inline static const char *getCondCodeName(CondCode Code) { } inline static CondCode getInvertedCondCode(CondCode Code) { - switch (Code) { - default: llvm_unreachable("Unknown condition code"); - case EQ: return NE; - case NE: return EQ; - case HS: return LO; - case LO: return HS; - case MI: return PL; - case PL: return MI; - case VS: return VC; - case VC: return VS; - case HI: return LS; - case LS: return HI; - case GE: return LT; - case LT: return GE; - case GT: return LE; - case LE: return GT; - } + // To reverse a condition it's necessary to only invert the low bit: + + return static_cast<CondCode>(static_cast<unsigned>(Code) ^ 0x1); } /// Given a condition code, return NZCV flags that would satisfy that condition. diff --git a/lib/Target/ARM/A15SDOptimizer.cpp b/lib/Target/ARM/A15SDOptimizer.cpp index 94faf6f60db3..92eaf9e1c9b3 100644 --- a/lib/Target/ARM/A15SDOptimizer.cpp +++ b/lib/Target/ARM/A15SDOptimizer.cpp @@ -321,8 +321,7 @@ unsigned A15SDOptimizer::optimizeSDPattern(MachineInstr *MI) { return optimizeAllLanesPattern(MI, MI->getOperand(0).getReg()); } - assert(0 && "Unhandled update pattern!"); - return 0; + llvm_unreachable("Unhandled update pattern!"); } // Return true if this MachineInstr inserts a scalar (SPR) value into diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp index 55e9fe5f5c57..28d2610c3903 100644 --- a/lib/Target/ARM/ARMAsmPrinter.cpp +++ b/lib/Target/ARM/ARMAsmPrinter.cpp @@ -82,7 +82,8 @@ void ARMAsmPrinter::EmitXXStructor(const Constant *CV) { const GlobalValue *GV = dyn_cast<GlobalValue>(CV->stripPointerCasts()); assert(GV && "C++ constructor pointer was not a GlobalValue!"); - const MCExpr *E = MCSymbolRefExpr::Create(getSymbol(GV), + const MCExpr *E = MCSymbolRefExpr::Create(GetARMGVSymbol(GV, + ARMII::MO_NO_FLAG), (Subtarget->isTargetELF() ? MCSymbolRefExpr::VK_ARM_TARGET1 : MCSymbolRefExpr::VK_None), @@ -164,7 +165,7 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum, else if ((Modifier && strcmp(Modifier, "hi16") == 0) || (TF & ARMII::MO_HI16)) O << ":upper16:"; - O << *getSymbol(GV); + O << *GetARMGVSymbol(GV, TF); printOffset(MO.getOffset(), O); if (TF == ARMII::MO_PLT) @@ -730,6 +731,32 @@ void ARMAsmPrinter::emitAttributes() { if (Subtarget->hasDivideInARMMode() && !Subtarget->hasV8Ops()) ATS.emitAttribute(ARMBuildAttrs::DIV_use, ARMBuildAttrs::AllowDIVExt); + if (MMI) { + if (const Module *SourceModule = MMI->getModule()) { + // ABI_PCS_wchar_t to indicate wchar_t width + // FIXME: There is no way to emit value 0 (wchar_t prohibited). + if (auto WCharWidthValue = cast_or_null<ConstantInt>( + SourceModule->getModuleFlag("wchar_size"))) { + int WCharWidth = WCharWidthValue->getZExtValue(); + assert((WCharWidth == 2 || WCharWidth == 4) && + "wchar_t width must be 2 or 4 bytes"); + ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_wchar_t, WCharWidth); + } + + // ABI_enum_size to indicate enum width + // FIXME: There is no way to emit value 0 (enums prohibited) or value 3 + // (all enums contain a value needing 32 bits to encode). + if (auto EnumWidthValue = cast_or_null<ConstantInt>( + SourceModule->getModuleFlag("min_enum_size"))) { + int EnumWidth = EnumWidthValue->getZExtValue(); + assert((EnumWidth == 1 || EnumWidth == 4) && + "Minimum enum width must be 1 or 4 bytes"); + int EnumBuildAttr = EnumWidth == 1 ? 1 : 2; + ATS.emitAttribute(ARMBuildAttrs::ABI_enum_size, EnumBuildAttr); + } + } + } + if (Subtarget->hasTrustZone() && Subtarget->hasVirtualization()) ATS.emitAttribute(ARMBuildAttrs::Virtualization_use, ARMBuildAttrs::AllowTZVirtualization); @@ -768,23 +795,41 @@ getModifierVariantKind(ARMCP::ARMCPModifier Modifier) { MCSymbol *ARMAsmPrinter::GetARMGVSymbol(const GlobalValue *GV, unsigned char TargetFlags) { - bool isIndirect = Subtarget->isTargetMachO() && - (TargetFlags & ARMII::MO_NONLAZY) && - Subtarget->GVIsIndirectSymbol(GV, TM.getRelocationModel()); - if (!isIndirect) - return getSymbol(GV); + if (Subtarget->isTargetMachO()) { + bool IsIndirect = (TargetFlags & ARMII::MO_NONLAZY) && + Subtarget->GVIsIndirectSymbol(GV, TM.getRelocationModel()); + + if (!IsIndirect) + return getSymbol(GV); - // FIXME: Remove this when Darwin transition to @GOT like syntax. - MCSymbol *MCSym = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr"); - MachineModuleInfoMachO &MMIMachO = - MMI->getObjFileInfo<MachineModuleInfoMachO>(); - MachineModuleInfoImpl::StubValueTy &StubSym = - GV->hasHiddenVisibility() ? MMIMachO.getHiddenGVStubEntry(MCSym) : - MMIMachO.getGVStubEntry(MCSym); - if (!StubSym.getPointer()) - StubSym = MachineModuleInfoImpl:: - StubValueTy(getSymbol(GV), !GV->hasInternalLinkage()); - return MCSym; + // FIXME: Remove this when Darwin transition to @GOT like syntax. + MCSymbol *MCSym = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr"); + MachineModuleInfoMachO &MMIMachO = + MMI->getObjFileInfo<MachineModuleInfoMachO>(); + MachineModuleInfoImpl::StubValueTy &StubSym = + GV->hasHiddenVisibility() ? MMIMachO.getHiddenGVStubEntry(MCSym) + : MMIMachO.getGVStubEntry(MCSym); + if (!StubSym.getPointer()) + StubSym = MachineModuleInfoImpl::StubValueTy(getSymbol(GV), + !GV->hasInternalLinkage()); + return MCSym; + } else if (Subtarget->isTargetCOFF()) { + assert(Subtarget->isTargetWindows() && + "Windows is the only supported COFF target"); + + bool IsIndirect = (TargetFlags & ARMII::MO_DLLIMPORT); + if (!IsIndirect) + return getSymbol(GV); + + SmallString<128> Name; + Name = "__imp_"; + getNameWithPrefix(Name, GV); + + return OutContext.GetOrCreateSymbol(Name); + } else if (Subtarget->isTargetELF()) { + return getSymbol(GV); + } + llvm_unreachable("unexpected target"); } void ARMAsmPrinter:: @@ -928,7 +973,7 @@ void ARMAsmPrinter::EmitJump2Table(const MachineInstr *MI) { for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) { MachineBasicBlock *MBB = JTBBs[i]; const MCExpr *MBBSymbolExpr = MCSymbolRefExpr::Create(MBB->getSymbol(), - OutContext); + OutContext); // If this isn't a TBB or TBH, the entries are direct branch instructions. if (OffsetWidth == 4) { EmitToStreamer(OutStreamer, MCInstBuilder(ARM::t2B) @@ -1225,8 +1270,10 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { // Add 's' bit operand (always reg0 for this) .addReg(0)); - const GlobalValue *GV = MI->getOperand(0).getGlobal(); - MCSymbol *GVSym = getSymbol(GV); + const MachineOperand &Op = MI->getOperand(0); + const GlobalValue *GV = Op.getGlobal(); + const unsigned TF = Op.getTargetFlags(); + MCSymbol *GVSym = GetARMGVSymbol(GV, TF); const MCExpr *GVSymExpr = MCSymbolRefExpr::Create(GVSym, OutContext); EmitToStreamer(OutStreamer, MCInstBuilder(ARM::Bcc) .addExpr(GVSymExpr) diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp index bc266e88b2df..0288db91dcbf 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -32,6 +32,7 @@ #include "llvm/IR/Function.h" #include "llvm/IR/GlobalValue.h" #include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCExpr.h" #include "llvm/Support/BranchProbability.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -102,14 +103,15 @@ ARMBaseInstrInfo::ARMBaseInstrInfo(const ARMSubtarget& STI) // Use a ScoreboardHazardRecognizer for prepass ARM scheduling. TargetInstrImpl // currently defaults to no prepass hazard recognizer. -ScheduleHazardRecognizer *ARMBaseInstrInfo:: -CreateTargetHazardRecognizer(const TargetMachine *TM, - const ScheduleDAG *DAG) const { +ScheduleHazardRecognizer * +ARMBaseInstrInfo::CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI, + const ScheduleDAG *DAG) const { if (usePreRAHazardRecognizer()) { - const InstrItineraryData *II = TM->getInstrItineraryData(); + const InstrItineraryData *II = + &static_cast<const ARMSubtarget *>(STI)->getInstrItineraryData(); return new ScoreboardHazardRecognizer(II, DAG, "pre-RA-sched"); } - return TargetInstrInfo::CreateTargetHazardRecognizer(TM, DAG); + return TargetInstrInfo::CreateTargetHazardRecognizer(STI, DAG); } ScheduleHazardRecognizer *ARMBaseInstrInfo:: @@ -1885,7 +1887,8 @@ bool llvm::tryFoldSPUpdateIntoPushPop(const ARMSubtarget &Subtarget, unsigned NumBytes) { // This optimisation potentially adds lots of load and store // micro-operations, it's only really a great benefit to code-size. - if (!Subtarget.isMinSize()) + if (!MF.getFunction()->getAttributes().hasAttribute( + AttributeSet::FunctionIndex, Attribute::MinSize)) return false; // If only one register is pushed/popped, LLVM can use an LDR/STR @@ -4358,6 +4361,29 @@ breakPartialRegDependency(MachineBasicBlock::iterator MI, MI->addRegisterKilled(DReg, TRI, true); } +void ARMBaseInstrInfo::getUnconditionalBranch( + MCInst &Branch, const MCSymbolRefExpr *BranchTarget) const { + if (Subtarget.isThumb()) + Branch.setOpcode(ARM::tB); + else if (Subtarget.isThumb2()) + Branch.setOpcode(ARM::t2B); + else + Branch.setOpcode(ARM::Bcc); + + Branch.addOperand(MCOperand::CreateExpr(BranchTarget)); + Branch.addOperand(MCOperand::CreateImm(ARMCC::AL)); + Branch.addOperand(MCOperand::CreateReg(0)); +} + +void ARMBaseInstrInfo::getTrap(MCInst &MI) const { + if (Subtarget.isThumb()) + MI.setOpcode(ARM::tTRAP); + else if (Subtarget.useNaClTrap()) + MI.setOpcode(ARM::TRAPNaCl); + else + MI.setOpcode(ARM::TRAP); +} + bool ARMBaseInstrInfo::hasNOP() const { return (Subtarget.getFeatureBits() & ARM::HasV6T2Ops) != 0; } diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h index 4b3e74023ac9..b8d675806b25 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/lib/Target/ARM/ARMBaseInstrInfo.h @@ -50,7 +50,7 @@ public: const ARMSubtarget &getSubtarget() const { return Subtarget; } ScheduleHazardRecognizer * - CreateTargetHazardRecognizer(const TargetMachine *TM, + CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI, const ScheduleDAG *DAG) const override; ScheduleHazardRecognizer * @@ -229,6 +229,13 @@ public: const TargetRegisterInfo*) const override; void breakPartialRegDependency(MachineBasicBlock::iterator, unsigned, const TargetRegisterInfo *TRI) const override; + + void + getUnconditionalBranch(MCInst &Branch, + const MCSymbolRefExpr *BranchTarget) const override; + + void getTrap(MCInst &MI) const override; + /// Get the number of addresses by LDM or VLDM or zero for unknown. unsigned getNumLDMAddresses(const MachineInstr *MI) const; diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp index a2eee9ff3048..cdd91c7a7036 100644 --- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp +++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -45,9 +45,12 @@ using namespace llvm; ARMBaseRegisterInfo::ARMBaseRegisterInfo(const ARMSubtarget &sti) : ARMGenRegisterInfo(ARM::LR, 0, 0, ARM::PC), STI(sti), BasePtr(ARM::R6) { - if (STI.isTargetMachO()) - FramePtr = ARM::R7; - else if (STI.isTargetWindows()) + if (STI.isTargetMachO()) { + if (STI.isTargetDarwin() || STI.isThumb1Only()) + FramePtr = ARM::R7; + else + FramePtr = ARM::R11; + } else if (STI.isTargetWindows()) FramePtr = ARM::R11; else // ARM EABI FramePtr = STI.isThumb() ? ARM::R7 : ARM::R11; diff --git a/lib/Target/ARM/ARMCodeEmitter.cpp b/lib/Target/ARM/ARMCodeEmitter.cpp index 2fd7eddd8748..5fb6ebfeaae1 100644 --- a/lib/Target/ARM/ARMCodeEmitter.cpp +++ b/lib/Target/ARM/ARMCodeEmitter.cpp @@ -15,6 +15,7 @@ #include "ARM.h" #include "ARMBaseInstrInfo.h" #include "ARMConstantPoolValue.h" +#include "ARMMachineFunctionInfo.h" #include "ARMRelocations.h" #include "ARMSubtarget.h" #include "ARMTargetMachine.h" diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp index 6045738e2e34..51d3dbb5bd8e 100644 --- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -927,10 +927,16 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, } case ARM::tTPsoft: case ARM::TPsoft: { - MachineInstrBuilder MIB = - BuildMI(MBB, MBBI, MI.getDebugLoc(), - TII->get(Opcode == ARM::tTPsoft ? ARM::tBL : ARM::BL)) - .addExternalSymbol("__aeabi_read_tp", 0); + MachineInstrBuilder MIB; + if (Opcode == ARM::tTPsoft) + MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get( ARM::tBL)) + .addImm((unsigned)ARMCC::AL).addReg(0) + .addExternalSymbol("__aeabi_read_tp", 0); + else + MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get( ARM::BL)) + .addExternalSymbol("__aeabi_read_tp", 0); MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); TransferImpOps(MI, MIB, MIB); diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp index 6f8fb1a3521c..e2d90cd9306c 100644 --- a/lib/Target/ARM/ARMFastISel.cpp +++ b/lib/Target/ARM/ARMFastISel.cpp @@ -590,7 +590,7 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) { // Use movw+movt when possible, it avoids constant pool entries. // Non-darwin targets only support static movt relocations in FastISel. - if (Subtarget->useMovt() && + if (Subtarget->useMovt(*FuncInfo.MF) && (Subtarget->isTargetMachO() || RelocM == Reloc::Static)) { unsigned Opc; unsigned char TF = 0; diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp index 0caf4bfd77a5..a67b3600c4fe 100644 --- a/lib/Target/ARM/ARMFrameLowering.cpp +++ b/lib/Target/ARM/ARMFrameLowering.cpp @@ -39,6 +39,10 @@ static MachineBasicBlock::iterator skipAlignedDPRCS2Spills(MachineBasicBlock::iterator MI, unsigned NumAlignedDPRCS2Regs); +ARMFrameLowering::ARMFrameLowering(const ARMSubtarget &sti) + : TargetFrameLowering(StackGrowsDown, sti.getStackAlignment(), 0, 4), + STI(sti) {} + /// hasFP - Return true if the specified function should have a dedicated frame /// pointer register. This is true if the function has variable sized allocas /// or if frame pointer elimination is disabled. @@ -220,7 +224,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const { case ARM::R10: case ARM::R11: case ARM::R12: - if (STI.isTargetMachO()) { + if (STI.isTargetDarwin()) { GPRCS2Size += 4; break; } @@ -380,7 +384,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const { case ARM::R10: case ARM::R11: case ARM::R12: - if (STI.isTargetMachO()) + if (STI.isTargetDarwin()) break; // fallthrough case ARM::R0: @@ -445,7 +449,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const { case ARM::R10: case ARM::R11: case ARM::R12: - if (STI.isTargetMachO()) { + if (STI.isTargetDarwin()) { unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true); unsigned Offset = MFI->getObjectOffset(FI); unsigned CFIIndex = MMI.addFrameInst( @@ -810,7 +814,7 @@ void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB, unsigned LastReg = 0; for (; i != 0; --i) { unsigned Reg = CSI[i-1].getReg(); - if (!(Func)(Reg, STI.isTargetMachO())) continue; + if (!(Func)(Reg, STI.isTargetDarwin())) continue; // D-registers in the aligned area DPRCS2 are NOT spilled here. if (Reg >= ARM::D8 && Reg < ARM::D8 + NumAlignedDPRCS2Regs) @@ -888,7 +892,7 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB, bool DeleteRet = false; for (; i != 0; --i) { unsigned Reg = CSI[i-1].getReg(); - if (!(Func)(Reg, STI.isTargetMachO())) continue; + if (!(Func)(Reg, STI.isTargetDarwin())) continue; // The aligned reloads from area DPRCS2 are not inserted here. if (Reg >= ARM::D8 && Reg < ARM::D8 + NumAlignedDPRCS2Regs) @@ -1438,7 +1442,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, if (Spilled) { NumGPRSpills++; - if (!STI.isTargetMachO()) { + if (!STI.isTargetDarwin()) { if (Reg == ARM::LR) LRSpilled = true; CS1Spilled = true; @@ -1460,7 +1464,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, break; } } else { - if (!STI.isTargetMachO()) { + if (!STI.isTargetDarwin()) { UnspilledCS1GPRs.push_back(Reg); continue; } diff --git a/lib/Target/ARM/ARMFrameLowering.h b/lib/Target/ARM/ARMFrameLowering.h index 981d3209710e..709afbcdc681 100644 --- a/lib/Target/ARM/ARMFrameLowering.h +++ b/lib/Target/ARM/ARMFrameLowering.h @@ -14,7 +14,6 @@ #ifndef ARM_FRAMEINFO_H #define ARM_FRAMEINFO_H -#include "ARMSubtarget.h" #include "llvm/Target/TargetFrameLowering.h" namespace llvm { @@ -25,10 +24,7 @@ protected: const ARMSubtarget &STI; public: - explicit ARMFrameLowering(const ARMSubtarget &sti) - : TargetFrameLowering(StackGrowsDown, sti.getStackAlignment(), 0, 4), - STI(sti) { - } + explicit ARMFrameLowering(const ARMSubtarget &sti); /// emitProlog/emitEpilog - These methods insert prolog and epilog code into /// the function. diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp index 08d598d7c5a6..38547cfae2e0 100644 --- a/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -60,22 +60,17 @@ enum AddrMode2Type { }; class ARMDAGToDAGISel : public SelectionDAGISel { - ARMBaseTargetMachine &TM; - /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can /// make the right decision when generating code for different targets. const ARMSubtarget *Subtarget; public: - explicit ARMDAGToDAGISel(ARMBaseTargetMachine &tm, - CodeGenOpt::Level OptLevel) - : SelectionDAGISel(tm, OptLevel), TM(tm), - Subtarget(&TM.getSubtarget<ARMSubtarget>()) { - } + explicit ARMDAGToDAGISel(ARMBaseTargetMachine &tm, CodeGenOpt::Level OptLevel) + : SelectionDAGISel(tm, OptLevel) {} bool runOnMachineFunction(MachineFunction &MF) override { // Reset the subtarget each time through. - Subtarget = &TM.getSubtarget<ARMSubtarget>(); + Subtarget = &MF.getTarget().getSubtarget<ARMSubtarget>(); SelectionDAGISel::runOnMachineFunction(MF); return true; } @@ -429,8 +424,8 @@ bool ARMDAGToDAGISel::hasNoVMLxHazardUse(SDNode *N) const { if (Use->getOpcode() == ISD::CopyToReg) return true; if (Use->isMachineOpcode()) { - const ARMBaseInstrInfo *TII = - static_cast<const ARMBaseInstrInfo*>(TM.getInstrInfo()); + const ARMBaseInstrInfo *TII = static_cast<const ARMBaseInstrInfo *>( + CurDAG->getTarget().getInstrInfo()); const MCInstrDesc &MCID = TII->get(Use->getMachineOpcode()); if (MCID.mayStore()) @@ -2444,7 +2439,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { case ISD::Constant: { unsigned Val = cast<ConstantSDNode>(N)->getZExtValue(); bool UseCP = true; - if (Subtarget->useMovt()) + if (Subtarget->useMovt(*MF)) // Thumb2-aware targets have the MOVT instruction, so all immediates can // be done with MOV + MOVT, at worst. UseCP = false; diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 00d07e840674..4bfa5a8b5c62 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -155,16 +155,16 @@ void ARMTargetLowering::addQRTypeForNEON(MVT VT) { addTypeForNEON(VT, MVT::v2f64, MVT::v4i32); } -static TargetLoweringObjectFile *createTLOF(TargetMachine &TM) { - if (TM.getSubtarget<ARMSubtarget>().isTargetMachO()) +static TargetLoweringObjectFile *createTLOF(const Triple &TT) { + if (TT.isOSBinFormatMachO()) return new TargetLoweringObjectFileMachO(); - if (TM.getSubtarget<ARMSubtarget>().isTargetWindows()) + if (TT.isOSWindows()) return new TargetLoweringObjectFileCOFF(); return new ARMElfTargetObjectFile(); } ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) - : TargetLowering(TM, createTLOF(TM)) { + : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))) { Subtarget = &TM.getSubtarget<ARMSubtarget>(); RegInfo = TM.getRegisterInfo(); Itins = TM.getInstrItineraryData(); @@ -710,7 +710,11 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setExceptionSelectorRegister(ARM::R1); } - setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); + if (Subtarget->getTargetTriple().isWindowsItaniumEnvironment()) + setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); + else + setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); + // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use // the default expansion. if (Subtarget->hasAnyDataBarrier() && !Subtarget->isThumb1Only()) { @@ -983,6 +987,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::PRELOAD: return "ARMISD::PRELOAD"; + case ARMISD::WIN__CHKSTK: return "ARMISD:::WIN__CHKSTK"; + case ARMISD::VCEQ: return "ARMISD::VCEQ"; case ARMISD::VCEQZ: return "ARMISD::VCEQZ"; case ARMISD::VCGE: return "ARMISD::VCGE"; @@ -1199,7 +1205,7 @@ ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC, case CallingConv::C: if (!Subtarget->isAAPCS_ABI()) return CallingConv::ARM_APCS; - else if (Subtarget->hasVFP2() && + else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && getTargetMachine().Options.FloatABIType == FloatABI::Hard && !isVarArg) return CallingConv::ARM_AAPCS_VFP; @@ -1207,10 +1213,10 @@ ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC, return CallingConv::ARM_AAPCS; case CallingConv::Fast: if (!Subtarget->isAAPCS_ABI()) { - if (Subtarget->hasVFP2() && !isVarArg) + if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg) return CallingConv::Fast; return CallingConv::ARM_APCS; - } else if (Subtarget->hasVFP2() && !isVarArg) + } else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg) return CallingConv::ARM_AAPCS_VFP; else return CallingConv::ARM_AAPCS; @@ -1598,8 +1604,9 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); if (EnableARMLongCalls) { - assert (getTargetMachine().getRelocationModel() == Reloc::Static - && "long-calls with non-static relocation model!"); + assert((Subtarget->isTargetWindows() || + getTargetMachine().getRelocationModel() == Reloc::Static) && + "long-calls with non-static relocation model!"); // Handle a global address or an external symbol. If it's not one of // those, the target's already in a register, so we don't need to do // anything extra. @@ -1647,6 +1654,19 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?"); Callee = DAG.getNode(ARMISD::WrapperPIC, dl, getPointerTy(), DAG.getTargetGlobalAddress(GV, dl, getPointerTy())); + } else if (Subtarget->isTargetCOFF()) { + assert(Subtarget->isTargetWindows() && + "Windows is the only supported COFF target"); + unsigned TargetFlags = GV->hasDLLImportStorageClass() + ? ARMII::MO_DLLIMPORT + : ARMII::MO_NO_FLAG; + Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), /*Offset=*/0, + TargetFlags); + if (GV->hasDLLImportStorageClass()) + Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), + DAG.getNode(ARMISD::Wrapper, dl, getPointerTy(), + Callee), MachinePointerInfo::getGOT(), + false, false, false, 0); } else { // On ELF targets for PIC code, direct calls should go through the PLT unsigned OpFlags = 0; @@ -1688,7 +1708,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // FIXME: handle tail calls differently. unsigned CallOpc; - bool HasMinSizeAttr = Subtarget->isMinSize(); + bool HasMinSizeAttr = MF.getFunction()->getAttributes().hasAttribute( + AttributeSet::FunctionIndex, Attribute::MinSize); if (Subtarget->isThumb()) { if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps()) CallOpc = ARMISD::CALL_NOLINK; @@ -2326,7 +2347,8 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(Chain) .setCallee(CallingConv::C, Type::getInt32Ty(*DAG.getContext()), - DAG.getExternalSymbol("__tls_get_addr", PtrVT), &Args, 0); + DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args), + 0); std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); return CallResult.first; @@ -2434,7 +2456,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, // If we have T2 ops, we can materialize the address directly via movt/movw // pair. This is always cheaper. - if (Subtarget->useMovt()) { + if (Subtarget->useMovt(DAG.getMachineFunction())) { ++NumMovwMovt; // FIXME: Once remat is capable of dealing with instructions with register // operands, expand this into two nodes. @@ -2456,7 +2478,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); Reloc::Model RelocM = getTargetMachine().getRelocationModel(); - if (Subtarget->useMovt()) + if (Subtarget->useMovt(DAG.getMachineFunction())) ++NumMovwMovt; // FIXME: Once remat is capable of dealing with instructions with register @@ -2476,18 +2498,27 @@ SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported"); - assert(Subtarget->useMovt() && "Windows on ARM expects to use movw/movt"); + assert(Subtarget->useMovt(DAG.getMachineFunction()) && + "Windows on ARM expects to use movw/movt"); const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); + const ARMII::TOF TargetFlags = + (GV->hasDLLImportStorageClass() ? ARMII::MO_DLLIMPORT : ARMII::MO_NO_FLAG); EVT PtrVT = getPointerTy(); + SDValue Result; SDLoc DL(Op); ++NumMovwMovt; // FIXME: Once remat is capable of dealing with instructions with register // operands, expand this into two nodes. - return DAG.getNode(ARMISD::Wrapper, DL, PtrVT, - DAG.getTargetGlobalAddress(GV, DL, PtrVT)); + Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, + DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*Offset=*/0, + TargetFlags)); + if (GV->hasDLLImportStorageClass()) + Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, + MachinePointerInfo::getGOT(), false, false, false, 0); + return Result; } SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op, @@ -2535,6 +2566,11 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, SDLoc dl(Op); switch (IntNo) { default: return SDValue(); // Don't custom lower most intrinsics. + case Intrinsic::arm_rbit: { + assert(Op.getOperand(0).getValueType() == MVT::i32 && + "RBIT intrinsic must have i32 type!"); + return DAG.getNode(ARMISD::RBIT, dl, MVT::i32, Op.getOperand(0)); + } case Intrinsic::arm_thread_pointer: { EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); @@ -4492,6 +4528,11 @@ static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, BitMask <<= 8; ImmMask <<= 1; } + + if (DAG.getTargetLoweringInfo().isBigEndian()) + // swap higher and lower 32 bit word + Imm = ((Imm & 0xf) << 4) | ((Imm & 0xf0) >> 4); + // Op=1, Cmode=1110. OpCmode = 0x1e; VT = is128Bits ? MVT::v2i64 : MVT::v1i64; @@ -6078,7 +6119,7 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), Callee, - &Args, 0) + std::move(Args), 0) .setDiscardResult(); std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); @@ -6213,6 +6254,10 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FSINCOS: return LowerFSINCOS(Op, DAG); case ISD::SDIVREM: case ISD::UDIVREM: return LowerDivRem(Op, DAG); + case ISD::DYNAMIC_STACKALLOC: + if (Subtarget->getTargetTriple().isWindowsItaniumEnvironment()) + return LowerDYNAMIC_STACKALLOC(Op, DAG); + llvm_unreachable("Don't know how to custom lower this!"); } } @@ -7112,6 +7157,73 @@ ARMTargetLowering::EmitStructByval(MachineInstr *MI, } MachineBasicBlock * +ARMTargetLowering::EmitLowered__chkstk(MachineInstr *MI, + MachineBasicBlock *MBB) const { + const TargetMachine &TM = getTargetMachine(); + const TargetInstrInfo &TII = *TM.getInstrInfo(); + DebugLoc DL = MI->getDebugLoc(); + + assert(Subtarget->isTargetWindows() && + "__chkstk is only supported on Windows"); + assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode"); + + // __chkstk takes the number of words to allocate on the stack in R4, and + // returns the stack adjustment in number of bytes in R4. This will not + // clober any other registers (other than the obvious lr). + // + // Although, technically, IP should be considered a register which may be + // clobbered, the call itself will not touch it. Windows on ARM is a pure + // thumb-2 environment, so there is no interworking required. As a result, we + // do not expect a veneer to be emitted by the linker, clobbering IP. + // + // Each module receives its own copy of __chkstk, so no import thunk is + // required, again, ensuring that IP is not clobbered. + // + // Finally, although some linkers may theoretically provide a trampoline for + // out of range calls (which is quite common due to a 32M range limitation of + // branches for Thumb), we can generate the long-call version via + // -mcmodel=large, alleviating the need for the trampoline which may clobber + // IP. + + switch (TM.getCodeModel()) { + case CodeModel::Small: + case CodeModel::Medium: + case CodeModel::Default: + case CodeModel::Kernel: + BuildMI(*MBB, MI, DL, TII.get(ARM::tBL)) + .addImm((unsigned)ARMCC::AL).addReg(0) + .addExternalSymbol("__chkstk") + .addReg(ARM::R4, RegState::Implicit | RegState::Kill) + .addReg(ARM::R4, RegState::Implicit | RegState::Define) + .addReg(ARM::R12, RegState::Implicit | RegState::Define | RegState::Dead); + break; + case CodeModel::Large: + case CodeModel::JITDefault: { + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + unsigned Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass); + + BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg) + .addExternalSymbol("__chkstk"); + BuildMI(*MBB, MI, DL, TII.get(ARM::tBLXr)) + .addImm((unsigned)ARMCC::AL).addReg(0) + .addReg(Reg, RegState::Kill) + .addReg(ARM::R4, RegState::Implicit | RegState::Kill) + .addReg(ARM::R4, RegState::Implicit | RegState::Define) + .addReg(ARM::R12, RegState::Implicit | RegState::Define | RegState::Dead); + break; + } + } + + AddDefaultCC(AddDefaultPred(BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), + ARM::SP) + .addReg(ARM::SP, RegState::Define) + .addReg(ARM::R4, RegState::Kill))); + + MI->eraseFromParent(); + return MBB; +} + +MachineBasicBlock * ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *BB) const { const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); @@ -7360,6 +7472,8 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case ARM::COPY_STRUCT_BYVAL_I32: ++NumLoopByVals; return EmitStructByval(MI, BB); + case ARM::WIN__CHKSTK: + return EmitLowered__chkstk(MI, BB); } } @@ -8315,6 +8429,8 @@ static SDValue PerformVMOVRRDCombine(SDNode *N, std::min(4U, LD->getAlignment() / 2)); DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1)); + if (DCI.DAG.getTargetLoweringInfo().isBigEndian()) + std::swap (NewLD1, NewLD2); SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2); DCI.RemoveFromWorklist(LD); DAG.DeleteNode(LD); @@ -8382,7 +8498,8 @@ static SDValue PerformSTORECombine(SDNode *N, SDLoc DL(St); SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal); SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); - for (unsigned i = 0; i < NumElems; ++i) ShuffleVec[i] = i * SizeRatio; + for (unsigned i = 0; i < NumElems; ++i) + ShuffleVec[i] = TLI.isBigEndian() ? (i+1) * SizeRatio - 1 : i * SizeRatio; // Can't shuffle using an illegal type. if (!TLI.isTypeLegal(WideVecVT)) return SDValue(); @@ -10471,13 +10588,39 @@ SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(InChain) - .setCallee(getLibcallCallingConv(LC), RetTy, Callee, &Args, 0) + .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args), 0) .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned); std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI); return CallInfo.first; } +SDValue +ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { + assert(Subtarget->isTargetWindows() && "unsupported target platform"); + SDLoc DL(Op); + + // Get the inputs. + SDValue Chain = Op.getOperand(0); + SDValue Size = Op.getOperand(1); + + SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size, + DAG.getConstant(2, MVT::i32)); + + SDValue Flag; + Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Flag); + Flag = Chain.getValue(1); + + SDVTList NodeTys = DAG.getVTList(MVT::i32, MVT::Glue); + Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Flag); + + SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32); + Chain = NewSP.getValue(1); + + SDValue Ops[2] = { NewSP, Chain }; + return DAG.getMergeValues(Ops, DL); +} + bool ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { // The ARM target isn't yet aware of offsets. @@ -10635,14 +10778,20 @@ bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, bool ARMTargetLowering::shouldExpandAtomicInIR(Instruction *Inst) const { // Loads and stores less than 64-bits are already atomic; ones above that // are doomed anyway, so defer to the default libcall and blame the OS when - // things go wrong: - if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) - return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 64; - else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) - return LI->getType()->getPrimitiveSizeInBits() == 64; - - // For the real atomic operations, we have ldrex/strex up to 64 bits. - return Inst->getType()->getPrimitiveSizeInBits() <= 64; + // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit + // anything for those. + bool IsMClass = Subtarget->isMClass(); + if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { + unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits(); + return Size == 64 && !IsMClass; + } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { + return LI->getType()->getPrimitiveSizeInBits() == 64 && !IsMClass; + } + + // For the real atomic operations, we have ldrex/strex up to 32 bits, + // and up to 64 bits on the non-M profiles + unsigned AtomicLimit = IsMClass ? 32 : 64; + return Inst->getType()->getPrimitiveSizeInBits() <= AtomicLimit; } Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h index c15305c5e603..1ace0f330f1a 100644 --- a/lib/Target/ARM/ARMISelLowering.h +++ b/lib/Target/ARM/ARMISelLowering.h @@ -95,6 +95,8 @@ namespace llvm { PRELOAD, // Preload + WIN__CHKSTK, // Windows' __chkstk call to do stack probing. + VCEQ, // Vector compare equal. VCEQZ, // Vector compare equal to zero. VCGE, // Vector compare greater than or equal. @@ -470,6 +472,7 @@ namespace llvm { const ARMSubtarget *ST) const; SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDivRem(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; unsigned getRegisterByName(const char* RegName, EVT VT) const override; @@ -578,6 +581,9 @@ namespace llvm { MachineBasicBlock *EmitStructByval(MachineInstr *MI, MachineBasicBlock *MBB) const; + + MachineBasicBlock *EmitLowered__chkstk(MachineInstr *MI, + MachineBasicBlock *MBB) const; }; enum NEONModImmType { diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td index 718d5da9d05a..2bb897659559 100644 --- a/lib/Target/ARM/ARMInstrInfo.td +++ b/lib/Target/ARM/ARMInstrInfo.td @@ -270,8 +270,8 @@ def UseNaClTrap : Predicate<"Subtarget->useNaClTrap()">, def DontUseNaClTrap : Predicate<"!Subtarget->useNaClTrap()">; // FIXME: Eventually this will be just "hasV6T2Ops". -def UseMovt : Predicate<"Subtarget->useMovt()">; -def DontUseMovt : Predicate<"!Subtarget->useMovt()">; +def UseMovt : Predicate<"Subtarget->useMovt(*MF)">; +def DontUseMovt : Predicate<"!Subtarget->useMovt(*MF)">; def UseFPVMLx : Predicate<"Subtarget->useFPVMLx()">; def UseMulOps : Predicate<"Subtarget->useMulOps()">; @@ -493,7 +493,7 @@ def neon_vcvt_imm32 : Operand<i32> { // rot_imm: An integer that encodes a rotate amount. Must be 8, 16, or 24. def rot_imm_XFORM: SDNodeXForm<imm, [{ switch (N->getZExtValue()){ - default: assert(0); + default: llvm_unreachable(nullptr); case 0: return CurDAG->getTargetConstant(0, MVT::i32); case 8: return CurDAG->getTargetConstant(1, MVT::i32); case 16: return CurDAG->getTargetConstant(2, MVT::i32); @@ -594,7 +594,7 @@ def so_imm2part : PatLeaf<(imm), [{ /// arm_i32imm - True for +V6T2, or true only if so_imm2part is true. /// def arm_i32imm : PatLeaf<(imm), [{ - if (Subtarget->useMovt()) + if (Subtarget->useMovt(*MF)) return true; return ARM_AM::isSOImmTwoPartVal((unsigned)N->getZExtValue()); }]>; @@ -3334,8 +3334,8 @@ def SBFX : I<(outs GPRnopc:$Rd), let Inst{3-0} = Rn; } -def UBFX : I<(outs GPR:$Rd), - (ins GPR:$Rn, imm0_31:$lsb, imm1_32:$width), +def UBFX : I<(outs GPRnopc:$Rd), + (ins GPRnopc:$Rn, imm0_31:$lsb, imm1_32:$width), AddrMode1, 4, IndexModeNone, DPFrm, IIC_iUNAsi, "ubfx", "\t$Rd, $Rn, $lsb, $width", "", []>, Requires<[IsARM, HasV6T2]> { @@ -4443,7 +4443,7 @@ def instsyncb_opt : Operand<i32> { let DecoderMethod = "DecodeInstSyncBarrierOption"; } -// memory barriers protect the atomic sequences +// Memory barriers protect the atomic sequences let hasSideEffects = 1 in { def DMB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary, "dmb", "\t$opt", [(int_arm_dmb (i32 imm0_15:$opt))]>, @@ -4452,7 +4452,6 @@ def DMB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary, let Inst{31-4} = 0xf57ff05; let Inst{3-0} = opt; } -} def DSB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary, "dsb", "\t$opt", [(int_arm_dsb (i32 imm0_15:$opt))]>, @@ -4464,12 +4463,13 @@ def DSB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary, // ISB has only full system option def ISB : AInoP<(outs), (ins instsyncb_opt:$opt), MiscFrm, NoItinerary, - "isb", "\t$opt", []>, + "isb", "\t$opt", [(int_arm_isb (i32 imm0_15:$opt))]>, Requires<[IsARM, HasDB]> { bits<4> opt; let Inst{31-4} = 0xf57ff06; let Inst{3-0} = opt; } +} let usesCustomInserter = 1, Defs = [CPSR] in { @@ -5093,6 +5093,19 @@ def MSRi : ABI<0b0011, (outs), (ins msr_mask:$mask, so_imm:$a), NoItinerary, let Inst{11-0} = a; } +// Dynamic stack allocation yields a _chkstk for Windows targets. These calls +// are needed to probe the stack when allocating more than +// 4k bytes in one go. Touching the stack at 4K increments is necessary to +// ensure that the guard pages used by the OS virtual memory manager are +// allocated in correct sequence. +// The main point of having separate instruction are extra unmodelled effects +// (compared to ordinary calls) like stack pointer change. + +def win__chkstk : SDNode<"ARMISD::WIN__CHKSTK", SDTNone, + [SDNPHasChain, SDNPSideEffect]>; +let usesCustomInserter = 1, Uses = [R4], Defs = [R4, SP] in + def WIN__CHKSTK : PseudoInst<(outs), (ins), NoItinerary, [(win__chkstk)]>; + //===----------------------------------------------------------------------===// // TLS Instructions // @@ -5100,9 +5113,11 @@ def MSRi : ABI<0b0011, (outs), (ins msr_mask:$mask, so_imm:$a), NoItinerary, // __aeabi_read_tp preserves the registers r1-r3. // This is a pseudo inst so that we can get the encoding right, // complete with fixup for the aeabi_read_tp function. +// TPsoft is valid for ARM mode only, in case of Thumb mode a tTPsoft pattern +// is defined in "ARMInstrThumb.td". let isCall = 1, Defs = [R0, R12, LR, CPSR], Uses = [SP] in { - def TPsoft : PseudoInst<(outs), (ins), IIC_Br, + def TPsoft : ARMPseudoInst<(outs), (ins), 4, IIC_Br, [(set R0, ARMthread_pointer)]>, Sched<[WriteBr]>; } diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td index b32b5d24af19..c02bb3b55f5b 100644 --- a/lib/Target/ARM/ARMInstrNEON.td +++ b/lib/Target/ARM/ARMInstrNEON.td @@ -6372,6 +6372,32 @@ multiclass Lengthen_HalfSingle<string DestLanes, string DestTy, string SrcTy, dsub_0)>; } +// The following class definition is basically a copy of the +// Lengthen_HalfSingle definition above, however with an additional parameter +// "RevLanes" to select the correct VREV32dXX instruction. This is to convert +// data loaded by VLD1LN into proper vector format in big endian mode. +multiclass Lengthen_HalfSingle_Big_Endian<string DestLanes, string DestTy, string SrcTy, + string InsnLanes, string InsnTy, string RevLanes> { + def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy) + (!cast<PatFrag>("extloadv" # SrcTy) addrmode6oneL32:$addr)), + (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # InsnLanes # InsnTy) + (!cast<Instruction>("VREV32d" # RevLanes) + (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))), + dsub_0)>; + def _Z : Pat<(!cast<ValueType>("v" # DestLanes # DestTy) + (!cast<PatFrag>("zextloadv" # SrcTy) addrmode6oneL32:$addr)), + (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # InsnLanes # InsnTy) + (!cast<Instruction>("VREV32d" # RevLanes) + (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))), + dsub_0)>; + def _S : Pat<(!cast<ValueType>("v" # DestLanes # DestTy) + (!cast<PatFrag>("sextloadv" # SrcTy) addrmode6oneL32:$addr)), + (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # InsnLanes # InsnTy) + (!cast<Instruction>("VREV32d" # RevLanes) + (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))), + dsub_0)>; +} + // extload, zextload and sextload for a lengthening load followed by another // lengthening load, to quadruple the initial length. // @@ -6406,6 +6432,36 @@ multiclass Lengthen_Double<string DestLanes, string DestTy, string SrcTy, dsub_0))>; } +// The following class definition is basically a copy of the +// Lengthen_Double definition above, however with an additional parameter +// "RevLanes" to select the correct VREV32dXX instruction. This is to convert +// data loaded by VLD1LN into proper vector format in big endian mode. +multiclass Lengthen_Double_Big_Endian<string DestLanes, string DestTy, string SrcTy, + string Insn1Lanes, string Insn1Ty, string Insn2Lanes, + string Insn2Ty, string RevLanes> { + def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy) + (!cast<PatFrag>("extloadv" # SrcTy) addrmode6oneL32:$addr)), + (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty) + (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty) + (!cast<Instruction>("VREV32d" # RevLanes) + (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))), + dsub_0))>; + def _Z : Pat<(!cast<ValueType>("v" # DestLanes # DestTy) + (!cast<PatFrag>("zextloadv" # SrcTy) addrmode6oneL32:$addr)), + (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty) + (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty) + (!cast<Instruction>("VREV32d" # RevLanes) + (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))), + dsub_0))>; + def _S : Pat<(!cast<ValueType>("v" # DestLanes # DestTy) + (!cast<PatFrag>("sextloadv" # SrcTy) addrmode6oneL32:$addr)), + (!cast<Instruction>("VMOVLsv" # Insn2Lanes # Insn2Ty) + (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn1Lanes # Insn1Ty) + (!cast<Instruction>("VREV32d" # RevLanes) + (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))), + dsub_0))>; +} + // extload, zextload and sextload for a lengthening load followed by another // lengthening load, to quadruple the initial length, but which ends up only // requiring half the available lanes (a 64-bit outcome instead of a 128-bit). @@ -6443,33 +6499,102 @@ multiclass Lengthen_HalfDouble<string DestLanes, string DestTy, string SrcTy, dsub_0)>; } +// The following class definition is basically a copy of the +// Lengthen_HalfDouble definition above, however with an additional VREV16d8 +// instruction to convert data loaded by VLD1LN into proper vector format +// in big endian mode. +multiclass Lengthen_HalfDouble_Big_Endian<string DestLanes, string DestTy, string SrcTy, + string Insn1Lanes, string Insn1Ty, string Insn2Lanes, + string Insn2Ty> { + def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy) + (!cast<PatFrag>("extloadv" # SrcTy) addrmode6:$addr)), + (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty) + (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty) + (!cast<Instruction>("VREV16d8") + (VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))), + dsub_0)), + dsub_0)>; + def _Z : Pat<(!cast<ValueType>("v" # DestLanes # DestTy) + (!cast<PatFrag>("zextloadv" # SrcTy) addrmode6:$addr)), + (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty) + (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty) + (!cast<Instruction>("VREV16d8") + (VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))), + dsub_0)), + dsub_0)>; + def _S : Pat<(!cast<ValueType>("v" # DestLanes # DestTy) + (!cast<PatFrag>("sextloadv" # SrcTy) addrmode6:$addr)), + (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn2Lanes # Insn2Ty) + (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn1Lanes # Insn1Ty) + (!cast<Instruction>("VREV16d8") + (VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))), + dsub_0)), + dsub_0)>; +} + defm : Lengthen_Single<"8", "i16", "8">; // v8i8 -> v8i16 defm : Lengthen_Single<"4", "i32", "16">; // v4i16 -> v4i32 defm : Lengthen_Single<"2", "i64", "32">; // v2i32 -> v2i64 -defm : Lengthen_HalfSingle<"4", "i16", "i8", "8", "i16">; // v4i8 -> v4i16 -defm : Lengthen_HalfSingle<"2", "i32", "i16", "4", "i32">; // v2i16 -> v2i32 +let Predicates = [IsLE] in { + defm : Lengthen_HalfSingle<"4", "i16", "i8", "8", "i16">; // v4i8 -> v4i16 + defm : Lengthen_HalfSingle<"2", "i32", "i16", "4", "i32">; // v2i16 -> v2i32 -// Double lengthening - v4i8 -> v4i16 -> v4i32 -defm : Lengthen_Double<"4", "i32", "i8", "8", "i16", "4", "i32">; -// v2i8 -> v2i16 -> v2i32 -defm : Lengthen_HalfDouble<"2", "i32", "i8", "8", "i16", "4", "i32">; -// v2i16 -> v2i32 -> v2i64 -defm : Lengthen_Double<"2", "i64", "i16", "4", "i32", "2", "i64">; + // Double lengthening - v4i8 -> v4i16 -> v4i32 + defm : Lengthen_Double<"4", "i32", "i8", "8", "i16", "4", "i32">; + // v2i8 -> v2i16 -> v2i32 + defm : Lengthen_HalfDouble<"2", "i32", "i8", "8", "i16", "4", "i32">; + // v2i16 -> v2i32 -> v2i64 + defm : Lengthen_Double<"2", "i64", "i16", "4", "i32", "2", "i64">; +} + +let Predicates = [IsBE] in { + defm : Lengthen_HalfSingle_Big_Endian<"4", "i16", "i8", "8", "i16", "8">; // v4i8 -> v4i16 + defm : Lengthen_HalfSingle_Big_Endian<"2", "i32", "i16", "4", "i32", "16">; // v2i16 -> v2i32 + + // Double lengthening - v4i8 -> v4i16 -> v4i32 + defm : Lengthen_Double_Big_Endian<"4", "i32", "i8", "8", "i16", "4", "i32", "8">; + // v2i8 -> v2i16 -> v2i32 + defm : Lengthen_HalfDouble_Big_Endian<"2", "i32", "i8", "8", "i16", "4", "i32">; + // v2i16 -> v2i32 -> v2i64 + defm : Lengthen_Double_Big_Endian<"2", "i64", "i16", "4", "i32", "2", "i64", "16">; +} // Triple lengthening - v2i8 -> v2i16 -> v2i32 -> v2i64 -def : Pat<(v2i64 (extloadvi8 addrmode6:$addr)), - (VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16 - (VLD1LNd16 addrmode6:$addr, - (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), dsub_0))>; -def : Pat<(v2i64 (zextloadvi8 addrmode6:$addr)), - (VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16 - (VLD1LNd16 addrmode6:$addr, - (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), dsub_0))>; -def : Pat<(v2i64 (sextloadvi8 addrmode6:$addr)), - (VMOVLsv2i64 (EXTRACT_SUBREG (VMOVLsv4i32 (EXTRACT_SUBREG (VMOVLsv8i16 - (VLD1LNd16 addrmode6:$addr, - (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), dsub_0))>; +let Predicates = [IsLE] in { + def : Pat<(v2i64 (extloadvi8 addrmode6:$addr)), + (VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16 + (VLD1LNd16 addrmode6:$addr, + (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), dsub_0))>; + def : Pat<(v2i64 (zextloadvi8 addrmode6:$addr)), + (VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16 + (VLD1LNd16 addrmode6:$addr, + (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), dsub_0))>; + def : Pat<(v2i64 (sextloadvi8 addrmode6:$addr)), + (VMOVLsv2i64 (EXTRACT_SUBREG (VMOVLsv4i32 (EXTRACT_SUBREG (VMOVLsv8i16 + (VLD1LNd16 addrmode6:$addr, + (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), dsub_0))>; +} +// The following patterns are basically a copy of the patterns above, +// however with an additional VREV16d instruction to convert data +// loaded by VLD1LN into proper vector format in big endian mode. +let Predicates = [IsBE] in { + def : Pat<(v2i64 (extloadvi8 addrmode6:$addr)), + (VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16 + (!cast<Instruction>("VREV16d8") + (VLD1LNd16 addrmode6:$addr, + (f64 (IMPLICIT_DEF)), (i32 0)))), dsub_0)), dsub_0))>; + def : Pat<(v2i64 (zextloadvi8 addrmode6:$addr)), + (VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16 + (!cast<Instruction>("VREV16d8") + (VLD1LNd16 addrmode6:$addr, + (f64 (IMPLICIT_DEF)), (i32 0)))), dsub_0)), dsub_0))>; + def : Pat<(v2i64 (sextloadvi8 addrmode6:$addr)), + (VMOVLsv2i64 (EXTRACT_SUBREG (VMOVLsv4i32 (EXTRACT_SUBREG (VMOVLsv8i16 + (!cast<Instruction>("VREV16d8") + (VLD1LNd16 addrmode6:$addr, + (f64 (IMPLICIT_DEF)), (i32 0)))), dsub_0)), dsub_0))>; +} //===----------------------------------------------------------------------===// // Assembler aliases diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td index c30d6abbb299..85e93516807b 100644 --- a/lib/Target/ARM/ARMInstrThumb2.td +++ b/lib/Target/ARM/ARMInstrThumb2.td @@ -3209,27 +3209,28 @@ def t2MOVCCi32imm let hasSideEffects = 1 in { def t2DMB : T2I<(outs), (ins memb_opt:$opt), NoItinerary, "dmb", "\t$opt", [(int_arm_dmb (i32 imm0_15:$opt))]>, - Requires<[HasDB]> { + Requires<[IsThumb, HasDB]> { bits<4> opt; let Inst{31-4} = 0xf3bf8f5; let Inst{3-0} = opt; } -} def t2DSB : T2I<(outs), (ins memb_opt:$opt), NoItinerary, "dsb", "\t$opt", [(int_arm_dsb (i32 imm0_15:$opt))]>, - Requires<[HasDB]> { + Requires<[IsThumb, HasDB]> { bits<4> opt; let Inst{31-4} = 0xf3bf8f4; let Inst{3-0} = opt; } def t2ISB : T2I<(outs), (ins instsyncb_opt:$opt), NoItinerary, - "isb", "\t$opt", []>, Requires<[HasDB]> { + "isb", "\t$opt", [(int_arm_isb (i32 imm0_15:$opt))]>, + Requires<[IsThumb, HasDB]> { bits<4> opt; let Inst{31-4} = 0xf3bf8f6; let Inst{3-0} = opt; } +} class T2I_ldrex<bits<4> opcod, dag oops, dag iops, AddrMode am, int sz, InstrItinClass itin, string opc, string asm, string cstr, diff --git a/lib/Target/ARM/ARMJITInfo.cpp b/lib/Target/ARM/ARMJITInfo.cpp index 8821c2dd09f8..6d1114d51aab 100644 --- a/lib/Target/ARM/ARMJITInfo.cpp +++ b/lib/Target/ARM/ARMJITInfo.cpp @@ -13,6 +13,7 @@ #include "ARMJITInfo.h" #include "ARMConstantPoolValue.h" +#include "ARMMachineFunctionInfo.h" #include "ARMRelocations.h" #include "MCTargetDesc/ARMBaseInfo.h" #include "llvm/CodeGen/JITCodeEmitter.h" @@ -334,3 +335,10 @@ void ARMJITInfo::relocate(void *Function, MachineRelocation *MR, } } } + +void ARMJITInfo::Initialize(const MachineFunction &MF, bool isPIC) { + const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + ConstPoolId2AddrMap.resize(AFI->getNumPICLabels()); + JumpTableId2AddrMap.resize(AFI->getNumJumpTables()); + IsPIC = isPIC; +} diff --git a/lib/Target/ARM/ARMJITInfo.h b/lib/Target/ARM/ARMJITInfo.h index ee4c863543e7..27e2a2013404 100644 --- a/lib/Target/ARM/ARMJITInfo.h +++ b/lib/Target/ARM/ARMJITInfo.h @@ -14,7 +14,6 @@ #ifndef ARMJITINFO_H #define ARMJITINFO_H -#include "ARMMachineFunctionInfo.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/MachineConstantPool.h" @@ -103,12 +102,7 @@ namespace llvm { /// Resize constant pool ids to CONSTPOOL_ENTRY addresses map; resize /// jump table ids to jump table bases map; remember if codegen relocation /// model is PIC. - void Initialize(const MachineFunction &MF, bool isPIC) { - const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); - ConstPoolId2AddrMap.resize(AFI->getNumPICLabels()); - JumpTableId2AddrMap.resize(AFI->getNumJumpTables()); - IsPIC = isPIC; - } + void Initialize(const MachineFunction &MF, bool isPIC); /// getConstantPoolEntryAddr - The ARM target puts all constant /// pool entries into constant islands. This returns the address of the diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index ee7df5476c71..a03bcdbddd79 100644 --- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -505,7 +505,7 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB, // Exception: If the base register is in the input reglist, Thumb1 LDM is // non-writeback. Check for this. - if (Opcode == ARM::tLDRi && isThumb1) + if (Opcode == ARM::tLDMIA && isThumb1) for (unsigned I = 0; I < NumRegs; ++I) if (Base == Regs[I].first) { Writeback = false; @@ -519,17 +519,17 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB, // Update tLDMIA with writeback if necessary. Opcode = ARM::tLDMIA_UPD; - // The base isn't dead after a merged instruction with writeback. Update - // future uses of the base with the added offset (if possible), or reset - // the base register as necessary. - if (!BaseKill) - UpdateBaseRegUses(MBB, MBBI, dl, Base, NumRegs, Pred, PredReg); - MIB = BuildMI(MBB, MBBI, dl, TII->get(Opcode)); // Thumb1: we might need to set base writeback when building the MI. MIB.addReg(Base, getDefRegState(true)) .addReg(Base, getKillRegState(BaseKill)); + + // The base isn't dead after a merged instruction with writeback. Update + // future uses of the base with the added offset (if possible), or reset + // the base register as necessary. + if (!BaseKill) + UpdateBaseRegUses(MBB, MBBI, dl, Base, NumRegs, Pred, PredReg); } else { // No writeback, simply build the MachineInstr. MIB = BuildMI(MBB, MBBI, dl, TII->get(Opcode)); @@ -1734,6 +1734,12 @@ bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { isThumb2 = AFI->isThumb2Function(); isThumb1 = AFI->isThumbFunction() && !isThumb2; + // FIXME: Temporarily disabling for Thumb-1 due to miscompiles + if (isThumb1) { + delete RS; + return false; + } + bool Modified = false; for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; ++MFI) { diff --git a/lib/Target/ARM/ARMMCInstLower.cpp b/lib/Target/ARM/ARMMCInstLower.cpp index 48141b14e137..023f5f8e37a0 100644 --- a/lib/Target/ARM/ARMMCInstLower.cpp +++ b/lib/Target/ARM/ARMMCInstLower.cpp @@ -34,7 +34,7 @@ MCOperand ARMAsmPrinter::GetSymbolRef(const MachineOperand &MO, OutContext); switch (Option) { default: llvm_unreachable("Unknown target flag on symbol operand"); - case 0: + case ARMII::MO_NO_FLAG: break; case ARMII::MO_LO16: Expr = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_None, diff --git a/lib/Target/ARM/ARMMachineFunctionInfo.cpp b/lib/Target/ARM/ARMMachineFunctionInfo.cpp index af445e2f35aa..892b269fc181 100644 --- a/lib/Target/ARM/ARMMachineFunctionInfo.cpp +++ b/lib/Target/ARM/ARMMachineFunctionInfo.cpp @@ -12,3 +12,13 @@ using namespace llvm; void ARMFunctionInfo::anchor() { } + +ARMFunctionInfo::ARMFunctionInfo(MachineFunction &MF) + : isThumb(MF.getTarget().getSubtarget<ARMSubtarget>().isThumb()), + hasThumb2(MF.getTarget().getSubtarget<ARMSubtarget>().hasThumb2()), + StByValParamsPadding(0), ArgRegsSaveSize(0), HasStackFrame(false), + RestoreSPFromFP(false), LRSpilledForFarJump(false), + FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0), + GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0), JumpTableUId(0), + PICLabelUId(0), VarArgsFrameIndex(0), HasITBlocks(false), + GlobalBaseReg(0) {} diff --git a/lib/Target/ARM/ARMMachineFunctionInfo.h b/lib/Target/ARM/ARMMachineFunctionInfo.h index d7ec6eba941c..44a9e3495b90 100644 --- a/lib/Target/ARM/ARMMachineFunctionInfo.h +++ b/lib/Target/ARM/ARMMachineFunctionInfo.h @@ -130,16 +130,7 @@ public: JumpTableUId(0), PICLabelUId(0), VarArgsFrameIndex(0), HasITBlocks(false), GlobalBaseReg(0) {} - explicit ARMFunctionInfo(MachineFunction &MF) : - isThumb(MF.getTarget().getSubtarget<ARMSubtarget>().isThumb()), - hasThumb2(MF.getTarget().getSubtarget<ARMSubtarget>().hasThumb2()), - StByValParamsPadding(0), - ArgRegsSaveSize(0), HasStackFrame(false), RestoreSPFromFP(false), - LRSpilledForFarJump(false), - FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0), - GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0), - JumpTableUId(0), PICLabelUId(0), - VarArgsFrameIndex(0), HasITBlocks(false), GlobalBaseReg(0) {} + explicit ARMFunctionInfo(MachineFunction &MF); bool isThumbFunction() const { return isThumb; } bool isThumb1OnlyFunction() const { return isThumb && !hasThumb2; } @@ -220,7 +211,7 @@ public: void recordCPEClone(unsigned CPIdx, unsigned CPCloneIdx) { if (!CPEClones.insert(std::make_pair(CPCloneIdx, CPIdx)).second) - assert(0 && "Duplicate entries!"); + llvm_unreachable("Duplicate entries!"); } unsigned getOriginalCPIdx(unsigned CloneIdx) const { diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/lib/Target/ARM/ARMSelectionDAGInfo.cpp index 008ad640a476..3dcc0df349d2 100644 --- a/lib/Target/ARM/ARMSelectionDAGInfo.cpp +++ b/lib/Target/ARM/ARMSelectionDAGInfo.cpp @@ -18,10 +18,8 @@ using namespace llvm; #define DEBUG_TYPE "arm-selectiondag-info" -ARMSelectionDAGInfo::ARMSelectionDAGInfo(const TargetMachine &TM) - : TargetSelectionDAGInfo(TM), - Subtarget(&TM.getSubtarget<ARMSubtarget>()) { -} +ARMSelectionDAGInfo::ARMSelectionDAGInfo(const DataLayout &DL) + : TargetSelectionDAGInfo(&DL) {} ARMSelectionDAGInfo::~ARMSelectionDAGInfo() { } @@ -34,6 +32,7 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, bool isVolatile, bool AlwaysInline, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const { + const ARMSubtarget &Subtarget = DAG.getTarget().getSubtarget<ARMSubtarget>(); // Do repeated 4-byte loads and stores. To be improved. // This requires 4-byte alignment. if ((Align & 3) != 0) @@ -44,7 +43,7 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, if (!ConstantSize) return SDValue(); uint64_t SizeVal = ConstantSize->getZExtValue(); - if (!AlwaysInline && SizeVal > Subtarget->getMaxInlineSizeThreshold()) + if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold()) return SDValue(); unsigned BytesLeft = SizeVal & 3; @@ -54,7 +53,7 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, unsigned VTSize = 4; unsigned i = 0; // Emit a maximum of 4 loads in Thumb1 since we have fewer registers - const unsigned MAX_LOADS_IN_LDM = Subtarget->isThumb1Only() ? 4 : 6; + const unsigned MAX_LOADS_IN_LDM = Subtarget.isThumb1Only() ? 4 : 6; SDValue TFOps[6]; SDValue Loads[6]; uint64_t SrcOff = 0, DstOff = 0; @@ -151,9 +150,10 @@ EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, SDValue Src, SDValue Size, unsigned Align, bool isVolatile, MachinePointerInfo DstPtrInfo) const { + const ARMSubtarget &Subtarget = DAG.getTarget().getSubtarget<ARMSubtarget>(); // Use default for non-AAPCS (or MachO) subtargets - if (!Subtarget->isAAPCS_ABI() || Subtarget->isTargetMachO() || - Subtarget->isTargetWindows()) + if (!Subtarget.isAAPCS_ABI() || Subtarget.isTargetMachO() || + Subtarget.isTargetWindows()) return SDValue(); const ARMTargetLowering &TLI = @@ -191,7 +191,7 @@ EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, .setCallee(TLI.getLibcallCallingConv(RTLIB::MEMSET), Type::getVoidTy(*DAG.getContext()), DAG.getExternalSymbol(TLI.getLibcallName(RTLIB::MEMSET), - TLI.getPointerTy()), &Args, 0) + TLI.getPointerTy()), std::move(Args), 0) .setDiscardResult(); std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI); diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.h b/lib/Target/ARM/ARMSelectionDAGInfo.h index 8c2397b52188..13769dc8efe0 100644 --- a/lib/Target/ARM/ARMSelectionDAGInfo.h +++ b/lib/Target/ARM/ARMSelectionDAGInfo.h @@ -36,12 +36,8 @@ namespace ARM_AM { } // end namespace ARM_AM class ARMSelectionDAGInfo : public TargetSelectionDAGInfo { - /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can - /// make the right decision when generating code for different targets. - const ARMSubtarget *Subtarget; - public: - explicit ARMSelectionDAGInfo(const TargetMachine &TM); + explicit ARMSelectionDAGInfo(const DataLayout &DL); ~ARMSelectionDAGInfo(); SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp index 5b204f6ba197..0eb24ef5adf0 100644 --- a/lib/Target/ARM/ARMSubtarget.cpp +++ b/lib/Target/ARM/ARMSubtarget.cpp @@ -12,8 +12,15 @@ //===----------------------------------------------------------------------===// #include "ARMSubtarget.h" -#include "ARMBaseInstrInfo.h" -#include "ARMBaseRegisterInfo.h" +#include "ARMFrameLowering.h" +#include "ARMISelLowering.h" +#include "ARMInstrInfo.h" +#include "ARMJITInfo.h" +#include "ARMSelectionDAGInfo.h" +#include "ARMSubtarget.h" +#include "Thumb1FrameLowering.h" +#include "Thumb1InstrInfo.h" +#include "Thumb2InstrInfo.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalValue.h" @@ -76,22 +83,89 @@ IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), "Allow IT blocks based on ARMv7"), clEnumValEnd)); -ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &CPU, - const std::string &FS, bool IsLittle, - const TargetOptions &Options) - : ARMGenSubtargetInfo(TT, CPU, FS) - , ARMProcFamily(Others) - , ARMProcClass(None) - , stackAlignment(4) - , CPUString(CPU) - , IsLittle(IsLittle) - , TargetTriple(TT) - , Options(Options) - , TargetABI(ARM_ABI_UNKNOWN) { +static std::string computeDataLayout(ARMSubtarget &ST) { + std::string Ret = ""; + + if (ST.isLittle()) + // Little endian. + Ret += "e"; + else + // Big endian. + Ret += "E"; + + Ret += DataLayout::getManglingComponent(ST.getTargetTriple()); + + // Pointers are 32 bits and aligned to 32 bits. + Ret += "-p:32:32"; + + // On thumb, i16,i18 and i1 have natural aligment requirements, but we try to + // align to 32. + if (ST.isThumb()) + Ret += "-i1:8:32-i8:8:32-i16:16:32"; + + // ABIs other than APCS have 64 bit integers with natural alignment. + if (!ST.isAPCS_ABI()) + Ret += "-i64:64"; + + // We have 64 bits floats. The APCS ABI requires them to be aligned to 32 + // bits, others to 64 bits. We always try to align to 64 bits. + if (ST.isAPCS_ABI()) + Ret += "-f64:32:64"; + + // We have 128 and 64 bit vectors. The APCS ABI aligns them to 32 bits, others + // to 64. We always ty to give them natural alignment. + if (ST.isAPCS_ABI()) + Ret += "-v64:32:64-v128:32:128"; + else + Ret += "-v128:64:128"; + + // On thumb and APCS, only try to align aggregates to 32 bits (the default is + // 64 bits). + if (ST.isThumb() || ST.isAPCS_ABI()) + Ret += "-a:0:32"; + + // Integer registers are 32 bits. + Ret += "-n32"; + + // The stack is 128 bit aligned on NaCl, 64 bit aligned on AAPCS and 32 bit + // aligned everywhere else. + if (ST.isTargetNaCl()) + Ret += "-S128"; + else if (ST.isAAPCS_ABI()) + Ret += "-S64"; + else + Ret += "-S32"; + + return Ret; +} + +/// initializeSubtargetDependencies - Initializes using a CPU and feature string +/// so that we can use initializer lists for subtarget initialization. +ARMSubtarget &ARMSubtarget::initializeSubtargetDependencies(StringRef CPU, + StringRef FS) { initializeEnvironment(); resetSubtargetFeatures(CPU, FS); + return *this; } +ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &CPU, + const std::string &FS, TargetMachine &TM, + bool IsLittle, const TargetOptions &Options) + : ARMGenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others), + ARMProcClass(None), stackAlignment(4), CPUString(CPU), IsLittle(IsLittle), + TargetTriple(TT), Options(Options), TargetABI(ARM_ABI_UNKNOWN), + DL(computeDataLayout(initializeSubtargetDependencies(CPU, FS))), + TSInfo(DL), JITInfo(), + InstrInfo(isThumb1Only() + ? (ARMBaseInstrInfo *)new Thumb1InstrInfo(*this) + : !isThumb() + ? (ARMBaseInstrInfo *)new ARMInstrInfo(*this) + : (ARMBaseInstrInfo *)new Thumb2InstrInfo(*this)), + TLInfo(TM), + FrameLowering(!isThumb1Only() + ? new ARMFrameLowering(*this) + : (ARMFrameLowering *)new Thumb1FrameLowering(*this)) {} + void ARMSubtarget::initializeEnvironment() { HasV4TOps = false; HasV5TOps = false; @@ -106,7 +180,6 @@ void ARMSubtarget::initializeEnvironment() { HasVFPv4 = false; HasFPARMv8 = false; HasNEON = false; - MinSize = false; UseNEONForSinglePrecisionFP = false; UseMulOps = UseFusedMulOps; SlowFPVMLx = false; @@ -158,9 +231,6 @@ void ARMSubtarget::resetSubtargetFeatures(const MachineFunction *MF) { initializeEnvironment(); resetSubtargetFeatures(CPU, FS); } - - MinSize = - FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize); } void ARMSubtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) { @@ -353,6 +423,17 @@ bool ARMSubtarget::hasSinCos() const { !getTargetTriple().isOSVersionLT(7, 0); } +// Enable the PostMachineScheduler if the target selects it instead of +// PostRAScheduler. Currently only available on the command line via +// -misched-postra. +bool ARMSubtarget::enablePostMachineScheduler() const { + return PostRAScheduler; +} + +bool ARMSubtarget::enableAtomicExpandLoadLinked() const { + return hasAnyDataBarrier() && !isThumb1Only(); +} + bool ARMSubtarget::enablePostRAScheduler( CodeGenOpt::Level OptLevel, TargetSubtargetInfo::AntiDepBreakMode& Mode, @@ -360,3 +441,12 @@ bool ARMSubtarget::enablePostRAScheduler( Mode = TargetSubtargetInfo::ANTIDEP_NONE; return PostRAScheduler && OptLevel >= CodeGenOpt::Default; } + +bool ARMSubtarget::useMovt(const MachineFunction &MF) const { + // NOTE Windows on ARM needs to use mov.w/mov.t pairs to materialise 32-bit + // immediates as it is inherently position independent, and may be out of + // range otherwise. + return UseMovt && (isTargetWindows() || + !MF.getFunction()->getAttributes().hasAttribute( + AttributeSet::FunctionIndex, Attribute::MinSize)); +} diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h index 38536b2e43e9..8f6c16589ecd 100644 --- a/lib/Target/ARM/ARMSubtarget.h +++ b/lib/Target/ARM/ARMSubtarget.h @@ -14,8 +14,20 @@ #ifndef ARMSUBTARGET_H #define ARMSUBTARGET_H + +#include "ARMFrameLowering.h" +#include "ARMISelLowering.h" +#include "ARMInstrInfo.h" +#include "ARMJITInfo.h" +#include "ARMSelectionDAGInfo.h" +#include "ARMSubtarget.h" +#include "Thumb1FrameLowering.h" +#include "Thumb1InstrInfo.h" +#include "Thumb2InstrInfo.h" +#include "ARMJITInfo.h" #include "MCTargetDesc/ARMMCTargetDesc.h" #include "llvm/ADT/Triple.h" +#include "llvm/IR/DataLayout.h" #include "llvm/MC/MCInstrItineraries.h" #include "llvm/Target/TargetSubtargetInfo.h" #include <string> @@ -64,10 +76,6 @@ protected: bool HasFPARMv8; bool HasNEON; - /// MinSize - True if the function being compiled has the "minsize" attribute - /// and should be optimised for size at the expense of speed. - bool MinSize; - /// UseNEONForSinglePrecisionFP - if the NEONFP attribute has been /// specified. Use the method useNEONForSinglePrecisionFP() to /// determine if NEON should actually be used. @@ -236,7 +244,7 @@ protected: /// of the specified triple. /// ARMSubtarget(const std::string &TT, const std::string &CPU, - const std::string &FS, bool IsLittle, + const std::string &FS, TargetMachine &TM, bool IsLittle, const TargetOptions &Options); /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size @@ -250,7 +258,31 @@ protected: /// \brief Reset the features for the ARM target. void resetSubtargetFeatures(const MachineFunction *MF) override; + + /// initializeSubtargetDependencies - Initializes using a CPU and feature string + /// so that we can use initializer lists for subtarget initialization. + ARMSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS); + + const DataLayout *getDataLayout() const { return &DL; } + const ARMSelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; } + ARMJITInfo *getJITInfo() { return &JITInfo; } + const ARMBaseInstrInfo *getInstrInfo() const { return InstrInfo.get(); } + const ARMTargetLowering *getTargetLowering() const { return &TLInfo; } + const ARMFrameLowering *getFrameLowering() const { return FrameLowering.get(); } + const ARMBaseRegisterInfo *getRegisterInfo() const { + return &InstrInfo->getRegisterInfo(); + } + private: + const DataLayout DL; + ARMSelectionDAGInfo TSInfo; + ARMJITInfo JITInfo; + // Either Thumb1InstrInfo or Thumb2InstrInfo. + std::unique_ptr<ARMBaseInstrInfo> InstrInfo; + ARMTargetLowering TLInfo; + // Either Thumb1FrameLowering or ARMFrameLowering. + std::unique_ptr<ARMFrameLowering> FrameLowering; + void initializeEnvironment(); void resetSubtargetFeatures(StringRef CPU, StringRef FS); public: @@ -286,7 +318,6 @@ public: bool hasCrypto() const { return HasCrypto; } bool hasCRC() const { return HasCRC; } bool hasVirtualization() const { return HasVirtualization; } - bool isMinSize() const { return MinSize; } bool useNEONForSinglePrecisionFP() const { return hasNEON() && UseNEONForSinglePrecisionFP; } @@ -382,7 +413,8 @@ public: bool isR9Reserved() const { return IsR9Reserved; } - bool useMovt() const { return UseMovt && !isMinSize(); } + bool useMovt(const MachineFunction &MF) const; + bool supportsTailCall() const { return SupportsTailCall; } bool allowsUnalignedMem() const { return AllowsUnalignedMem; } @@ -399,11 +431,17 @@ public: /// compiler runtime or math libraries. bool hasSinCos() const; + /// True for some subtargets at > -O0. + bool enablePostMachineScheduler() const; + /// enablePostRAScheduler - True at 'More' optimization. bool enablePostRAScheduler(CodeGenOpt::Level OptLevel, TargetSubtargetInfo::AntiDepBreakMode& Mode, RegClassVector& CriticalPathRCs) const override; + // enableAtomicExpandLoadLinked - True if we need to expand our atomics. + bool enableAtomicExpandLoadLinked() const override; + /// getInstrItins - Return the instruction itineraies based on subtarget /// selection. const InstrItineraryData &getInstrItineraryData() const { return InstrItins; } diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp index 887622705ed2..d85194b75ecb 100644 --- a/lib/Target/ARM/ARMTargetMachine.cpp +++ b/lib/Target/ARM/ARMTargetMachine.cpp @@ -28,6 +28,12 @@ DisableA15SDOptimization("disable-a15-sd-optimization", cl::Hidden, cl::desc("Inhibit optimization of S->D register accesses on A15"), cl::init(false)); +static cl::opt<bool> +EnableAtomicTidy("arm-atomic-cfg-tidy", cl::Hidden, + cl::desc("Run SimplifyCFG after expanding atomic operations" + " to make use of cmpxchg flow-based information"), + cl::init(true)); + extern "C" void LLVMInitializeARMTarget() { // Register the target. RegisterTargetMachine<ARMLETargetMachine> X(TheARMLETarget); @@ -43,12 +49,9 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL, - bool isLittle) - : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), - Subtarget(TT, CPU, FS, isLittle, Options), - JITInfo(), - InstrItins(Subtarget.getInstrItineraryData()) { + CodeGenOpt::Level OL, bool isLittle) + : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), + Subtarget(TT, CPU, FS, *this, isLittle, Options) { // Default to triple-appropriate float ABI if (Options.FloatABIType == FloatABI::Default) @@ -67,74 +70,11 @@ void ARMBaseTargetMachine::addAnalysisPasses(PassManagerBase &PM) { void ARMTargetMachine::anchor() { } -static std::string computeDataLayout(ARMSubtarget &ST) { - std::string Ret = ""; - - if (ST.isLittle()) - // Little endian. - Ret += "e"; - else - // Big endian. - Ret += "E"; - - Ret += DataLayout::getManglingComponent(ST.getTargetTriple()); - - // Pointers are 32 bits and aligned to 32 bits. - Ret += "-p:32:32"; - - // On thumb, i16,i18 and i1 have natural aligment requirements, but we try to - // align to 32. - if (ST.isThumb()) - Ret += "-i1:8:32-i8:8:32-i16:16:32"; - - // ABIs other than APCS have 64 bit integers with natural alignment. - if (!ST.isAPCS_ABI()) - Ret += "-i64:64"; - - // We have 64 bits floats. The APCS ABI requires them to be aligned to 32 - // bits, others to 64 bits. We always try to align to 64 bits. - if (ST.isAPCS_ABI()) - Ret += "-f64:32:64"; - - // We have 128 and 64 bit vectors. The APCS ABI aligns them to 32 bits, others - // to 64. We always ty to give them natural alignment. - if (ST.isAPCS_ABI()) - Ret += "-v64:32:64-v128:32:128"; - else - Ret += "-v128:64:128"; - - // On thumb and APCS, only try to align aggregates to 32 bits (the default is - // 64 bits). - if (ST.isThumb() || ST.isAPCS_ABI()) - Ret += "-a:0:32"; - - // Integer registers are 32 bits. - Ret += "-n32"; - - // The stack is 128 bit aligned on NaCl, 64 bit aligned on AAPCS and 32 bit - // aligned everywhere else. - if (ST.isTargetNaCl()) - Ret += "-S128"; - else if (ST.isAAPCS_ABI()) - Ret += "-S64"; - else - Ret += "-S32"; - - return Ret; -} - -ARMTargetMachine::ARMTargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, - const TargetOptions &Options, +ARMTargetMachine::ARMTargetMachine(const Target &T, StringRef TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL, - bool isLittle) - : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, isLittle), - InstrInfo(Subtarget), - DL(computeDataLayout(Subtarget)), - TLInfo(*this), - TSInfo(*this), - FrameLowering(Subtarget) { + CodeGenOpt::Level OL, bool isLittle) + : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, isLittle) { initAsmInfo(); if (!Subtarget.hasARMOps()) report_fatal_error("CPU: '" + Subtarget.getCPUString() + "' does not " @@ -143,21 +83,21 @@ ARMTargetMachine::ARMTargetMachine(const Target &T, StringRef TT, void ARMLETargetMachine::anchor() { } -ARMLETargetMachine:: -ARMLETargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL) - : ARMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} +ARMLETargetMachine::ARMLETargetMachine(const Target &T, StringRef TT, + StringRef CPU, StringRef FS, + const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL) + : ARMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} void ARMBETargetMachine::anchor() { } -ARMBETargetMachine:: -ARMBETargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL) - : ARMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {} +ARMBETargetMachine::ARMBETargetMachine(const Target &T, StringRef TT, + StringRef CPU, StringRef FS, + const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL) + : ARMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {} void ThumbTargetMachine::anchor() { } @@ -165,38 +105,29 @@ ThumbTargetMachine::ThumbTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL, - bool isLittle) - : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, isLittle), - InstrInfo(Subtarget.hasThumb2() - ? ((ARMBaseInstrInfo*)new Thumb2InstrInfo(Subtarget)) - : ((ARMBaseInstrInfo*)new Thumb1InstrInfo(Subtarget))), - DL(computeDataLayout(Subtarget)), - TLInfo(*this), - TSInfo(*this), - FrameLowering(Subtarget.hasThumb2() - ? new ARMFrameLowering(Subtarget) - : (ARMFrameLowering*)new Thumb1FrameLowering(Subtarget)) { + CodeGenOpt::Level OL, bool isLittle) + : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, + isLittle) { initAsmInfo(); } void ThumbLETargetMachine::anchor() { } -ThumbLETargetMachine:: -ThumbLETargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL) - : ThumbTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} +ThumbLETargetMachine::ThumbLETargetMachine(const Target &T, StringRef TT, + StringRef CPU, StringRef FS, + const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL) + : ThumbTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} void ThumbBETargetMachine::anchor() { } -ThumbBETargetMachine:: -ThumbBETargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL) - : ThumbTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {} +ThumbBETargetMachine::ThumbBETargetMachine(const Target &T, StringRef TT, + StringRef CPU, StringRef FS, + const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL) + : ThumbTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {} namespace { /// ARM Code Generator Pass Configuration Options. @@ -213,6 +144,7 @@ public: return *getARMTargetMachine().getSubtargetImpl(); } + void addIRPasses() override; bool addPreISel() override; bool addInstSelector() override; bool addPreRegAlloc() override; @@ -225,11 +157,21 @@ TargetPassConfig *ARMBaseTargetMachine::createPassConfig(PassManagerBase &PM) { return new ARMPassConfig(this, PM); } -bool ARMPassConfig::addPreISel() { +void ARMPassConfig::addIRPasses() { + addPass(createAtomicExpandLoadLinkedPass(TM)); + + // Cmpxchg instructions are often used with a subsequent comparison to + // determine whether it succeeded. We can exploit existing control-flow in + // ldrex/strex loops to simplify this, but it needs tidying up. const ARMSubtarget *Subtarget = &getARMSubtarget(); if (Subtarget->hasAnyDataBarrier() && !Subtarget->isThumb1Only()) - addPass(createAtomicExpandLoadLinkedPass(TM)); + if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy) + addPass(createCFGSimplificationPass()); + TargetPassConfig::addIRPasses(); +} + +bool ARMPassConfig::addPreISel() { if (TM->getOptLevel() != CodeGenOpt::None) addPass(createGlobalMergePass(TM)); diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h index 664c992ea165..b72b1df4af83 100644 --- a/lib/Target/ARM/ARMTargetMachine.h +++ b/lib/Target/ARM/ARMTargetMachine.h @@ -14,17 +14,9 @@ #ifndef ARMTARGETMACHINE_H #define ARMTARGETMACHINE_H -#include "ARMFrameLowering.h" -#include "ARMISelLowering.h" #include "ARMInstrInfo.h" -#include "ARMJITInfo.h" -#include "ARMSelectionDAGInfo.h" #include "ARMSubtarget.h" -#include "Thumb1FrameLowering.h" -#include "Thumb1InstrInfo.h" -#include "Thumb2InstrInfo.h" #include "llvm/IR/DataLayout.h" -#include "llvm/MC/MCStreamer.h" #include "llvm/Target/TargetMachine.h" namespace llvm { @@ -32,10 +24,6 @@ namespace llvm { class ARMBaseTargetMachine : public LLVMTargetMachine { protected: ARMSubtarget Subtarget; -private: - ARMJITInfo JITInfo; - InstrItineraryData InstrItins; - public: ARMBaseTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, @@ -44,15 +32,29 @@ public: CodeGenOpt::Level OL, bool isLittle); - ARMJITInfo *getJITInfo() override { return &JITInfo; } const ARMSubtarget *getSubtargetImpl() const override { return &Subtarget; } + const ARMBaseRegisterInfo *getRegisterInfo() const override { + return getSubtargetImpl()->getRegisterInfo(); + } const ARMTargetLowering *getTargetLowering() const override { - // Implemented by derived classes - llvm_unreachable("getTargetLowering not implemented"); + return getSubtargetImpl()->getTargetLowering(); + } + const ARMSelectionDAGInfo *getSelectionDAGInfo() const override { + return getSubtargetImpl()->getSelectionDAGInfo(); + } + const ARMBaseInstrInfo *getInstrInfo() const override { + return getSubtargetImpl()->getInstrInfo(); + } + const ARMFrameLowering *getFrameLowering() const override { + return getSubtargetImpl()->getFrameLowering(); } const InstrItineraryData *getInstrItineraryData() const override { - return &InstrItins; + return &getSubtargetImpl()->getInstrItineraryData(); } + const DataLayout *getDataLayout() const override { + return getSubtargetImpl()->getDataLayout(); + } + ARMJITInfo *getJITInfo() override { return Subtarget.getJITInfo(); } /// \brief Register ARM analysis passes with a pass manager. void addAnalysisPasses(PassManagerBase &PM) override; @@ -67,35 +69,10 @@ public: /// class ARMTargetMachine : public ARMBaseTargetMachine { virtual void anchor(); - ARMInstrInfo InstrInfo; - const DataLayout DL; // Calculates type size & alignment - ARMTargetLowering TLInfo; - ARMSelectionDAGInfo TSInfo; - ARMFrameLowering FrameLowering; public: - ARMTargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, - const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL, - bool isLittle); - - const ARMRegisterInfo *getRegisterInfo() const override { - return &InstrInfo.getRegisterInfo(); - } - - const ARMTargetLowering *getTargetLowering() const override { - return &TLInfo; - } - - const ARMSelectionDAGInfo *getSelectionDAGInfo() const override { - return &TSInfo; - } - const ARMFrameLowering *getFrameLowering() const override { - return &FrameLowering; - } - const ARMInstrInfo *getInstrInfo() const override { return &InstrInfo; } - const DataLayout *getDataLayout() const override { return &DL; } + ARMTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, + const TargetOptions &Options, Reloc::Model RM, + CodeModel::Model CM, CodeGenOpt::Level OL, bool isLittle); }; /// ARMLETargetMachine - ARM little endian target machine. @@ -114,10 +91,9 @@ public: class ARMBETargetMachine : public ARMTargetMachine { void anchor() override; public: - ARMBETargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL); + ARMBETargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, + const TargetOptions &Options, Reloc::Model RM, + CodeModel::Model CM, CodeGenOpt::Level OL); }; /// ThumbTargetMachine - Thumb target machine. @@ -126,43 +102,10 @@ public: /// class ThumbTargetMachine : public ARMBaseTargetMachine { virtual void anchor(); - // Either Thumb1InstrInfo or Thumb2InstrInfo. - std::unique_ptr<ARMBaseInstrInfo> InstrInfo; - const DataLayout DL; // Calculates type size & alignment - ARMTargetLowering TLInfo; - ARMSelectionDAGInfo TSInfo; - // Either Thumb1FrameLowering or ARMFrameLowering. - std::unique_ptr<ARMFrameLowering> FrameLowering; public: - ThumbTargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, - const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL, - bool isLittle); - - /// returns either Thumb1RegisterInfo or Thumb2RegisterInfo - const ARMBaseRegisterInfo *getRegisterInfo() const override { - return &InstrInfo->getRegisterInfo(); - } - - const ARMTargetLowering *getTargetLowering() const override { - return &TLInfo; - } - - const ARMSelectionDAGInfo *getSelectionDAGInfo() const override { - return &TSInfo; - } - - /// returns either Thumb1InstrInfo or Thumb2InstrInfo - const ARMBaseInstrInfo *getInstrInfo() const override { - return InstrInfo.get(); - } - /// returns either Thumb1FrameLowering or ARMFrameLowering - const ARMFrameLowering *getFrameLowering() const override { - return FrameLowering.get(); - } - const DataLayout *getDataLayout() const override { return &DL; } + ThumbTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, + const TargetOptions &Options, Reloc::Model RM, + CodeModel::Model CM, CodeGenOpt::Level OL, bool isLittle); }; /// ThumbLETargetMachine - Thumb little endian target machine. @@ -170,10 +113,10 @@ public: class ThumbLETargetMachine : public ThumbTargetMachine { void anchor() override; public: - ThumbLETargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL); + ThumbLETargetMachine(const Target &T, StringRef TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL); }; /// ThumbBETargetMachine - Thumb big endian target machine. diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp index 57df7da7f310..a2ace629baa1 100644 --- a/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -443,31 +443,58 @@ unsigned ARMTTI::getAddressComputationCost(Type *Ty, bool IsComplex) const { unsigned ARMTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) const { - // We only handle costs of reverse shuffles for now. - if (Kind != SK_Reverse) + // We only handle costs of reverse and alternate shuffles for now. + if (Kind != SK_Reverse && Kind != SK_Alternate) return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); - static const CostTblEntry<MVT::SimpleValueType> NEONShuffleTbl[] = { - // Reverse shuffle cost one instruction if we are shuffling within a double - // word (vrev) or two if we shuffle a quad word (vrev, vext). - { ISD::VECTOR_SHUFFLE, MVT::v2i32, 1 }, - { ISD::VECTOR_SHUFFLE, MVT::v2f32, 1 }, - { ISD::VECTOR_SHUFFLE, MVT::v2i64, 1 }, - { ISD::VECTOR_SHUFFLE, MVT::v2f64, 1 }, - - { ISD::VECTOR_SHUFFLE, MVT::v4i32, 2 }, - { ISD::VECTOR_SHUFFLE, MVT::v4f32, 2 }, - { ISD::VECTOR_SHUFFLE, MVT::v8i16, 2 }, - { ISD::VECTOR_SHUFFLE, MVT::v16i8, 2 } - }; + if (Kind == SK_Reverse) { + static const CostTblEntry<MVT::SimpleValueType> NEONShuffleTbl[] = { + // Reverse shuffle cost one instruction if we are shuffling within a + // double word (vrev) or two if we shuffle a quad word (vrev, vext). + {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp); + {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, + {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, + {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2}, + {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}}; - int Idx = CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); - if (Idx == -1) - return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); + std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp); + + int Idx = CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); + if (Idx == -1) + return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); - return LT.first * NEONShuffleTbl[Idx].Cost; + return LT.first * NEONShuffleTbl[Idx].Cost; + } + if (Kind == SK_Alternate) { + static const CostTblEntry<MVT::SimpleValueType> NEONAltShuffleTbl[] = { + // Alt shuffle cost table for ARM. Cost is the number of instructions + // required to create the shuffled vector. + + {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1}, + + {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, + {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, + {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2}, + + {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16}, + + {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}}; + + std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp); + int Idx = + CostTableLookup(NEONAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); + if (Idx == -1) + return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); + return LT.first * NEONAltShuffleTbl[Idx].Cost; + } + return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); } unsigned ARMTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index 5cdf394443a4..b62706c45fbf 100644 --- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -190,11 +190,11 @@ class ARMAsmParser : public MCTargetAsmParser { } int tryParseRegister(); - bool tryParseRegisterWithWriteBack(SmallVectorImpl<MCParsedAsmOperand*> &); - int tryParseShiftRegister(SmallVectorImpl<MCParsedAsmOperand*> &); - bool parseRegisterList(SmallVectorImpl<MCParsedAsmOperand*> &); - bool parseMemory(SmallVectorImpl<MCParsedAsmOperand*> &); - bool parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &, StringRef Mnemonic); + bool tryParseRegisterWithWriteBack(OperandVector &); + int tryParseShiftRegister(OperandVector &); + bool parseRegisterList(OperandVector &); + bool parseMemory(OperandVector &); + bool parseOperand(OperandVector &, StringRef Mnemonic); bool parsePrefix(ARMMCExpr::VariantKind &RefKind); bool parseMemRegOffsetShift(ARM_AM::ShiftOpc &ShiftType, unsigned &ShiftAmount); @@ -282,54 +282,42 @@ class ARMAsmParser : public MCTargetAsmParser { /// } - OperandMatchResultTy parseITCondCode(SmallVectorImpl<MCParsedAsmOperand*>&); - OperandMatchResultTy parseCoprocNumOperand( - SmallVectorImpl<MCParsedAsmOperand*>&); - OperandMatchResultTy parseCoprocRegOperand( - SmallVectorImpl<MCParsedAsmOperand*>&); - OperandMatchResultTy parseCoprocOptionOperand( - SmallVectorImpl<MCParsedAsmOperand*>&); - OperandMatchResultTy parseMemBarrierOptOperand( - SmallVectorImpl<MCParsedAsmOperand*>&); - OperandMatchResultTy parseInstSyncBarrierOptOperand( - SmallVectorImpl<MCParsedAsmOperand*>&); - OperandMatchResultTy parseProcIFlagsOperand( - SmallVectorImpl<MCParsedAsmOperand*>&); - OperandMatchResultTy parseMSRMaskOperand( - SmallVectorImpl<MCParsedAsmOperand*>&); - OperandMatchResultTy parsePKHImm(SmallVectorImpl<MCParsedAsmOperand*> &O, - StringRef Op, int Low, int High); - OperandMatchResultTy parsePKHLSLImm(SmallVectorImpl<MCParsedAsmOperand*> &O) { + OperandMatchResultTy parseITCondCode(OperandVector &); + OperandMatchResultTy parseCoprocNumOperand(OperandVector &); + OperandMatchResultTy parseCoprocRegOperand(OperandVector &); + OperandMatchResultTy parseCoprocOptionOperand(OperandVector &); + OperandMatchResultTy parseMemBarrierOptOperand(OperandVector &); + OperandMatchResultTy parseInstSyncBarrierOptOperand(OperandVector &); + OperandMatchResultTy parseProcIFlagsOperand(OperandVector &); + OperandMatchResultTy parseMSRMaskOperand(OperandVector &); + OperandMatchResultTy parsePKHImm(OperandVector &O, StringRef Op, int Low, + int High); + OperandMatchResultTy parsePKHLSLImm(OperandVector &O) { return parsePKHImm(O, "lsl", 0, 31); } - OperandMatchResultTy parsePKHASRImm(SmallVectorImpl<MCParsedAsmOperand*> &O) { + OperandMatchResultTy parsePKHASRImm(OperandVector &O) { return parsePKHImm(O, "asr", 1, 32); } - OperandMatchResultTy parseSetEndImm(SmallVectorImpl<MCParsedAsmOperand*>&); - OperandMatchResultTy parseShifterImm(SmallVectorImpl<MCParsedAsmOperand*>&); - OperandMatchResultTy parseRotImm(SmallVectorImpl<MCParsedAsmOperand*>&); - OperandMatchResultTy parseBitfield(SmallVectorImpl<MCParsedAsmOperand*>&); - OperandMatchResultTy parsePostIdxReg(SmallVectorImpl<MCParsedAsmOperand*>&); - OperandMatchResultTy parseAM3Offset(SmallVectorImpl<MCParsedAsmOperand*>&); - OperandMatchResultTy parseFPImm(SmallVectorImpl<MCParsedAsmOperand*>&); - OperandMatchResultTy parseVectorList(SmallVectorImpl<MCParsedAsmOperand*>&); + OperandMatchResultTy parseSetEndImm(OperandVector &); + OperandMatchResultTy parseShifterImm(OperandVector &); + OperandMatchResultTy parseRotImm(OperandVector &); + OperandMatchResultTy parseBitfield(OperandVector &); + OperandMatchResultTy parsePostIdxReg(OperandVector &); + OperandMatchResultTy parseAM3Offset(OperandVector &); + OperandMatchResultTy parseFPImm(OperandVector &); + OperandMatchResultTy parseVectorList(OperandVector &); OperandMatchResultTy parseVectorLane(VectorLaneTy &LaneKind, unsigned &Index, SMLoc &EndLoc); // Asm Match Converter Methods - void cvtThumbMultiply(MCInst &Inst, - const SmallVectorImpl<MCParsedAsmOperand*> &); - void cvtThumbBranches(MCInst &Inst, - const SmallVectorImpl<MCParsedAsmOperand*> &); - - bool validateInstruction(MCInst &Inst, - const SmallVectorImpl<MCParsedAsmOperand*> &Ops); - bool processInstruction(MCInst &Inst, - const SmallVectorImpl<MCParsedAsmOperand*> &Ops); - bool shouldOmitCCOutOperand(StringRef Mnemonic, - SmallVectorImpl<MCParsedAsmOperand*> &Operands); - bool shouldOmitPredicateOperand(StringRef Mnemonic, - SmallVectorImpl<MCParsedAsmOperand*> &Operands); + void cvtThumbMultiply(MCInst &Inst, const OperandVector &); + void cvtThumbBranches(MCInst &Inst, const OperandVector &); + + bool validateInstruction(MCInst &Inst, const OperandVector &Ops); + bool processInstruction(MCInst &Inst, const OperandVector &Ops); + bool shouldOmitCCOutOperand(StringRef Mnemonic, OperandVector &Operands); + bool shouldOmitPredicateOperand(StringRef Mnemonic, OperandVector &Operands); + public: enum ARMMatchResultTy { Match_RequiresITBlock = FIRST_TARGET_MATCH_RESULT_TY, @@ -361,19 +349,17 @@ public: // Implementation of the MCTargetAsmParser interface: bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override; - bool - ParseInstruction(ParseInstructionInfo &Info, StringRef Name, - SMLoc NameLoc, - SmallVectorImpl<MCParsedAsmOperand*> &Operands) override; + bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + SMLoc NameLoc, OperandVector &Operands) override; bool ParseDirective(AsmToken DirectiveID) override; - unsigned validateTargetOperandClass(MCParsedAsmOperand *Op, + unsigned validateTargetOperandClass(MCParsedAsmOperand &Op, unsigned Kind) override; unsigned checkTargetMatchPredicate(MCInst &Inst) override; bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, - SmallVectorImpl<MCParsedAsmOperand*> &Operands, - MCStreamer &Out, unsigned &ErrorInfo, + OperandVector &Operands, MCStreamer &Out, + unsigned &ErrorInfo, bool MatchingInlineAsm) override; void onLabelParsed(MCSymbol *Symbol) override; }; @@ -545,8 +531,8 @@ class ARMOperand : public MCParsedAsmOperand { struct BitfieldOp Bitfield; }; - ARMOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {} public: + ARMOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {} ARMOperand(const ARMOperand &o) : MCParsedAsmOperand() { Kind = o.Kind; StartLoc = o.StartLoc; @@ -2481,56 +2467,58 @@ public: void print(raw_ostream &OS) const override; - static ARMOperand *CreateITMask(unsigned Mask, SMLoc S) { - ARMOperand *Op = new ARMOperand(k_ITCondMask); + static std::unique_ptr<ARMOperand> CreateITMask(unsigned Mask, SMLoc S) { + auto Op = make_unique<ARMOperand>(k_ITCondMask); Op->ITMask.Mask = Mask; Op->StartLoc = S; Op->EndLoc = S; return Op; } - static ARMOperand *CreateCondCode(ARMCC::CondCodes CC, SMLoc S) { - ARMOperand *Op = new ARMOperand(k_CondCode); + static std::unique_ptr<ARMOperand> CreateCondCode(ARMCC::CondCodes CC, + SMLoc S) { + auto Op = make_unique<ARMOperand>(k_CondCode); Op->CC.Val = CC; Op->StartLoc = S; Op->EndLoc = S; return Op; } - static ARMOperand *CreateCoprocNum(unsigned CopVal, SMLoc S) { - ARMOperand *Op = new ARMOperand(k_CoprocNum); + static std::unique_ptr<ARMOperand> CreateCoprocNum(unsigned CopVal, SMLoc S) { + auto Op = make_unique<ARMOperand>(k_CoprocNum); Op->Cop.Val = CopVal; Op->StartLoc = S; Op->EndLoc = S; return Op; } - static ARMOperand *CreateCoprocReg(unsigned CopVal, SMLoc S) { - ARMOperand *Op = new ARMOperand(k_CoprocReg); + static std::unique_ptr<ARMOperand> CreateCoprocReg(unsigned CopVal, SMLoc S) { + auto Op = make_unique<ARMOperand>(k_CoprocReg); Op->Cop.Val = CopVal; Op->StartLoc = S; Op->EndLoc = S; return Op; } - static ARMOperand *CreateCoprocOption(unsigned Val, SMLoc S, SMLoc E) { - ARMOperand *Op = new ARMOperand(k_CoprocOption); + static std::unique_ptr<ARMOperand> CreateCoprocOption(unsigned Val, SMLoc S, + SMLoc E) { + auto Op = make_unique<ARMOperand>(k_CoprocOption); Op->Cop.Val = Val; Op->StartLoc = S; Op->EndLoc = E; return Op; } - static ARMOperand *CreateCCOut(unsigned RegNum, SMLoc S) { - ARMOperand *Op = new ARMOperand(k_CCOut); + static std::unique_ptr<ARMOperand> CreateCCOut(unsigned RegNum, SMLoc S) { + auto Op = make_unique<ARMOperand>(k_CCOut); Op->Reg.RegNum = RegNum; Op->StartLoc = S; Op->EndLoc = S; return Op; } - static ARMOperand *CreateToken(StringRef Str, SMLoc S) { - ARMOperand *Op = new ARMOperand(k_Token); + static std::unique_ptr<ARMOperand> CreateToken(StringRef Str, SMLoc S) { + auto Op = make_unique<ARMOperand>(k_Token); Op->Tok.Data = Str.data(); Op->Tok.Length = Str.size(); Op->StartLoc = S; @@ -2538,20 +2526,20 @@ public: return Op; } - static ARMOperand *CreateReg(unsigned RegNum, SMLoc S, SMLoc E) { - ARMOperand *Op = new ARMOperand(k_Register); + static std::unique_ptr<ARMOperand> CreateReg(unsigned RegNum, SMLoc S, + SMLoc E) { + auto Op = make_unique<ARMOperand>(k_Register); Op->Reg.RegNum = RegNum; Op->StartLoc = S; Op->EndLoc = E; return Op; } - static ARMOperand *CreateShiftedRegister(ARM_AM::ShiftOpc ShTy, - unsigned SrcReg, - unsigned ShiftReg, - unsigned ShiftImm, - SMLoc S, SMLoc E) { - ARMOperand *Op = new ARMOperand(k_ShiftedRegister); + static std::unique_ptr<ARMOperand> + CreateShiftedRegister(ARM_AM::ShiftOpc ShTy, unsigned SrcReg, + unsigned ShiftReg, unsigned ShiftImm, SMLoc S, + SMLoc E) { + auto Op = make_unique<ARMOperand>(k_ShiftedRegister); Op->RegShiftedReg.ShiftTy = ShTy; Op->RegShiftedReg.SrcReg = SrcReg; Op->RegShiftedReg.ShiftReg = ShiftReg; @@ -2561,11 +2549,10 @@ public: return Op; } - static ARMOperand *CreateShiftedImmediate(ARM_AM::ShiftOpc ShTy, - unsigned SrcReg, - unsigned ShiftImm, - SMLoc S, SMLoc E) { - ARMOperand *Op = new ARMOperand(k_ShiftedImmediate); + static std::unique_ptr<ARMOperand> + CreateShiftedImmediate(ARM_AM::ShiftOpc ShTy, unsigned SrcReg, + unsigned ShiftImm, SMLoc S, SMLoc E) { + auto Op = make_unique<ARMOperand>(k_ShiftedImmediate); Op->RegShiftedImm.ShiftTy = ShTy; Op->RegShiftedImm.SrcReg = SrcReg; Op->RegShiftedImm.ShiftImm = ShiftImm; @@ -2574,9 +2561,9 @@ public: return Op; } - static ARMOperand *CreateShifterImm(bool isASR, unsigned Imm, - SMLoc S, SMLoc E) { - ARMOperand *Op = new ARMOperand(k_ShifterImmediate); + static std::unique_ptr<ARMOperand> CreateShifterImm(bool isASR, unsigned Imm, + SMLoc S, SMLoc E) { + auto Op = make_unique<ARMOperand>(k_ShifterImmediate); Op->ShifterImm.isASR = isASR; Op->ShifterImm.Imm = Imm; Op->StartLoc = S; @@ -2584,17 +2571,18 @@ public: return Op; } - static ARMOperand *CreateRotImm(unsigned Imm, SMLoc S, SMLoc E) { - ARMOperand *Op = new ARMOperand(k_RotateImmediate); + static std::unique_ptr<ARMOperand> CreateRotImm(unsigned Imm, SMLoc S, + SMLoc E) { + auto Op = make_unique<ARMOperand>(k_RotateImmediate); Op->RotImm.Imm = Imm; Op->StartLoc = S; Op->EndLoc = E; return Op; } - static ARMOperand *CreateBitfield(unsigned LSB, unsigned Width, - SMLoc S, SMLoc E) { - ARMOperand *Op = new ARMOperand(k_BitfieldDescriptor); + static std::unique_ptr<ARMOperand> + CreateBitfield(unsigned LSB, unsigned Width, SMLoc S, SMLoc E) { + auto Op = make_unique<ARMOperand>(k_BitfieldDescriptor); Op->Bitfield.LSB = LSB; Op->Bitfield.Width = Width; Op->StartLoc = S; @@ -2602,8 +2590,8 @@ public: return Op; } - static ARMOperand * - CreateRegList(SmallVectorImpl<std::pair<unsigned, unsigned> > &Regs, + static std::unique_ptr<ARMOperand> + CreateRegList(SmallVectorImpl<std::pair<unsigned, unsigned>> &Regs, SMLoc StartLoc, SMLoc EndLoc) { assert (Regs.size() > 0 && "RegList contains no registers?"); KindTy Kind = k_RegisterList; @@ -2617,7 +2605,7 @@ public: // Sort based on the register encoding values. array_pod_sort(Regs.begin(), Regs.end()); - ARMOperand *Op = new ARMOperand(Kind); + auto Op = make_unique<ARMOperand>(Kind); for (SmallVectorImpl<std::pair<unsigned, unsigned> >::const_iterator I = Regs.begin(), E = Regs.end(); I != E; ++I) Op->Registers.push_back(I->second); @@ -2626,9 +2614,11 @@ public: return Op; } - static ARMOperand *CreateVectorList(unsigned RegNum, unsigned Count, - bool isDoubleSpaced, SMLoc S, SMLoc E) { - ARMOperand *Op = new ARMOperand(k_VectorList); + static std::unique_ptr<ARMOperand> CreateVectorList(unsigned RegNum, + unsigned Count, + bool isDoubleSpaced, + SMLoc S, SMLoc E) { + auto Op = make_unique<ARMOperand>(k_VectorList); Op->VectorList.RegNum = RegNum; Op->VectorList.Count = Count; Op->VectorList.isDoubleSpaced = isDoubleSpaced; @@ -2637,10 +2627,10 @@ public: return Op; } - static ARMOperand *CreateVectorListAllLanes(unsigned RegNum, unsigned Count, - bool isDoubleSpaced, - SMLoc S, SMLoc E) { - ARMOperand *Op = new ARMOperand(k_VectorListAllLanes); + static std::unique_ptr<ARMOperand> + CreateVectorListAllLanes(unsigned RegNum, unsigned Count, bool isDoubleSpaced, + SMLoc S, SMLoc E) { + auto Op = make_unique<ARMOperand>(k_VectorListAllLanes); Op->VectorList.RegNum = RegNum; Op->VectorList.Count = Count; Op->VectorList.isDoubleSpaced = isDoubleSpaced; @@ -2649,11 +2639,10 @@ public: return Op; } - static ARMOperand *CreateVectorListIndexed(unsigned RegNum, unsigned Count, - unsigned Index, - bool isDoubleSpaced, - SMLoc S, SMLoc E) { - ARMOperand *Op = new ARMOperand(k_VectorListIndexed); + static std::unique_ptr<ARMOperand> + CreateVectorListIndexed(unsigned RegNum, unsigned Count, unsigned Index, + bool isDoubleSpaced, SMLoc S, SMLoc E) { + auto Op = make_unique<ARMOperand>(k_VectorListIndexed); Op->VectorList.RegNum = RegNum; Op->VectorList.Count = Count; Op->VectorList.LaneIndex = Index; @@ -2663,33 +2652,30 @@ public: return Op; } - static ARMOperand *CreateVectorIndex(unsigned Idx, SMLoc S, SMLoc E, - MCContext &Ctx) { - ARMOperand *Op = new ARMOperand(k_VectorIndex); + static std::unique_ptr<ARMOperand> + CreateVectorIndex(unsigned Idx, SMLoc S, SMLoc E, MCContext &Ctx) { + auto Op = make_unique<ARMOperand>(k_VectorIndex); Op->VectorIndex.Val = Idx; Op->StartLoc = S; Op->EndLoc = E; return Op; } - static ARMOperand *CreateImm(const MCExpr *Val, SMLoc S, SMLoc E) { - ARMOperand *Op = new ARMOperand(k_Immediate); + static std::unique_ptr<ARMOperand> CreateImm(const MCExpr *Val, SMLoc S, + SMLoc E) { + auto Op = make_unique<ARMOperand>(k_Immediate); Op->Imm.Val = Val; Op->StartLoc = S; Op->EndLoc = E; return Op; } - static ARMOperand *CreateMem(unsigned BaseRegNum, - const MCConstantExpr *OffsetImm, - unsigned OffsetRegNum, - ARM_AM::ShiftOpc ShiftType, - unsigned ShiftImm, - unsigned Alignment, - bool isNegative, - SMLoc S, SMLoc E, - SMLoc AlignmentLoc = SMLoc()) { - ARMOperand *Op = new ARMOperand(k_Memory); + static std::unique_ptr<ARMOperand> + CreateMem(unsigned BaseRegNum, const MCConstantExpr *OffsetImm, + unsigned OffsetRegNum, ARM_AM::ShiftOpc ShiftType, + unsigned ShiftImm, unsigned Alignment, bool isNegative, SMLoc S, + SMLoc E, SMLoc AlignmentLoc = SMLoc()) { + auto Op = make_unique<ARMOperand>(k_Memory); Op->Memory.BaseRegNum = BaseRegNum; Op->Memory.OffsetImm = OffsetImm; Op->Memory.OffsetRegNum = OffsetRegNum; @@ -2703,11 +2689,10 @@ public: return Op; } - static ARMOperand *CreatePostIdxReg(unsigned RegNum, bool isAdd, - ARM_AM::ShiftOpc ShiftTy, - unsigned ShiftImm, - SMLoc S, SMLoc E) { - ARMOperand *Op = new ARMOperand(k_PostIndexRegister); + static std::unique_ptr<ARMOperand> + CreatePostIdxReg(unsigned RegNum, bool isAdd, ARM_AM::ShiftOpc ShiftTy, + unsigned ShiftImm, SMLoc S, SMLoc E) { + auto Op = make_unique<ARMOperand>(k_PostIndexRegister); Op->PostIdxReg.RegNum = RegNum; Op->PostIdxReg.isAdd = isAdd; Op->PostIdxReg.ShiftTy = ShiftTy; @@ -2717,33 +2702,35 @@ public: return Op; } - static ARMOperand *CreateMemBarrierOpt(ARM_MB::MemBOpt Opt, SMLoc S) { - ARMOperand *Op = new ARMOperand(k_MemBarrierOpt); + static std::unique_ptr<ARMOperand> CreateMemBarrierOpt(ARM_MB::MemBOpt Opt, + SMLoc S) { + auto Op = make_unique<ARMOperand>(k_MemBarrierOpt); Op->MBOpt.Val = Opt; Op->StartLoc = S; Op->EndLoc = S; return Op; } - static ARMOperand *CreateInstSyncBarrierOpt(ARM_ISB::InstSyncBOpt Opt, - SMLoc S) { - ARMOperand *Op = new ARMOperand(k_InstSyncBarrierOpt); + static std::unique_ptr<ARMOperand> + CreateInstSyncBarrierOpt(ARM_ISB::InstSyncBOpt Opt, SMLoc S) { + auto Op = make_unique<ARMOperand>(k_InstSyncBarrierOpt); Op->ISBOpt.Val = Opt; Op->StartLoc = S; Op->EndLoc = S; return Op; } - static ARMOperand *CreateProcIFlags(ARM_PROC::IFlags IFlags, SMLoc S) { - ARMOperand *Op = new ARMOperand(k_ProcIFlags); + static std::unique_ptr<ARMOperand> CreateProcIFlags(ARM_PROC::IFlags IFlags, + SMLoc S) { + auto Op = make_unique<ARMOperand>(k_ProcIFlags); Op->IFlags.Val = IFlags; Op->StartLoc = S; Op->EndLoc = S; return Op; } - static ARMOperand *CreateMSRMask(unsigned MMask, SMLoc S) { - ARMOperand *Op = new ARMOperand(k_MSRMask); + static std::unique_ptr<ARMOperand> CreateMSRMask(unsigned MMask, SMLoc S) { + auto Op = make_unique<ARMOperand>(k_MSRMask); Op->MMask.Val = MMask; Op->StartLoc = S; Op->EndLoc = S; @@ -2947,8 +2934,7 @@ int ARMAsmParser::tryParseRegister() { // occurs, return -1. An irrecoverable error is one where tokens have been // consumed in the process of trying to parse the shifter (i.e., when it is // indeed a shifter operand, but malformed). -int ARMAsmParser::tryParseShiftRegister( - SmallVectorImpl<MCParsedAsmOperand*> &Operands) { +int ARMAsmParser::tryParseShiftRegister(OperandVector &Operands) { SMLoc S = Parser.getTok().getLoc(); const AsmToken &Tok = Parser.getTok(); if (Tok.isNot(AsmToken::Identifier)) @@ -2972,7 +2958,8 @@ int ARMAsmParser::tryParseShiftRegister( // The source register for the shift has already been added to the // operand list, so we need to pop it off and combine it into the shifted // register operand instead. - std::unique_ptr<ARMOperand> PrevOp((ARMOperand*)Operands.pop_back_val()); + std::unique_ptr<ARMOperand> PrevOp( + (ARMOperand *)Operands.pop_back_val().release()); if (!PrevOp->isReg()) return Error(PrevOp->getStartLoc(), "shift must be of a register"); int SrcReg = PrevOp->getReg(); @@ -3049,8 +3036,7 @@ int ARMAsmParser::tryParseShiftRegister( /// /// TODO this is likely to change to allow different register types and or to /// parse for a specific register type. -bool ARMAsmParser:: -tryParseRegisterWithWriteBack(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { +bool ARMAsmParser::tryParseRegisterWithWriteBack(OperandVector &Operands) { const AsmToken &RegTok = Parser.getTok(); int RegNo = tryParseRegister(); if (RegNo == -1) @@ -3096,17 +3082,25 @@ tryParseRegisterWithWriteBack(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { } /// MatchCoprocessorOperandName - Try to parse an coprocessor related -/// instruction with a symbolic operand name. Example: "p1", "p7", "c3", -/// "c5", ... +/// instruction with a symbolic operand name. +/// We accept "crN" syntax for GAS compatibility. +/// <operand-name> ::= <prefix><number> +/// If CoprocOp is 'c', then: +/// <prefix> ::= c | cr +/// If CoprocOp is 'p', then : +/// <prefix> ::= p +/// <number> ::= integer in range [0, 15] static int MatchCoprocessorOperandName(StringRef Name, char CoprocOp) { // Use the same layout as the tablegen'erated register name matcher. Ugly, // but efficient. + if (Name.size() < 2 || Name[0] != CoprocOp) + return -1; + Name = (Name[1] == 'r') ? Name.drop_front(2) : Name.drop_front(); + switch (Name.size()) { default: return -1; - case 2: - if (Name[0] != CoprocOp) - return -1; - switch (Name[1]) { + case 1: + switch (Name[0]) { default: return -1; case '0': return 0; case '1': return 1; @@ -3119,10 +3113,10 @@ static int MatchCoprocessorOperandName(StringRef Name, char CoprocOp) { case '8': return 8; case '9': return 9; } - case 3: - if (Name[0] != CoprocOp || Name[1] != '1') + case 2: + if (Name[0] != '1') return -1; - switch (Name[2]) { + switch (Name[1]) { default: return -1; // p10 and p11 are invalid for coproc instructions (reserved for FP/NEON) case '0': return CoprocOp == 'p'? -1: 10; @@ -3136,8 +3130,8 @@ static int MatchCoprocessorOperandName(StringRef Name, char CoprocOp) { } /// parseITCondCode - Try to parse a condition code for an IT instruction. -ARMAsmParser::OperandMatchResultTy ARMAsmParser:: -parseITCondCode(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { +ARMAsmParser::OperandMatchResultTy +ARMAsmParser::parseITCondCode(OperandVector &Operands) { SMLoc S = Parser.getTok().getLoc(); const AsmToken &Tok = Parser.getTok(); if (!Tok.is(AsmToken::Identifier)) @@ -3173,8 +3167,8 @@ parseITCondCode(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { /// parseCoprocNumOperand - Try to parse an coprocessor number operand. The /// token must be an Identifier when called, and if it is a coprocessor /// number, the token is eaten and the operand is added to the operand list. -ARMAsmParser::OperandMatchResultTy ARMAsmParser:: -parseCoprocNumOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { +ARMAsmParser::OperandMatchResultTy +ARMAsmParser::parseCoprocNumOperand(OperandVector &Operands) { SMLoc S = Parser.getTok().getLoc(); const AsmToken &Tok = Parser.getTok(); if (Tok.isNot(AsmToken::Identifier)) @@ -3192,8 +3186,8 @@ parseCoprocNumOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { /// parseCoprocRegOperand - Try to parse an coprocessor register operand. The /// token must be an Identifier when called, and if it is a coprocessor /// number, the token is eaten and the operand is added to the operand list. -ARMAsmParser::OperandMatchResultTy ARMAsmParser:: -parseCoprocRegOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { +ARMAsmParser::OperandMatchResultTy +ARMAsmParser::parseCoprocRegOperand(OperandVector &Operands) { SMLoc S = Parser.getTok().getLoc(); const AsmToken &Tok = Parser.getTok(); if (Tok.isNot(AsmToken::Identifier)) @@ -3210,8 +3204,8 @@ parseCoprocRegOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { /// parseCoprocOptionOperand - Try to parse an coprocessor option operand. /// coproc_option : '{' imm0_255 '}' -ARMAsmParser::OperandMatchResultTy ARMAsmParser:: -parseCoprocOptionOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { +ARMAsmParser::OperandMatchResultTy +ARMAsmParser::parseCoprocOptionOperand(OperandVector &Operands) { SMLoc S = Parser.getTok().getLoc(); // If this isn't a '{', this isn't a coprocessor immediate operand. @@ -3288,8 +3282,7 @@ static unsigned getDRegFromQReg(unsigned QReg) { } /// Parse a register list. -bool ARMAsmParser:: -parseRegisterList(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { +bool ARMAsmParser::parseRegisterList(OperandVector &Operands) { assert(Parser.getTok().is(AsmToken::LCurly) && "Token is not a Left Curly Brace"); SMLoc S = Parser.getTok().getLoc(); @@ -3470,8 +3463,8 @@ parseVectorLane(VectorLaneTy &LaneKind, unsigned &Index, SMLoc &EndLoc) { } // parse a vector register list -ARMAsmParser::OperandMatchResultTy ARMAsmParser:: -parseVectorList(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { +ARMAsmParser::OperandMatchResultTy +ARMAsmParser::parseVectorList(OperandVector &Operands) { VectorLaneTy LaneKind; unsigned LaneIndex; SMLoc S = Parser.getTok().getLoc(); @@ -3721,8 +3714,8 @@ parseVectorList(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { } /// parseMemBarrierOptOperand - Try to parse DSB/DMB data barrier options. -ARMAsmParser::OperandMatchResultTy ARMAsmParser:: -parseMemBarrierOptOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { +ARMAsmParser::OperandMatchResultTy +ARMAsmParser::parseMemBarrierOptOperand(OperandVector &Operands) { SMLoc S = Parser.getTok().getLoc(); const AsmToken &Tok = Parser.getTok(); unsigned Opt; @@ -3792,8 +3785,8 @@ parseMemBarrierOptOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { } /// parseInstSyncBarrierOptOperand - Try to parse ISB inst sync barrier options. -ARMAsmParser::OperandMatchResultTy ARMAsmParser:: -parseInstSyncBarrierOptOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { +ARMAsmParser::OperandMatchResultTy +ARMAsmParser::parseInstSyncBarrierOptOperand(OperandVector &Operands) { SMLoc S = Parser.getTok().getLoc(); const AsmToken &Tok = Parser.getTok(); unsigned Opt; @@ -3843,8 +3836,8 @@ parseInstSyncBarrierOptOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { /// parseProcIFlagsOperand - Try to parse iflags from CPS instruction. -ARMAsmParser::OperandMatchResultTy ARMAsmParser:: -parseProcIFlagsOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { +ARMAsmParser::OperandMatchResultTy +ARMAsmParser::parseProcIFlagsOperand(OperandVector &Operands) { SMLoc S = Parser.getTok().getLoc(); const AsmToken &Tok = Parser.getTok(); if (!Tok.is(AsmToken::Identifier)) @@ -3877,8 +3870,8 @@ parseProcIFlagsOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { } /// parseMSRMaskOperand - Try to parse mask flags from MSR instruction. -ARMAsmParser::OperandMatchResultTy ARMAsmParser:: -parseMSRMaskOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { +ARMAsmParser::OperandMatchResultTy +ARMAsmParser::parseMSRMaskOperand(OperandVector &Operands) { SMLoc S = Parser.getTok().getLoc(); const AsmToken &Tok = Parser.getTok(); if (!Tok.is(AsmToken::Identifier)) @@ -4005,9 +3998,9 @@ parseMSRMaskOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { return MatchOperand_Success; } -ARMAsmParser::OperandMatchResultTy ARMAsmParser:: -parsePKHImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands, StringRef Op, - int Low, int High) { +ARMAsmParser::OperandMatchResultTy +ARMAsmParser::parsePKHImm(OperandVector &Operands, StringRef Op, int Low, + int High) { const AsmToken &Tok = Parser.getTok(); if (Tok.isNot(AsmToken::Identifier)) { Error(Parser.getTok().getLoc(), Op + " operand expected."); @@ -4053,8 +4046,8 @@ parsePKHImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands, StringRef Op, return MatchOperand_Success; } -ARMAsmParser::OperandMatchResultTy ARMAsmParser:: -parseSetEndImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { +ARMAsmParser::OperandMatchResultTy +ARMAsmParser::parseSetEndImm(OperandVector &Operands) { const AsmToken &Tok = Parser.getTok(); SMLoc S = Tok.getLoc(); if (Tok.isNot(AsmToken::Identifier)) { @@ -4082,8 +4075,8 @@ parseSetEndImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { /// lsl #n 'n' in [0,31] /// asr #n 'n' in [1,32] /// n == 32 encoded as n == 0. -ARMAsmParser::OperandMatchResultTy ARMAsmParser:: -parseShifterImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { +ARMAsmParser::OperandMatchResultTy +ARMAsmParser::parseShifterImm(OperandVector &Operands) { const AsmToken &Tok = Parser.getTok(); SMLoc S = Tok.getLoc(); if (Tok.isNot(AsmToken::Identifier)) { @@ -4152,8 +4145,8 @@ parseShifterImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { /// parseRotImm - Parse the shifter immediate operand for SXTB/UXTB family /// of instructions. Legal values are: /// ror #n 'n' in {0, 8, 16, 24} -ARMAsmParser::OperandMatchResultTy ARMAsmParser:: -parseRotImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { +ARMAsmParser::OperandMatchResultTy +ARMAsmParser::parseRotImm(OperandVector &Operands) { const AsmToken &Tok = Parser.getTok(); SMLoc S = Tok.getLoc(); if (Tok.isNot(AsmToken::Identifier)) @@ -4198,8 +4191,8 @@ parseRotImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { return MatchOperand_Success; } -ARMAsmParser::OperandMatchResultTy ARMAsmParser:: -parseBitfield(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { +ARMAsmParser::OperandMatchResultTy +ARMAsmParser::parseBitfield(OperandVector &Operands) { SMLoc S = Parser.getTok().getLoc(); // The bitfield descriptor is really two operands, the LSB and the width. if (Parser.getTok().isNot(AsmToken::Hash) && @@ -4266,8 +4259,8 @@ parseBitfield(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { return MatchOperand_Success; } -ARMAsmParser::OperandMatchResultTy ARMAsmParser:: -parsePostIdxReg(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { +ARMAsmParser::OperandMatchResultTy +ARMAsmParser::parsePostIdxReg(OperandVector &Operands) { // Check for a post-index addressing register operand. Specifically: // postidx_reg := '+' register {, shift} // | '-' register {, shift} @@ -4315,8 +4308,8 @@ parsePostIdxReg(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { return MatchOperand_Success; } -ARMAsmParser::OperandMatchResultTy ARMAsmParser:: -parseAM3Offset(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { +ARMAsmParser::OperandMatchResultTy +ARMAsmParser::parseAM3Offset(OperandVector &Operands) { // Check for a post-index addressing register operand. Specifically: // am3offset := '+' register // | '-' register @@ -4388,26 +4381,24 @@ parseAM3Offset(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { /// Convert parsed operands to MCInst. Needed here because this instruction /// only has two register operands, but multiplication is commutative so /// assemblers should accept both "mul rD, rN, rD" and "mul rD, rD, rN". -void ARMAsmParser:: -cvtThumbMultiply(MCInst &Inst, - const SmallVectorImpl<MCParsedAsmOperand*> &Operands) { - ((ARMOperand*)Operands[3])->addRegOperands(Inst, 1); - ((ARMOperand*)Operands[1])->addCCOutOperands(Inst, 1); +void ARMAsmParser::cvtThumbMultiply(MCInst &Inst, + const OperandVector &Operands) { + ((ARMOperand &)*Operands[3]).addRegOperands(Inst, 1); + ((ARMOperand &)*Operands[1]).addCCOutOperands(Inst, 1); // If we have a three-operand form, make sure to set Rn to be the operand // that isn't the same as Rd. unsigned RegOp = 4; if (Operands.size() == 6 && - ((ARMOperand*)Operands[4])->getReg() == - ((ARMOperand*)Operands[3])->getReg()) + ((ARMOperand &)*Operands[4]).getReg() == + ((ARMOperand &)*Operands[3]).getReg()) RegOp = 5; - ((ARMOperand*)Operands[RegOp])->addRegOperands(Inst, 1); + ((ARMOperand &)*Operands[RegOp]).addRegOperands(Inst, 1); Inst.addOperand(Inst.getOperand(0)); - ((ARMOperand*)Operands[2])->addCondCodeOperands(Inst, 2); + ((ARMOperand &)*Operands[2]).addCondCodeOperands(Inst, 2); } -void ARMAsmParser:: -cvtThumbBranches(MCInst &Inst, - const SmallVectorImpl<MCParsedAsmOperand*> &Operands) { +void ARMAsmParser::cvtThumbBranches(MCInst &Inst, + const OperandVector &Operands) { int CondOp = -1, ImmOp = -1; switch(Inst.getOpcode()) { case ARM::tB: @@ -4430,7 +4421,7 @@ cvtThumbBranches(MCInst &Inst, } else { // outside IT blocks we can only have unconditional branches with AL // condition code or conditional branches with non-AL condition code - unsigned Cond = static_cast<ARMOperand*>(Operands[CondOp])->getCondCode(); + unsigned Cond = static_cast<ARMOperand &>(*Operands[CondOp]).getCondCode(); switch(Inst.getOpcode()) { case ARM::tB: case ARM::tBcc: @@ -4447,27 +4438,26 @@ cvtThumbBranches(MCInst &Inst, switch(Inst.getOpcode()) { // classify tB as either t2B or t1B based on range of immediate operand case ARM::tB: { - ARMOperand* op = static_cast<ARMOperand*>(Operands[ImmOp]); - if(!op->isSignedOffset<11, 1>() && isThumbTwo()) + ARMOperand &op = static_cast<ARMOperand &>(*Operands[ImmOp]); + if (!op.isSignedOffset<11, 1>() && isThumbTwo()) Inst.setOpcode(ARM::t2B); break; } // classify tBcc as either t2Bcc or t1Bcc based on range of immediate operand case ARM::tBcc: { - ARMOperand* op = static_cast<ARMOperand*>(Operands[ImmOp]); - if(!op->isSignedOffset<8, 1>() && isThumbTwo()) + ARMOperand &op = static_cast<ARMOperand &>(*Operands[ImmOp]); + if (!op.isSignedOffset<8, 1>() && isThumbTwo()) Inst.setOpcode(ARM::t2Bcc); break; } } - ((ARMOperand*)Operands[ImmOp])->addImmOperands(Inst, 1); - ((ARMOperand*)Operands[CondOp])->addCondCodeOperands(Inst, 2); + ((ARMOperand &)*Operands[ImmOp]).addImmOperands(Inst, 1); + ((ARMOperand &)*Operands[CondOp]).addCondCodeOperands(Inst, 2); } /// Parse an ARM memory expression, return false if successful else return true /// or an error. The first token must be a '[' when called. -bool ARMAsmParser:: -parseMemory(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { +bool ARMAsmParser::parseMemory(OperandVector &Operands) { SMLoc S, E; assert(Parser.getTok().is(AsmToken::LBrac) && "Token is not a Left Bracket"); @@ -4717,8 +4707,8 @@ bool ARMAsmParser::parseMemRegOffsetShift(ARM_AM::ShiftOpc &St, } /// parseFPImm - A floating point immediate expression operand. -ARMAsmParser::OperandMatchResultTy ARMAsmParser:: -parseFPImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { +ARMAsmParser::OperandMatchResultTy +ARMAsmParser::parseFPImm(OperandVector &Operands) { // Anything that can accept a floating point constant as an operand // needs to go through here, as the regular parseExpression is // integer only. @@ -4744,12 +4734,12 @@ parseFPImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { // integer constant. Make sure we don't try to parse an FPImm // for these: // vmov.i{8|16|32|64} <dreg|qreg>, #imm - ARMOperand *TyOp = static_cast<ARMOperand*>(Operands[2]); - bool isVmovf = TyOp->isToken() && (TyOp->getToken() == ".f32" || - TyOp->getToken() == ".f64"); - ARMOperand *Mnemonic = static_cast<ARMOperand*>(Operands[0]); - bool isFconst = Mnemonic->isToken() && (Mnemonic->getToken() == "fconstd" || - Mnemonic->getToken() == "fconsts"); + ARMOperand &TyOp = static_cast<ARMOperand &>(*Operands[2]); + bool isVmovf = TyOp.isToken() && + (TyOp.getToken() == ".f32" || TyOp.getToken() == ".f64"); + ARMOperand &Mnemonic = static_cast<ARMOperand &>(*Operands[0]); + bool isFconst = Mnemonic.isToken() && (Mnemonic.getToken() == "fconstd" || + Mnemonic.getToken() == "fconsts"); if (!(isVmovf || isFconst)) return MatchOperand_NoMatch; @@ -4798,8 +4788,7 @@ parseFPImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { /// Parse a arm instruction operand. For now this parses the operand regardless /// of the mnemonic. -bool ARMAsmParser::parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands, - StringRef Mnemonic) { +bool ARMAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) { SMLoc S, E; // Check if the current operand has a custom associated parser, if so, try to @@ -5125,7 +5114,7 @@ getMnemonicAcceptInfo(StringRef Mnemonic, StringRef FullInst, } bool ARMAsmParser::shouldOmitCCOutOperand(StringRef Mnemonic, - SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + OperandVector &Operands) { // FIXME: This is all horribly hacky. We really need a better way to deal // with optional operands like this in the matcher table. @@ -5138,17 +5127,17 @@ bool ARMAsmParser::shouldOmitCCOutOperand(StringRef Mnemonic, // conditionally adding the cc_out in the first place because we need // to check the type of the parsed immediate operand. if (Mnemonic == "mov" && Operands.size() > 4 && !isThumb() && - !static_cast<ARMOperand*>(Operands[4])->isARMSOImm() && - static_cast<ARMOperand*>(Operands[4])->isImm0_65535Expr() && - static_cast<ARMOperand*>(Operands[1])->getReg() == 0) + !static_cast<ARMOperand &>(*Operands[4]).isARMSOImm() && + static_cast<ARMOperand &>(*Operands[4]).isImm0_65535Expr() && + static_cast<ARMOperand &>(*Operands[1]).getReg() == 0) return true; // Register-register 'add' for thumb does not have a cc_out operand // when there are only two register operands. if (isThumb() && Mnemonic == "add" && Operands.size() == 5 && - static_cast<ARMOperand*>(Operands[3])->isReg() && - static_cast<ARMOperand*>(Operands[4])->isReg() && - static_cast<ARMOperand*>(Operands[1])->getReg() == 0) + static_cast<ARMOperand &>(*Operands[3]).isReg() && + static_cast<ARMOperand &>(*Operands[4]).isReg() && + static_cast<ARMOperand &>(*Operands[1]).getReg() == 0) return true; // Register-register 'add' for thumb does not have a cc_out operand // when it's an ADD Rdm, SP, {Rdm|#imm0_255} instruction. We do @@ -5156,13 +5145,12 @@ bool ARMAsmParser::shouldOmitCCOutOperand(StringRef Mnemonic, // that can handle a different range and has a cc_out operand. if (((isThumb() && Mnemonic == "add") || (isThumbTwo() && Mnemonic == "sub")) && - Operands.size() == 6 && - static_cast<ARMOperand*>(Operands[3])->isReg() && - static_cast<ARMOperand*>(Operands[4])->isReg() && - static_cast<ARMOperand*>(Operands[4])->getReg() == ARM::SP && - static_cast<ARMOperand*>(Operands[1])->getReg() == 0 && - ((Mnemonic == "add" &&static_cast<ARMOperand*>(Operands[5])->isReg()) || - static_cast<ARMOperand*>(Operands[5])->isImm0_1020s4())) + Operands.size() == 6 && static_cast<ARMOperand &>(*Operands[3]).isReg() && + static_cast<ARMOperand &>(*Operands[4]).isReg() && + static_cast<ARMOperand &>(*Operands[4]).getReg() == ARM::SP && + static_cast<ARMOperand &>(*Operands[1]).getReg() == 0 && + ((Mnemonic == "add" && static_cast<ARMOperand &>(*Operands[5]).isReg()) || + static_cast<ARMOperand &>(*Operands[5]).isImm0_1020s4())) return true; // For Thumb2, add/sub immediate does not have a cc_out operand for the // imm0_4095 variant. That's the least-preferred variant when @@ -5170,23 +5158,22 @@ bool ARMAsmParser::shouldOmitCCOutOperand(StringRef Mnemonic, // should remove the cc_out operand, we have to explicitly check that // it's not one of the other variants. Ugh. if (isThumbTwo() && (Mnemonic == "add" || Mnemonic == "sub") && - Operands.size() == 6 && - static_cast<ARMOperand*>(Operands[3])->isReg() && - static_cast<ARMOperand*>(Operands[4])->isReg() && - static_cast<ARMOperand*>(Operands[5])->isImm()) { + Operands.size() == 6 && static_cast<ARMOperand &>(*Operands[3]).isReg() && + static_cast<ARMOperand &>(*Operands[4]).isReg() && + static_cast<ARMOperand &>(*Operands[5]).isImm()) { // Nest conditions rather than one big 'if' statement for readability. // // If both registers are low, we're in an IT block, and the immediate is // in range, we should use encoding T1 instead, which has a cc_out. if (inITBlock() && - isARMLowRegister(static_cast<ARMOperand*>(Operands[3])->getReg()) && - isARMLowRegister(static_cast<ARMOperand*>(Operands[4])->getReg()) && - static_cast<ARMOperand*>(Operands[5])->isImm0_7()) + isARMLowRegister(static_cast<ARMOperand &>(*Operands[3]).getReg()) && + isARMLowRegister(static_cast<ARMOperand &>(*Operands[4]).getReg()) && + static_cast<ARMOperand &>(*Operands[5]).isImm0_7()) return false; // Check against T3. If the second register is the PC, this is an // alternate form of ADR, which uses encoding T4, so check for that too. - if (static_cast<ARMOperand*>(Operands[4])->getReg() != ARM::PC && - static_cast<ARMOperand*>(Operands[5])->isT2SOImm()) + if (static_cast<ARMOperand &>(*Operands[4]).getReg() != ARM::PC && + static_cast<ARMOperand &>(*Operands[5]).isT2SOImm()) return false; // Otherwise, we use encoding T4, which does not have a cc_out @@ -5198,35 +5185,34 @@ bool ARMAsmParser::shouldOmitCCOutOperand(StringRef Mnemonic, // if we have a "mul" mnemonic in Thumb mode, check if we'll be able to // use the 16-bit encoding or not. if (isThumbTwo() && Mnemonic == "mul" && Operands.size() == 6 && - static_cast<ARMOperand*>(Operands[1])->getReg() == 0 && - static_cast<ARMOperand*>(Operands[3])->isReg() && - static_cast<ARMOperand*>(Operands[4])->isReg() && - static_cast<ARMOperand*>(Operands[5])->isReg() && + static_cast<ARMOperand &>(*Operands[1]).getReg() == 0 && + static_cast<ARMOperand &>(*Operands[3]).isReg() && + static_cast<ARMOperand &>(*Operands[4]).isReg() && + static_cast<ARMOperand &>(*Operands[5]).isReg() && // If the registers aren't low regs, the destination reg isn't the // same as one of the source regs, or the cc_out operand is zero // outside of an IT block, we have to use the 32-bit encoding, so // remove the cc_out operand. - (!isARMLowRegister(static_cast<ARMOperand*>(Operands[3])->getReg()) || - !isARMLowRegister(static_cast<ARMOperand*>(Operands[4])->getReg()) || - !isARMLowRegister(static_cast<ARMOperand*>(Operands[5])->getReg()) || - !inITBlock() || - (static_cast<ARMOperand*>(Operands[3])->getReg() != - static_cast<ARMOperand*>(Operands[5])->getReg() && - static_cast<ARMOperand*>(Operands[3])->getReg() != - static_cast<ARMOperand*>(Operands[4])->getReg()))) + (!isARMLowRegister(static_cast<ARMOperand &>(*Operands[3]).getReg()) || + !isARMLowRegister(static_cast<ARMOperand &>(*Operands[4]).getReg()) || + !isARMLowRegister(static_cast<ARMOperand &>(*Operands[5]).getReg()) || + !inITBlock() || (static_cast<ARMOperand &>(*Operands[3]).getReg() != + static_cast<ARMOperand &>(*Operands[5]).getReg() && + static_cast<ARMOperand &>(*Operands[3]).getReg() != + static_cast<ARMOperand &>(*Operands[4]).getReg()))) return true; // Also check the 'mul' syntax variant that doesn't specify an explicit // destination register. if (isThumbTwo() && Mnemonic == "mul" && Operands.size() == 5 && - static_cast<ARMOperand*>(Operands[1])->getReg() == 0 && - static_cast<ARMOperand*>(Operands[3])->isReg() && - static_cast<ARMOperand*>(Operands[4])->isReg() && + static_cast<ARMOperand &>(*Operands[1]).getReg() == 0 && + static_cast<ARMOperand &>(*Operands[3]).isReg() && + static_cast<ARMOperand &>(*Operands[4]).isReg() && // If the registers aren't low regs or the cc_out operand is zero // outside of an IT block, we have to use the 32-bit encoding, so // remove the cc_out operand. - (!isARMLowRegister(static_cast<ARMOperand*>(Operands[3])->getReg()) || - !isARMLowRegister(static_cast<ARMOperand*>(Operands[4])->getReg()) || + (!isARMLowRegister(static_cast<ARMOperand &>(*Operands[3]).getReg()) || + !isARMLowRegister(static_cast<ARMOperand &>(*Operands[4]).getReg()) || !inITBlock())) return true; @@ -5239,32 +5225,32 @@ bool ARMAsmParser::shouldOmitCCOutOperand(StringRef Mnemonic, // anyway. if (isThumb() && (Mnemonic == "add" || Mnemonic == "sub") && (Operands.size() == 5 || Operands.size() == 6) && - static_cast<ARMOperand*>(Operands[3])->isReg() && - static_cast<ARMOperand*>(Operands[3])->getReg() == ARM::SP && - static_cast<ARMOperand*>(Operands[1])->getReg() == 0 && - (static_cast<ARMOperand*>(Operands[4])->isImm() || + static_cast<ARMOperand &>(*Operands[3]).isReg() && + static_cast<ARMOperand &>(*Operands[3]).getReg() == ARM::SP && + static_cast<ARMOperand &>(*Operands[1]).getReg() == 0 && + (static_cast<ARMOperand &>(*Operands[4]).isImm() || (Operands.size() == 6 && - static_cast<ARMOperand*>(Operands[5])->isImm()))) + static_cast<ARMOperand &>(*Operands[5]).isImm()))) return true; return false; } -bool ARMAsmParser::shouldOmitPredicateOperand( - StringRef Mnemonic, SmallVectorImpl<MCParsedAsmOperand *> &Operands) { +bool ARMAsmParser::shouldOmitPredicateOperand(StringRef Mnemonic, + OperandVector &Operands) { // VRINT{Z, R, X} have a predicate operand in VFP, but not in NEON unsigned RegIdx = 3; if ((Mnemonic == "vrintz" || Mnemonic == "vrintx" || Mnemonic == "vrintr") && - static_cast<ARMOperand *>(Operands[2])->getToken() == ".f32") { - if (static_cast<ARMOperand *>(Operands[3])->isToken() && - static_cast<ARMOperand *>(Operands[3])->getToken() == ".f32") + static_cast<ARMOperand &>(*Operands[2]).getToken() == ".f32") { + if (static_cast<ARMOperand &>(*Operands[3]).isToken() && + static_cast<ARMOperand &>(*Operands[3]).getToken() == ".f32") RegIdx = 4; - if (static_cast<ARMOperand *>(Operands[RegIdx])->isReg() && - (ARMMCRegisterClasses[ARM::DPRRegClassID] - .contains(static_cast<ARMOperand *>(Operands[RegIdx])->getReg()) || - ARMMCRegisterClasses[ARM::QPRRegClassID] - .contains(static_cast<ARMOperand *>(Operands[RegIdx])->getReg()))) + if (static_cast<ARMOperand &>(*Operands[RegIdx]).isReg() && + (ARMMCRegisterClasses[ARM::DPRRegClassID].contains( + static_cast<ARMOperand &>(*Operands[RegIdx]).getReg()) || + ARMMCRegisterClasses[ARM::QPRRegClassID].contains( + static_cast<ARMOperand &>(*Operands[RegIdx]).getReg()))) return true; } return false; @@ -5309,8 +5295,7 @@ static bool RequiresVFPRegListValidation(StringRef Inst, /// Parse an arm instruction mnemonic followed by its operands. bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, - SMLoc NameLoc, - SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + SMLoc NameLoc, OperandVector &Operands) { // FIXME: Can this be done via tablegen in some fashion? bool RequireVFPRegisterListCheck; bool AcceptSinglePrecisionOnly; @@ -5489,12 +5474,12 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, Parser.Lex(); // Consume the EndOfStatement if (RequireVFPRegisterListCheck) { - ARMOperand *Op = static_cast<ARMOperand*>(Operands.back()); - if (AcceptSinglePrecisionOnly && !Op->isSPRRegList()) - return Error(Op->getStartLoc(), + ARMOperand &Op = static_cast<ARMOperand &>(*Operands.back()); + if (AcceptSinglePrecisionOnly && !Op.isSPRRegList()) + return Error(Op.getStartLoc(), "VFP/Neon single precision register expected"); - if (AcceptDoublePrecisionOnly && !Op->isDPRRegList()) - return Error(Op->getStartLoc(), + if (AcceptDoublePrecisionOnly && !Op.isDPRRegList()) + return Error(Op.getStartLoc(), "VFP/Neon double precision register expected"); } @@ -5505,20 +5490,14 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, // try to remove a cc_out operand that was explicitly set on the the // mnemonic, of course (CarrySetting == true). Reason number #317 the // table driven matcher doesn't fit well with the ARM instruction set. - if (!CarrySetting && shouldOmitCCOutOperand(Mnemonic, Operands)) { - ARMOperand *Op = static_cast<ARMOperand*>(Operands[1]); + if (!CarrySetting && shouldOmitCCOutOperand(Mnemonic, Operands)) Operands.erase(Operands.begin() + 1); - delete Op; - } // Some instructions have the same mnemonic, but don't always // have a predicate. Distinguish them here and delete the // predicate if needed. - if (shouldOmitPredicateOperand(Mnemonic, Operands)) { - ARMOperand *Op = static_cast<ARMOperand*>(Operands[1]); + if (shouldOmitPredicateOperand(Mnemonic, Operands)) Operands.erase(Operands.begin() + 1); - delete Op; - } // ARM mode 'blx' need special handling, as the register operand version // is predicable, but the label operand version is not. So, we can't rely @@ -5526,11 +5505,8 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, // a k_CondCode operand in the list. If we're trying to match the label // version, remove the k_CondCode operand here. if (!isThumb() && Mnemonic == "blx" && Operands.size() == 3 && - static_cast<ARMOperand*>(Operands[2])->isImm()) { - ARMOperand *Op = static_cast<ARMOperand*>(Operands[1]); + static_cast<ARMOperand &>(*Operands[2]).isImm()) Operands.erase(Operands.begin() + 1); - delete Op; - } // Adjust operands of ldrexd/strexd to MCK_GPRPair. // ldrexd/strexd require even/odd GPR pair. To enforce this constraint, @@ -5543,53 +5519,50 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, Mnemonic == "stlexd")) { bool isLoad = (Mnemonic == "ldrexd" || Mnemonic == "ldaexd"); unsigned Idx = isLoad ? 2 : 3; - ARMOperand* Op1 = static_cast<ARMOperand*>(Operands[Idx]); - ARMOperand* Op2 = static_cast<ARMOperand*>(Operands[Idx+1]); + ARMOperand &Op1 = static_cast<ARMOperand &>(*Operands[Idx]); + ARMOperand &Op2 = static_cast<ARMOperand &>(*Operands[Idx + 1]); const MCRegisterClass& MRC = MRI->getRegClass(ARM::GPRRegClassID); // Adjust only if Op1 and Op2 are GPRs. - if (Op1->isReg() && Op2->isReg() && MRC.contains(Op1->getReg()) && - MRC.contains(Op2->getReg())) { - unsigned Reg1 = Op1->getReg(); - unsigned Reg2 = Op2->getReg(); + if (Op1.isReg() && Op2.isReg() && MRC.contains(Op1.getReg()) && + MRC.contains(Op2.getReg())) { + unsigned Reg1 = Op1.getReg(); + unsigned Reg2 = Op2.getReg(); unsigned Rt = MRI->getEncodingValue(Reg1); unsigned Rt2 = MRI->getEncodingValue(Reg2); // Rt2 must be Rt + 1 and Rt must be even. if (Rt + 1 != Rt2 || (Rt & 1)) { - Error(Op2->getStartLoc(), isLoad ? - "destination operands must be sequential" : - "source operands must be sequential"); + Error(Op2.getStartLoc(), isLoad + ? "destination operands must be sequential" + : "source operands must be sequential"); return true; } unsigned NewReg = MRI->getMatchingSuperReg(Reg1, ARM::gsub_0, &(MRI->getRegClass(ARM::GPRPairRegClassID))); - Operands.erase(Operands.begin() + Idx, Operands.begin() + Idx + 2); - Operands.insert(Operands.begin() + Idx, ARMOperand::CreateReg( - NewReg, Op1->getStartLoc(), Op2->getEndLoc())); - delete Op1; - delete Op2; + Operands[Idx] = + ARMOperand::CreateReg(NewReg, Op1.getStartLoc(), Op2.getEndLoc()); + Operands.erase(Operands.begin() + Idx + 1); } } // GNU Assembler extension (compatibility) if ((Mnemonic == "ldrd" || Mnemonic == "strd")) { - ARMOperand *Op2 = static_cast<ARMOperand *>(Operands[2]); - ARMOperand *Op3 = static_cast<ARMOperand *>(Operands[3]); - if (Op3->isMem()) { - assert(Op2->isReg() && "expected register argument"); + ARMOperand &Op2 = static_cast<ARMOperand &>(*Operands[2]); + ARMOperand &Op3 = static_cast<ARMOperand &>(*Operands[3]); + if (Op3.isMem()) { + assert(Op2.isReg() && "expected register argument"); unsigned SuperReg = MRI->getMatchingSuperReg( - Op2->getReg(), ARM::gsub_0, &MRI->getRegClass(ARM::GPRPairRegClassID)); + Op2.getReg(), ARM::gsub_0, &MRI->getRegClass(ARM::GPRPairRegClassID)); assert(SuperReg && "expected register pair"); unsigned PairedReg = MRI->getSubReg(SuperReg, ARM::gsub_1); - Operands.insert(Operands.begin() + 3, - ARMOperand::CreateReg(PairedReg, - Op2->getStartLoc(), - Op2->getEndLoc())); + Operands.insert( + Operands.begin() + 3, + ARMOperand::CreateReg(PairedReg, Op2.getStartLoc(), Op2.getEndLoc())); } } @@ -5599,19 +5572,13 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, // so the Mnemonic is the original name "subs" and delete the predicate // operand so it will match the table entry. if (isThumbTwo() && Mnemonic == "sub" && Operands.size() == 6 && - static_cast<ARMOperand*>(Operands[3])->isReg() && - static_cast<ARMOperand*>(Operands[3])->getReg() == ARM::PC && - static_cast<ARMOperand*>(Operands[4])->isReg() && - static_cast<ARMOperand*>(Operands[4])->getReg() == ARM::LR && - static_cast<ARMOperand*>(Operands[5])->isImm()) { - ARMOperand *Op0 = static_cast<ARMOperand*>(Operands[0]); - Operands.erase(Operands.begin()); - delete Op0; - Operands.insert(Operands.begin(), ARMOperand::CreateToken(Name, NameLoc)); - - ARMOperand *Op1 = static_cast<ARMOperand*>(Operands[1]); + static_cast<ARMOperand &>(*Operands[3]).isReg() && + static_cast<ARMOperand &>(*Operands[3]).getReg() == ARM::PC && + static_cast<ARMOperand &>(*Operands[4]).isReg() && + static_cast<ARMOperand &>(*Operands[4]).getReg() == ARM::LR && + static_cast<ARMOperand &>(*Operands[5]).isImm()) { + Operands.front() = ARMOperand::CreateToken(Name, NameLoc); Operands.erase(Operands.begin() + 1); - delete Op1; } return false; } @@ -5657,9 +5624,8 @@ static bool instIsBreakpoint(const MCInst &Inst) { } // FIXME: We would really like to be able to tablegen'erate this. -bool ARMAsmParser:: -validateInstruction(MCInst &Inst, - const SmallVectorImpl<MCParsedAsmOperand*> &Operands) { +bool ARMAsmParser::validateInstruction(MCInst &Inst, + const OperandVector &Operands) { const MCInstrDesc &MCID = MII.get(Inst.getOpcode()); SMLoc Loc = Operands[0]->getStartLoc(); @@ -5682,7 +5648,7 @@ validateInstruction(MCInst &Inst, // Find the condition code Operand to get its SMLoc information. SMLoc CondLoc; for (unsigned I = 1; I < Operands.size(); ++I) - if (static_cast<ARMOperand*>(Operands[I])->isCondCode()) + if (static_cast<ARMOperand &>(*Operands[I]).isCondCode()) CondLoc = Operands[I]->getStartLoc(); return Error(CondLoc, "incorrect condition in IT block; got '" + StringRef(ARMCondCodeToString(ARMCC::CondCodes(Cond))) + @@ -5782,8 +5748,8 @@ validateInstruction(MCInst &Inst, // in the register list. unsigned Rn = Inst.getOperand(0).getReg(); bool HasWritebackToken = - (static_cast<ARMOperand*>(Operands[3])->isToken() && - static_cast<ARMOperand*>(Operands[3])->getToken() == "!"); + (static_cast<ARMOperand &>(*Operands[3]).isToken() && + static_cast<ARMOperand &>(*Operands[3]).getToken() == "!"); bool ListContainsBase; if (checkLowRegisterList(Inst, 3, Rn, 0, ListContainsBase) && !isThumbTwo()) return Error(Operands[3 + HasWritebackToken]->getStartLoc(), @@ -5843,11 +5809,10 @@ validateInstruction(MCInst &Inst, // this first statement is always true for the new Inst. Essentially, the // destination is unconditionally copied into the second source operand // without checking to see if it matches what we actually parsed. - if (Operands.size() == 6 && - (((ARMOperand*)Operands[3])->getReg() != - ((ARMOperand*)Operands[5])->getReg()) && - (((ARMOperand*)Operands[3])->getReg() != - ((ARMOperand*)Operands[4])->getReg())) { + if (Operands.size() == 6 && (((ARMOperand &)*Operands[3]).getReg() != + ((ARMOperand &)*Operands[5]).getReg()) && + (((ARMOperand &)*Operands[3]).getReg() != + ((ARMOperand &)*Operands[4]).getReg())) { return Error(Operands[3]->getStartLoc(), "destination register must match source register"); } @@ -5900,23 +5865,23 @@ validateInstruction(MCInst &Inst, } // Final range checking for Thumb unconditional branch instructions. case ARM::tB: - if (!(static_cast<ARMOperand*>(Operands[2]))->isSignedOffset<11, 1>()) + if (!(static_cast<ARMOperand &>(*Operands[2])).isSignedOffset<11, 1>()) return Error(Operands[2]->getStartLoc(), "branch target out of range"); break; case ARM::t2B: { int op = (Operands[2]->isImm()) ? 2 : 3; - if (!(static_cast<ARMOperand*>(Operands[op]))->isSignedOffset<24, 1>()) + if (!static_cast<ARMOperand &>(*Operands[op]).isSignedOffset<24, 1>()) return Error(Operands[op]->getStartLoc(), "branch target out of range"); break; } // Final range checking for Thumb conditional branch instructions. case ARM::tBcc: - if (!(static_cast<ARMOperand*>(Operands[2]))->isSignedOffset<8, 1>()) + if (!static_cast<ARMOperand &>(*Operands[2]).isSignedOffset<8, 1>()) return Error(Operands[2]->getStartLoc(), "branch target out of range"); break; case ARM::t2Bcc: { int Op = (Operands[2]->isImm()) ? 2 : 3; - if (!(static_cast<ARMOperand*>(Operands[Op]))->isSignedOffset<20, 1>()) + if (!static_cast<ARMOperand &>(*Operands[Op]).isSignedOffset<20, 1>()) return Error(Operands[Op]->getStartLoc(), "branch target out of range"); break; } @@ -5931,19 +5896,19 @@ validateInstruction(MCInst &Inst, // lead to bugs that are difficult to find since this is an easy mistake // to make. int i = (Operands[3]->isImm()) ? 3 : 4; - ARMOperand *Op = static_cast<ARMOperand*>(Operands[i]); - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op->getImm()); + ARMOperand &Op = static_cast<ARMOperand &>(*Operands[i]); + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op.getImm()); if (CE) break; - const MCExpr *E = dyn_cast<MCExpr>(Op->getImm()); + const MCExpr *E = dyn_cast<MCExpr>(Op.getImm()); if (!E) break; const ARMMCExpr *ARM16Expr = dyn_cast<ARMMCExpr>(E); if (!ARM16Expr || (ARM16Expr->getKind() != ARMMCExpr::VK_ARM_HI16 && - ARM16Expr->getKind() != ARMMCExpr::VK_ARM_LO16)) { - return Error(Op->getStartLoc(), - "immediate expression for mov requires :lower16: or :upper16"); - break; - } - } + ARM16Expr->getKind() != ARMMCExpr::VK_ARM_LO16)) + return Error( + Op.getStartLoc(), + "immediate expression for mov requires :lower16: or :upper16"); + break; + } } return false; @@ -6205,9 +6170,8 @@ static unsigned getRealVLDOpcode(unsigned Opc, unsigned &Spacing) { } } -bool ARMAsmParser:: -processInstruction(MCInst &Inst, - const SmallVectorImpl<MCParsedAsmOperand*> &Operands) { +bool ARMAsmParser::processInstruction(MCInst &Inst, + const OperandVector &Operands) { switch (Inst.getOpcode()) { // Alias for alternate form of 'ldr{,b}t Rt, [Rn], #imm' instruction. case ARM::LDRT_POST: @@ -6264,8 +6228,8 @@ processInstruction(MCInst &Inst, // Select the narrow version if the immediate will fit. if (Inst.getOperand(1).getImm() > 0 && Inst.getOperand(1).getImm() <= 0xff && - !(static_cast<ARMOperand*>(Operands[2])->isToken() && - static_cast<ARMOperand*>(Operands[2])->getToken() == ".w")) + !(static_cast<ARMOperand &>(*Operands[2]).isToken() && + static_cast<ARMOperand &>(*Operands[2]).getToken() == ".w")) Inst.setOpcode(ARM::tLDRpci); else Inst.setOpcode(ARM::t2LDRpci); @@ -7355,8 +7319,8 @@ processInstruction(MCInst &Inst, if (isARMLowRegister(Inst.getOperand(0).getReg()) && Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() && Inst.getOperand(5).getReg() == (inITBlock() ? 0 : ARM::CPSR) && - !(static_cast<ARMOperand*>(Operands[3])->isToken() && - static_cast<ARMOperand*>(Operands[3])->getToken() == ".w")) { + !(static_cast<ARMOperand &>(*Operands[3]).isToken() && + static_cast<ARMOperand &>(*Operands[3]).getToken() == ".w")) { unsigned NewOpc; switch (Inst.getOpcode()) { default: llvm_unreachable("unexpected opcode"); @@ -7559,7 +7523,7 @@ processInstruction(MCInst &Inst, case ARM::LDMIA_UPD: // If this is a load of a single register via a 'pop', then we should use // a post-indexed LDR instruction instead, per the ARM ARM. - if (static_cast<ARMOperand*>(Operands[0])->getToken() == "pop" && + if (static_cast<ARMOperand &>(*Operands[0]).getToken() == "pop" && Inst.getNumOperands() == 5) { MCInst TmpInst; TmpInst.setOpcode(ARM::LDR_POST_IMM); @@ -7577,7 +7541,7 @@ processInstruction(MCInst &Inst, case ARM::STMDB_UPD: // If this is a store of a single register via a 'push', then we should use // a pre-indexed STR instruction instead, per the ARM ARM. - if (static_cast<ARMOperand*>(Operands[0])->getToken() == "push" && + if (static_cast<ARMOperand &>(*Operands[0]).getToken() == "push" && Inst.getNumOperands() == 5) { MCInst TmpInst; TmpInst.setOpcode(ARM::STR_PRE_IMM); @@ -7593,7 +7557,7 @@ processInstruction(MCInst &Inst, case ARM::t2ADDri12: // If the immediate fits for encoding T3 (t2ADDri) and the generic "add" // mnemonic was used (not "addw"), encoding T3 is preferred. - if (static_cast<ARMOperand*>(Operands[0])->getToken() != "add" || + if (static_cast<ARMOperand &>(*Operands[0]).getToken() != "add" || ARM_AM::getT2SOImmVal(Inst.getOperand(2).getImm()) == -1) break; Inst.setOpcode(ARM::t2ADDri); @@ -7602,7 +7566,7 @@ processInstruction(MCInst &Inst, case ARM::t2SUBri12: // If the immediate fits for encoding T3 (t2SUBri) and the generic "sub" // mnemonic was used (not "subw"), encoding T3 is preferred. - if (static_cast<ARMOperand*>(Operands[0])->getToken() != "sub" || + if (static_cast<ARMOperand &>(*Operands[0]).getToken() != "sub" || ARM_AM::getT2SOImmVal(Inst.getOperand(2).getImm()) == -1) break; Inst.setOpcode(ARM::t2SUBri); @@ -7638,9 +7602,9 @@ processInstruction(MCInst &Inst, !isARMLowRegister(Inst.getOperand(0).getReg()) || (unsigned)Inst.getOperand(2).getImm() > 255 || ((!inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR) || - (inITBlock() && Inst.getOperand(5).getReg() != 0)) || - (static_cast<ARMOperand*>(Operands[3])->isToken() && - static_cast<ARMOperand*>(Operands[3])->getToken() == ".w")) + (inITBlock() && Inst.getOperand(5).getReg() != 0)) || + (static_cast<ARMOperand &>(*Operands[3]).isToken() && + static_cast<ARMOperand &>(*Operands[3]).getToken() == ".w")) break; MCInst TmpInst; TmpInst.setOpcode(Inst.getOpcode() == ARM::t2ADDri ? @@ -7661,8 +7625,8 @@ processInstruction(MCInst &Inst, // 'as' behaviour. Make sure the wide encoding wasn't explicit. if (Inst.getOperand(0).getReg() != Inst.getOperand(1).getReg() || Inst.getOperand(5).getReg() != 0 || - (static_cast<ARMOperand*>(Operands[3])->isToken() && - static_cast<ARMOperand*>(Operands[3])->getToken() == ".w")) + (static_cast<ARMOperand &>(*Operands[3]).isToken() && + static_cast<ARMOperand &>(*Operands[3]).getToken() == ".w")) break; MCInst TmpInst; TmpInst.setOpcode(ARM::tADDhirr); @@ -7719,8 +7683,8 @@ processInstruction(MCInst &Inst, // an error in validateInstruction(). unsigned Rn = Inst.getOperand(0).getReg(); bool hasWritebackToken = - (static_cast<ARMOperand*>(Operands[3])->isToken() && - static_cast<ARMOperand*>(Operands[3])->getToken() == "!"); + (static_cast<ARMOperand &>(*Operands[3]).isToken() && + static_cast<ARMOperand &>(*Operands[3]).getToken() == "!"); bool listContainsBase; if (checkLowRegisterList(Inst, 3, Rn, 0, listContainsBase) || (!listContainsBase && !hasWritebackToken) || @@ -7782,10 +7746,10 @@ processInstruction(MCInst &Inst, if (isARMLowRegister(Inst.getOperand(0).getReg()) && (unsigned)Inst.getOperand(1).getImm() <= 255 && ((!inITBlock() && Inst.getOperand(2).getImm() == ARMCC::AL && - Inst.getOperand(4).getReg() == ARM::CPSR) || - (inITBlock() && Inst.getOperand(4).getReg() == 0)) && - (!static_cast<ARMOperand*>(Operands[2])->isToken() || - static_cast<ARMOperand*>(Operands[2])->getToken() != ".w")) { + Inst.getOperand(4).getReg() == ARM::CPSR) || + (inITBlock() && Inst.getOperand(4).getReg() == 0)) && + (!static_cast<ARMOperand &>(*Operands[2]).isToken() || + static_cast<ARMOperand &>(*Operands[2]).getToken() != ".w")) { // The operands aren't in the same order for tMOVi8... MCInst TmpInst; TmpInst.setOpcode(ARM::tMOVi8); @@ -7806,8 +7770,8 @@ processInstruction(MCInst &Inst, isARMLowRegister(Inst.getOperand(1).getReg()) && Inst.getOperand(2).getImm() == ARMCC::AL && Inst.getOperand(4).getReg() == ARM::CPSR && - (!static_cast<ARMOperand*>(Operands[2])->isToken() || - static_cast<ARMOperand*>(Operands[2])->getToken() != ".w")) { + (!static_cast<ARMOperand &>(*Operands[2]).isToken() || + static_cast<ARMOperand &>(*Operands[2]).getToken() != ".w")) { // The operands aren't the same for tMOV[S]r... (no cc_out) MCInst TmpInst; TmpInst.setOpcode(Inst.getOperand(4).getReg() ? ARM::tMOVSr : ARM::tMOVr); @@ -7829,8 +7793,8 @@ processInstruction(MCInst &Inst, if (isARMLowRegister(Inst.getOperand(0).getReg()) && isARMLowRegister(Inst.getOperand(1).getReg()) && Inst.getOperand(2).getImm() == 0 && - (!static_cast<ARMOperand*>(Operands[2])->isToken() || - static_cast<ARMOperand*>(Operands[2])->getToken() != ".w")) { + (!static_cast<ARMOperand &>(*Operands[2]).isToken() || + static_cast<ARMOperand &>(*Operands[2]).getToken() != ".w")) { unsigned NewOpc; switch (Inst.getOpcode()) { default: llvm_unreachable("Illegal opcode!"); @@ -7942,9 +7906,10 @@ processInstruction(MCInst &Inst, isARMLowRegister(Inst.getOperand(2).getReg())) && Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() && ((!inITBlock() && Inst.getOperand(5).getReg() == ARM::CPSR) || - (inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR)) && - (!static_cast<ARMOperand*>(Operands[3])->isToken() || - !static_cast<ARMOperand*>(Operands[3])->getToken().equals_lower(".w"))) { + (inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR)) && + (!static_cast<ARMOperand &>(*Operands[3]).isToken() || + !static_cast<ARMOperand &>(*Operands[3]).getToken().equals_lower( + ".w"))) { unsigned NewOpc; switch (Inst.getOpcode()) { default: llvm_unreachable("unexpected opcode"); @@ -7981,9 +7946,10 @@ processInstruction(MCInst &Inst, (Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() || Inst.getOperand(0).getReg() == Inst.getOperand(2).getReg()) && ((!inITBlock() && Inst.getOperand(5).getReg() == ARM::CPSR) || - (inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR)) && - (!static_cast<ARMOperand*>(Operands[3])->isToken() || - !static_cast<ARMOperand*>(Operands[3])->getToken().equals_lower(".w"))) { + (inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR)) && + (!static_cast<ARMOperand &>(*Operands[3]).isToken() || + !static_cast<ARMOperand &>(*Operands[3]).getToken().equals_lower( + ".w"))) { unsigned NewOpc; switch (Inst.getOpcode()) { default: llvm_unreachable("unexpected opcode"); @@ -8063,11 +8029,10 @@ template <> inline bool IsCPSRDead<MCInst>(MCInst *Instr) { } static const char *getSubtargetFeatureName(unsigned Val); -bool ARMAsmParser:: -MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, - SmallVectorImpl<MCParsedAsmOperand*> &Operands, - MCStreamer &Out, unsigned &ErrorInfo, - bool MatchingInlineAsm) { +bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + OperandVector &Operands, + MCStreamer &Out, unsigned &ErrorInfo, + bool MatchingInlineAsm) { MCInst Inst; unsigned MatchResult; @@ -8136,7 +8101,7 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, if (ErrorInfo >= Operands.size()) return Error(IDLoc, "too few operands for instruction"); - ErrorLoc = ((ARMOperand*)Operands[ErrorInfo])->getStartLoc(); + ErrorLoc = ((ARMOperand &)*Operands[ErrorInfo]).getStartLoc(); if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc; } @@ -8144,7 +8109,7 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, } case Match_MnemonicFail: return Error(IDLoc, "invalid instruction", - ((ARMOperand*)Operands[0])->getLocRange()); + ((ARMOperand &)*Operands[0]).getLocRange()); case Match_RequiresNotITBlock: return Error(IDLoc, "flag setting instruction only valid outside IT block"); case Match_RequiresITBlock: @@ -8154,12 +8119,12 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, case Match_RequiresThumb2: return Error(IDLoc, "instruction variant requires Thumb2"); case Match_ImmRange0_15: { - SMLoc ErrorLoc = ((ARMOperand*)Operands[ErrorInfo])->getStartLoc(); + SMLoc ErrorLoc = ((ARMOperand &)*Operands[ErrorInfo]).getStartLoc(); if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc; return Error(ErrorLoc, "immediate operand must be in the range [0,15]"); } case Match_ImmRange0_239: { - SMLoc ErrorLoc = ((ARMOperand*)Operands[ErrorInfo])->getStartLoc(); + SMLoc ErrorLoc = ((ARMOperand &)*Operands[ErrorInfo]).getStartLoc(); if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc; return Error(ErrorLoc, "immediate operand must be in the range [0,239]"); } @@ -8175,7 +8140,7 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, case Match_DupAlignedMemoryRequires64or128: case Match_AlignedMemoryRequires64or128or256: { - SMLoc ErrorLoc = ((ARMOperand*)Operands[ErrorInfo])->getAlignmentLoc(); + SMLoc ErrorLoc = ((ARMOperand &)*Operands[ErrorInfo]).getAlignmentLoc(); if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc; switch (MatchResult) { default: @@ -8923,28 +8888,22 @@ bool ARMAsmParser::parseDirectiveRegSave(SMLoc L, bool IsVector) { } // RAII object to make sure parsed operands are deleted. - struct CleanupObject { - SmallVector<MCParsedAsmOperand *, 1> Operands; - ~CleanupObject() { - for (unsigned I = 0, E = Operands.size(); I != E; ++I) - delete Operands[I]; - } - } CO; + SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> Operands; // Parse the register list - if (parseRegisterList(CO.Operands)) + if (parseRegisterList(Operands)) return false; - ARMOperand *Op = (ARMOperand*)CO.Operands[0]; - if (!IsVector && !Op->isRegList()) { + ARMOperand &Op = (ARMOperand &)*Operands[0]; + if (!IsVector && !Op.isRegList()) { Error(L, ".save expects GPR registers"); return false; } - if (IsVector && !Op->isDPRRegList()) { + if (IsVector && !Op.isDPRRegList()) { Error(L, ".vsave expects DPR registers"); return false; } - getTargetStreamer().emitRegSave(Op->getRegList(), IsVector); + getTargetStreamer().emitRegSave(Op.getRegList(), IsVector); return false; } @@ -9468,23 +9427,23 @@ bool ARMAsmParser::parseDirectiveArchExtension(SMLoc L) { // Define this matcher function after the auto-generated include so we // have the match class enum definitions. -unsigned ARMAsmParser::validateTargetOperandClass(MCParsedAsmOperand *AsmOp, +unsigned ARMAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp, unsigned Kind) { - ARMOperand *Op = static_cast<ARMOperand*>(AsmOp); + ARMOperand &Op = static_cast<ARMOperand &>(AsmOp); // If the kind is a token for a literal immediate, check if our asm // operand matches. This is for InstAliases which have a fixed-value // immediate in the syntax. switch (Kind) { default: break; case MCK__35_0: - if (Op->isImm()) - if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op->getImm())) + if (Op.isImm()) + if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op.getImm())) if (CE->getValue() == 0) return Match_Success; break; case MCK_ARMSOImm: - if (Op->isImm()) { - const MCExpr *SOExpr = Op->getImm(); + if (Op.isImm()) { + const MCExpr *SOExpr = Op.getImm(); int64_t Value; if (!SOExpr->EvaluateAsAbsolute(Value)) return Match_Success; @@ -9493,8 +9452,8 @@ unsigned ARMAsmParser::validateTargetOperandClass(MCParsedAsmOperand *AsmOp, } break; case MCK_GPRPair: - if (Op->isReg() && - MRI->getRegClass(ARM::GPRRegClassID).contains(Op->getReg())) + if (Op.isReg() && + MRI->getRegClass(ARM::GPRRegClassID).contains(Op.getReg())) return Match_Success; break; } diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp index e4b785def87c..228fb5756ca8 100644 --- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp +++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp @@ -1092,13 +1092,13 @@ void ARMInstPrinter::printAddrModeImm12Operand(const MCInst *MI, unsigned OpNum, if (isSub) { O << ", " << markup("<imm:") - << "#-" << -OffImm + << "#-" << formatImm(-OffImm) << markup(">"); } else if (AlwaysPrintImm0 || OffImm > 0) { O << ", " << markup("<imm:") - << "#" << OffImm + << "#" << formatImm(OffImm) << markup(">"); } O << "]" << markup(">"); diff --git a/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h b/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h index 42a1cbb8c222..1686d76b8e1b 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h +++ b/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h @@ -295,7 +295,12 @@ namespace ARMII { /// MO_OPTION_MASK - Most flags are mutually exclusive; this mask selects /// just that part of the flag set. - MO_OPTION_MASK = 0x7f, + MO_OPTION_MASK = 0x3f, + + /// MO_DLLIMPORT - On a symbol operand, this represents that the reference + /// to the symbol is for an import stub. This is used for DLL import + /// storage class indication on Windows. + MO_DLLIMPORT = 0x40, /// MO_NONLAZY - This is an independent flag, on a symbol operand "FOO" it /// represents a symbol which, if indirect, will get special Darwin mangling diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp index a4d13edd3ac7..7b5d8b01dfe6 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp @@ -992,7 +992,8 @@ void ARMTargetELFStreamer::emitLabel(MCSymbol *Symbol) { return; const MCSymbolData &SD = Streamer.getOrCreateSymbolData(Symbol); - if (MCELF::GetType(SD) & (ELF::STT_FUNC << ELF_STT_Shift)) + unsigned Type = MCELF::GetType(SD); + if (Type == ELF_STT_Func || Type == ELF_STT_GnuIFunc) Streamer.EmitThumbFunc(Symbol); } @@ -1160,7 +1161,7 @@ void ARMELFStreamer::EmitPersonalityFixup(StringRef Name) { const MCSymbolRefExpr *PersonalityRef = MCSymbolRefExpr::Create( PersonalitySym, MCSymbolRefExpr::VK_ARM_NONE, getContext()); - AddValueSymbols(PersonalityRef); + visitUsedExpr(*PersonalityRef); MCDataFragment *DF = getOrCreateDataFragment(); DF->getFixups().push_back(MCFixup::Create(DF->getContents().size(), PersonalityRef, @@ -1332,6 +1333,12 @@ MCStreamer *createMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS, return S; } +MCStreamer *createARMNullStreamer(MCContext &Ctx) { + MCStreamer *S = llvm::createNullStreamer(Ctx); + new ARMTargetStreamer(*S); + return S; +} + MCELFStreamer* createARMELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS, MCCodeEmitter *Emitter, bool RelaxAll, bool NoExecStack, diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp index 5b51a52f828a..b8ee55574972 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp @@ -1047,8 +1047,7 @@ ARMMCCodeEmitter::getHiLo16ImmOpValue(const MCInst &MI, unsigned OpIdx, // we have a movt or a movw, but that led to misleadingly results. // This is now disallowed in the the AsmParser in validateInstruction() // so this should never happen. - assert(0 && "expression without :upper16: or :lower16:"); - return 0; + llvm_unreachable("expression without :upper16: or :lower16:"); } uint32_t ARMMCCodeEmitter:: diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp index 87ea8751944a..e545e3c2f301 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp @@ -41,33 +41,6 @@ ARMMCExpr::EvaluateAsRelocatableImpl(MCValue &Res, return false; } -// FIXME: This basically copies MCObjectStreamer::AddValueSymbols. Perhaps -// that method should be made public? -static void AddValueSymbols_(const MCExpr *Value, MCAssembler *Asm) { - switch (Value->getKind()) { - case MCExpr::Target: - llvm_unreachable("Can't handle nested target expr!"); - - case MCExpr::Constant: - break; - - case MCExpr::Binary: { - const MCBinaryExpr *BE = cast<MCBinaryExpr>(Value); - AddValueSymbols_(BE->getLHS(), Asm); - AddValueSymbols_(BE->getRHS(), Asm); - break; - } - - case MCExpr::SymbolRef: - Asm->getOrCreateSymbolData(cast<MCSymbolRefExpr>(Value)->getSymbol()); - break; - - case MCExpr::Unary: - AddValueSymbols_(cast<MCUnaryExpr>(Value)->getSubExpr(), Asm); - break; - } -} - -void ARMMCExpr::AddValueSymbols(MCAssembler *Asm) const { - AddValueSymbols_(getSubExpr(), Asm); +void ARMMCExpr::visitUsedExpr(MCStreamer &Streamer) const { + Streamer.visitUsedExpr(*getSubExpr()); } diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h index d8191395c5ec..c5c0b10f8ad9 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h +++ b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h @@ -59,7 +59,7 @@ public: void PrintImpl(raw_ostream &OS) const override; bool EvaluateAsRelocatableImpl(MCValue &Res, const MCAsmLayout *Layout) const override; - void AddValueSymbols(MCAssembler *) const override; + void visitUsedExpr(MCStreamer &Streamer) const override; const MCSection *FindAssociatedSection() const override { return getSubExpr()->FindAssociatedSection(); } diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp index 04d63a7e6d46..2b3855d793af 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp @@ -427,6 +427,12 @@ extern "C" void LLVMInitializeARMTargetMC() { TargetRegistry::RegisterAsmStreamer(TheThumbLETarget, createMCAsmStreamer); TargetRegistry::RegisterAsmStreamer(TheThumbBETarget, createMCAsmStreamer); + // Register the null streamer. + TargetRegistry::RegisterNullStreamer(TheARMLETarget, createARMNullStreamer); + TargetRegistry::RegisterNullStreamer(TheARMBETarget, createARMNullStreamer); + TargetRegistry::RegisterNullStreamer(TheThumbLETarget, createARMNullStreamer); + TargetRegistry::RegisterNullStreamer(TheThumbBETarget, createARMNullStreamer); + // Register the MCInstPrinter. TargetRegistry::RegisterMCInstPrinter(TheARMLETarget, createARMMCInstPrinter); TargetRegistry::RegisterMCInstPrinter(TheARMBETarget, createARMMCInstPrinter); diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h index 8853a8c69c50..5326e564f363 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h +++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h @@ -51,6 +51,8 @@ MCStreamer *createMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS, MCInstPrinter *InstPrint, MCCodeEmitter *CE, MCAsmBackend *TAB, bool ShowInst); +MCStreamer *createARMNullStreamer(MCContext &Ctx); + MCCodeEmitter *createARMLEMCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, const MCSubtargetInfo &STI, diff --git a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp index ecfa4e54b26d..186776a1944f 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp @@ -32,6 +32,7 @@ class ARMMachObjectWriter : public MCMachObjectTargetWriter { const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target, + unsigned Type, unsigned Log2Size, uint64_t &FixedValue); void RecordARMScatteredHalfRelocation(MachObjectWriter *Writer, @@ -251,11 +252,11 @@ void ARMMachObjectWriter::RecordARMScatteredRelocation(MachObjectWriter *Writer, const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target, + unsigned Type, unsigned Log2Size, uint64_t &FixedValue) { uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset(); unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind()); - unsigned Type = MachO::ARM_RELOC_VANILLA; // See <reloc.h>. const MCSymbol *A = &Target.getSymA()->getSymbol(); @@ -272,6 +273,7 @@ void ARMMachObjectWriter::RecordARMScatteredRelocation(MachObjectWriter *Writer, uint32_t Value2 = 0; if (const MCSymbolRefExpr *B = Target.getSymB()) { + assert(Type == MachO::ARM_RELOC_VANILLA && "invalid reloc for 2 symbols"); const MCSymbolData *B_SD = &Asm.getSymbolData(B->getSymbol()); if (!B_SD->getFragment()) @@ -374,7 +376,8 @@ void ARMMachObjectWriter::RecordRelocation(MachObjectWriter *Writer, return RecordARMScatteredHalfRelocation(Writer, Asm, Layout, Fragment, Fixup, Target, FixedValue); return RecordARMScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup, - Target, Log2Size, FixedValue); + Target, RelocType, Log2Size, + FixedValue); } // Get the symbol data, if any. @@ -392,7 +395,8 @@ void ARMMachObjectWriter::RecordRelocation(MachObjectWriter *Writer, Offset += 1 << Log2Size; if (Offset && SD && !Writer->doesSymbolRequireExternRelocation(SD)) return RecordARMScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup, - Target, Log2Size, FixedValue); + Target, RelocType, Log2Size, + FixedValue); // See <reloc.h>. uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset(); diff --git a/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp index e3cfb05b379d..ad3f1caae347 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp @@ -11,147 +11,12 @@ // //===----------------------------------------------------------------------===// #include "llvm/ADT/MapVector.h" +#include "llvm/MC/ConstantPools.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCStreamer.h" using namespace llvm; - -namespace { -// A class to keep track of assembler-generated constant pools that are use to -// implement the ldr-pseudo. -class ConstantPool { - typedef SmallVector<std::pair<MCSymbol *, const MCExpr *>, 4> EntryVecTy; - EntryVecTy Entries; - -public: - // Initialize a new empty constant pool - ConstantPool() {} - - // Add a new entry to the constant pool in the next slot. - // \param Value is the new entry to put in the constant pool. - // - // \returns a MCExpr that references the newly inserted value - const MCExpr *addEntry(const MCExpr *Value, MCContext &Context); - - // Emit the contents of the constant pool using the provided streamer. - void emitEntries(MCStreamer &Streamer); - - // Return true if the constant pool is empty - bool empty(); -}; -} - -namespace llvm { -class AssemblerConstantPools { - // Map type used to keep track of per-Section constant pools used by the - // ldr-pseudo opcode. The map associates a section to its constant pool. The - // constant pool is a vector of (label, value) pairs. When the ldr - // pseudo is parsed we insert a new (label, value) pair into the constant pool - // for the current section and add MCSymbolRefExpr to the new label as - // an opcode to the ldr. After we have parsed all the user input we - // output the (label, value) pairs in each constant pool at the end of the - // section. - // - // We use the MapVector for the map type to ensure stable iteration of - // the sections at the end of the parse. We need to iterate over the - // sections in a stable order to ensure that we have print the - // constant pools in a deterministic order when printing an assembly - // file. - typedef MapVector<const MCSection *, ConstantPool> ConstantPoolMapTy; - ConstantPoolMapTy ConstantPools; - -public: - AssemblerConstantPools() {} - ~AssemblerConstantPools() {} - - void emitAll(MCStreamer &Streamer); - void emitForCurrentSection(MCStreamer &Streamer); - const MCExpr *addEntry(MCStreamer &Streamer, const MCExpr *Expr); - -private: - ConstantPool *getConstantPool(const MCSection *Section); - ConstantPool &getOrCreateConstantPool(const MCSection *Section); -}; -} - -// -// ConstantPool implementation -// -// Emit the contents of the constant pool using the provided streamer. -void ConstantPool::emitEntries(MCStreamer &Streamer) { - if (Entries.empty()) - return; - Streamer.EmitCodeAlignment(4); // align to 4-byte address - Streamer.EmitDataRegion(MCDR_DataRegion); - for (EntryVecTy::const_iterator I = Entries.begin(), E = Entries.end(); - I != E; ++I) { - Streamer.EmitLabel(I->first); - Streamer.EmitValue(I->second, 4); - } - Streamer.EmitDataRegion(MCDR_DataRegionEnd); - Entries.clear(); -} - -const MCExpr *ConstantPool::addEntry(const MCExpr *Value, MCContext &Context) { - MCSymbol *CPEntryLabel = Context.CreateTempSymbol(); - - Entries.push_back(std::make_pair(CPEntryLabel, Value)); - return MCSymbolRefExpr::Create(CPEntryLabel, Context); -} - -bool ConstantPool::empty() { return Entries.empty(); } - -// -// AssemblerConstantPools implementation -// -ConstantPool * -AssemblerConstantPools::getConstantPool(const MCSection *Section) { - ConstantPoolMapTy::iterator CP = ConstantPools.find(Section); - if (CP == ConstantPools.end()) - return nullptr; - - return &CP->second; -} - -ConstantPool & -AssemblerConstantPools::getOrCreateConstantPool(const MCSection *Section) { - return ConstantPools[Section]; -} - -static void emitConstantPool(MCStreamer &Streamer, const MCSection *Section, - ConstantPool &CP) { - if (!CP.empty()) { - Streamer.SwitchSection(Section); - CP.emitEntries(Streamer); - } -} - -void AssemblerConstantPools::emitAll(MCStreamer &Streamer) { - // Dump contents of assembler constant pools. - for (ConstantPoolMapTy::iterator CPI = ConstantPools.begin(), - CPE = ConstantPools.end(); - CPI != CPE; ++CPI) { - const MCSection *Section = CPI->first; - ConstantPool &CP = CPI->second; - - emitConstantPool(Streamer, Section, CP); - } -} - -void AssemblerConstantPools::emitForCurrentSection(MCStreamer &Streamer) { - const MCSection *Section = Streamer.getCurrentSection().first; - if (ConstantPool *CP = getConstantPool(Section)) { - emitConstantPool(Streamer, Section, *CP); - } -} - -const MCExpr *AssemblerConstantPools::addEntry(MCStreamer &Streamer, - const MCExpr *Expr) { - const MCSection *Section = Streamer.getCurrentSection().first; - return getOrCreateConstantPool(Section).addEntry(Expr, Streamer.getContext()); -} - // // ARMTargetStreamer Implemenation // @@ -175,78 +40,34 @@ void ARMTargetStreamer::finish() { ConstantPools->emitAll(Streamer); } // The remaining callbacks should be handled separately by each // streamer. -void ARMTargetStreamer::emitFnStart() { - llvm_unreachable("unimplemented"); -} -void ARMTargetStreamer::emitFnEnd() { - llvm_unreachable("unimplemented"); -} -void ARMTargetStreamer::emitCantUnwind() { - llvm_unreachable("unimplemented"); -} -void ARMTargetStreamer::emitPersonality(const MCSymbol *Personality) { - llvm_unreachable("unimplemented"); -} -void ARMTargetStreamer::emitPersonalityIndex(unsigned Index) { - llvm_unreachable("unimplemented"); -} -void ARMTargetStreamer::emitHandlerData() { - llvm_unreachable("unimplemented"); -} +void ARMTargetStreamer::emitFnStart() {} +void ARMTargetStreamer::emitFnEnd() {} +void ARMTargetStreamer::emitCantUnwind() {} +void ARMTargetStreamer::emitPersonality(const MCSymbol *Personality) {} +void ARMTargetStreamer::emitPersonalityIndex(unsigned Index) {} +void ARMTargetStreamer::emitHandlerData() {} void ARMTargetStreamer::emitSetFP(unsigned FpReg, unsigned SpReg, - int64_t Offset) { - llvm_unreachable("unimplemented"); -} -void ARMTargetStreamer::emitMovSP(unsigned Reg, int64_t Offset) { - llvm_unreachable("unimplemented"); -} -void ARMTargetStreamer::emitPad(int64_t Offset) { - llvm_unreachable("unimplemented"); -} -void -ARMTargetStreamer::emitRegSave(const SmallVectorImpl<unsigned> &RegList, - bool isVector) { - llvm_unreachable("unimplemented"); -} -void ARMTargetStreamer::emitUnwindRaw( - int64_t StackOffset, const SmallVectorImpl<uint8_t> &Opcodes) { - llvm_unreachable("unimplemented"); -} -void ARMTargetStreamer::switchVendor(StringRef Vendor) { - llvm_unreachable("unimplemented"); -} -void ARMTargetStreamer::emitAttribute(unsigned Attribute, unsigned Value) { - llvm_unreachable("unimplemented"); -} + int64_t Offset) {} +void ARMTargetStreamer::emitMovSP(unsigned Reg, int64_t Offset) {} +void ARMTargetStreamer::emitPad(int64_t Offset) {} +void ARMTargetStreamer::emitRegSave(const SmallVectorImpl<unsigned> &RegList, + bool isVector) {} +void ARMTargetStreamer::emitUnwindRaw(int64_t StackOffset, + const SmallVectorImpl<uint8_t> &Opcodes) { +} +void ARMTargetStreamer::switchVendor(StringRef Vendor) {} +void ARMTargetStreamer::emitAttribute(unsigned Attribute, unsigned Value) {} void ARMTargetStreamer::emitTextAttribute(unsigned Attribute, - StringRef String) { - llvm_unreachable("unimplemented"); -} + StringRef String) {} void ARMTargetStreamer::emitIntTextAttribute(unsigned Attribute, - unsigned IntValue, - StringRef StringValue) { - llvm_unreachable("unimplemented"); -} -void ARMTargetStreamer::emitArch(unsigned Arch) { - llvm_unreachable("unimplemented"); -} -void ARMTargetStreamer::emitObjectArch(unsigned Arch) { - llvm_unreachable("unimplemented"); -} -void ARMTargetStreamer::emitFPU(unsigned FPU) { - llvm_unreachable("unimplemented"); -} -void ARMTargetStreamer::finishAttributeSection() { - llvm_unreachable("unimplemented"); -} -void ARMTargetStreamer::emitInst(uint32_t Inst, char Suffix) { - llvm_unreachable("unimplemented"); -} -void ARMTargetStreamer::AnnotateTLSDescriptorSequence( - const MCSymbolRefExpr *SRE) { - llvm_unreachable("unimplemented"); -} + unsigned IntValue, + StringRef StringValue) {} +void ARMTargetStreamer::emitArch(unsigned Arch) {} +void ARMTargetStreamer::emitObjectArch(unsigned Arch) {} +void ARMTargetStreamer::emitFPU(unsigned FPU) {} +void ARMTargetStreamer::finishAttributeSection() {} +void ARMTargetStreamer::emitInst(uint32_t Inst, char Suffix) {} +void +ARMTargetStreamer::AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *SRE) {} -void ARMTargetStreamer::emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) { - llvm_unreachable("unimplemented"); -} +void ARMTargetStreamer::emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) {} diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp index be29dc5c28d1..baa97a7c479a 100644 --- a/lib/Target/ARM/Thumb1FrameLowering.cpp +++ b/lib/Target/ARM/Thumb1FrameLowering.cpp @@ -21,6 +21,9 @@ using namespace llvm; +Thumb1FrameLowering::Thumb1FrameLowering(const ARMSubtarget &sti) + : ARMFrameLowering(sti) {} + bool Thumb1FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const{ const MachineFrameInfo *FFI = MF.getFrameInfo(); unsigned CFSize = FFI->getMaxCallFrameSize(); diff --git a/lib/Target/ARM/Thumb1FrameLowering.h b/lib/Target/ARM/Thumb1FrameLowering.h index f61874b1cb0a..a227f8ece73f 100644 --- a/lib/Target/ARM/Thumb1FrameLowering.h +++ b/lib/Target/ARM/Thumb1FrameLowering.h @@ -11,11 +11,10 @@ // //===----------------------------------------------------------------------===// -#ifndef __THUMB_FRAMEINFO_H_ -#define __THUMB_FRAMEINFO_H_ +#ifndef LLVM_ARM_THUMB1FRAMELOWERING_H +#define LLVM_ARM_THUMB1FRAMELOWERING_H #include "ARMFrameLowering.h" -#include "ARMSubtarget.h" #include "Thumb1InstrInfo.h" #include "Thumb1RegisterInfo.h" #include "llvm/Target/TargetFrameLowering.h" @@ -24,9 +23,7 @@ namespace llvm { class Thumb1FrameLowering : public ARMFrameLowering { public: - explicit Thumb1FrameLowering(const ARMSubtarget &sti) - : ARMFrameLowering(sti) { - } + explicit Thumb1FrameLowering(const ARMSubtarget &sti); /// emitProlog/emitEpilog - These methods insert prolog and epilog code into /// the function. diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp index 6267ecf53c7d..09debe76f1b6 100644 --- a/lib/Target/ARM/Thumb2SizeReduction.cpp +++ b/lib/Target/ARM/Thumb2SizeReduction.cpp @@ -1010,7 +1010,8 @@ bool Thumb2SizeReduce::runOnMachineFunction(MachineFunction &MF) { AttributeSet FnAttrs = MF.getFunction()->getAttributes(); OptimizeSize = FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize); - MinimizeSize = STI->isMinSize(); + MinimizeSize = + FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize); BlockInfo.clear(); BlockInfo.resize(MF.getNumBlockIDs()); diff --git a/lib/Target/CppBackend/CPPBackend.cpp b/lib/Target/CppBackend/CPPBackend.cpp index 15b574dd3688..f610fbb969bc 100644 --- a/lib/Target/CppBackend/CPPBackend.cpp +++ b/lib/Target/CppBackend/CPPBackend.cpp @@ -1577,6 +1577,10 @@ void CppWriter::printInstruction(const Instruction *I, nl(Out) << iName << "->setName(\""; printEscapedString(cxi->getName()); Out << "\");"; + nl(Out) << iName << "->setVolatile(" + << (cxi->isVolatile() ? "true" : "false") << ");"; + nl(Out) << iName << "->setWeak(" + << (cxi->isWeak() ? "true" : "false") << ");"; break; } case Instruction::AtomicRMW: { @@ -1607,6 +1611,8 @@ void CppWriter::printInstruction(const Instruction *I, nl(Out) << iName << "->setName(\""; printEscapedString(rmwi->getName()); Out << "\");"; + nl(Out) << iName << "->setVolatile(" + << (rmwi->isVolatile() ? "true" : "false") << ");"; break; } case Instruction::LandingPad: { diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp index d551ca9dc7f5..21df12faefae 100644 --- a/lib/Target/Hexagon/HexagonFrameLowering.cpp +++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp @@ -165,8 +165,8 @@ void HexagonFrameLowering::emitEpilogue(MachineFunction &MF, } // Replace 'jumpr r31' instruction with dealloc_return for V4 and higher // versions. - if (STI.hasV4TOps() && MBBI->getOpcode() == Hexagon::JMPret - && !DisableDeallocRet) { + if (MF.getTarget().getSubtarget<HexagonSubtarget>().hasV4TOps() && + MBBI->getOpcode() == Hexagon::JMPret && !DisableDeallocRet) { // Check for RESTORE_DEALLOC_RET_JMP_V4 call. Don't emit an extra DEALLOC // instruction if we encounter it. MachineBasicBlock::iterator BeforeJMPR = diff --git a/lib/Target/Hexagon/HexagonFrameLowering.h b/lib/Target/Hexagon/HexagonFrameLowering.h index 446af161aa75..2d4b0b9d7eb3 100644 --- a/lib/Target/Hexagon/HexagonFrameLowering.h +++ b/lib/Target/Hexagon/HexagonFrameLowering.h @@ -11,20 +11,16 @@ #define HEXAGON_FRAMEINFO_H #include "Hexagon.h" -#include "HexagonSubtarget.h" #include "llvm/Target/TargetFrameLowering.h" namespace llvm { class HexagonFrameLowering : public TargetFrameLowering { private: - const HexagonSubtarget &STI; void determineFrameLayout(MachineFunction &MF) const; public: - explicit HexagonFrameLowering(const HexagonSubtarget &sti) - : TargetFrameLowering(StackGrowsDown, 8, 0), STI(sti) { - } + explicit HexagonFrameLowering() : TargetFrameLowering(StackGrowsDown, 8, 0) {} /// emitProlog/emitEpilog - These methods insert prolog and epilog code into /// the function. diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp index b8e5d24f4f16..a460ea4f3420 100644 --- a/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -463,9 +463,10 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVector<std::pair<unsigned, SDValue>, 16> RegsToPass; SmallVector<SDValue, 8> MemOpChains; + const HexagonRegisterInfo *QRI = static_cast<const HexagonRegisterInfo *>( + DAG.getTarget().getRegisterInfo()); SDValue StackPtr = - DAG.getCopyFromReg(Chain, dl, TM.getRegisterInfo()->getStackRegister(), - getPointerTy()); + DAG.getCopyFromReg(Chain, dl, QRI->getStackRegister(), getPointerTy()); // Walk the register/memloc assignments, inserting copies/loads. for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { @@ -720,7 +721,10 @@ SDValue HexagonTargetLowering::LowerINLINEASM(SDValue Op, cast<RegisterSDNode>(Node->getOperand(i))->getReg(); // Check it to be lr - if (Reg == TM.getRegisterInfo()->getRARegister()) { + const HexagonRegisterInfo *QRI = + static_cast<const HexagonRegisterInfo *>( + DAG.getTarget().getRegisterInfo()); + if (Reg == QRI->getRARegister()) { FuncInfo->setHasClobberLR(true); break; } @@ -812,9 +816,9 @@ HexagonTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, // The Sub result contains the new stack start address, so it // must be placed in the stack pointer register. - SDValue CopyChain = DAG.getCopyToReg(Chain, dl, - TM.getRegisterInfo()->getStackRegister(), - Sub); + const HexagonRegisterInfo *QRI = static_cast<const HexagonRegisterInfo *>( + DAG.getTarget().getRegisterInfo()); + SDValue CopyChain = DAG.getCopyToReg(Chain, dl, QRI->getStackRegister(), Sub); SDValue Ops[2] = { ArgAdjust, CopyChain }; return DAG.getMergeValues(Ops, dl); @@ -944,21 +948,6 @@ HexagonTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { } SDValue -HexagonTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { - SDValue LHS = Op.getOperand(0); - SDValue RHS = Op.getOperand(1); - SDValue CC = Op.getOperand(4); - SDValue TrueVal = Op.getOperand(2); - SDValue FalseVal = Op.getOperand(3); - SDLoc dl(Op); - SDNode* OpNode = Op.getNode(); - EVT SVT = OpNode->getValueType(0); - - SDValue Cond = DAG.getNode(ISD::SETCC, dl, MVT::i1, LHS, RHS, CC); - return DAG.getNode(ISD::SELECT, dl, SVT, Cond, TrueVal, FalseVal); -} - -SDValue HexagonTargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { EVT ValTy = Op.getValueType(); SDLoc dl(Op); @@ -975,7 +964,7 @@ HexagonTargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { SDValue HexagonTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const { - const TargetRegisterInfo *TRI = TM.getRegisterInfo(); + const TargetRegisterInfo *TRI = DAG.getTarget().getRegisterInfo(); MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); MFI->setReturnAddressIsTaken(true); @@ -1001,7 +990,8 @@ HexagonTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const { SDValue HexagonTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { - const HexagonRegisterInfo *TRI = TM.getRegisterInfo(); + const HexagonRegisterInfo *TRI = + static_cast<const HexagonRegisterInfo *>(DAG.getTarget().getRegisterInfo()); MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); MFI->setFrameAddressIsTaken(true); @@ -1053,429 +1043,422 @@ HexagonTargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { // TargetLowering Implementation //===----------------------------------------------------------------------===// -HexagonTargetLowering::HexagonTargetLowering(HexagonTargetMachine - &targetmachine) - : TargetLowering(targetmachine, new HexagonTargetObjectFile()), - TM(targetmachine) { - - const HexagonRegisterInfo* QRI = TM.getRegisterInfo(); +HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &targetmachine) + : TargetLowering(targetmachine, new HexagonTargetObjectFile()), + TM(targetmachine) { - // Set up the register classes. - addRegisterClass(MVT::i32, &Hexagon::IntRegsRegClass); - addRegisterClass(MVT::i64, &Hexagon::DoubleRegsRegClass); - - if (QRI->Subtarget.hasV5TOps()) { - addRegisterClass(MVT::f32, &Hexagon::IntRegsRegClass); - addRegisterClass(MVT::f64, &Hexagon::DoubleRegsRegClass); - } + const HexagonSubtarget &Subtarget = TM.getSubtarget<HexagonSubtarget>(); - addRegisterClass(MVT::i1, &Hexagon::PredRegsRegClass); + // Set up the register classes. + addRegisterClass(MVT::i32, &Hexagon::IntRegsRegClass); + addRegisterClass(MVT::i64, &Hexagon::DoubleRegsRegClass); - computeRegisterProperties(); + if (Subtarget.hasV5TOps()) { + addRegisterClass(MVT::f32, &Hexagon::IntRegsRegClass); + addRegisterClass(MVT::f64, &Hexagon::DoubleRegsRegClass); + } - // Align loop entry - setPrefLoopAlignment(4); + addRegisterClass(MVT::i1, &Hexagon::PredRegsRegClass); - // Limits for inline expansion of memcpy/memmove - MaxStoresPerMemcpy = 6; - MaxStoresPerMemmove = 6; + computeRegisterProperties(); - // - // Library calls for unsupported operations - // + // Align loop entry + setPrefLoopAlignment(4); - setLibcallName(RTLIB::SINTTOFP_I128_F64, "__hexagon_floattidf"); - setLibcallName(RTLIB::SINTTOFP_I128_F32, "__hexagon_floattisf"); - - setLibcallName(RTLIB::FPTOUINT_F32_I128, "__hexagon_fixunssfti"); - setLibcallName(RTLIB::FPTOUINT_F64_I128, "__hexagon_fixunsdfti"); - - setLibcallName(RTLIB::FPTOSINT_F32_I128, "__hexagon_fixsfti"); - setLibcallName(RTLIB::FPTOSINT_F64_I128, "__hexagon_fixdfti"); - - setLibcallName(RTLIB::SDIV_I32, "__hexagon_divsi3"); - setOperationAction(ISD::SDIV, MVT::i32, Expand); - setLibcallName(RTLIB::SREM_I32, "__hexagon_umodsi3"); - setOperationAction(ISD::SREM, MVT::i32, Expand); - - setLibcallName(RTLIB::SDIV_I64, "__hexagon_divdi3"); - setOperationAction(ISD::SDIV, MVT::i64, Expand); - setLibcallName(RTLIB::SREM_I64, "__hexagon_moddi3"); - setOperationAction(ISD::SREM, MVT::i64, Expand); - - setLibcallName(RTLIB::UDIV_I32, "__hexagon_udivsi3"); - setOperationAction(ISD::UDIV, MVT::i32, Expand); - - setLibcallName(RTLIB::UDIV_I64, "__hexagon_udivdi3"); - setOperationAction(ISD::UDIV, MVT::i64, Expand); - - setLibcallName(RTLIB::UREM_I32, "__hexagon_umodsi3"); - setOperationAction(ISD::UREM, MVT::i32, Expand); - - setLibcallName(RTLIB::UREM_I64, "__hexagon_umoddi3"); - setOperationAction(ISD::UREM, MVT::i64, Expand); - - setLibcallName(RTLIB::DIV_F32, "__hexagon_divsf3"); - setOperationAction(ISD::FDIV, MVT::f32, Expand); - - setLibcallName(RTLIB::DIV_F64, "__hexagon_divdf3"); - setOperationAction(ISD::FDIV, MVT::f64, Expand); - - setOperationAction(ISD::FSQRT, MVT::f32, Expand); - setOperationAction(ISD::FSQRT, MVT::f64, Expand); - setOperationAction(ISD::FSIN, MVT::f32, Expand); - setOperationAction(ISD::FSIN, MVT::f64, Expand); - - if (QRI->Subtarget.hasV5TOps()) { - // Hexagon V5 Support. - setOperationAction(ISD::FADD, MVT::f32, Legal); - setOperationAction(ISD::FADD, MVT::f64, Legal); - setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal); - setCondCodeAction(ISD::SETOEQ, MVT::f32, Legal); - setCondCodeAction(ISD::SETOEQ, MVT::f64, Legal); - setCondCodeAction(ISD::SETUEQ, MVT::f32, Legal); - setCondCodeAction(ISD::SETUEQ, MVT::f64, Legal); - - setCondCodeAction(ISD::SETOGE, MVT::f32, Legal); - setCondCodeAction(ISD::SETOGE, MVT::f64, Legal); - setCondCodeAction(ISD::SETUGE, MVT::f32, Legal); - setCondCodeAction(ISD::SETUGE, MVT::f64, Legal); - - setCondCodeAction(ISD::SETOGT, MVT::f32, Legal); - setCondCodeAction(ISD::SETOGT, MVT::f64, Legal); - setCondCodeAction(ISD::SETUGT, MVT::f32, Legal); - setCondCodeAction(ISD::SETUGT, MVT::f64, Legal); - - setCondCodeAction(ISD::SETOLE, MVT::f32, Legal); - setCondCodeAction(ISD::SETOLE, MVT::f64, Legal); - setCondCodeAction(ISD::SETOLT, MVT::f32, Legal); - setCondCodeAction(ISD::SETOLT, MVT::f64, Legal); - - setOperationAction(ISD::ConstantFP, MVT::f32, Legal); - setOperationAction(ISD::ConstantFP, MVT::f64, Legal); - - setOperationAction(ISD::FP_TO_UINT, MVT::i1, Promote); - setOperationAction(ISD::FP_TO_SINT, MVT::i1, Promote); - setOperationAction(ISD::UINT_TO_FP, MVT::i1, Promote); - setOperationAction(ISD::SINT_TO_FP, MVT::i1, Promote); - - setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote); - setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote); - setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote); - setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote); - - setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote); - setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote); - setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote); - setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote); - - setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal); - setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal); - setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal); - setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal); - - setOperationAction(ISD::FP_TO_UINT, MVT::i64, Legal); - setOperationAction(ISD::FP_TO_SINT, MVT::i64, Legal); - setOperationAction(ISD::UINT_TO_FP, MVT::i64, Legal); - setOperationAction(ISD::SINT_TO_FP, MVT::i64, Legal); - - setOperationAction(ISD::FABS, MVT::f32, Legal); - setOperationAction(ISD::FABS, MVT::f64, Expand); - - setOperationAction(ISD::FNEG, MVT::f32, Legal); - setOperationAction(ISD::FNEG, MVT::f64, Expand); - } else { + // Limits for inline expansion of memcpy/memmove + MaxStoresPerMemcpy = 6; + MaxStoresPerMemmove = 6; - // Expand fp<->uint. - setOperationAction(ISD::FP_TO_SINT, MVT::i32, Expand); - setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand); + // + // Library calls for unsupported operations + // - setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand); - setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand); + setLibcallName(RTLIB::SINTTOFP_I128_F64, "__hexagon_floattidf"); + setLibcallName(RTLIB::SINTTOFP_I128_F32, "__hexagon_floattisf"); + + setLibcallName(RTLIB::FPTOUINT_F32_I128, "__hexagon_fixunssfti"); + setLibcallName(RTLIB::FPTOUINT_F64_I128, "__hexagon_fixunsdfti"); + + setLibcallName(RTLIB::FPTOSINT_F32_I128, "__hexagon_fixsfti"); + setLibcallName(RTLIB::FPTOSINT_F64_I128, "__hexagon_fixdfti"); + + setLibcallName(RTLIB::SDIV_I32, "__hexagon_divsi3"); + setOperationAction(ISD::SDIV, MVT::i32, Expand); + setLibcallName(RTLIB::SREM_I32, "__hexagon_umodsi3"); + setOperationAction(ISD::SREM, MVT::i32, Expand); + + setLibcallName(RTLIB::SDIV_I64, "__hexagon_divdi3"); + setOperationAction(ISD::SDIV, MVT::i64, Expand); + setLibcallName(RTLIB::SREM_I64, "__hexagon_moddi3"); + setOperationAction(ISD::SREM, MVT::i64, Expand); + + setLibcallName(RTLIB::UDIV_I32, "__hexagon_udivsi3"); + setOperationAction(ISD::UDIV, MVT::i32, Expand); + + setLibcallName(RTLIB::UDIV_I64, "__hexagon_udivdi3"); + setOperationAction(ISD::UDIV, MVT::i64, Expand); + + setLibcallName(RTLIB::UREM_I32, "__hexagon_umodsi3"); + setOperationAction(ISD::UREM, MVT::i32, Expand); + + setLibcallName(RTLIB::UREM_I64, "__hexagon_umoddi3"); + setOperationAction(ISD::UREM, MVT::i64, Expand); + + setLibcallName(RTLIB::DIV_F32, "__hexagon_divsf3"); + setOperationAction(ISD::FDIV, MVT::f32, Expand); + + setLibcallName(RTLIB::DIV_F64, "__hexagon_divdf3"); + setOperationAction(ISD::FDIV, MVT::f64, Expand); + + setOperationAction(ISD::FSQRT, MVT::f32, Expand); + setOperationAction(ISD::FSQRT, MVT::f64, Expand); + setOperationAction(ISD::FSIN, MVT::f32, Expand); + setOperationAction(ISD::FSIN, MVT::f64, Expand); + + if (Subtarget.hasV5TOps()) { + // Hexagon V5 Support. + setOperationAction(ISD::FADD, MVT::f32, Legal); + setOperationAction(ISD::FADD, MVT::f64, Legal); + setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal); + setCondCodeAction(ISD::SETOEQ, MVT::f32, Legal); + setCondCodeAction(ISD::SETOEQ, MVT::f64, Legal); + setCondCodeAction(ISD::SETUEQ, MVT::f32, Legal); + setCondCodeAction(ISD::SETUEQ, MVT::f64, Legal); + + setCondCodeAction(ISD::SETOGE, MVT::f32, Legal); + setCondCodeAction(ISD::SETOGE, MVT::f64, Legal); + setCondCodeAction(ISD::SETUGE, MVT::f32, Legal); + setCondCodeAction(ISD::SETUGE, MVT::f64, Legal); + + setCondCodeAction(ISD::SETOGT, MVT::f32, Legal); + setCondCodeAction(ISD::SETOGT, MVT::f64, Legal); + setCondCodeAction(ISD::SETUGT, MVT::f32, Legal); + setCondCodeAction(ISD::SETUGT, MVT::f64, Legal); + + setCondCodeAction(ISD::SETOLE, MVT::f32, Legal); + setCondCodeAction(ISD::SETOLE, MVT::f64, Legal); + setCondCodeAction(ISD::SETOLT, MVT::f32, Legal); + setCondCodeAction(ISD::SETOLT, MVT::f64, Legal); + + setOperationAction(ISD::ConstantFP, MVT::f32, Legal); + setOperationAction(ISD::ConstantFP, MVT::f64, Legal); + + setOperationAction(ISD::FP_TO_UINT, MVT::i1, Promote); + setOperationAction(ISD::FP_TO_SINT, MVT::i1, Promote); + setOperationAction(ISD::UINT_TO_FP, MVT::i1, Promote); + setOperationAction(ISD::SINT_TO_FP, MVT::i1, Promote); + + setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote); + setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote); + setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote); + setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote); + + setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote); + setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote); + setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote); + setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote); + + setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal); + + setOperationAction(ISD::FP_TO_UINT, MVT::i64, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::i64, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::i64, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::i64, Legal); + + setOperationAction(ISD::FABS, MVT::f32, Legal); + setOperationAction(ISD::FABS, MVT::f64, Expand); + + setOperationAction(ISD::FNEG, MVT::f32, Legal); + setOperationAction(ISD::FNEG, MVT::f64, Expand); + } else { - setLibcallName(RTLIB::SINTTOFP_I64_F32, "__hexagon_floatdisf"); - setLibcallName(RTLIB::UINTTOFP_I64_F32, "__hexagon_floatundisf"); + // Expand fp<->uint. + setOperationAction(ISD::FP_TO_SINT, MVT::i32, Expand); + setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand); - setLibcallName(RTLIB::UINTTOFP_I32_F32, "__hexagon_floatunsisf"); - setLibcallName(RTLIB::SINTTOFP_I32_F32, "__hexagon_floatsisf"); + setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand); + setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand); - setLibcallName(RTLIB::SINTTOFP_I64_F64, "__hexagon_floatdidf"); - setLibcallName(RTLIB::UINTTOFP_I64_F64, "__hexagon_floatundidf"); + setLibcallName(RTLIB::SINTTOFP_I64_F32, "__hexagon_floatdisf"); + setLibcallName(RTLIB::UINTTOFP_I64_F32, "__hexagon_floatundisf"); - setLibcallName(RTLIB::UINTTOFP_I32_F64, "__hexagon_floatunsidf"); - setLibcallName(RTLIB::SINTTOFP_I32_F64, "__hexagon_floatsidf"); + setLibcallName(RTLIB::UINTTOFP_I32_F32, "__hexagon_floatunsisf"); + setLibcallName(RTLIB::SINTTOFP_I32_F32, "__hexagon_floatsisf"); - setLibcallName(RTLIB::FPTOUINT_F32_I32, "__hexagon_fixunssfsi"); - setLibcallName(RTLIB::FPTOUINT_F32_I64, "__hexagon_fixunssfdi"); + setLibcallName(RTLIB::SINTTOFP_I64_F64, "__hexagon_floatdidf"); + setLibcallName(RTLIB::UINTTOFP_I64_F64, "__hexagon_floatundidf"); - setLibcallName(RTLIB::FPTOSINT_F64_I64, "__hexagon_fixdfdi"); - setLibcallName(RTLIB::FPTOSINT_F32_I64, "__hexagon_fixsfdi"); + setLibcallName(RTLIB::UINTTOFP_I32_F64, "__hexagon_floatunsidf"); + setLibcallName(RTLIB::SINTTOFP_I32_F64, "__hexagon_floatsidf"); - setLibcallName(RTLIB::FPTOUINT_F64_I32, "__hexagon_fixunsdfsi"); - setLibcallName(RTLIB::FPTOUINT_F64_I64, "__hexagon_fixunsdfdi"); + setLibcallName(RTLIB::FPTOUINT_F32_I32, "__hexagon_fixunssfsi"); + setLibcallName(RTLIB::FPTOUINT_F32_I64, "__hexagon_fixunssfdi"); - setLibcallName(RTLIB::ADD_F64, "__hexagon_adddf3"); - setOperationAction(ISD::FADD, MVT::f64, Expand); + setLibcallName(RTLIB::FPTOSINT_F64_I64, "__hexagon_fixdfdi"); + setLibcallName(RTLIB::FPTOSINT_F32_I64, "__hexagon_fixsfdi"); - setLibcallName(RTLIB::ADD_F32, "__hexagon_addsf3"); - setOperationAction(ISD::FADD, MVT::f32, Expand); + setLibcallName(RTLIB::FPTOUINT_F64_I32, "__hexagon_fixunsdfsi"); + setLibcallName(RTLIB::FPTOUINT_F64_I64, "__hexagon_fixunsdfdi"); - setLibcallName(RTLIB::FPEXT_F32_F64, "__hexagon_extendsfdf2"); - setOperationAction(ISD::FP_EXTEND, MVT::f32, Expand); + setLibcallName(RTLIB::ADD_F64, "__hexagon_adddf3"); + setOperationAction(ISD::FADD, MVT::f64, Expand); - setLibcallName(RTLIB::OEQ_F32, "__hexagon_eqsf2"); - setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); + setLibcallName(RTLIB::ADD_F32, "__hexagon_addsf3"); + setOperationAction(ISD::FADD, MVT::f32, Expand); - setLibcallName(RTLIB::OEQ_F64, "__hexagon_eqdf2"); - setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); + setLibcallName(RTLIB::FPEXT_F32_F64, "__hexagon_extendsfdf2"); + setOperationAction(ISD::FP_EXTEND, MVT::f32, Expand); - setLibcallName(RTLIB::OGE_F32, "__hexagon_gesf2"); - setCondCodeAction(ISD::SETOGE, MVT::f32, Expand); + setLibcallName(RTLIB::OEQ_F32, "__hexagon_eqsf2"); + setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); - setLibcallName(RTLIB::OGE_F64, "__hexagon_gedf2"); - setCondCodeAction(ISD::SETOGE, MVT::f64, Expand); + setLibcallName(RTLIB::OEQ_F64, "__hexagon_eqdf2"); + setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); - setLibcallName(RTLIB::OGT_F32, "__hexagon_gtsf2"); - setCondCodeAction(ISD::SETOGT, MVT::f32, Expand); + setLibcallName(RTLIB::OGE_F32, "__hexagon_gesf2"); + setCondCodeAction(ISD::SETOGE, MVT::f32, Expand); - setLibcallName(RTLIB::OGT_F64, "__hexagon_gtdf2"); - setCondCodeAction(ISD::SETOGT, MVT::f64, Expand); + setLibcallName(RTLIB::OGE_F64, "__hexagon_gedf2"); + setCondCodeAction(ISD::SETOGE, MVT::f64, Expand); - setLibcallName(RTLIB::FPTOSINT_F64_I32, "__hexagon_fixdfsi"); - setOperationAction(ISD::FP_TO_SINT, MVT::f64, Expand); + setLibcallName(RTLIB::OGT_F32, "__hexagon_gtsf2"); + setCondCodeAction(ISD::SETOGT, MVT::f32, Expand); - setLibcallName(RTLIB::FPTOSINT_F32_I32, "__hexagon_fixsfsi"); - setOperationAction(ISD::FP_TO_SINT, MVT::f32, Expand); + setLibcallName(RTLIB::OGT_F64, "__hexagon_gtdf2"); + setCondCodeAction(ISD::SETOGT, MVT::f64, Expand); - setLibcallName(RTLIB::OLE_F64, "__hexagon_ledf2"); - setCondCodeAction(ISD::SETOLE, MVT::f64, Expand); + setLibcallName(RTLIB::FPTOSINT_F64_I32, "__hexagon_fixdfsi"); + setOperationAction(ISD::FP_TO_SINT, MVT::f64, Expand); - setLibcallName(RTLIB::OLE_F32, "__hexagon_lesf2"); - setCondCodeAction(ISD::SETOLE, MVT::f32, Expand); + setLibcallName(RTLIB::FPTOSINT_F32_I32, "__hexagon_fixsfsi"); + setOperationAction(ISD::FP_TO_SINT, MVT::f32, Expand); - setLibcallName(RTLIB::OLT_F64, "__hexagon_ltdf2"); - setCondCodeAction(ISD::SETOLT, MVT::f64, Expand); + setLibcallName(RTLIB::OLE_F64, "__hexagon_ledf2"); + setCondCodeAction(ISD::SETOLE, MVT::f64, Expand); - setLibcallName(RTLIB::OLT_F32, "__hexagon_ltsf2"); - setCondCodeAction(ISD::SETOLT, MVT::f32, Expand); + setLibcallName(RTLIB::OLE_F32, "__hexagon_lesf2"); + setCondCodeAction(ISD::SETOLE, MVT::f32, Expand); - setLibcallName(RTLIB::MUL_F64, "__hexagon_muldf3"); - setOperationAction(ISD::FMUL, MVT::f64, Expand); + setLibcallName(RTLIB::OLT_F64, "__hexagon_ltdf2"); + setCondCodeAction(ISD::SETOLT, MVT::f64, Expand); - setLibcallName(RTLIB::MUL_F32, "__hexagon_mulsf3"); - setOperationAction(ISD::MUL, MVT::f32, Expand); + setLibcallName(RTLIB::OLT_F32, "__hexagon_ltsf2"); + setCondCodeAction(ISD::SETOLT, MVT::f32, Expand); - setLibcallName(RTLIB::UNE_F64, "__hexagon_nedf2"); - setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); + setLibcallName(RTLIB::MUL_F64, "__hexagon_muldf3"); + setOperationAction(ISD::FMUL, MVT::f64, Expand); - setLibcallName(RTLIB::UNE_F32, "__hexagon_nesf2"); + setLibcallName(RTLIB::MUL_F32, "__hexagon_mulsf3"); + setOperationAction(ISD::MUL, MVT::f32, Expand); - setLibcallName(RTLIB::SUB_F64, "__hexagon_subdf3"); - setOperationAction(ISD::SUB, MVT::f64, Expand); + setLibcallName(RTLIB::UNE_F64, "__hexagon_nedf2"); + setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); - setLibcallName(RTLIB::SUB_F32, "__hexagon_subsf3"); - setOperationAction(ISD::SUB, MVT::f32, Expand); + setLibcallName(RTLIB::UNE_F32, "__hexagon_nesf2"); - setLibcallName(RTLIB::FPROUND_F64_F32, "__hexagon_truncdfsf2"); - setOperationAction(ISD::FP_ROUND, MVT::f64, Expand); + setLibcallName(RTLIB::SUB_F64, "__hexagon_subdf3"); + setOperationAction(ISD::SUB, MVT::f64, Expand); - setLibcallName(RTLIB::UO_F64, "__hexagon_unorddf2"); - setCondCodeAction(ISD::SETUO, MVT::f64, Expand); + setLibcallName(RTLIB::SUB_F32, "__hexagon_subsf3"); + setOperationAction(ISD::SUB, MVT::f32, Expand); - setLibcallName(RTLIB::O_F64, "__hexagon_unorddf2"); - setCondCodeAction(ISD::SETO, MVT::f64, Expand); + setLibcallName(RTLIB::FPROUND_F64_F32, "__hexagon_truncdfsf2"); + setOperationAction(ISD::FP_ROUND, MVT::f64, Expand); - setLibcallName(RTLIB::O_F32, "__hexagon_unordsf2"); - setCondCodeAction(ISD::SETO, MVT::f32, Expand); + setLibcallName(RTLIB::UO_F64, "__hexagon_unorddf2"); + setCondCodeAction(ISD::SETUO, MVT::f64, Expand); - setLibcallName(RTLIB::UO_F32, "__hexagon_unordsf2"); - setCondCodeAction(ISD::SETUO, MVT::f32, Expand); + setLibcallName(RTLIB::O_F64, "__hexagon_unorddf2"); + setCondCodeAction(ISD::SETO, MVT::f64, Expand); - setOperationAction(ISD::FABS, MVT::f32, Expand); - setOperationAction(ISD::FABS, MVT::f64, Expand); - setOperationAction(ISD::FNEG, MVT::f32, Expand); - setOperationAction(ISD::FNEG, MVT::f64, Expand); - } + setLibcallName(RTLIB::O_F32, "__hexagon_unordsf2"); + setCondCodeAction(ISD::SETO, MVT::f32, Expand); - setLibcallName(RTLIB::SREM_I32, "__hexagon_modsi3"); - setOperationAction(ISD::SREM, MVT::i32, Expand); - - setIndexedLoadAction(ISD::POST_INC, MVT::i8, Legal); - setIndexedLoadAction(ISD::POST_INC, MVT::i16, Legal); - setIndexedLoadAction(ISD::POST_INC, MVT::i32, Legal); - setIndexedLoadAction(ISD::POST_INC, MVT::i64, Legal); - - setIndexedStoreAction(ISD::POST_INC, MVT::i8, Legal); - setIndexedStoreAction(ISD::POST_INC, MVT::i16, Legal); - setIndexedStoreAction(ISD::POST_INC, MVT::i32, Legal); - setIndexedStoreAction(ISD::POST_INC, MVT::i64, Legal); - - setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand); - - // Turn FP extload into load/fextend. - setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand); - // Hexagon has a i1 sign extending load. - setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Expand); - // Turn FP truncstore into trunc + store. - setTruncStoreAction(MVT::f64, MVT::f32, Expand); - - // Custom legalize GlobalAddress nodes into CONST32. - setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); - setOperationAction(ISD::GlobalAddress, MVT::i8, Custom); - setOperationAction(ISD::BlockAddress, MVT::i32, Custom); - // Truncate action? - setOperationAction(ISD::TRUNCATE, MVT::i64, Expand); - - // Hexagon doesn't have sext_inreg, replace them with shl/sra. - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); - - // Hexagon has no REM or DIVREM operations. - setOperationAction(ISD::UREM, MVT::i32, Expand); - setOperationAction(ISD::SREM, MVT::i32, Expand); - setOperationAction(ISD::SDIVREM, MVT::i32, Expand); - setOperationAction(ISD::UDIVREM, MVT::i32, Expand); - setOperationAction(ISD::SREM, MVT::i64, Expand); - setOperationAction(ISD::SDIVREM, MVT::i64, Expand); - setOperationAction(ISD::UDIVREM, MVT::i64, Expand); - - setOperationAction(ISD::BSWAP, MVT::i64, Expand); - - // Lower SELECT_CC to SETCC and SELECT. - setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); - setOperationAction(ISD::SELECT_CC, MVT::i64, Custom); - - if (QRI->Subtarget.hasV5TOps()) { - - // We need to make the operation type of SELECT node to be Custom, - // such that we don't go into the infinite loop of - // select -> setcc -> select_cc -> select loop. - setOperationAction(ISD::SELECT, MVT::f32, Custom); - setOperationAction(ISD::SELECT, MVT::f64, Custom); - - setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); - setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); - setOperationAction(ISD::SELECT_CC, MVT::Other, Expand); + setLibcallName(RTLIB::UO_F32, "__hexagon_unordsf2"); + setCondCodeAction(ISD::SETUO, MVT::f32, Expand); - } else { - - // Hexagon has no select or setcc: expand to SELECT_CC. - setOperationAction(ISD::SELECT, MVT::f32, Expand); - setOperationAction(ISD::SELECT, MVT::f64, Expand); + setOperationAction(ISD::FABS, MVT::f32, Expand); + setOperationAction(ISD::FABS, MVT::f64, Expand); + setOperationAction(ISD::FNEG, MVT::f32, Expand); + setOperationAction(ISD::FNEG, MVT::f64, Expand); + } - // This is a workaround documented in DAGCombiner.cpp:2892 We don't - // support SELECT_CC on every type. - setOperationAction(ISD::SELECT_CC, MVT::Other, Expand); + setLibcallName(RTLIB::SREM_I32, "__hexagon_modsi3"); + setOperationAction(ISD::SREM, MVT::i32, Expand); + + setIndexedLoadAction(ISD::POST_INC, MVT::i8, Legal); + setIndexedLoadAction(ISD::POST_INC, MVT::i16, Legal); + setIndexedLoadAction(ISD::POST_INC, MVT::i32, Legal); + setIndexedLoadAction(ISD::POST_INC, MVT::i64, Legal); + + setIndexedStoreAction(ISD::POST_INC, MVT::i8, Legal); + setIndexedStoreAction(ISD::POST_INC, MVT::i16, Legal); + setIndexedStoreAction(ISD::POST_INC, MVT::i32, Legal); + setIndexedStoreAction(ISD::POST_INC, MVT::i64, Legal); + + setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand); + + // Turn FP extload into load/fextend. + setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand); + // Hexagon has a i1 sign extending load. + setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Expand); + // Turn FP truncstore into trunc + store. + setTruncStoreAction(MVT::f64, MVT::f32, Expand); + + // Custom legalize GlobalAddress nodes into CONST32. + setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); + setOperationAction(ISD::GlobalAddress, MVT::i8, Custom); + setOperationAction(ISD::BlockAddress, MVT::i32, Custom); + // Truncate action? + setOperationAction(ISD::TRUNCATE, MVT::i64, Expand); + + // Hexagon doesn't have sext_inreg, replace them with shl/sra. + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); + + // Hexagon has no REM or DIVREM operations. + setOperationAction(ISD::UREM, MVT::i32, Expand); + setOperationAction(ISD::SREM, MVT::i32, Expand); + setOperationAction(ISD::SDIVREM, MVT::i32, Expand); + setOperationAction(ISD::UDIVREM, MVT::i32, Expand); + setOperationAction(ISD::SREM, MVT::i64, Expand); + setOperationAction(ISD::SDIVREM, MVT::i64, Expand); + setOperationAction(ISD::UDIVREM, MVT::i64, Expand); + + setOperationAction(ISD::BSWAP, MVT::i64, Expand); + + // Lower SELECT_CC to SETCC and SELECT. + setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i32, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); + + if (Subtarget.hasV5TOps()) { + + // We need to make the operation type of SELECT node to be Custom, + // such that we don't go into the infinite loop of + // select -> setcc -> select_cc -> select loop. + setOperationAction(ISD::SELECT, MVT::f32, Custom); + setOperationAction(ISD::SELECT, MVT::f64, Custom); + + setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); + setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); - } + } else { - if (EmitJumpTables) { - setOperationAction(ISD::BR_JT, MVT::Other, Custom); - } else { - setOperationAction(ISD::BR_JT, MVT::Other, Expand); - } - // Increase jump tables cutover to 5, was 4. - setMinimumJumpTableEntries(5); - - setOperationAction(ISD::BR_CC, MVT::f32, Expand); - setOperationAction(ISD::BR_CC, MVT::f64, Expand); - setOperationAction(ISD::BR_CC, MVT::i1, Expand); - setOperationAction(ISD::BR_CC, MVT::i32, Expand); - setOperationAction(ISD::BR_CC, MVT::i64, Expand); - - setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom); - - setOperationAction(ISD::FSIN , MVT::f64, Expand); - setOperationAction(ISD::FCOS , MVT::f64, Expand); - setOperationAction(ISD::FREM , MVT::f64, Expand); - setOperationAction(ISD::FSIN , MVT::f32, Expand); - setOperationAction(ISD::FCOS , MVT::f32, Expand); - setOperationAction(ISD::FREM , MVT::f32, Expand); - setOperationAction(ISD::FSINCOS, MVT::f64, Expand); - setOperationAction(ISD::FSINCOS, MVT::f32, Expand); - - // In V4, we have double word add/sub with carry. The problem with - // modelling this instruction is that it produces 2 results - Rdd and Px. - // To model update of Px, we will have to use Defs[p0..p3] which will - // cause any predicate live range to spill. So, we pretend we dont't - // have these instructions. - setOperationAction(ISD::ADDE, MVT::i8, Expand); - setOperationAction(ISD::ADDE, MVT::i16, Expand); - setOperationAction(ISD::ADDE, MVT::i32, Expand); - setOperationAction(ISD::ADDE, MVT::i64, Expand); - setOperationAction(ISD::SUBE, MVT::i8, Expand); - setOperationAction(ISD::SUBE, MVT::i16, Expand); - setOperationAction(ISD::SUBE, MVT::i32, Expand); - setOperationAction(ISD::SUBE, MVT::i64, Expand); - setOperationAction(ISD::ADDC, MVT::i8, Expand); - setOperationAction(ISD::ADDC, MVT::i16, Expand); - setOperationAction(ISD::ADDC, MVT::i32, Expand); - setOperationAction(ISD::ADDC, MVT::i64, Expand); - setOperationAction(ISD::SUBC, MVT::i8, Expand); - setOperationAction(ISD::SUBC, MVT::i16, Expand); - setOperationAction(ISD::SUBC, MVT::i32, Expand); - setOperationAction(ISD::SUBC, MVT::i64, Expand); - - setOperationAction(ISD::CTPOP, MVT::i32, Expand); - setOperationAction(ISD::CTPOP, MVT::i64, Expand); - setOperationAction(ISD::CTTZ , MVT::i32, Expand); - setOperationAction(ISD::CTTZ , MVT::i64, Expand); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); - setOperationAction(ISD::CTLZ , MVT::i32, Expand); - setOperationAction(ISD::CTLZ , MVT::i64, Expand); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand); - setOperationAction(ISD::ROTL , MVT::i32, Expand); - setOperationAction(ISD::ROTR , MVT::i32, Expand); - setOperationAction(ISD::BSWAP, MVT::i32, Expand); - setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); - setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); - setOperationAction(ISD::FPOW , MVT::f64, Expand); - setOperationAction(ISD::FPOW , MVT::f32, Expand); - - setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand); - setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand); - setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand); - - setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); - setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); - - setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); - setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); - - setOperationAction(ISD::EH_RETURN, MVT::Other, Custom); - - if (TM.getSubtargetImpl()->isSubtargetV2()) { - setExceptionPointerRegister(Hexagon::R20); - setExceptionSelectorRegister(Hexagon::R21); - } else { - setExceptionPointerRegister(Hexagon::R0); - setExceptionSelectorRegister(Hexagon::R1); - } + // Hexagon has no select or setcc: expand to SELECT_CC. + setOperationAction(ISD::SELECT, MVT::f32, Expand); + setOperationAction(ISD::SELECT, MVT::f64, Expand); + } - // VASTART needs to be custom lowered to use the VarArgsFrameIndex. - setOperationAction(ISD::VASTART , MVT::Other, Custom); + if (EmitJumpTables) { + setOperationAction(ISD::BR_JT, MVT::Other, Custom); + } else { + setOperationAction(ISD::BR_JT, MVT::Other, Expand); + } + // Increase jump tables cutover to 5, was 4. + setMinimumJumpTableEntries(5); + + setOperationAction(ISD::BR_CC, MVT::f32, Expand); + setOperationAction(ISD::BR_CC, MVT::f64, Expand); + setOperationAction(ISD::BR_CC, MVT::i1, Expand); + setOperationAction(ISD::BR_CC, MVT::i32, Expand); + setOperationAction(ISD::BR_CC, MVT::i64, Expand); + + setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom); + + setOperationAction(ISD::FSIN, MVT::f64, Expand); + setOperationAction(ISD::FCOS, MVT::f64, Expand); + setOperationAction(ISD::FREM, MVT::f64, Expand); + setOperationAction(ISD::FSIN, MVT::f32, Expand); + setOperationAction(ISD::FCOS, MVT::f32, Expand); + setOperationAction(ISD::FREM, MVT::f32, Expand); + setOperationAction(ISD::FSINCOS, MVT::f64, Expand); + setOperationAction(ISD::FSINCOS, MVT::f32, Expand); + + // In V4, we have double word add/sub with carry. The problem with + // modelling this instruction is that it produces 2 results - Rdd and Px. + // To model update of Px, we will have to use Defs[p0..p3] which will + // cause any predicate live range to spill. So, we pretend we dont't + // have these instructions. + setOperationAction(ISD::ADDE, MVT::i8, Expand); + setOperationAction(ISD::ADDE, MVT::i16, Expand); + setOperationAction(ISD::ADDE, MVT::i32, Expand); + setOperationAction(ISD::ADDE, MVT::i64, Expand); + setOperationAction(ISD::SUBE, MVT::i8, Expand); + setOperationAction(ISD::SUBE, MVT::i16, Expand); + setOperationAction(ISD::SUBE, MVT::i32, Expand); + setOperationAction(ISD::SUBE, MVT::i64, Expand); + setOperationAction(ISD::ADDC, MVT::i8, Expand); + setOperationAction(ISD::ADDC, MVT::i16, Expand); + setOperationAction(ISD::ADDC, MVT::i32, Expand); + setOperationAction(ISD::ADDC, MVT::i64, Expand); + setOperationAction(ISD::SUBC, MVT::i8, Expand); + setOperationAction(ISD::SUBC, MVT::i16, Expand); + setOperationAction(ISD::SUBC, MVT::i32, Expand); + setOperationAction(ISD::SUBC, MVT::i64, Expand); + + setOperationAction(ISD::CTPOP, MVT::i32, Expand); + setOperationAction(ISD::CTPOP, MVT::i64, Expand); + setOperationAction(ISD::CTTZ, MVT::i32, Expand); + setOperationAction(ISD::CTTZ, MVT::i64, Expand); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); + setOperationAction(ISD::CTLZ, MVT::i32, Expand); + setOperationAction(ISD::CTLZ, MVT::i64, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand); + setOperationAction(ISD::ROTL, MVT::i32, Expand); + setOperationAction(ISD::ROTR, MVT::i32, Expand); + setOperationAction(ISD::BSWAP, MVT::i32, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); + setOperationAction(ISD::FPOW, MVT::f64, Expand); + setOperationAction(ISD::FPOW, MVT::f32, Expand); + + setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand); + setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand); + setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand); + + setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); + setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); + + setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); + setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); + + setOperationAction(ISD::EH_RETURN, MVT::Other, Custom); + + if (Subtarget.isSubtargetV2()) { + setExceptionPointerRegister(Hexagon::R20); + setExceptionSelectorRegister(Hexagon::R21); + } else { + setExceptionPointerRegister(Hexagon::R0); + setExceptionSelectorRegister(Hexagon::R1); + } - // Use the default implementation. - setOperationAction(ISD::VAARG , MVT::Other, Expand); - setOperationAction(ISD::VACOPY , MVT::Other, Expand); - setOperationAction(ISD::VAEND , MVT::Other, Expand); - setOperationAction(ISD::STACKSAVE , MVT::Other, Expand); - setOperationAction(ISD::STACKRESTORE , MVT::Other, Expand); + // VASTART needs to be custom lowered to use the VarArgsFrameIndex. + setOperationAction(ISD::VASTART, MVT::Other, Custom); + // Use the default implementation. + setOperationAction(ISD::VAARG, MVT::Other, Expand); + setOperationAction(ISD::VACOPY, MVT::Other, Expand); + setOperationAction(ISD::VAEND, MVT::Other, Expand); + setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); + setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); - setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32 , Custom); - setOperationAction(ISD::INLINEASM , MVT::Other, Custom); + setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); + setOperationAction(ISD::INLINEASM, MVT::Other, Custom); - setMinFunctionAlignment(2); + setMinFunctionAlignment(2); - // Needed for DYNAMIC_STACKALLOC expansion. - unsigned StackRegister = TM.getRegisterInfo()->getStackRegister(); - setStackPointerRegisterToSaveRestore(StackRegister); - setSchedulingPreference(Sched::VLIW); + // Needed for DYNAMIC_STACKALLOC expansion. + const HexagonRegisterInfo *QRI = + static_cast<const HexagonRegisterInfo *>(TM.getRegisterInfo()); + setStackPointerRegisterToSaveRestore(QRI->getStackRegister()); + setSchedulingPreference(Sched::VLIW); } - const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const { switch (Opcode) { @@ -1577,7 +1560,6 @@ HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::BR_JT: return LowerBR_JT(Op, DAG); case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); - case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); case ISD::SELECT: return Op; case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); case ISD::INLINEASM: return LowerINLINEASM(Op, DAG); @@ -1641,8 +1623,7 @@ HexagonTargetLowering::getRegForInlineAsmConstraint(const /// specified FP immediate natively. If false, the legalizer will /// materialize the FP immediate as a load from a constant pool. bool HexagonTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { - const HexagonRegisterInfo* QRI = TM.getRegisterInfo(); - return QRI->Subtarget.hasV5TOps(); + return TM.getSubtarget<HexagonSubtarget>().hasV5TOps(); } /// isLegalAddressingMode - Return true if the addressing mode represented by diff --git a/lib/Target/Hexagon/HexagonISelLowering.h b/lib/Target/Hexagon/HexagonISelLowering.h index 4f27c276564c..ec16cc8f894b 100644 --- a/lib/Target/Hexagon/HexagonISelLowering.h +++ b/lib/Target/Hexagon/HexagonISelLowering.h @@ -74,8 +74,8 @@ namespace llvm { unsigned& RetSize) const; public: - HexagonTargetMachine &TM; - explicit HexagonTargetLowering(HexagonTargetMachine &targetmachine); + const TargetMachine &TM; + explicit HexagonTargetLowering(const TargetMachine &targetmachine); /// IsEligibleForTailCallOptimization - Check whether the call is eligible /// for tail call optimization. Targets which want to do tail call @@ -124,7 +124,6 @@ namespace llvm { const SmallVectorImpl<SDValue> &OutVals, SDValue Callee) const; - SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG& DAG) const; SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const; diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp index ea6367a89bce..1c95e06c8923 100644 --- a/lib/Target/Hexagon/HexagonInstrInfo.cpp +++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp @@ -1538,14 +1538,13 @@ int HexagonInstrInfo::GetDotOldOp(const int opc) const { int NewOp = opc; if (isPredicated(NewOp) && isPredicatedNew(NewOp)) { // Get predicate old form NewOp = Hexagon::getPredOldOpcode(NewOp); - if (NewOp < 0) - assert(0 && "Couldn't change predicate new instruction to its old form."); + assert(NewOp >= 0 && + "Couldn't change predicate new instruction to its old form."); } if (isNewValueStore(NewOp)) { // Convert into non-new-value format NewOp = Hexagon::getNonNVStore(NewOp); - if (NewOp < 0) - assert(0 && "Couldn't change new-value store to its old form."); + assert(NewOp >= 0 && "Couldn't change new-value store to its old form."); } return NewOp; } diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.cpp b/lib/Target/Hexagon/HexagonMachineScheduler.cpp index 7dd6e956ccb6..6fcaa2057404 100644 --- a/lib/Target/Hexagon/HexagonMachineScheduler.cpp +++ b/lib/Target/Hexagon/HexagonMachineScheduler.cpp @@ -20,7 +20,7 @@ using namespace llvm; #define DEBUG_TYPE "misched" -/// Platform specific modifications to DAG. +/// Platform-specific modifications to DAG. void VLIWMachineScheduler::postprocessDAG() { SUnit* LastSequentialCall = nullptr; // Currently we only catch the situation when compare gets scheduled @@ -150,7 +150,7 @@ void VLIWMachineScheduler::schedule() { buildDAGWithRegPressure(); - // Postprocess the DAG to add platform specific artificial dependencies. + // Postprocess the DAG to add platform-specific artificial dependencies. postprocessDAG(); SmallVector<SUnit*, 8> TopRoots, BotRoots; diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.h b/lib/Target/Hexagon/HexagonMachineScheduler.h index 99100a141e6e..8c4108609606 100644 --- a/lib/Target/Hexagon/HexagonMachineScheduler.h +++ b/lib/Target/Hexagon/HexagonMachineScheduler.h @@ -100,7 +100,7 @@ public: /// Schedule - This is called back from ScheduleDAGInstrs::Run() when it's /// time to do some work. virtual void schedule() override; - /// Perform platform specific DAG postprocessing. + /// Perform platform-specific DAG postprocessing. void postprocessDAG(); }; diff --git a/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp b/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp index 9e1e0fdf3d48..b5db997eb1b8 100644 --- a/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp +++ b/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp @@ -18,10 +18,8 @@ using namespace llvm; bool llvm::flag_aligned_memcpy; -HexagonSelectionDAGInfo::HexagonSelectionDAGInfo(const HexagonTargetMachine - &TM) - : TargetSelectionDAGInfo(TM) { -} +HexagonSelectionDAGInfo::HexagonSelectionDAGInfo(const DataLayout &DL) + : TargetSelectionDAGInfo(&DL) {} HexagonSelectionDAGInfo::~HexagonSelectionDAGInfo() { } diff --git a/lib/Target/Hexagon/HexagonSelectionDAGInfo.h b/lib/Target/Hexagon/HexagonSelectionDAGInfo.h index 8ba6108bdfad..b40b30320cf1 100644 --- a/lib/Target/Hexagon/HexagonSelectionDAGInfo.h +++ b/lib/Target/Hexagon/HexagonSelectionDAGInfo.h @@ -18,11 +18,9 @@ namespace llvm { -class HexagonTargetMachine; - class HexagonSelectionDAGInfo : public TargetSelectionDAGInfo { public: - explicit HexagonSelectionDAGInfo(const HexagonTargetMachine &TM); + explicit HexagonSelectionDAGInfo(const DataLayout &DL); ~HexagonSelectionDAGInfo(); SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, diff --git a/lib/Target/Hexagon/HexagonSubtarget.cpp b/lib/Target/Hexagon/HexagonSubtarget.cpp index 70c87fa19d1a..657893f32fee 100644 --- a/lib/Target/Hexagon/HexagonSubtarget.cpp +++ b/lib/Target/Hexagon/HexagonSubtarget.cpp @@ -48,10 +48,8 @@ EnableIEEERndNear( cl::Hidden, cl::ZeroOrMore, cl::init(false), cl::desc("Generate non-chopped conversion from fp to int.")); -HexagonSubtarget::HexagonSubtarget(StringRef TT, StringRef CPU, StringRef FS): - HexagonGenSubtargetInfo(TT, CPU, FS), - CPUString(CPU.str()) { - +HexagonSubtarget & +HexagonSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) { // If the programmer has not specified a Hexagon version, default to -mv4. if (CPUString.empty()) CPUString = "hexagonv4"; @@ -70,6 +68,15 @@ HexagonSubtarget::HexagonSubtarget(StringRef TT, StringRef CPU, StringRef FS): } ParseSubtargetFeatures(CPUString, FS); + return *this; +} + +HexagonSubtarget::HexagonSubtarget(StringRef TT, StringRef CPU, StringRef FS, + const TargetMachine &TM) + : HexagonGenSubtargetInfo(TT, CPU, FS), CPUString(CPU.str()), + DL("e-m:e-p:32:32-i1:32-i64:64-a:0-n32"), + InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM), + TSInfo(DL), FrameLowering() { // Initialize scheduling itinerary for the specified CPU. InstrItins = getInstrItineraryForCPU(CPUString); diff --git a/lib/Target/Hexagon/HexagonSubtarget.h b/lib/Target/Hexagon/HexagonSubtarget.h index 690bef0d7296..b184e62b4d0d 100644 --- a/lib/Target/Hexagon/HexagonSubtarget.h +++ b/lib/Target/Hexagon/HexagonSubtarget.h @@ -14,6 +14,11 @@ #ifndef Hexagon_SUBTARGET_H #define Hexagon_SUBTARGET_H +#include "HexagonFrameLowering.h" +#include "HexagonInstrInfo.h" +#include "HexagonISelLowering.h" +#include "HexagonSelectionDAGInfo.h" +#include "llvm/IR/DataLayout.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetSubtargetInfo.h" #include <string> @@ -28,6 +33,7 @@ namespace llvm { class HexagonSubtarget : public HexagonGenSubtargetInfo { virtual void anchor(); + bool UseMemOps; bool ModeIEEERndNear; @@ -37,16 +43,35 @@ public: }; HexagonArchEnum HexagonArchVersion; +private: std::string CPUString; + const DataLayout DL; // Calculates type size & alignment. + HexagonInstrInfo InstrInfo; + HexagonTargetLowering TLInfo; + HexagonSelectionDAGInfo TSInfo; + HexagonFrameLowering FrameLowering; InstrItineraryData InstrItins; public: - HexagonSubtarget(StringRef TT, StringRef CPU, StringRef FS); + HexagonSubtarget(StringRef TT, StringRef CPU, StringRef FS, + const TargetMachine &TM); /// getInstrItins - Return the instruction itineraies based on subtarget /// selection. const InstrItineraryData &getInstrItineraryData() const { return InstrItins; } + const HexagonInstrInfo *getInstrInfo() const { return &InstrInfo; } + const HexagonRegisterInfo *getRegisterInfo() const { + return &InstrInfo.getRegisterInfo(); + } + const HexagonTargetLowering *getTargetLowering() const { return &TLInfo; } + const HexagonFrameLowering *getFrameLowering() const { + return &FrameLowering; + } + const HexagonSelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; } + const DataLayout *getDataLayout() const { return &DL; } + HexagonSubtarget &initializeSubtargetDependencies(StringRef CPU, + StringRef FS); /// ParseSubtargetFeatures - Parses features string setting specified /// subtarget options. Definition of function is auto generated by tblgen. diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp index b9237647ff4a..78314100d18a 100644 --- a/lib/Target/Hexagon/HexagonTargetMachine.cpp +++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp @@ -67,15 +67,10 @@ SchedCustomRegistry("hexagon", "Run Hexagon's custom scheduler", HexagonTargetMachine::HexagonTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, const TargetOptions &Options, - Reloc::Model RM, - CodeModel::Model CM, + Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) - : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), - DL("e-m:e-p:32:32-i1:32-i64:64-a:0-n32") , - Subtarget(TT, CPU, FS), InstrInfo(Subtarget), TLInfo(*this), - TSInfo(*this), - FrameLowering(Subtarget), - InstrItins(&Subtarget.getInstrItineraryData()) { + : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), + Subtarget(TT, CPU, FS, *this) { initAsmInfo(); } diff --git a/lib/Target/Hexagon/HexagonTargetMachine.h b/lib/Target/Hexagon/HexagonTargetMachine.h index 70b835e61eef..d88178e052e7 100644 --- a/lib/Target/Hexagon/HexagonTargetMachine.h +++ b/lib/Target/Hexagon/HexagonTargetMachine.h @@ -14,12 +14,8 @@ #ifndef HexagonTARGETMACHINE_H #define HexagonTARGETMACHINE_H -#include "HexagonFrameLowering.h" -#include "HexagonISelLowering.h" #include "HexagonInstrInfo.h" -#include "HexagonSelectionDAGInfo.h" #include "HexagonSubtarget.h" -#include "llvm/IR/DataLayout.h" #include "llvm/Target/TargetMachine.h" namespace llvm { @@ -27,13 +23,7 @@ namespace llvm { class Module; class HexagonTargetMachine : public LLVMTargetMachine { - const DataLayout DL; // Calculates type size & alignment. HexagonSubtarget Subtarget; - HexagonInstrInfo InstrInfo; - HexagonTargetLowering TLInfo; - HexagonSelectionDAGInfo TSInfo; - HexagonFrameLowering FrameLowering; - const InstrItineraryData* InstrItins; public: HexagonTargetMachine(const Target &T, StringRef TT,StringRef CPU, @@ -42,33 +32,29 @@ public: CodeGenOpt::Level OL); const HexagonInstrInfo *getInstrInfo() const override { - return &InstrInfo; + return getSubtargetImpl()->getInstrInfo(); } const HexagonSubtarget *getSubtargetImpl() const override { return &Subtarget; } const HexagonRegisterInfo *getRegisterInfo() const override { - return &InstrInfo.getRegisterInfo(); + return getSubtargetImpl()->getRegisterInfo(); } - const InstrItineraryData* getInstrItineraryData() const override { - return InstrItins; + return &getSubtargetImpl()->getInstrItineraryData(); } - - const HexagonTargetLowering* getTargetLowering() const override { - return &TLInfo; + return getSubtargetImpl()->getTargetLowering(); } - const HexagonFrameLowering* getFrameLowering() const override { - return &FrameLowering; + return getSubtargetImpl()->getFrameLowering(); } - const HexagonSelectionDAGInfo* getSelectionDAGInfo() const override { - return &TSInfo; + return getSubtargetImpl()->getSelectionDAGInfo(); + } + const DataLayout *getDataLayout() const override { + return getSubtargetImpl()->getDataLayout(); } - - const DataLayout *getDataLayout() const override { return &DL; } static unsigned getModuleMatchQuality(const Module &M); TargetPassConfig *createPassConfig(PassManagerBase &PM) override; diff --git a/lib/Target/MSP430/MSP430FrameLowering.h b/lib/Target/MSP430/MSP430FrameLowering.h index d464dd99698a..fadfeedd1853 100644 --- a/lib/Target/MSP430/MSP430FrameLowering.h +++ b/lib/Target/MSP430/MSP430FrameLowering.h @@ -15,20 +15,15 @@ #define MSP430_FRAMEINFO_H #include "MSP430.h" -#include "MSP430Subtarget.h" #include "llvm/Target/TargetFrameLowering.h" namespace llvm { - class MSP430Subtarget; - class MSP430FrameLowering : public TargetFrameLowering { protected: - const MSP430Subtarget &STI; public: - explicit MSP430FrameLowering(const MSP430Subtarget &sti) - : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 2, -2, 2), - STI(sti) {} + explicit MSP430FrameLowering() + : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 2, -2, 2) {} /// emitProlog/emitEpilog - These methods insert prolog and epilog code into /// the function. diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp index c5901bcd6e47..3d3ee92d4bcb 100644 --- a/lib/Target/MSP430/MSP430ISelLowering.cpp +++ b/lib/Target/MSP430/MSP430ISelLowering.cpp @@ -57,11 +57,8 @@ HWMultMode("msp430-hwmult-mode", cl::Hidden, "Assume hardware multiplier cannot be used inside interrupts"), clEnumValEnd)); -MSP430TargetLowering::MSP430TargetLowering(MSP430TargetMachine &tm) : - TargetLowering(tm, new TargetLoweringObjectFileELF()), - Subtarget(*tm.getSubtargetImpl()) { - - TD = getDataLayout(); +MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM) + : TargetLowering(TM, new TargetLoweringObjectFileELF()) { // Set up the register classes. addRegisterClass(MVT::i8, &MSP430::GR8RegClass); @@ -1032,7 +1029,7 @@ MSP430TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { if (ReturnAddrIndex == 0) { // Set up a frame object for the return address. - uint64_t SlotSize = TD->getPointerSize(); + uint64_t SlotSize = getDataLayout()->getPointerSize(); ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize, true); FuncInfo->setRAIndex(ReturnAddrIndex); @@ -1055,7 +1052,7 @@ SDValue MSP430TargetLowering::LowerRETURNADDR(SDValue Op, if (Depth > 0) { SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); SDValue Offset = - DAG.getConstant(TD->getPointerSize(), MVT::i16); + DAG.getConstant(getDataLayout()->getPointerSize(), MVT::i16); return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), DAG.getNode(ISD::ADD, dl, getPointerTy(), FrameAddr, Offset), diff --git a/lib/Target/MSP430/MSP430ISelLowering.h b/lib/Target/MSP430/MSP430ISelLowering.h index 3ced61de3f4f..3e2f344aeb5a 100644 --- a/lib/Target/MSP430/MSP430ISelLowering.h +++ b/lib/Target/MSP430/MSP430ISelLowering.h @@ -66,12 +66,9 @@ namespace llvm { }; } - class MSP430Subtarget; - class MSP430TargetMachine; - class MSP430TargetLowering : public TargetLowering { public: - explicit MSP430TargetLowering(MSP430TargetMachine &TM); + explicit MSP430TargetLowering(const TargetMachine &TM); MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i8; } @@ -170,9 +167,6 @@ namespace llvm { SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const override; - - const MSP430Subtarget &Subtarget; - const DataLayout *TD; }; } // namespace llvm diff --git a/lib/Target/MSP430/MSP430InstrInfo.cpp b/lib/Target/MSP430/MSP430InstrInfo.cpp index 0c04ddbb110b..ccb6c09e3f41 100644 --- a/lib/Target/MSP430/MSP430InstrInfo.cpp +++ b/lib/Target/MSP430/MSP430InstrInfo.cpp @@ -30,9 +30,9 @@ using namespace llvm; // Pin the vtable to this file. void MSP430InstrInfo::anchor() {} -MSP430InstrInfo::MSP430InstrInfo(MSP430TargetMachine &tm) +MSP430InstrInfo::MSP430InstrInfo(MSP430Subtarget &STI) : MSP430GenInstrInfo(MSP430::ADJCALLSTACKDOWN, MSP430::ADJCALLSTACKUP), - RI(tm) {} + RI() {} void MSP430InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, diff --git a/lib/Target/MSP430/MSP430InstrInfo.h b/lib/Target/MSP430/MSP430InstrInfo.h index 1ffcebb01543..e6baaefe4842 100644 --- a/lib/Target/MSP430/MSP430InstrInfo.h +++ b/lib/Target/MSP430/MSP430InstrInfo.h @@ -22,7 +22,7 @@ namespace llvm { -class MSP430TargetMachine; +class MSP430Subtarget; /// MSP430II - This namespace holds all of the target specific flags that /// instruction info tracks. @@ -44,7 +44,7 @@ class MSP430InstrInfo : public MSP430GenInstrInfo { const MSP430RegisterInfo RI; virtual void anchor(); public: - explicit MSP430InstrInfo(MSP430TargetMachine &TM); + explicit MSP430InstrInfo(MSP430Subtarget &STI); /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As /// such, whenever a client has an instance of instruction info, it should diff --git a/lib/Target/MSP430/MSP430RegisterInfo.cpp b/lib/Target/MSP430/MSP430RegisterInfo.cpp index 341fb64b8fcb..691bceef0de0 100644 --- a/lib/Target/MSP430/MSP430RegisterInfo.cpp +++ b/lib/Target/MSP430/MSP430RegisterInfo.cpp @@ -32,10 +32,8 @@ using namespace llvm; #include "MSP430GenRegisterInfo.inc" // FIXME: Provide proper call frame setup / destroy opcodes. -MSP430RegisterInfo::MSP430RegisterInfo(MSP430TargetMachine &tm) - : MSP430GenRegisterInfo(MSP430::PCW), TM(tm) { - StackAlign = TM.getFrameLowering()->getStackAlignment(); -} +MSP430RegisterInfo::MSP430RegisterInfo() + : MSP430GenRegisterInfo(MSP430::PCW) {} const MCPhysReg* MSP430RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { diff --git a/lib/Target/MSP430/MSP430RegisterInfo.h b/lib/Target/MSP430/MSP430RegisterInfo.h index a607528fd118..cb0196134a8d 100644 --- a/lib/Target/MSP430/MSP430RegisterInfo.h +++ b/lib/Target/MSP430/MSP430RegisterInfo.h @@ -21,18 +21,9 @@ namespace llvm { -class TargetInstrInfo; -class MSP430TargetMachine; - struct MSP430RegisterInfo : public MSP430GenRegisterInfo { -private: - MSP430TargetMachine &TM; - - /// StackAlign - Default stack alignment. - /// - unsigned StackAlign; public: - MSP430RegisterInfo(MSP430TargetMachine &tm); + MSP430RegisterInfo(); /// Code Generation virtual methods... const MCPhysReg * diff --git a/lib/Target/MSP430/MSP430SelectionDAGInfo.cpp b/lib/Target/MSP430/MSP430SelectionDAGInfo.cpp index c700383b5ccc..3897ef684d4d 100644 --- a/lib/Target/MSP430/MSP430SelectionDAGInfo.cpp +++ b/lib/Target/MSP430/MSP430SelectionDAGInfo.cpp @@ -16,9 +16,8 @@ using namespace llvm; #define DEBUG_TYPE "msp430-selectiondag-info" -MSP430SelectionDAGInfo::MSP430SelectionDAGInfo(const MSP430TargetMachine &TM) - : TargetSelectionDAGInfo(TM) { -} +MSP430SelectionDAGInfo::MSP430SelectionDAGInfo(const DataLayout &DL) + : TargetSelectionDAGInfo(&DL) {} MSP430SelectionDAGInfo::~MSP430SelectionDAGInfo() { } diff --git a/lib/Target/MSP430/MSP430SelectionDAGInfo.h b/lib/Target/MSP430/MSP430SelectionDAGInfo.h index fa8194830ff8..cb04adc0de1e 100644 --- a/lib/Target/MSP430/MSP430SelectionDAGInfo.h +++ b/lib/Target/MSP430/MSP430SelectionDAGInfo.h @@ -22,7 +22,7 @@ class MSP430TargetMachine; class MSP430SelectionDAGInfo : public TargetSelectionDAGInfo { public: - explicit MSP430SelectionDAGInfo(const MSP430TargetMachine &TM); + explicit MSP430SelectionDAGInfo(const DataLayout &DL); ~MSP430SelectionDAGInfo(); }; diff --git a/lib/Target/MSP430/MSP430Subtarget.cpp b/lib/Target/MSP430/MSP430Subtarget.cpp index 68ad0911aff4..dbddc5243db0 100644 --- a/lib/Target/MSP430/MSP430Subtarget.cpp +++ b/lib/Target/MSP430/MSP430Subtarget.cpp @@ -25,12 +25,15 @@ using namespace llvm; void MSP430Subtarget::anchor() { } -MSP430Subtarget::MSP430Subtarget(const std::string &TT, - const std::string &CPU, - const std::string &FS) : - MSP430GenSubtargetInfo(TT, CPU, FS) { - std::string CPUName = "generic"; - - // Parse features string. - ParseSubtargetFeatures(CPUName, FS); +MSP430Subtarget &MSP430Subtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) { + ParseSubtargetFeatures("generic", FS); + return *this; } + +MSP430Subtarget::MSP430Subtarget(const std::string &TT, const std::string &CPU, + const std::string &FS, const TargetMachine &TM) + : MSP430GenSubtargetInfo(TT, CPU, FS), + // FIXME: Check DataLayout string. + DL("e-m:e-p:16:16-i32:16:32-n8:16"), FrameLowering(), + InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM), + TSInfo(DL) {} diff --git a/lib/Target/MSP430/MSP430Subtarget.h b/lib/Target/MSP430/MSP430Subtarget.h index 4d8792eede7f..0152ad19dfcd 100644 --- a/lib/Target/MSP430/MSP430Subtarget.h +++ b/lib/Target/MSP430/MSP430Subtarget.h @@ -14,6 +14,12 @@ #ifndef LLVM_TARGET_MSP430_SUBTARGET_H #define LLVM_TARGET_MSP430_SUBTARGET_H +#include "MSP430FrameLowering.h" +#include "MSP430InstrInfo.h" +#include "MSP430ISelLowering.h" +#include "MSP430RegisterInfo.h" +#include "MSP430SelectionDAGInfo.h" +#include "llvm/IR/DataLayout.h" #include "llvm/Target/TargetSubtargetInfo.h" #include <string> @@ -26,16 +32,33 @@ class StringRef; class MSP430Subtarget : public MSP430GenSubtargetInfo { virtual void anchor(); bool ExtendedInsts; + const DataLayout DL; // Calculates type size & alignment + MSP430FrameLowering FrameLowering; + MSP430InstrInfo InstrInfo; + MSP430TargetLowering TLInfo; + MSP430SelectionDAGInfo TSInfo; + public: /// This constructor initializes the data members to match that /// of the specified triple. /// MSP430Subtarget(const std::string &TT, const std::string &CPU, - const std::string &FS); + const std::string &FS, const TargetMachine &TM); + + MSP430Subtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS); /// ParseSubtargetFeatures - Parses features string setting specified /// subtarget options. Definition of function is auto generated by tblgen. void ParseSubtargetFeatures(StringRef CPU, StringRef FS); + + const TargetFrameLowering *getFrameLowering() const { return &FrameLowering; } + const MSP430InstrInfo *getInstrInfo() const { return &InstrInfo; } + const DataLayout *getDataLayout() const { return &DL; } + const TargetRegisterInfo *getRegisterInfo() const { + return &InstrInfo.getRegisterInfo(); + } + const MSP430TargetLowering *getTargetLowering() const { return &TLInfo; } + const MSP430SelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; } }; } // End llvm namespace diff --git a/lib/Target/MSP430/MSP430TargetMachine.cpp b/lib/Target/MSP430/MSP430TargetMachine.cpp index 50be2be3007b..5ca36f2e4e77 100644 --- a/lib/Target/MSP430/MSP430TargetMachine.cpp +++ b/lib/Target/MSP430/MSP430TargetMachine.cpp @@ -24,19 +24,13 @@ extern "C" void LLVMInitializeMSP430Target() { RegisterTargetMachine<MSP430TargetMachine> X(TheMSP430Target); } -MSP430TargetMachine::MSP430TargetMachine(const Target &T, - StringRef TT, - StringRef CPU, - StringRef FS, +MSP430TargetMachine::MSP430TargetMachine(const Target &T, StringRef TT, + StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) - : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), - Subtarget(TT, CPU, FS), - // FIXME: Check DataLayout string. - DL("e-m:e-p:16:16-i32:16:32-n8:16"), - InstrInfo(*this), TLInfo(*this), TSInfo(*this), - FrameLowering(Subtarget) { + : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), + Subtarget(TT, CPU, FS, *this) { initAsmInfo(); } diff --git a/lib/Target/MSP430/MSP430TargetMachine.h b/lib/Target/MSP430/MSP430TargetMachine.h index ea5d4073700b..efa84039ff6b 100644 --- a/lib/Target/MSP430/MSP430TargetMachine.h +++ b/lib/Target/MSP430/MSP430TargetMachine.h @@ -15,13 +15,7 @@ #ifndef LLVM_TARGET_MSP430_TARGETMACHINE_H #define LLVM_TARGET_MSP430_TARGETMACHINE_H -#include "MSP430FrameLowering.h" -#include "MSP430ISelLowering.h" -#include "MSP430InstrInfo.h" -#include "MSP430RegisterInfo.h" -#include "MSP430SelectionDAGInfo.h" #include "MSP430Subtarget.h" -#include "llvm/IR/DataLayout.h" #include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetMachine.h" @@ -31,11 +25,6 @@ namespace llvm { /// class MSP430TargetMachine : public LLVMTargetMachine { MSP430Subtarget Subtarget; - const DataLayout DL; // Calculates type size & alignment - MSP430InstrInfo InstrInfo; - MSP430TargetLowering TLInfo; - MSP430SelectionDAGInfo TSInfo; - MSP430FrameLowering FrameLowering; public: MSP430TargetMachine(const Target &T, StringRef TT, @@ -44,22 +33,25 @@ public: CodeGenOpt::Level OL); const TargetFrameLowering *getFrameLowering() const override { - return &FrameLowering; + return getSubtargetImpl()->getFrameLowering(); + } + const MSP430InstrInfo *getInstrInfo() const override { + return getSubtargetImpl()->getInstrInfo(); + } + const DataLayout *getDataLayout() const override { + return getSubtargetImpl()->getDataLayout(); + } + const MSP430Subtarget *getSubtargetImpl() const override { + return &Subtarget; } - const MSP430InstrInfo *getInstrInfo() const override { return &InstrInfo; } - const DataLayout *getDataLayout() const override { return &DL;} - const MSP430Subtarget *getSubtargetImpl() const override { return &Subtarget; } - const TargetRegisterInfo *getRegisterInfo() const override { - return &InstrInfo.getRegisterInfo(); + return getSubtargetImpl()->getRegisterInfo(); } - const MSP430TargetLowering *getTargetLowering() const override { - return &TLInfo; + return getSubtargetImpl()->getTargetLowering(); } - - const MSP430SelectionDAGInfo* getSelectionDAGInfo() const override { - return &TSInfo; + const MSP430SelectionDAGInfo *getSelectionDAGInfo() const override { + return getSubtargetImpl()->getSelectionDAGInfo(); } TargetPassConfig *createPassConfig(PassManagerBase &PM) override; }; // MSP430TargetMachine. diff --git a/lib/Target/Mips/Android.mk b/lib/Target/Mips/Android.mk index 4e8831c0e166..9f437f8196c6 100644 --- a/lib/Target/Mips/Android.mk +++ b/lib/Target/Mips/Android.mk @@ -8,6 +8,7 @@ mips_codegen_TBLGEN_TABLES := \ MipsGenMCPseudoLowering.inc \ MipsGenAsmWriter.inc \ MipsGenDAGISel.inc \ + MipsGenFastISel.inc \ MipsGenCallingConv.inc \ MipsGenSubtargetInfo.inc diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp index 86fd386967f9..0c06be8c7d99 100644 --- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp +++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp @@ -72,72 +72,69 @@ class MipsAsmParser : public MCTargetAsmParser { #define GET_ASSEMBLER_HEADER #include "MipsGenAsmMatcher.inc" + unsigned checkTargetMatchPredicate(MCInst &Inst) override; + bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, - SmallVectorImpl<MCParsedAsmOperand *> &Operands, - MCStreamer &Out, unsigned &ErrorInfo, + OperandVector &Operands, MCStreamer &Out, + unsigned &ErrorInfo, bool MatchingInlineAsm) override; /// Parse a register as used in CFI directives bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override; - bool ParseParenSuffix(StringRef Name, - SmallVectorImpl<MCParsedAsmOperand *> &Operands); + bool ParseParenSuffix(StringRef Name, OperandVector &Operands); - bool ParseBracketSuffix(StringRef Name, - SmallVectorImpl<MCParsedAsmOperand *> &Operands); + bool ParseBracketSuffix(StringRef Name, OperandVector &Operands); - bool - ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, - SmallVectorImpl<MCParsedAsmOperand *> &Operands) override; + bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + SMLoc NameLoc, OperandVector &Operands) override; bool ParseDirective(AsmToken DirectiveID) override; - MipsAsmParser::OperandMatchResultTy - parseMemOperand(SmallVectorImpl<MCParsedAsmOperand *> &Operands); - - MipsAsmParser::OperandMatchResultTy MatchAnyRegisterNameWithoutDollar( - SmallVectorImpl<MCParsedAsmOperand *> &Operands, StringRef Identifier, - SMLoc S); + MipsAsmParser::OperandMatchResultTy parseMemOperand(OperandVector &Operands); MipsAsmParser::OperandMatchResultTy - MatchAnyRegisterWithoutDollar(SmallVectorImpl<MCParsedAsmOperand *> &Operands, - SMLoc S); + MatchAnyRegisterNameWithoutDollar(OperandVector &Operands, + StringRef Identifier, SMLoc S); MipsAsmParser::OperandMatchResultTy - ParseAnyRegister(SmallVectorImpl<MCParsedAsmOperand *> &Operands); + MatchAnyRegisterWithoutDollar(OperandVector &Operands, SMLoc S); - MipsAsmParser::OperandMatchResultTy - ParseImm(SmallVectorImpl<MCParsedAsmOperand *> &Operands); + MipsAsmParser::OperandMatchResultTy ParseAnyRegister(OperandVector &Operands); - MipsAsmParser::OperandMatchResultTy - ParseJumpTarget(SmallVectorImpl<MCParsedAsmOperand *> &Operands); + MipsAsmParser::OperandMatchResultTy ParseImm(OperandVector &Operands); - MipsAsmParser::OperandMatchResultTy - parseInvNum(SmallVectorImpl<MCParsedAsmOperand *> &Operands); + MipsAsmParser::OperandMatchResultTy ParseJumpTarget(OperandVector &Operands); - MipsAsmParser::OperandMatchResultTy - ParseLSAImm(SmallVectorImpl<MCParsedAsmOperand *> &Operands); + MipsAsmParser::OperandMatchResultTy parseInvNum(OperandVector &Operands); - bool searchSymbolAlias(SmallVectorImpl<MCParsedAsmOperand *> &Operands); + MipsAsmParser::OperandMatchResultTy ParseLSAImm(OperandVector &Operands); - bool ParseOperand(SmallVectorImpl<MCParsedAsmOperand *> &, - StringRef Mnemonic); + bool searchSymbolAlias(OperandVector &Operands); + + bool ParseOperand(OperandVector &, StringRef Mnemonic); bool needsExpansion(MCInst &Inst); - void expandInstruction(MCInst &Inst, SMLoc IDLoc, + // Expands assembly pseudo instructions. + // Returns false on success, true otherwise. + bool expandInstruction(MCInst &Inst, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions); - void expandLoadImm(MCInst &Inst, SMLoc IDLoc, + + bool expandLoadImm(MCInst &Inst, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions); - void expandLoadAddressImm(MCInst &Inst, SMLoc IDLoc, + + bool expandLoadAddressImm(MCInst &Inst, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions); - void expandLoadAddressReg(MCInst &Inst, SMLoc IDLoc, + + bool expandLoadAddressReg(MCInst &Inst, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions); + void expandMemInst(MCInst &Inst, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions, bool isLoad, bool isImmOpnd); - bool reportParseError(StringRef ErrorMsg); - bool reportParseError(SMLoc Loc, StringRef ErrorMsg); + bool reportParseError(Twine ErrorMsg); + bool reportParseError(SMLoc Loc, Twine ErrorMsg); bool parseMemOffset(const MCExpr *&Res, bool isParenExpr); bool parseRelocOperand(const MCExpr *&Res); @@ -159,32 +156,20 @@ class MipsAsmParser : public MCTargetAsmParser { bool parseSetReorderDirective(); bool parseSetNoReorderDirective(); bool parseSetNoMips16Directive(); + bool parseSetFpDirective(); bool parseSetAssignment(); bool parseDataDirective(unsigned Size, SMLoc L); bool parseDirectiveGpWord(); bool parseDirectiveGpDWord(); + bool parseDirectiveModule(); + bool parseDirectiveModuleFP(); + bool parseFpABIValue(MipsABIFlagsSection::FpABIKind &FpABI, + StringRef Directive); MCSymbolRefExpr::VariantKind getVariantKind(StringRef Symbol); - bool isGP64() const { - return (STI.getFeatureBits() & Mips::FeatureGP64Bit) != 0; - } - - bool isFP64() const { - return (STI.getFeatureBits() & Mips::FeatureFP64Bit) != 0; - } - - bool isN32() const { return STI.getFeatureBits() & Mips::FeatureN32; } - bool isN64() const { return STI.getFeatureBits() & Mips::FeatureN64; } - - bool isMicroMips() const { - return STI.getFeatureBits() & Mips::FeatureMicroMips; - } - - bool parseRegister(unsigned &RegNum); - bool eatComma(StringRef ErrorStr); int matchCPURegisterName(StringRef Symbol); @@ -205,7 +190,7 @@ class MipsAsmParser : public MCTargetAsmParser { unsigned getGPR(int RegNo); - int getATReg(); + int getATReg(SMLoc Loc); bool processInstruction(MCInst &Inst, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions); @@ -230,23 +215,85 @@ class MipsAsmParser : public MCTargetAsmParser { } public: + enum MipsMatchResultTy { + Match_RequiresDifferentSrcAndDst = FIRST_TARGET_MATCH_RESULT_TY +#define GET_OPERAND_DIAGNOSTIC_TYPES +#include "MipsGenAsmMatcher.inc" +#undef GET_OPERAND_DIAGNOSTIC_TYPES + + }; + MipsAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser, - const MCInstrInfo &MII, - const MCTargetOptions &Options) + const MCInstrInfo &MII, const MCTargetOptions &Options) : MCTargetAsmParser(), STI(sti), Parser(parser) { // Initialize the set of available features. setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); + getTargetStreamer().updateABIInfo(*this); + // Assert exactly one ABI was chosen. assert((((STI.getFeatureBits() & Mips::FeatureO32) != 0) + ((STI.getFeatureBits() & Mips::FeatureEABI) != 0) + ((STI.getFeatureBits() & Mips::FeatureN32) != 0) + ((STI.getFeatureBits() & Mips::FeatureN64) != 0)) == 1); + + if (!isABI_O32() && !allowOddSPReg() != 0) + report_fatal_error("-mno-odd-spreg requires the O32 ABI"); } MCAsmParser &getParser() const { return Parser; } MCAsmLexer &getLexer() const { return Parser.getLexer(); } + /// True if all of $fcc0 - $fcc7 exist for the current ISA. + bool hasEightFccRegisters() const { return hasMips4() || hasMips32(); } + + bool isGP64bit() const { return STI.getFeatureBits() & Mips::FeatureGP64Bit; } + bool isFP64bit() const { return STI.getFeatureBits() & Mips::FeatureFP64Bit; } + bool isABI_N32() const { return STI.getFeatureBits() & Mips::FeatureN32; } + bool isABI_N64() const { return STI.getFeatureBits() & Mips::FeatureN64; } + bool isABI_O32() const { return STI.getFeatureBits() & Mips::FeatureO32; } + bool isABI_FPXX() const { return false; } // TODO: add check for FeatureXX + + bool allowOddSPReg() const { + return !(STI.getFeatureBits() & Mips::FeatureNoOddSPReg); + } + + bool inMicroMipsMode() const { + return STI.getFeatureBits() & Mips::FeatureMicroMips; + } + bool hasMips1() const { return STI.getFeatureBits() & Mips::FeatureMips1; } + bool hasMips2() const { return STI.getFeatureBits() & Mips::FeatureMips2; } + bool hasMips3() const { return STI.getFeatureBits() & Mips::FeatureMips3; } + bool hasMips4() const { return STI.getFeatureBits() & Mips::FeatureMips4; } + bool hasMips5() const { return STI.getFeatureBits() & Mips::FeatureMips5; } + bool hasMips32() const { + return (STI.getFeatureBits() & Mips::FeatureMips32); + } + bool hasMips64() const { + return (STI.getFeatureBits() & Mips::FeatureMips64); + } + bool hasMips32r2() const { + return (STI.getFeatureBits() & Mips::FeatureMips32r2); + } + bool hasMips64r2() const { + return (STI.getFeatureBits() & Mips::FeatureMips64r2); + } + bool hasMips32r6() const { + return (STI.getFeatureBits() & Mips::FeatureMips32r6); + } + bool hasMips64r6() const { + return (STI.getFeatureBits() & Mips::FeatureMips64r6); + } + bool hasDSP() const { return (STI.getFeatureBits() & Mips::FeatureDSP); } + bool hasDSPR2() const { return (STI.getFeatureBits() & Mips::FeatureDSPR2); } + bool hasMSA() const { return (STI.getFeatureBits() & Mips::FeatureMSA); } + + bool inMips16Mode() const { + return STI.getFeatureBits() & Mips::FeatureMips16; + } + // TODO: see how can we get this info. + bool mipsSEUsesSoftFloat() const { return false; } + /// Warn if RegNo is the current assembler temporary. void WarnIfAssemblerTemporary(int RegNo, SMLoc Loc); }; @@ -261,9 +308,9 @@ public: /// Broad categories of register classes /// The exact class is finalized by the render method. enum RegKind { - RegKind_GPR = 1, /// GPR32 and GPR64 (depending on isGP64()) + RegKind_GPR = 1, /// GPR32 and GPR64 (depending on isGP64bit()) RegKind_FGR = 2, /// FGR32, FGR64, AFGR64 (depending on context and - /// isFP64()) + /// isFP64bit()) RegKind_FCC = 4, /// FCC RegKind_MSA128 = 8, /// MSA128[BHWD] (makes no difference which) RegKind_MSACtrl = 16, /// MSA control registers @@ -289,9 +336,11 @@ private: k_Token /// A simple token } Kind; +public: MipsOperand(KindTy K, MipsAsmParser &Parser) : MCParsedAsmOperand(), Kind(K), AsmParser(Parser) {} +private: /// For diagnostics, and checking the assembler temporary MipsAsmParser &AsmParser; @@ -330,10 +379,11 @@ private: SMLoc StartLoc, EndLoc; /// Internal constructor for register kinds - static MipsOperand *CreateReg(unsigned Index, RegKind RegKind, - const MCRegisterInfo *RegInfo, SMLoc S, SMLoc E, - MipsAsmParser &Parser) { - MipsOperand *Op = new MipsOperand(k_RegisterIndex, Parser); + static std::unique_ptr<MipsOperand> CreateReg(unsigned Index, RegKind RegKind, + const MCRegisterInfo *RegInfo, + SMLoc S, SMLoc E, + MipsAsmParser &Parser) { + auto Op = make_unique<MipsOperand>(k_RegisterIndex, Parser); Op->RegIdx.Index = Index; Op->RegIdx.RegInfo = RegInfo; Op->RegIdx.Kind = RegKind; @@ -521,6 +571,10 @@ public: void addFGR32AsmRegOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); Inst.addOperand(MCOperand::CreateReg(getFGR32Reg())); + // FIXME: We ought to do this for -integrated-as without -via-file-asm too. + if (!AsmParser.allowOddSPReg() && RegIdx.Index & 1) + AsmParser.Error(StartLoc, "-mno-odd-spreg prohibits the use of odd FPU " + "registers"); } void addFGRH32AsmRegOperands(MCInst &Inst, unsigned N) const { @@ -612,6 +666,12 @@ public: return Kind == k_Token; } bool isMem() const override { return Kind == k_Memory; } + bool isConstantMemOff() const { + return isMem() && dyn_cast<MCConstantExpr>(getMemOff()); + } + template <unsigned Bits> bool isMemWithSimmOffset() const { + return isMem() && isConstantMemOff() && isInt<Bits>(getConstantMemOff()); + } bool isInvNum() const { return Kind == k_Immediate; } bool isLSAImm() const { if (!isConstantImm()) @@ -656,9 +716,13 @@ public: return Mem.Off; } - static MipsOperand *CreateToken(StringRef Str, SMLoc S, - MipsAsmParser &Parser) { - MipsOperand *Op = new MipsOperand(k_Token, Parser); + int64_t getConstantMemOff() const { + return static_cast<const MCConstantExpr *>(getMemOff())->getValue(); + } + + static std::unique_ptr<MipsOperand> CreateToken(StringRef Str, SMLoc S, + MipsAsmParser &Parser) { + auto Op = make_unique<MipsOperand>(k_Token, Parser); Op->Tok.Data = Str.data(); Op->Tok.Length = Str.size(); Op->StartLoc = S; @@ -668,74 +732,75 @@ public: /// Create a numeric register (e.g. $1). The exact register remains /// unresolved until an instruction successfully matches - static MipsOperand *CreateNumericReg(unsigned Index, - const MCRegisterInfo *RegInfo, SMLoc S, - SMLoc E, MipsAsmParser &Parser) { + static std::unique_ptr<MipsOperand> + CreateNumericReg(unsigned Index, const MCRegisterInfo *RegInfo, SMLoc S, + SMLoc E, MipsAsmParser &Parser) { DEBUG(dbgs() << "CreateNumericReg(" << Index << ", ...)\n"); return CreateReg(Index, RegKind_Numeric, RegInfo, S, E, Parser); } /// Create a register that is definitely a GPR. /// This is typically only used for named registers such as $gp. - static MipsOperand *CreateGPRReg(unsigned Index, - const MCRegisterInfo *RegInfo, SMLoc S, - SMLoc E, MipsAsmParser &Parser) { + static std::unique_ptr<MipsOperand> + CreateGPRReg(unsigned Index, const MCRegisterInfo *RegInfo, SMLoc S, SMLoc E, + MipsAsmParser &Parser) { return CreateReg(Index, RegKind_GPR, RegInfo, S, E, Parser); } /// Create a register that is definitely a FGR. /// This is typically only used for named registers such as $f0. - static MipsOperand *CreateFGRReg(unsigned Index, - const MCRegisterInfo *RegInfo, SMLoc S, - SMLoc E, MipsAsmParser &Parser) { + static std::unique_ptr<MipsOperand> + CreateFGRReg(unsigned Index, const MCRegisterInfo *RegInfo, SMLoc S, SMLoc E, + MipsAsmParser &Parser) { return CreateReg(Index, RegKind_FGR, RegInfo, S, E, Parser); } /// Create a register that is definitely an FCC. /// This is typically only used for named registers such as $fcc0. - static MipsOperand *CreateFCCReg(unsigned Index, - const MCRegisterInfo *RegInfo, SMLoc S, - SMLoc E, MipsAsmParser &Parser) { + static std::unique_ptr<MipsOperand> + CreateFCCReg(unsigned Index, const MCRegisterInfo *RegInfo, SMLoc S, SMLoc E, + MipsAsmParser &Parser) { return CreateReg(Index, RegKind_FCC, RegInfo, S, E, Parser); } /// Create a register that is definitely an ACC. /// This is typically only used for named registers such as $ac0. - static MipsOperand *CreateACCReg(unsigned Index, - const MCRegisterInfo *RegInfo, SMLoc S, - SMLoc E, MipsAsmParser &Parser) { + static std::unique_ptr<MipsOperand> + CreateACCReg(unsigned Index, const MCRegisterInfo *RegInfo, SMLoc S, SMLoc E, + MipsAsmParser &Parser) { return CreateReg(Index, RegKind_ACC, RegInfo, S, E, Parser); } /// Create a register that is definitely an MSA128. /// This is typically only used for named registers such as $w0. - static MipsOperand *CreateMSA128Reg(unsigned Index, - const MCRegisterInfo *RegInfo, SMLoc S, - SMLoc E, MipsAsmParser &Parser) { + static std::unique_ptr<MipsOperand> + CreateMSA128Reg(unsigned Index, const MCRegisterInfo *RegInfo, SMLoc S, + SMLoc E, MipsAsmParser &Parser) { return CreateReg(Index, RegKind_MSA128, RegInfo, S, E, Parser); } /// Create a register that is definitely an MSACtrl. /// This is typically only used for named registers such as $msaaccess. - static MipsOperand *CreateMSACtrlReg(unsigned Index, - const MCRegisterInfo *RegInfo, SMLoc S, - SMLoc E, MipsAsmParser &Parser) { + static std::unique_ptr<MipsOperand> + CreateMSACtrlReg(unsigned Index, const MCRegisterInfo *RegInfo, SMLoc S, + SMLoc E, MipsAsmParser &Parser) { return CreateReg(Index, RegKind_MSACtrl, RegInfo, S, E, Parser); } - static MipsOperand *CreateImm(const MCExpr *Val, SMLoc S, SMLoc E, - MipsAsmParser &Parser) { - MipsOperand *Op = new MipsOperand(k_Immediate, Parser); + static std::unique_ptr<MipsOperand> + CreateImm(const MCExpr *Val, SMLoc S, SMLoc E, MipsAsmParser &Parser) { + auto Op = make_unique<MipsOperand>(k_Immediate, Parser); Op->Imm.Val = Val; Op->StartLoc = S; Op->EndLoc = E; return Op; } - static MipsOperand *CreateMem(MipsOperand *Base, const MCExpr *Off, SMLoc S, - SMLoc E, MipsAsmParser &Parser) { - MipsOperand *Op = new MipsOperand(k_Memory, Parser); - Op->Mem.Base = Base; + static std::unique_ptr<MipsOperand> + CreateMem(std::unique_ptr<MipsOperand> Base, const MCExpr *Off, SMLoc S, + SMLoc E, MipsAsmParser &Parser) { + auto Op = make_unique<MipsOperand>(k_Memory, Parser); + Op->Mem.Base = Base.release(); Op->Mem.Off = Off; Op->StartLoc = S; Op->EndLoc = E; @@ -756,7 +821,11 @@ public: return isRegIdx() && RegIdx.Kind & RegKind_CCR && RegIdx.Index <= 31; } bool isFCCAsmReg() const { - return isRegIdx() && RegIdx.Kind & RegKind_FCC && RegIdx.Index <= 7; + if (!(isRegIdx() && RegIdx.Kind & RegKind_FCC)) + return false; + if (!AsmParser.hasEightFccRegisters()) + return RegIdx.Index == 0; + return RegIdx.Index <= 7; } bool isACCAsmReg() const { return isRegIdx() && RegIdx.Kind & RegKind_ACC && RegIdx.Index <= 3; @@ -849,9 +918,10 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, Offset = Inst.getOperand(2); if (!Offset.isImm()) break; // We'll deal with this situation later on when applying fixups. - if (!isIntN(isMicroMips() ? 17 : 18, Offset.getImm())) + if (!isIntN(inMicroMipsMode() ? 17 : 18, Offset.getImm())) return Error(IDLoc, "branch target out of range"); - if (OffsetToAlignment(Offset.getImm(), 1LL << (isMicroMips() ? 1 : 2))) + if (OffsetToAlignment(Offset.getImm(), + 1LL << (inMicroMipsMode() ? 1 : 2))) return Error(IDLoc, "branch to misaligned address"); break; case Mips::BGEZ: @@ -874,14 +944,23 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, Offset = Inst.getOperand(1); if (!Offset.isImm()) break; // We'll deal with this situation later on when applying fixups. - if (!isIntN(isMicroMips() ? 17 : 18, Offset.getImm())) + if (!isIntN(inMicroMipsMode() ? 17 : 18, Offset.getImm())) return Error(IDLoc, "branch target out of range"); - if (OffsetToAlignment(Offset.getImm(), 1LL << (isMicroMips() ? 1 : 2))) + if (OffsetToAlignment(Offset.getImm(), + 1LL << (inMicroMipsMode() ? 1 : 2))) return Error(IDLoc, "branch to misaligned address"); break; } } + // SSNOP is deprecated on MIPS32r6/MIPS64r6 + // We still accept it but it is a normal nop. + if (hasMips32r6() && Inst.getOpcode() == Mips::SSNOP) { + std::string ISA = hasMips64r6() ? "MIPS64r6" : "MIPS32r6"; + Warning(IDLoc, "ssnop is deprecated for " + ISA + " and is equivalent to a " + "nop instruction"); + } + if (MCID.hasDelaySlot() && Options.isReorder()) { // If this instruction has a delay slot and .set reorder is active, // emit a NOP after it. @@ -930,7 +1009,7 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, } // if load/store if (needsExpansion(Inst)) - expandInstruction(Inst, IDLoc, Instructions); + return expandInstruction(Inst, IDLoc, Instructions); else Instructions.push_back(Inst); @@ -943,17 +1022,27 @@ bool MipsAsmParser::needsExpansion(MCInst &Inst) { case Mips::LoadImm32Reg: case Mips::LoadAddr32Imm: case Mips::LoadAddr32Reg: + case Mips::LoadImm64Reg: return true; default: return false; } } -void MipsAsmParser::expandInstruction(MCInst &Inst, SMLoc IDLoc, +bool MipsAsmParser::expandInstruction(MCInst &Inst, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) { switch (Inst.getOpcode()) { + default: + assert(0 && "unimplemented expansion"); + return true; case Mips::LoadImm32Reg: return expandLoadImm(Inst, IDLoc, Instructions); + case Mips::LoadImm64Reg: + if (!isGP64bit()) { + Error(IDLoc, "instruction requires a CPU feature not currently enabled"); + return true; + } + return expandLoadImm(Inst, IDLoc, Instructions); case Mips::LoadAddr32Imm: return expandLoadAddressImm(Inst, IDLoc, Instructions); case Mips::LoadAddr32Reg: @@ -961,7 +1050,31 @@ void MipsAsmParser::expandInstruction(MCInst &Inst, SMLoc IDLoc, } } -void MipsAsmParser::expandLoadImm(MCInst &Inst, SMLoc IDLoc, +namespace { +template <int Shift, bool PerformShift> +void createShiftOr(int64_t Value, unsigned RegNo, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions) { + MCInst tmpInst; + if (PerformShift) { + tmpInst.setOpcode(Mips::DSLL); + tmpInst.addOperand(MCOperand::CreateReg(RegNo)); + tmpInst.addOperand(MCOperand::CreateReg(RegNo)); + tmpInst.addOperand(MCOperand::CreateImm(16)); + tmpInst.setLoc(IDLoc); + Instructions.push_back(tmpInst); + tmpInst.clear(); + } + tmpInst.setOpcode(Mips::ORi); + tmpInst.addOperand(MCOperand::CreateReg(RegNo)); + tmpInst.addOperand(MCOperand::CreateReg(RegNo)); + tmpInst.addOperand( + MCOperand::CreateImm(((Value & (0xffffLL << Shift)) >> Shift))); + tmpInst.setLoc(IDLoc); + Instructions.push_back(tmpInst); +} +} + +bool MipsAsmParser::expandLoadImm(MCInst &Inst, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) { MCInst tmpInst; const MCOperand &ImmOp = Inst.getOperand(1); @@ -969,8 +1082,10 @@ void MipsAsmParser::expandLoadImm(MCInst &Inst, SMLoc IDLoc, const MCOperand &RegOp = Inst.getOperand(0); assert(RegOp.isReg() && "expected register operand kind"); - int ImmValue = ImmOp.getImm(); + int64_t ImmValue = ImmOp.getImm(); tmpInst.setLoc(IDLoc); + // FIXME: gas has a special case for values that are 000...1111, which + // becomes a li -1 and then a dsrl if (0 <= ImmValue && ImmValue <= 65535) { // For 0 <= j <= 65535. // li d,j => ori d,$zero,j @@ -987,25 +1102,76 @@ void MipsAsmParser::expandLoadImm(MCInst &Inst, SMLoc IDLoc, tmpInst.addOperand(MCOperand::CreateReg(Mips::ZERO)); tmpInst.addOperand(MCOperand::CreateImm(ImmValue)); Instructions.push_back(tmpInst); - } else { - // For any other value of j that is representable as a 32-bit integer. + } else if ((ImmValue & 0xffffffff) == ImmValue) { + // For any value of j that is representable as a 32-bit integer, create + // a sequence of: // li d,j => lui d,hi16(j) // ori d,d,lo16(j) tmpInst.setOpcode(Mips::LUi); tmpInst.addOperand(MCOperand::CreateReg(RegOp.getReg())); tmpInst.addOperand(MCOperand::CreateImm((ImmValue & 0xffff0000) >> 16)); Instructions.push_back(tmpInst); - tmpInst.clear(); - tmpInst.setOpcode(Mips::ORi); + createShiftOr<0, false>(ImmValue, RegOp.getReg(), IDLoc, Instructions); + } else if ((ImmValue & (0xffffLL << 48)) == 0) { + if (!isGP64bit()) { + Error(IDLoc, "instruction requires a CPU feature not currently enabled"); + return true; + } + + // <------- lo32 ------> + // <------- hi32 ------> + // <- hi16 -> <- lo16 -> + // _________________________________ + // | | | | + // | 16-bytes | 16-bytes | 16-bytes | + // |__________|__________|__________| + // + // For any value of j that is representable as a 48-bit integer, create + // a sequence of: + // li d,j => lui d,hi16(j) + // ori d,d,hi16(lo32(j)) + // dsll d,d,16 + // ori d,d,lo16(lo32(j)) + tmpInst.setOpcode(Mips::LUi); tmpInst.addOperand(MCOperand::CreateReg(RegOp.getReg())); + tmpInst.addOperand( + MCOperand::CreateImm((ImmValue & (0xffffLL << 32)) >> 32)); + Instructions.push_back(tmpInst); + createShiftOr<16, false>(ImmValue, RegOp.getReg(), IDLoc, Instructions); + createShiftOr<0, true>(ImmValue, RegOp.getReg(), IDLoc, Instructions); + } else { + if (!isGP64bit()) { + Error(IDLoc, "instruction requires a CPU feature not currently enabled"); + return true; + } + + // <------- hi32 ------> <------- lo32 ------> + // <- hi16 -> <- lo16 -> + // ___________________________________________ + // | | | | | + // | 16-bytes | 16-bytes | 16-bytes | 16-bytes | + // |__________|__________|__________|__________| + // + // For any value of j that isn't representable as a 48-bit integer. + // li d,j => lui d,hi16(j) + // ori d,d,lo16(hi32(j)) + // dsll d,d,16 + // ori d,d,hi16(lo32(j)) + // dsll d,d,16 + // ori d,d,lo16(lo32(j)) + tmpInst.setOpcode(Mips::LUi); tmpInst.addOperand(MCOperand::CreateReg(RegOp.getReg())); - tmpInst.addOperand(MCOperand::CreateImm(ImmValue & 0xffff)); - tmpInst.setLoc(IDLoc); + tmpInst.addOperand( + MCOperand::CreateImm((ImmValue & (0xffffLL << 48)) >> 48)); Instructions.push_back(tmpInst); + createShiftOr<32, false>(ImmValue, RegOp.getReg(), IDLoc, Instructions); + createShiftOr<16, true>(ImmValue, RegOp.getReg(), IDLoc, Instructions); + createShiftOr<0, true>(ImmValue, RegOp.getReg(), IDLoc, Instructions); } + return false; } -void +bool MipsAsmParser::expandLoadAddressReg(MCInst &Inst, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) { MCInst tmpInst; @@ -1046,9 +1212,10 @@ MipsAsmParser::expandLoadAddressReg(MCInst &Inst, SMLoc IDLoc, tmpInst.addOperand(MCOperand::CreateReg(SrcRegOp.getReg())); Instructions.push_back(tmpInst); } + return false; } -void +bool MipsAsmParser::expandLoadAddressImm(MCInst &Inst, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) { MCInst tmpInst; @@ -1080,6 +1247,7 @@ MipsAsmParser::expandLoadAddressImm(MCInst &Inst, SMLoc IDLoc, tmpInst.addOperand(MCOperand::CreateImm(ImmValue & 0xffff)); Instructions.push_back(tmpInst); } + return false; } void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc, @@ -1090,8 +1258,6 @@ void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc, unsigned ImmOffset, HiOffset, LoOffset; const MCExpr *ExprOffset; unsigned TmpRegNum; - unsigned AtRegNum = getReg( - (isGP64()) ? Mips::GPR64RegClassID : Mips::GPR32RegClassID, getATReg()); // 1st operand is either the source or destination register. assert(Inst.getOperand(0).isReg() && "expected register operand kind"); unsigned RegOpNum = Inst.getOperand(0).getReg(); @@ -1111,10 +1277,46 @@ void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc, ExprOffset = Inst.getOperand(2).getExpr(); // All instructions will have the same location. TempInst.setLoc(IDLoc); - // 1st instruction in expansion is LUi. For load instruction we can use - // the dst register as a temporary if base and dst are different, - // but for stores we must use $at. - TmpRegNum = (isLoad && (BaseRegNum != RegOpNum)) ? RegOpNum : AtRegNum; + // These are some of the types of expansions we perform here: + // 1) lw $8, sym => lui $8, %hi(sym) + // lw $8, %lo(sym)($8) + // 2) lw $8, offset($9) => lui $8, %hi(offset) + // add $8, $8, $9 + // lw $8, %lo(offset)($9) + // 3) lw $8, offset($8) => lui $at, %hi(offset) + // add $at, $at, $8 + // lw $8, %lo(offset)($at) + // 4) sw $8, sym => lui $at, %hi(sym) + // sw $8, %lo(sym)($at) + // 5) sw $8, offset($8) => lui $at, %hi(offset) + // add $at, $at, $8 + // sw $8, %lo(offset)($at) + // 6) ldc1 $f0, sym => lui $at, %hi(sym) + // ldc1 $f0, %lo(sym)($at) + // + // For load instructions we can use the destination register as a temporary + // if base and dst are different (examples 1 and 2) and if the base register + // is general purpose otherwise we must use $at (example 6) and error if it's + // not available. For stores we must use $at (examples 4 and 5) because we + // must not clobber the source register setting up the offset. + const MCInstrDesc &Desc = getInstDesc(Inst.getOpcode()); + int16_t RegClassOp0 = Desc.OpInfo[0].RegClass; + unsigned RegClassIDOp0 = + getContext().getRegisterInfo()->getRegClass(RegClassOp0).getID(); + bool IsGPR = (RegClassIDOp0 == Mips::GPR32RegClassID) || + (RegClassIDOp0 == Mips::GPR64RegClassID); + if (isLoad && IsGPR && (BaseRegNum != RegOpNum)) + TmpRegNum = RegOpNum; + else { + int AT = getATReg(IDLoc); + // At this point we need AT to perform the expansions and we exit if it is + // not available. + if (!AT) + return; + TmpRegNum = getReg( + (isGP64bit()) ? Mips::GPR64RegClassID : Mips::GPR32RegClassID, AT); + } + TempInst.setOpcode(Mips::LUi); TempInst.addOperand(MCOperand::CreateReg(TmpRegNum)); if (isImmOpnd) @@ -1164,10 +1366,24 @@ void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc, TempInst.clear(); } -bool MipsAsmParser::MatchAndEmitInstruction( - SMLoc IDLoc, unsigned &Opcode, - SmallVectorImpl<MCParsedAsmOperand *> &Operands, MCStreamer &Out, - unsigned &ErrorInfo, bool MatchingInlineAsm) { +unsigned MipsAsmParser::checkTargetMatchPredicate(MCInst &Inst) { + // As described by the Mips32r2 spec, the registers Rd and Rs for + // jalr.hb must be different. + unsigned Opcode = Inst.getOpcode(); + + if (Opcode == Mips::JALR_HB && + (Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg())) + return Match_RequiresDifferentSrcAndDst; + + return Match_Success; +} + +bool MipsAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + OperandVector &Operands, + MCStreamer &Out, + unsigned &ErrorInfo, + bool MatchingInlineAsm) { + MCInst Inst; SmallVector<MCInst, 8> Instructions; unsigned MatchResult = @@ -1192,7 +1408,7 @@ bool MipsAsmParser::MatchAndEmitInstruction( if (ErrorInfo >= Operands.size()) return Error(IDLoc, "too few operands for instruction"); - ErrorLoc = ((MipsOperand *)Operands[ErrorInfo])->getStartLoc(); + ErrorLoc = ((MipsOperand &)*Operands[ErrorInfo]).getStartLoc(); if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc; } @@ -1201,6 +1417,8 @@ bool MipsAsmParser::MatchAndEmitInstruction( } case Match_MnemonicFail: return Error(IDLoc, "invalid instruction"); + case Match_RequiresDifferentSrcAndDst: + return Error(IDLoc, "source and destination must be different"); } return true; } @@ -1254,7 +1472,7 @@ int MipsAsmParser::matchCPURegisterName(StringRef Name) { .Case("t9", 25) .Default(-1); - if (isN32() || isN64()) { + if (isABI_N32() || isABI_N64()) { // Although SGI documentation just cuts out t0-t3 for n32/n64, // GNU pushes the values of t0-t3 to override the o32/o64 values for t4-t7 // We are supporting both cases, so for t0-t3 we'll just push them to t4-t7. @@ -1354,10 +1572,11 @@ bool MipsAssemblerOptions::setATReg(unsigned Reg) { return true; } -int MipsAsmParser::getATReg() { +int MipsAsmParser::getATReg(SMLoc Loc) { int AT = Options.getATRegNum(); if (AT == 0) - TokError("Pseudo instruction requires $at, which is not available"); + reportParseError(Loc, + "Pseudo instruction requires $at, which is not available"); return AT; } @@ -1366,7 +1585,7 @@ unsigned MipsAsmParser::getReg(int RC, int RegNo) { } unsigned MipsAsmParser::getGPR(int RegNo) { - return getReg(isGP64() ? Mips::GPR64RegClassID : Mips::GPR32RegClassID, + return getReg(isGP64bit() ? Mips::GPR64RegClassID : Mips::GPR32RegClassID, RegNo); } @@ -1378,9 +1597,7 @@ int MipsAsmParser::matchRegisterByNumber(unsigned RegNum, unsigned RegClass) { return getReg(RegClass, RegNum); } -bool -MipsAsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand *> &Operands, - StringRef Mnemonic) { +bool MipsAsmParser::ParseOperand(OperandVector &Operands, StringRef Mnemonic) { DEBUG(dbgs() << "ParseOperand\n"); // Check if the current operand has a custom associated parser, if so, try to @@ -1431,6 +1648,7 @@ MipsAsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand *> &Operands, case AsmToken::Minus: case AsmToken::Plus: case AsmToken::Integer: + case AsmToken::Tilde: case AsmToken::String: { DEBUG(dbgs() << ".. generic integer\n"); OperandMatchResultTy ResTy = ParseImm(Operands); @@ -1578,11 +1796,11 @@ bool MipsAsmParser::parseRelocOperand(const MCExpr *&Res) { bool MipsAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) { - SmallVector<MCParsedAsmOperand *, 1> Operands; + SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> Operands; OperandMatchResultTy ResTy = ParseAnyRegister(Operands); if (ResTy == MatchOperand_Success) { assert(Operands.size() == 1); - MipsOperand &Operand = *static_cast<MipsOperand *>(Operands.front()); + MipsOperand &Operand = static_cast<MipsOperand &>(*Operands.front()); StartLoc = Operand.getStartLoc(); EndLoc = Operand.getEndLoc(); @@ -1592,11 +1810,9 @@ bool MipsAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, // register is a parse error. if (Operand.isGPRAsmReg()) { // Resolve to GPR32 or GPR64 appropriately. - RegNo = isGP64() ? Operand.getGPR64Reg() : Operand.getGPR32Reg(); + RegNo = isGP64bit() ? Operand.getGPR64Reg() : Operand.getGPR32Reg(); } - delete &Operand; - return (RegNo == (unsigned)-1); } @@ -1632,8 +1848,8 @@ bool MipsAsmParser::parseMemOffset(const MCExpr *&Res, bool isParenExpr) { return Result; } -MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand( - SmallVectorImpl<MCParsedAsmOperand *> &Operands) { +MipsAsmParser::OperandMatchResultTy +MipsAsmParser::parseMemOperand(OperandVector &Operands) { DEBUG(dbgs() << "parseMemOperand\n"); const MCExpr *IdVal = nullptr; SMLoc S; @@ -1653,8 +1869,8 @@ MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand( const AsmToken &Tok = Parser.getTok(); // Get the next token. if (Tok.isNot(AsmToken::LParen)) { - MipsOperand *Mnemonic = static_cast<MipsOperand *>(Operands[0]); - if (Mnemonic->getToken() == "la") { + MipsOperand &Mnemonic = static_cast<MipsOperand &>(*Operands[0]); + if (Mnemonic.getToken() == "la") { SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1); Operands.push_back(MipsOperand::CreateImm(IdVal, S, E, *this)); @@ -1666,9 +1882,10 @@ MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand( // Zero register assumed, add a memory operand with ZERO as its base. // "Base" will be managed by k_Memory. - MipsOperand *Base = MipsOperand::CreateGPRReg( - 0, getContext().getRegisterInfo(), S, E, *this); - Operands.push_back(MipsOperand::CreateMem(Base, IdVal, S, E, *this)); + auto Base = MipsOperand::CreateGPRReg(0, getContext().getRegisterInfo(), + S, E, *this); + Operands.push_back( + MipsOperand::CreateMem(std::move(Base), IdVal, S, E, *this)); return MatchOperand_Success; } Error(Parser.getTok().getLoc(), "'(' expected"); @@ -1695,7 +1912,8 @@ MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand( IdVal = MCConstantExpr::Create(0, getContext()); // Replace the register operand with the memory operand. - MipsOperand *op = static_cast<MipsOperand *>(Operands.back()); + std::unique_ptr<MipsOperand> op( + static_cast<MipsOperand *>(Operands.back().release())); // Remove the register from the operands. // "op" will be managed by k_Memory. Operands.pop_back(); @@ -1709,12 +1927,11 @@ MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand( getContext()); } - Operands.push_back(MipsOperand::CreateMem(op, IdVal, S, E, *this)); + Operands.push_back(MipsOperand::CreateMem(std::move(op), IdVal, S, E, *this)); return MatchOperand_Success; } -bool MipsAsmParser::searchSymbolAlias( - SmallVectorImpl<MCParsedAsmOperand *> &Operands) { +bool MipsAsmParser::searchSymbolAlias(OperandVector &Operands) { MCSymbol *Sym = getContext().LookupSymbol(Parser.getTok().getIdentifier()); if (Sym) { @@ -1740,9 +1957,8 @@ bool MipsAsmParser::searchSymbolAlias( } else if (Expr->getKind() == MCExpr::Constant) { Parser.Lex(); const MCConstantExpr *Const = static_cast<const MCConstantExpr *>(Expr); - MipsOperand *op = - MipsOperand::CreateImm(Const, S, Parser.getTok().getLoc(), *this); - Operands.push_back(op); + Operands.push_back( + MipsOperand::CreateImm(Const, S, Parser.getTok().getLoc(), *this)); return true; } } @@ -1750,9 +1966,9 @@ bool MipsAsmParser::searchSymbolAlias( } MipsAsmParser::OperandMatchResultTy -MipsAsmParser::MatchAnyRegisterNameWithoutDollar( - SmallVectorImpl<MCParsedAsmOperand *> &Operands, StringRef Identifier, - SMLoc S) { +MipsAsmParser::MatchAnyRegisterNameWithoutDollar(OperandVector &Operands, + StringRef Identifier, + SMLoc S) { int Index = matchCPURegisterName(Identifier); if (Index != -1) { Operands.push_back(MipsOperand::CreateGPRReg( @@ -1799,8 +2015,7 @@ MipsAsmParser::MatchAnyRegisterNameWithoutDollar( } MipsAsmParser::OperandMatchResultTy -MipsAsmParser::MatchAnyRegisterWithoutDollar( - SmallVectorImpl<MCParsedAsmOperand *> &Operands, SMLoc S) { +MipsAsmParser::MatchAnyRegisterWithoutDollar(OperandVector &Operands, SMLoc S) { auto Token = Parser.getLexer().peekTok(false); if (Token.is(AsmToken::Identifier)) { @@ -1822,8 +2037,8 @@ MipsAsmParser::MatchAnyRegisterWithoutDollar( return MatchOperand_NoMatch; } -MipsAsmParser::OperandMatchResultTy MipsAsmParser::ParseAnyRegister( - SmallVectorImpl<MCParsedAsmOperand *> &Operands) { +MipsAsmParser::OperandMatchResultTy +MipsAsmParser::ParseAnyRegister(OperandVector &Operands) { DEBUG(dbgs() << "ParseAnyRegister\n"); auto Token = Parser.getTok(); @@ -1850,7 +2065,7 @@ MipsAsmParser::OperandMatchResultTy MipsAsmParser::ParseAnyRegister( } MipsAsmParser::OperandMatchResultTy -MipsAsmParser::ParseImm(SmallVectorImpl<MCParsedAsmOperand *> &Operands) { +MipsAsmParser::ParseImm(OperandVector &Operands) { switch (getLexer().getKind()) { default: return MatchOperand_NoMatch; @@ -1858,6 +2073,7 @@ MipsAsmParser::ParseImm(SmallVectorImpl<MCParsedAsmOperand *> &Operands) { case AsmToken::Minus: case AsmToken::Plus: case AsmToken::Integer: + case AsmToken::Tilde: case AsmToken::String: break; } @@ -1872,8 +2088,8 @@ MipsAsmParser::ParseImm(SmallVectorImpl<MCParsedAsmOperand *> &Operands) { return MatchOperand_Success; } -MipsAsmParser::OperandMatchResultTy MipsAsmParser::ParseJumpTarget( - SmallVectorImpl<MCParsedAsmOperand *> &Operands) { +MipsAsmParser::OperandMatchResultTy +MipsAsmParser::ParseJumpTarget(OperandVector &Operands) { DEBUG(dbgs() << "ParseJumpTarget\n"); SMLoc S = getLexer().getLoc(); @@ -1899,7 +2115,7 @@ MipsAsmParser::OperandMatchResultTy MipsAsmParser::ParseJumpTarget( } MipsAsmParser::OperandMatchResultTy -MipsAsmParser::parseInvNum(SmallVectorImpl<MCParsedAsmOperand *> &Operands) { +MipsAsmParser::parseInvNum(OperandVector &Operands) { const MCExpr *IdVal; // If the first token is '$' we may have register operand. if (Parser.getTok().is(AsmToken::Dollar)) @@ -1917,7 +2133,7 @@ MipsAsmParser::parseInvNum(SmallVectorImpl<MCParsedAsmOperand *> &Operands) { } MipsAsmParser::OperandMatchResultTy -MipsAsmParser::ParseLSAImm(SmallVectorImpl<MCParsedAsmOperand *> &Operands) { +MipsAsmParser::ParseLSAImm(OperandVector &Operands) { switch (getLexer().getKind()) { default: return MatchOperand_NoMatch; @@ -1996,8 +2212,7 @@ MCSymbolRefExpr::VariantKind MipsAsmParser::getVariantKind(StringRef Symbol) { /// ::= '(', register, ')' /// handle it before we iterate so we don't get tripped up by the lack of /// a comma. -bool MipsAsmParser::ParseParenSuffix( - StringRef Name, SmallVectorImpl<MCParsedAsmOperand *> &Operands) { +bool MipsAsmParser::ParseParenSuffix(StringRef Name, OperandVector &Operands) { if (getLexer().is(AsmToken::LParen)) { Operands.push_back( MipsOperand::CreateToken("(", getLexer().getLoc(), *this)); @@ -2025,8 +2240,8 @@ bool MipsAsmParser::ParseParenSuffix( /// ::= '[', integer, ']' /// handle it before we iterate so we don't get tripped up by the lack of /// a comma. -bool MipsAsmParser::ParseBracketSuffix( - StringRef Name, SmallVectorImpl<MCParsedAsmOperand *> &Operands) { +bool MipsAsmParser::ParseBracketSuffix(StringRef Name, + OperandVector &Operands) { if (getLexer().is(AsmToken::LBrac)) { Operands.push_back( MipsOperand::CreateToken("[", getLexer().getLoc(), *this)); @@ -2048,10 +2263,12 @@ bool MipsAsmParser::ParseBracketSuffix( return false; } -bool MipsAsmParser::ParseInstruction( - ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, - SmallVectorImpl<MCParsedAsmOperand *> &Operands) { +bool MipsAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + SMLoc NameLoc, OperandVector &Operands) { DEBUG(dbgs() << "ParseInstruction\n"); + // We have reached first instruction, module directive after + // this is forbidden. + getTargetStreamer().setCanHaveModuleDir(false); // Check if we have valid mnemonic if (!mnemonicIsValid(Name, 0)) { Parser.eatToEndOfStatement(); @@ -2098,13 +2315,13 @@ bool MipsAsmParser::ParseInstruction( return false; } -bool MipsAsmParser::reportParseError(StringRef ErrorMsg) { +bool MipsAsmParser::reportParseError(Twine ErrorMsg) { SMLoc Loc = getLexer().getLoc(); Parser.eatToEndOfStatement(); return Error(Loc, ErrorMsg); } -bool MipsAsmParser::reportParseError(SMLoc Loc, StringRef ErrorMsg) { +bool MipsAsmParser::reportParseError(SMLoc Loc, Twine ErrorMsg) { return Error(Loc, ErrorMsg); } @@ -2238,6 +2455,32 @@ bool MipsAsmParser::parseSetNoMips16Directive() { return false; } +bool MipsAsmParser::parseSetFpDirective() { + MipsABIFlagsSection::FpABIKind FpAbiVal; + // Line can be: .set fp=32 + // .set fp=xx + // .set fp=64 + Parser.Lex(); // Eat fp token + AsmToken Tok = Parser.getTok(); + if (Tok.isNot(AsmToken::Equal)) { + reportParseError("unexpected token in statement"); + return false; + } + Parser.Lex(); // Eat '=' token. + Tok = Parser.getTok(); + + if (!parseFpABIValue(FpAbiVal, ".set")) + return false; + + if (getLexer().isNot(AsmToken::EndOfStatement)) { + reportParseError("unexpected token in statement"); + return false; + } + getTargetStreamer().emitDirectiveSetFp(FpAbiVal); + Parser.Lex(); // Consume the EndOfStatement. + return false; +} + bool MipsAsmParser::parseSetAssignment() { StringRef Name; const MCExpr *Value; @@ -2296,25 +2539,6 @@ bool MipsAsmParser::parseSetFeature(uint64_t Feature) { return false; } -bool MipsAsmParser::parseRegister(unsigned &RegNum) { - if (!getLexer().is(AsmToken::Dollar)) - return false; - - Parser.Lex(); - - const AsmToken &Reg = Parser.getTok(); - if (Reg.is(AsmToken::Identifier)) { - RegNum = matchCPURegisterName(Reg.getIdentifier()); - } else if (Reg.is(AsmToken::Integer)) { - RegNum = Reg.getIntVal(); - } else { - return false; - } - - Parser.Lex(); - return true; -} - bool MipsAsmParser::eatComma(StringRef ErrorStr) { if (getLexer().isNot(AsmToken::Comma)) { SMLoc Loc = getLexer().getLoc(); @@ -2332,21 +2556,20 @@ bool MipsAsmParser::parseDirectiveCPLoad(SMLoc Loc) { // FIXME: Warn if cpload is used in Mips16 mode. - SmallVector<MCParsedAsmOperand *, 1> Reg; + SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> Reg; OperandMatchResultTy ResTy = ParseAnyRegister(Reg); if (ResTy == MatchOperand_NoMatch || ResTy == MatchOperand_ParseFail) { reportParseError("expected register containing function address"); return false; } - MipsOperand *RegOpnd = static_cast<MipsOperand *>(Reg[0]); - if (!RegOpnd->isGPRAsmReg()) { - reportParseError(RegOpnd->getStartLoc(), "invalid register"); + MipsOperand &RegOpnd = static_cast<MipsOperand &>(*Reg[0]); + if (!RegOpnd.isGPRAsmReg()) { + reportParseError(RegOpnd.getStartLoc(), "invalid register"); return false; } - getTargetStreamer().emitDirectiveCpload(RegOpnd->getGPR32Reg()); - delete RegOpnd; + getTargetStreamer().emitDirectiveCpload(RegOpnd.getGPR32Reg()); return false; } @@ -2355,23 +2578,48 @@ bool MipsAsmParser::parseDirectiveCPSetup() { unsigned Save; bool SaveIsReg = true; - if (!parseRegister(FuncReg)) - return reportParseError("expected register containing function address"); - FuncReg = getGPR(FuncReg); + SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> TmpReg; + OperandMatchResultTy ResTy = ParseAnyRegister(TmpReg); + if (ResTy == MatchOperand_NoMatch) { + reportParseError("expected register containing function address"); + Parser.eatToEndOfStatement(); + return false; + } + + MipsOperand &FuncRegOpnd = static_cast<MipsOperand &>(*TmpReg[0]); + if (!FuncRegOpnd.isGPRAsmReg()) { + reportParseError(FuncRegOpnd.getStartLoc(), "invalid register"); + Parser.eatToEndOfStatement(); + return false; + } + + FuncReg = FuncRegOpnd.getGPR32Reg(); + TmpReg.clear(); if (!eatComma("expected comma parsing directive")) return true; - if (!parseRegister(Save)) { + ResTy = ParseAnyRegister(TmpReg); + if (ResTy == MatchOperand_NoMatch) { const AsmToken &Tok = Parser.getTok(); if (Tok.is(AsmToken::Integer)) { Save = Tok.getIntVal(); SaveIsReg = false; Parser.Lex(); - } else - return reportParseError("expected save register or stack offset"); - } else - Save = getGPR(Save); + } else { + reportParseError("expected save register or stack offset"); + Parser.eatToEndOfStatement(); + return false; + } + } else { + MipsOperand &SaveOpnd = static_cast<MipsOperand &>(*TmpReg[0]); + if (!SaveOpnd.isGPRAsmReg()) { + reportParseError(SaveOpnd.getStartLoc(), "invalid register"); + Parser.eatToEndOfStatement(); + return false; + } + Save = SaveOpnd.getGPR32Reg(); + } if (!eatComma("expected comma parsing directive")) return true; @@ -2414,6 +2662,8 @@ bool MipsAsmParser::parseDirectiveSet() { return parseSetNoAtDirective(); } else if (Tok.getString() == "at") { return parseSetAtDirective(); + } else if (Tok.getString() == "fp") { + return parseSetFpDirective(); } else if (Tok.getString() == "reorder") { return parseSetReorderDirective(); } else if (Tok.getString() == "noreorder") { @@ -2546,6 +2796,134 @@ bool MipsAsmParser::parseDirectiveOption() { return false; } +/// parseDirectiveModule +/// ::= .module oddspreg +/// ::= .module nooddspreg +/// ::= .module fp=value +bool MipsAsmParser::parseDirectiveModule() { + MCAsmLexer &Lexer = getLexer(); + SMLoc L = Lexer.getLoc(); + + if (!getTargetStreamer().getCanHaveModuleDir()) { + // TODO : get a better message. + reportParseError(".module directive must appear before any code"); + return false; + } + + if (Lexer.is(AsmToken::Identifier)) { + StringRef Option = Parser.getTok().getString(); + Parser.Lex(); + + if (Option == "oddspreg") { + getTargetStreamer().emitDirectiveModuleOddSPReg(true, isABI_O32()); + clearFeatureBits(Mips::FeatureNoOddSPReg, "nooddspreg"); + + if (getLexer().isNot(AsmToken::EndOfStatement)) { + reportParseError("Expected end of statement"); + return false; + } + + return false; + } else if (Option == "nooddspreg") { + if (!isABI_O32()) { + Error(L, "'.module nooddspreg' requires the O32 ABI"); + return false; + } + + getTargetStreamer().emitDirectiveModuleOddSPReg(false, isABI_O32()); + setFeatureBits(Mips::FeatureNoOddSPReg, "nooddspreg"); + + if (getLexer().isNot(AsmToken::EndOfStatement)) { + reportParseError("Expected end of statement"); + return false; + } + + return false; + } else if (Option == "fp") { + return parseDirectiveModuleFP(); + } + + return Error(L, "'" + Twine(Option) + "' is not a valid .module option."); + } + + return false; +} + +/// parseDirectiveModuleFP +/// ::= =32 +/// ::= =xx +/// ::= =64 +bool MipsAsmParser::parseDirectiveModuleFP() { + MCAsmLexer &Lexer = getLexer(); + + if (Lexer.isNot(AsmToken::Equal)) { + reportParseError("unexpected token in statement"); + return false; + } + Parser.Lex(); // Eat '=' token. + + MipsABIFlagsSection::FpABIKind FpABI; + if (!parseFpABIValue(FpABI, ".module")) + return false; + + if (getLexer().isNot(AsmToken::EndOfStatement)) { + reportParseError("unexpected token in statement"); + return false; + } + + // Emit appropriate flags. + getTargetStreamer().emitDirectiveModuleFP(FpABI, isABI_O32()); + Parser.Lex(); // Consume the EndOfStatement. + return false; +} + +bool MipsAsmParser::parseFpABIValue(MipsABIFlagsSection::FpABIKind &FpABI, + StringRef Directive) { + MCAsmLexer &Lexer = getLexer(); + + if (Lexer.is(AsmToken::Identifier)) { + StringRef Value = Parser.getTok().getString(); + Parser.Lex(); + + if (Value != "xx") { + reportParseError("unsupported value, expected 'xx', '32' or '64'"); + return false; + } + + if (!isABI_O32()) { + reportParseError("'" + Directive + " fp=xx' requires the O32 ABI"); + return false; + } + + FpABI = MipsABIFlagsSection::FpABIKind::XX; + return true; + } + + if (Lexer.is(AsmToken::Integer)) { + unsigned Value = Parser.getTok().getIntVal(); + Parser.Lex(); + + if (Value != 32 && Value != 64) { + reportParseError("unsupported value, expected 'xx', '32' or '64'"); + return false; + } + + if (Value == 32) { + if (!isABI_O32()) { + reportParseError("'" + Directive + " fp=32' requires the O32 ABI"); + return false; + } + + FpABI = MipsABIFlagsSection::FpABIKind::S32; + } else + FpABI = MipsABIFlagsSection::FpABIKind::S64; + + return true; + } + + return false; +} + bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) { StringRef IDVal = DirectiveID.getString(); @@ -2624,6 +3002,9 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) { if (IDVal == ".cpsetup") return parseDirectiveCPSetup(); + if (IDVal == ".module") + return parseDirectiveModule(); + return true; } diff --git a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp index 95670aa4440c..902b87759dc0 100644 --- a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp +++ b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp @@ -57,16 +57,24 @@ class MipsDisassembler : public MipsDisassemblerBase { public: /// Constructor - Initializes the disassembler. /// - MipsDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx, - bool bigEndian) : - MipsDisassemblerBase(STI, Ctx, bigEndian) { - IsMicroMips = STI.getFeatureBits() & Mips::FeatureMicroMips; - } + MipsDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx, bool bigEndian) + : MipsDisassemblerBase(STI, Ctx, bigEndian) { + IsMicroMips = STI.getFeatureBits() & Mips::FeatureMicroMips; + } - bool isMips32r6() const { + bool hasMips3() const { return STI.getFeatureBits() & Mips::FeatureMips3; } + bool hasMips32() const { return STI.getFeatureBits() & Mips::FeatureMips32; } + bool hasMips32r6() const { return STI.getFeatureBits() & Mips::FeatureMips32r6; } + bool isGP64() const { return STI.getFeatureBits() & Mips::FeatureGP64Bit; } + + bool hasCOP3() const { + // Only present in MIPS-I and MIPS-II + return !hasMips32() && !hasMips3(); + } + /// getInstruction - See MCDisassembler. DecodeStatus getInstruction(MCInst &instr, uint64_t &size, @@ -149,6 +157,10 @@ static DecodeStatus DecodeFCCRegisterClass(MCInst &Inst, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeFGRCCRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); + static DecodeStatus DecodeHWRegsRegisterClass(MCInst &Inst, unsigned Insn, uint64_t Address, @@ -260,6 +272,11 @@ static DecodeStatus DecodeFMem(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeSpecial3LlSc(MCInst &Inst, + unsigned Insn, + uint64_t Address, + const void *Decoder); + static DecodeStatus DecodeSimm16(MCInst &Inst, unsigned Insn, uint64_t Address, @@ -285,6 +302,9 @@ static DecodeStatus DecodeExtSize(MCInst &Inst, static DecodeStatus DecodeSimm19Lsl2(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeSimm18Lsl3(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder); + /// INSVE_[BHWD] have an implicit operand that the generated decoder doesn't /// handle. template <typename InsnType> @@ -316,6 +336,11 @@ static DecodeStatus DecodeBgtzGroupBranch(MCInst &MI, InsnType insn, uint64_t Address, const void *Decoder); +template <typename InsnType> +static DecodeStatus +DecodeBlezGroupBranch(MCInst &MI, InsnType insn, uint64_t Address, + const void *Decoder); + namespace llvm { extern Target TheMipselTarget, TheMipsTarget, TheMips64Target, TheMips64elTarget; @@ -511,6 +536,7 @@ static DecodeStatus DecodeBlezlGroupBranch(MCInst &MI, InsnType insn, InsnType Rs = fieldFromInstruction(insn, 21, 5); InsnType Rt = fieldFromInstruction(insn, 16, 5); InsnType Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) << 2; + bool HasRs = false; if (Rt == 0) return MCDisassembler::Fail; @@ -518,8 +544,14 @@ static DecodeStatus DecodeBlezlGroupBranch(MCInst &MI, InsnType insn, MI.setOpcode(Mips::BLEZC); else if (Rs == Rt) MI.setOpcode(Mips::BGEZC); - else - return MCDisassembler::Fail; // FIXME: BGEC is not implemented yet. + else { + HasRs = true; + MI.setOpcode(Mips::BGEC); + } + + if (HasRs) + MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID, + Rs))); MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID, Rt))); @@ -544,6 +576,8 @@ static DecodeStatus DecodeBgtzlGroupBranch(MCInst &MI, InsnType insn, // BLTZC if rs == rt && rt != 0 // BLTC if rs != rt && rs != 0 && rt != 0 + bool HasRs = false; + InsnType Rs = fieldFromInstruction(insn, 21, 5); InsnType Rt = fieldFromInstruction(insn, 16, 5); InsnType Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) << 2; @@ -554,8 +588,14 @@ static DecodeStatus DecodeBgtzlGroupBranch(MCInst &MI, InsnType insn, MI.setOpcode(Mips::BGTZC); else if (Rs == Rt) MI.setOpcode(Mips::BLTZC); - else - return MCDisassembler::Fail; // FIXME: BLTC is not implemented yet. + else { + MI.setOpcode(Mips::BLTC); + HasRs = true; + } + + if (HasRs) + MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID, + Rs))); MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID, Rt))); @@ -595,8 +635,11 @@ static DecodeStatus DecodeBgtzGroupBranch(MCInst &MI, InsnType insn, } else if (Rs == Rt) { MI.setOpcode(Mips::BLTZALC); HasRs = true; - } else - return MCDisassembler::Fail; // BLTUC not implemented yet + } else { + MI.setOpcode(Mips::BLTUC); + HasRs = true; + HasRt = true; + } if (HasRs) MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID, @@ -611,6 +654,48 @@ static DecodeStatus DecodeBgtzGroupBranch(MCInst &MI, InsnType insn, return MCDisassembler::Success; } +template <typename InsnType> +static DecodeStatus DecodeBlezGroupBranch(MCInst &MI, InsnType insn, + uint64_t Address, + const void *Decoder) { + // If we are called then we can assume that MIPS32r6/MIPS64r6 is enabled + // (otherwise we would have matched the BLEZL instruction from the earlier + // ISA's instead). + // + // We have: + // 0b000110 sssss ttttt iiiiiiiiiiiiiiii + // Invalid if rs == 0 + // BLEZALC if rs == 0 && rt != 0 + // BGEZALC if rs == rt && rt != 0 + // BGEUC if rs != rt && rs != 0 && rt != 0 + + InsnType Rs = fieldFromInstruction(insn, 21, 5); + InsnType Rt = fieldFromInstruction(insn, 16, 5); + InsnType Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) << 2; + bool HasRs = false; + + if (Rt == 0) + return MCDisassembler::Fail; + else if (Rs == 0) + MI.setOpcode(Mips::BLEZALC); + else if (Rs == Rt) + MI.setOpcode(Mips::BGEZALC); + else { + HasRs = true; + MI.setOpcode(Mips::BGEUC); + } + + if (HasRs) + MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID, + Rs))); + MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID, + Rt))); + + MI.addOperand(MCOperand::CreateImm(Imm)); + + return MCDisassembler::Success; +} + /// readInstruction - read four bytes from the MemoryObject /// and return 32 bit word sorted according to the given endianess static DecodeStatus readInstruction32(const MemoryObject ®ion, @@ -670,6 +755,7 @@ MipsDisassembler::getInstruction(MCInst &instr, return MCDisassembler::Fail; if (IsMicroMips) { + DEBUG(dbgs() << "Trying MicroMips32 table (32-bit opcodes):\n"); // Calling the auto-generated decoder function. Result = decodeInstruction(DecoderTableMicroMips32, instr, Insn, Address, this, STI); @@ -680,7 +766,28 @@ MipsDisassembler::getInstruction(MCInst &instr, return MCDisassembler::Fail; } - if (isMips32r6()) { + if (hasCOP3()) { + DEBUG(dbgs() << "Trying COP3_ table (32-bit opcodes):\n"); + Result = + decodeInstruction(DecoderTableCOP3_32, instr, Insn, Address, this, STI); + if (Result != MCDisassembler::Fail) { + Size = 4; + return Result; + } + } + + if (hasMips32r6() && isGP64()) { + DEBUG(dbgs() << "Trying Mips32r6_64r6 (GPR64) table (32-bit opcodes):\n"); + Result = decodeInstruction(DecoderTableMips32r6_64r6_GP6432, instr, Insn, + Address, this, STI); + if (Result != MCDisassembler::Fail) { + Size = 4; + return Result; + } + } + + if (hasMips32r6()) { + DEBUG(dbgs() << "Trying Mips32r6_64r6 table (32-bit opcodes):\n"); Result = decodeInstruction(DecoderTableMips32r6_64r632, instr, Insn, Address, this, STI); if (Result != MCDisassembler::Fail) { @@ -689,6 +796,7 @@ MipsDisassembler::getInstruction(MCInst &instr, } } + DEBUG(dbgs() << "Trying Mips table (32-bit opcodes):\n"); // Calling the auto-generated decoder function. Result = decodeInstruction(DecoderTableMips32, instr, Insn, Address, this, STI); @@ -840,6 +948,17 @@ static DecodeStatus DecodeFCCRegisterClass(MCInst &Inst, return MCDisassembler::Success; } +static DecodeStatus DecodeFGRCCRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder) { + if (RegNo > 31) + return MCDisassembler::Fail; + + unsigned Reg = getReg(Decoder, Mips::FGRCCRegClassID, RegNo); + Inst.addOperand(MCOperand::CreateReg(Reg)); + return MCDisassembler::Success; +} + static DecodeStatus DecodeMem(MCInst &Inst, unsigned Insn, uint64_t Address, @@ -965,6 +1084,27 @@ static DecodeStatus DecodeFMem(MCInst &Inst, return MCDisassembler::Success; } +static DecodeStatus DecodeSpecial3LlSc(MCInst &Inst, + unsigned Insn, + uint64_t Address, + const void *Decoder) { + int64_t Offset = SignExtend64<9>((Insn >> 7) & 0x1ff); + unsigned Rt = fieldFromInstruction(Insn, 16, 5); + unsigned Base = fieldFromInstruction(Insn, 21, 5); + + Rt = getReg(Decoder, Mips::GPR32RegClassID, Rt); + Base = getReg(Decoder, Mips::GPR32RegClassID, Base); + + if(Inst.getOpcode() == Mips::SC_R6 || Inst.getOpcode() == Mips::SCD_R6){ + Inst.addOperand(MCOperand::CreateReg(Rt)); + } + + Inst.addOperand(MCOperand::CreateReg(Rt)); + Inst.addOperand(MCOperand::CreateReg(Base)); + Inst.addOperand(MCOperand::CreateImm(Offset)); + + return MCDisassembler::Success; +} static DecodeStatus DecodeHWRegsRegisterClass(MCInst &Inst, unsigned RegNo, @@ -1197,3 +1337,9 @@ static DecodeStatus DecodeSimm19Lsl2(MCInst &Inst, unsigned Insn, Inst.addOperand(MCOperand::CreateImm(SignExtend32<19>(Insn) << 2)); return MCDisassembler::Success; } + +static DecodeStatus DecodeSimm18Lsl3(MCInst &Inst, unsigned Insn, + uint64_t Address, const void *Decoder) { + Inst.addOperand(MCOperand::CreateImm(SignExtend32<18>(Insn) << 3)); + return MCDisassembler::Success; +} diff --git a/lib/Target/Mips/MCTargetDesc/Android.mk b/lib/Target/Mips/MCTargetDesc/Android.mk index 7ee11a1ace83..c8d18fc5009b 100644 --- a/lib/Target/Mips/MCTargetDesc/Android.mk +++ b/lib/Target/Mips/MCTargetDesc/Android.mk @@ -7,6 +7,7 @@ mips_mc_desc_TBLGEN_TABLES := \ MipsGenSubtargetInfo.inc mips_mc_desc_SRC_FILES := \ + MipsABIFlagsSection.cpp \ MipsAsmBackend.cpp \ MipsELFObjectWriter.cpp \ MipsELFStreamer.cpp \ diff --git a/lib/Target/Mips/MCTargetDesc/CMakeLists.txt b/lib/Target/Mips/MCTargetDesc/CMakeLists.txt index d3e2fd75498c..c14ee355f80a 100644 --- a/lib/Target/Mips/MCTargetDesc/CMakeLists.txt +++ b/lib/Target/Mips/MCTargetDesc/CMakeLists.txt @@ -1,4 +1,5 @@ add_llvm_library(LLVMMipsDesc + MipsABIFlagsSection.cpp MipsAsmBackend.cpp MipsELFObjectWriter.cpp MipsELFStreamer.cpp diff --git a/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp b/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp new file mode 100644 index 000000000000..52d5dd3ea5c3 --- /dev/null +++ b/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp @@ -0,0 +1,60 @@ +//===-- MipsABIFlagsSection.cpp - Mips ELF ABI Flags Section ---*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MipsABIFlagsSection.h" + +using namespace llvm; + +uint8_t MipsABIFlagsSection::getFpABIValue() { + switch (FpABI) { + case FpABIKind::ANY: + return Val_GNU_MIPS_ABI_FP_ANY; + case FpABIKind::XX: + return Val_GNU_MIPS_ABI_FP_XX; + case FpABIKind::S32: + return Val_GNU_MIPS_ABI_FP_DOUBLE; + case FpABIKind::S64: + if (Is32BitABI) + return OddSPReg ? Val_GNU_MIPS_ABI_FP_64 : Val_GNU_MIPS_ABI_FP_64A; + return Val_GNU_MIPS_ABI_FP_DOUBLE; + } + + llvm_unreachable("unexpected fp abi value"); +} + +StringRef MipsABIFlagsSection::getFpABIString(FpABIKind Value) { + switch (Value) { + case FpABIKind::XX: + return "xx"; + case FpABIKind::S32: + return "32"; + case FpABIKind::S64: + return "64"; + default: + llvm_unreachable("unsupported fp abi value"); + } +} + +namespace llvm { +MCStreamer &operator<<(MCStreamer &OS, MipsABIFlagsSection &ABIFlagsSection) { + // Write out a Elf_Internal_ABIFlags_v0 struct + OS.EmitIntValue(ABIFlagsSection.getVersionValue(), 2); // version + OS.EmitIntValue(ABIFlagsSection.getISALevelValue(), 1); // isa_level + OS.EmitIntValue(ABIFlagsSection.getISARevisionValue(), 1); // isa_rev + OS.EmitIntValue(ABIFlagsSection.getGPRSizeValue(), 1); // gpr_size + OS.EmitIntValue(ABIFlagsSection.getCPR1SizeValue(), 1); // cpr1_size + OS.EmitIntValue(ABIFlagsSection.getCPR2SizeValue(), 1); // cpr2_size + OS.EmitIntValue(ABIFlagsSection.getFpABIValue(), 1); // fp_abi + OS.EmitIntValue(ABIFlagsSection.getISAExtensionSetValue(), 4); // isa_ext + OS.EmitIntValue(ABIFlagsSection.getASESetValue(), 4); // ases + OS.EmitIntValue(ABIFlagsSection.getFlags1Value(), 4); // flags1 + OS.EmitIntValue(ABIFlagsSection.getFlags2Value(), 4); // flags2 + return OS; +} +} diff --git a/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h b/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h new file mode 100644 index 000000000000..ab18c4413efd --- /dev/null +++ b/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h @@ -0,0 +1,237 @@ +//===-- MipsABIFlagsSection.h - Mips ELF ABI Flags Section -----*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef MIPSABIFLAGSSECTION_H +#define MIPSABIFLAGSSECTION_H + +#include "llvm/MC/MCStreamer.h" + +namespace llvm { + +class MCStreamer; + +struct MipsABIFlagsSection { + // Values for the xxx_size bytes of an ABI flags structure. + enum AFL_REG { + AFL_REG_NONE = 0x00, // No registers. + AFL_REG_32 = 0x01, // 32-bit registers. + AFL_REG_64 = 0x02, // 64-bit registers. + AFL_REG_128 = 0x03 // 128-bit registers. + }; + + // Masks for the ases word of an ABI flags structure. + enum AFL_ASE { + AFL_ASE_DSP = 0x00000001, // DSP ASE. + AFL_ASE_DSPR2 = 0x00000002, // DSP R2 ASE. + AFL_ASE_EVA = 0x00000004, // Enhanced VA Scheme. + AFL_ASE_MCU = 0x00000008, // MCU (MicroController) ASE. + AFL_ASE_MDMX = 0x00000010, // MDMX ASE. + AFL_ASE_MIPS3D = 0x00000020, // MIPS-3D ASE. + AFL_ASE_MT = 0x00000040, // MT ASE. + AFL_ASE_SMARTMIPS = 0x00000080, // SmartMIPS ASE. + AFL_ASE_VIRT = 0x00000100, // VZ ASE. + AFL_ASE_MSA = 0x00000200, // MSA ASE. + AFL_ASE_MIPS16 = 0x00000400, // MIPS16 ASE. + AFL_ASE_MICROMIPS = 0x00000800, // MICROMIPS ASE. + AFL_ASE_XPA = 0x00001000 // XPA ASE. + }; + + // Values for the isa_ext word of an ABI flags structure. + enum AFL_EXT { + AFL_EXT_XLR = 1, // RMI Xlr instruction. + AFL_EXT_OCTEON2 = 2, // Cavium Networks Octeon2. + AFL_EXT_OCTEONP = 3, // Cavium Networks OcteonP. + AFL_EXT_LOONGSON_3A = 4, // Loongson 3A. + AFL_EXT_OCTEON = 5, // Cavium Networks Octeon. + AFL_EXT_5900 = 6, // MIPS R5900 instruction. + AFL_EXT_4650 = 7, // MIPS R4650 instruction. + AFL_EXT_4010 = 8, // LSI R4010 instruction. + AFL_EXT_4100 = 9, // NEC VR4100 instruction. + AFL_EXT_3900 = 10, // Toshiba R3900 instruction. + AFL_EXT_10000 = 11, // MIPS R10000 instruction. + AFL_EXT_SB1 = 12, // Broadcom SB-1 instruction. + AFL_EXT_4111 = 13, // NEC VR4111/VR4181 instruction. + AFL_EXT_4120 = 14, // NEC VR4120 instruction. + AFL_EXT_5400 = 15, // NEC VR5400 instruction. + AFL_EXT_5500 = 16, // NEC VR5500 instruction. + AFL_EXT_LOONGSON_2E = 17, // ST Microelectronics Loongson 2E. + AFL_EXT_LOONGSON_2F = 18 // ST Microelectronics Loongson 2F. + }; + + // Values for the fp_abi word of an ABI flags structure. + enum Val_GNU_MIPS_ABI { + Val_GNU_MIPS_ABI_FP_ANY = 0, + Val_GNU_MIPS_ABI_FP_DOUBLE = 1, + Val_GNU_MIPS_ABI_FP_XX = 5, + Val_GNU_MIPS_ABI_FP_64 = 6, + Val_GNU_MIPS_ABI_FP_64A = 7 + }; + + enum AFL_FLAGS1 { + AFL_FLAGS1_ODDSPREG = 1 + }; + + // Internal representation of the values used in .module fp=value + enum class FpABIKind { ANY, XX, S32, S64 }; + + // Version of flags structure. + uint16_t Version; + // The level of the ISA: 1-5, 32, 64. + uint8_t ISALevel; + // The revision of ISA: 0 for MIPS V and below, 1-n otherwise. + uint8_t ISARevision; + // The size of general purpose registers. + AFL_REG GPRSize; + // The size of co-processor 1 registers. + AFL_REG CPR1Size; + // The size of co-processor 2 registers. + AFL_REG CPR2Size; + // Processor-specific extension. + uint32_t ISAExtensionSet; + // Mask of ASEs used. + uint32_t ASESet; + + bool OddSPReg; + + bool Is32BitABI; + +protected: + // The floating-point ABI. + FpABIKind FpABI; + +public: + MipsABIFlagsSection() + : Version(0), ISALevel(0), ISARevision(0), GPRSize(AFL_REG_NONE), + CPR1Size(AFL_REG_NONE), CPR2Size(AFL_REG_NONE), ISAExtensionSet(0), + ASESet(0), OddSPReg(false), Is32BitABI(false), FpABI(FpABIKind::ANY) {} + + uint16_t getVersionValue() { return (uint16_t)Version; } + uint8_t getISALevelValue() { return (uint8_t)ISALevel; } + uint8_t getISARevisionValue() { return (uint8_t)ISARevision; } + uint8_t getGPRSizeValue() { return (uint8_t)GPRSize; } + uint8_t getCPR1SizeValue() { return (uint8_t)CPR1Size; } + uint8_t getCPR2SizeValue() { return (uint8_t)CPR2Size; } + uint8_t getFpABIValue(); + uint32_t getISAExtensionSetValue() { return (uint32_t)ISAExtensionSet; } + uint32_t getASESetValue() { return (uint32_t)ASESet; } + + uint32_t getFlags1Value() { + uint32_t Value = 0; + + if (OddSPReg) + Value |= (uint32_t)AFL_FLAGS1_ODDSPREG; + + return Value; + } + + uint32_t getFlags2Value() { return 0; } + + FpABIKind getFpABI() { return FpABI; } + void setFpABI(FpABIKind Value, bool IsABI32Bit) { + FpABI = Value; + Is32BitABI = IsABI32Bit; + } + StringRef getFpABIString(FpABIKind Value); + + template <class PredicateLibrary> + void setISALevelAndRevisionFromPredicates(const PredicateLibrary &P) { + if (P.hasMips64()) { + ISALevel = 64; + if (P.hasMips64r6()) + ISARevision = 6; + else if (P.hasMips64r2()) + ISARevision = 2; + else + ISARevision = 1; + } else if (P.hasMips32()) { + ISALevel = 32; + if (P.hasMips32r6()) + ISARevision = 6; + else if (P.hasMips32r2()) + ISARevision = 2; + else + ISARevision = 1; + } else { + ISARevision = 0; + if (P.hasMips5()) + ISALevel = 5; + else if (P.hasMips4()) + ISALevel = 4; + else if (P.hasMips3()) + ISALevel = 3; + else if (P.hasMips2()) + ISALevel = 2; + else if (P.hasMips1()) + ISALevel = 1; + else + llvm_unreachable("Unknown ISA level!"); + } + } + + template <class PredicateLibrary> + void setGPRSizeFromPredicates(const PredicateLibrary &P) { + GPRSize = P.isGP64bit() ? AFL_REG_64 : AFL_REG_32; + } + + template <class PredicateLibrary> + void setCPR1SizeFromPredicates(const PredicateLibrary &P) { + if (P.mipsSEUsesSoftFloat()) + CPR1Size = AFL_REG_NONE; + else if (P.hasMSA()) + CPR1Size = AFL_REG_128; + else + CPR1Size = P.isFP64bit() ? AFL_REG_64 : AFL_REG_32; + } + + template <class PredicateLibrary> + void setASESetFromPredicates(const PredicateLibrary &P) { + ASESet = 0; + if (P.hasDSP()) + ASESet |= AFL_ASE_DSP; + if (P.hasDSPR2()) + ASESet |= AFL_ASE_DSPR2; + if (P.hasMSA()) + ASESet |= AFL_ASE_MSA; + if (P.inMicroMipsMode()) + ASESet |= AFL_ASE_MICROMIPS; + if (P.inMips16Mode()) + ASESet |= AFL_ASE_MIPS16; + } + + template <class PredicateLibrary> + void setFpAbiFromPredicates(const PredicateLibrary &P) { + Is32BitABI = P.isABI_O32(); + + FpABI = FpABIKind::ANY; + if (P.isABI_N32() || P.isABI_N64()) + FpABI = FpABIKind::S64; + else if (P.isABI_O32()) { + if (P.isFP64bit()) + FpABI = FpABIKind::S64; + else if (P.isABI_FPXX()) + FpABI = FpABIKind::XX; + else + FpABI = FpABIKind::S32; + } + } + + template <class PredicateLibrary> + void setAllFromPredicates(const PredicateLibrary &P) { + setISALevelAndRevisionFromPredicates(P); + setGPRSizeFromPredicates(P); + setCPR1SizeFromPredicates(P); + setASESetFromPredicates(P); + setFpAbiFromPredicates(P); + } +}; + +MCStreamer &operator<<(MCStreamer &OS, MipsABIFlagsSection &ABIFlagsSection); +} + +#endif diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp index 5375a00fd110..d8e6128cd54d 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp @@ -70,6 +70,13 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value, if (!isIntN(16, Value) && Ctx) Ctx->FatalError(Fixup.getLoc(), "out of range PC16 fixup"); break; + case Mips::fixup_MIPS_PC19_S2: + // Forcing a signed division because Value can be negative. + Value = (int64_t)Value / 4; + // We now check if Value can be encoded as a 19-bit signed immediate. + if (!isIntN(19, Value) && Ctx) + Ctx->FatalError(Fixup.getLoc(), "out of range PC19 fixup"); + break; case Mips::fixup_Mips_26: // So far we are only using this type for jumps. // The displacement is then divided by 4 to give us an 28 bit @@ -104,6 +111,13 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value, if (!isIntN(16, Value) && Ctx) Ctx->FatalError(Fixup.getLoc(), "out of range PC16 fixup"); break; + case Mips::fixup_MIPS_PC18_S3: + // Forcing a signed division because Value can be negative. + Value = (int64_t)Value / 8; + // We now check if Value can be encoded as a 18-bit signed immediate. + if (!isIntN(18, Value) && Ctx) + Ctx->FatalError(Fixup.getLoc(), "out of range PC18 fixup"); + break; case Mips::fixup_MIPS_PC21_S2: Value -= 4; // Forcing a signed division because Value can be negative. @@ -247,6 +261,8 @@ getFixupKindInfo(MCFixupKind Kind) const { { "fixup_Mips_GOT_LO16", 0, 16, 0 }, { "fixup_Mips_CALL_HI16", 0, 16, 0 }, { "fixup_Mips_CALL_LO16", 0, 16, 0 }, + { "fixup_Mips_PC18_S3", 0, 18, MCFixupKindInfo::FKF_IsPCRel }, + { "fixup_MIPS_PC19_S2", 0, 19, MCFixupKindInfo::FKF_IsPCRel }, { "fixup_MIPS_PC21_S2", 0, 21, MCFixupKindInfo::FKF_IsPCRel }, { "fixup_MIPS_PC26_S2", 0, 26, MCFixupKindInfo::FKF_IsPCRel }, { "fixup_MIPS_PCHI16", 0, 16, MCFixupKindInfo::FKF_IsPCRel }, @@ -308,6 +324,8 @@ getFixupKindInfo(MCFixupKind Kind) const { { "fixup_Mips_GOT_LO16", 16, 16, 0 }, { "fixup_Mips_CALL_HI16", 16, 16, 0 }, { "fixup_Mips_CALL_LO16", 16, 16, 0 }, + { "fixup_Mips_PC18_S3", 14, 18, MCFixupKindInfo::FKF_IsPCRel }, + { "fixup_MIPS_PC19_S2", 13, 19, MCFixupKindInfo::FKF_IsPCRel }, { "fixup_MIPS_PC21_S2", 11, 21, MCFixupKindInfo::FKF_IsPCRel }, { "fixup_MIPS_PC26_S2", 6, 26, MCFixupKindInfo::FKF_IsPCRel }, { "fixup_MIPS_PCHI16", 16, 16, MCFixupKindInfo::FKF_IsPCRel }, diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h index bc695e6d4f74..d5c3dbc47880 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h +++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h @@ -65,7 +65,7 @@ public: const MCRelaxableFragment *DF, const MCAsmLayout &Layout) const override { // FIXME. - assert(0 && "RelaxInstruction() unimplemented"); + llvm_unreachable("RelaxInstruction() unimplemented"); return false; } diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp index 74c12ff3428c..49ac25690b98 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp @@ -193,6 +193,12 @@ unsigned MipsELFObjectWriter::GetRelocType(const MCValue &Target, case Mips::fixup_MICROMIPS_TLS_TPREL_LO16: Type = ELF::R_MICROMIPS_TLS_TPREL_LO16; break; + case Mips::fixup_MIPS_PC19_S2: + Type = ELF::R_MIPS_PC19_S2; + break; + case Mips::fixup_MIPS_PC18_S3: + Type = ELF::R_MIPS_PC18_S3; + break; case Mips::fixup_MIPS_PC21_S2: Type = ELF::R_MIPS_PC21_S2; break; diff --git a/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h b/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h index 3079004f9ac6..05080f046f89 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h +++ b/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h @@ -128,6 +128,12 @@ namespace Mips { // resulting in - R_MIPS_CALL_LO16 fixup_Mips_CALL_LO16, + // resulting in - R_MIPS_PC18_S3 + fixup_MIPS_PC18_S3, + + // resulting in - R_MIPS_PC19_S2 + fixup_MIPS_PC19_S2, + // resulting in - R_MIPS_PC21_S2 fixup_MIPS_PC21_S2, diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp index 6aa3c762d9db..e415412ab6cb 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp @@ -38,7 +38,7 @@ MipsMCAsmInfo::MipsMCAsmInfo(StringRef TT) { ZeroDirective = "\t.space\t"; GPRel32Directive = "\t.gpword\t"; GPRel64Directive = "\t.gpdword\t"; - DebugLabelSuffix = "=."; + UseAssignmentForEHBegin = true; SupportsDebugInformation = true; ExceptionsType = ExceptionHandling::DwarfCFI; HasLEB128 = true; diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp index 85e0bf1569a7..43fc52136ddc 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp @@ -621,11 +621,42 @@ unsigned MipsMCCodeEmitter::getSimm19Lsl2Encoding(const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { - assert(MI.getOperand(OpNo).isImm()); - // The immediate is encoded as 'immediate << 2'. - unsigned Res = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, STI); - assert((Res & 3) == 0); - return Res >> 2; + const MCOperand &MO = MI.getOperand(OpNo); + if (MO.isImm()) { + // The immediate is encoded as 'immediate << 2'. + unsigned Res = getMachineOpValue(MI, MO, Fixups, STI); + assert((Res & 3) == 0); + return Res >> 2; + } + + assert(MO.isExpr() && + "getSimm19Lsl2Encoding expects only expressions or an immediate"); + + const MCExpr *Expr = MO.getExpr(); + Fixups.push_back(MCFixup::Create(0, Expr, + MCFixupKind(Mips::fixup_MIPS_PC19_S2))); + return 0; +} + +unsigned +MipsMCCodeEmitter::getSimm18Lsl3Encoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpNo); + if (MO.isImm()) { + // The immediate is encoded as 'immediate << 3'. + unsigned Res = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, STI); + assert((Res & 7) == 0); + return Res >> 3; + } + + assert(MO.isExpr() && + "getSimm18Lsl2Encoding expects only expressions or an immediate"); + + const MCExpr *Expr = MO.getExpr(); + Fixups.push_back(MCFixup::Create(0, Expr, + MCFixupKind(Mips::fixup_MIPS_PC18_S3))); + return 0; } #include "MipsGenMCCodeEmitter.inc" diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h index 3f7daabfefaa..304167fd03db 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h +++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h @@ -141,6 +141,10 @@ public: SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const; + unsigned getSimm18Lsl3Encoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + unsigned getExprOpValue(const MCExpr *Expr, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const; diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp index 21ccc3c58b34..5bba3e5b7ae2 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp @@ -11,6 +11,7 @@ #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCContext.h" +#include "llvm/MC/MCObjectStreamer.h" using namespace llvm; @@ -83,33 +84,6 @@ MipsMCExpr::EvaluateAsRelocatableImpl(MCValue &Res, return getSubExpr()->EvaluateAsRelocatable(Res, Layout); } -// FIXME: This basically copies MCObjectStreamer::AddValueSymbols. Perhaps -// that method should be made public? -static void AddValueSymbolsImpl(const MCExpr *Value, MCAssembler *Asm) { - switch (Value->getKind()) { - case MCExpr::Target: - llvm_unreachable("Can't handle nested target expr!"); - - case MCExpr::Constant: - break; - - case MCExpr::Binary: { - const MCBinaryExpr *BE = cast<MCBinaryExpr>(Value); - AddValueSymbolsImpl(BE->getLHS(), Asm); - AddValueSymbolsImpl(BE->getRHS(), Asm); - break; - } - - case MCExpr::SymbolRef: - Asm->getOrCreateSymbolData(cast<MCSymbolRefExpr>(Value)->getSymbol()); - break; - - case MCExpr::Unary: - AddValueSymbolsImpl(cast<MCUnaryExpr>(Value)->getSubExpr(), Asm); - break; - } -} - -void MipsMCExpr::AddValueSymbols(MCAssembler *Asm) const { - AddValueSymbolsImpl(getSubExpr(), Asm); +void MipsMCExpr::visitUsedExpr(MCStreamer &Streamer) const { + Streamer.visitUsedExpr(*getSubExpr()); } diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h index 8d7aacde31a1..f193dc9b9d50 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h +++ b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h @@ -49,7 +49,7 @@ public: void PrintImpl(raw_ostream &OS) const override; bool EvaluateAsRelocatableImpl(MCValue &Res, const MCAsmLayout *Layout) const override; - void AddValueSymbols(MCAssembler *) const override; + void visitUsedExpr(MCStreamer &Streamer) const override; const MCSection *FindAssociatedSection() const override { return getSubExpr()->FindAssociatedSection(); } diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp index 660e5a7be1b8..d2b929bea334 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp @@ -133,6 +133,12 @@ createMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS, return S; } +static MCStreamer *createMipsNullStreamer(MCContext &Ctx) { + MCStreamer *S = llvm::createNullStreamer(Ctx); + new MipsTargetStreamer(*S); + return S; +} + extern "C" void LLVMInitializeMipsTargetMC() { // Register the MC asm info. RegisterMCAsmInfoFn X(TheMipsTarget, createMipsMCAsmInfo); @@ -187,6 +193,12 @@ extern "C" void LLVMInitializeMipsTargetMC() { TargetRegistry::RegisterAsmStreamer(TheMips64Target, createMCAsmStreamer); TargetRegistry::RegisterAsmStreamer(TheMips64elTarget, createMCAsmStreamer); + TargetRegistry::RegisterNullStreamer(TheMipsTarget, createMipsNullStreamer); + TargetRegistry::RegisterNullStreamer(TheMipselTarget, createMipsNullStreamer); + TargetRegistry::RegisterNullStreamer(TheMips64Target, createMipsNullStreamer); + TargetRegistry::RegisterNullStreamer(TheMips64elTarget, + createMipsNullStreamer); + // Register the asm backend. TargetRegistry::RegisterMCAsmBackend(TheMipsTarget, createMipsAsmBackendEB32); diff --git a/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp index cd6be734dfe8..6cde8f9ae3e4 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp @@ -48,7 +48,13 @@ private: bool PendingCall; bool isIndirectJump(const MCInst &MI) { - return MI.getOpcode() == Mips::JR || MI.getOpcode() == Mips::RET; + if (MI.getOpcode() == Mips::JALR) { + // MIPS32r6/MIPS64r6 doesn't have a JR instruction and uses JALR instead. + // JALR is an indirect branch if the link register is $0. + assert(MI.getOperand(0).isReg()); + return MI.getOperand(0).getReg() == Mips::ZERO; + } + return MI.getOpcode() == Mips::JR; } bool isStackPointerFirstOperand(const MCInst &MI) { @@ -56,7 +62,9 @@ private: && MI.getOperand(0).getReg() == Mips::SP); } - bool isCall(unsigned Opcode, bool *IsIndirectCall) { + bool isCall(const MCInst &MI, bool *IsIndirectCall) { + unsigned Opcode = MI.getOpcode(); + *IsIndirectCall = false; switch (Opcode) { @@ -64,12 +72,19 @@ private: return false; case Mips::JAL: + case Mips::BAL: case Mips::BAL_BR: case Mips::BLTZAL: case Mips::BGEZAL: return true; case Mips::JALR: + // JALR is only a call if the link register is not $0. Otherwise it's an + // indirect branch. + assert(MI.getOperand(0).isReg()); + if (MI.getOperand(0).getReg() == Mips::ZERO) + return false; + *IsIndirectCall = true; return true; } @@ -137,24 +152,23 @@ public: &IsStore); bool IsSPFirstOperand = isStackPointerFirstOperand(Inst); if (IsMemAccess || IsSPFirstOperand) { - if (PendingCall) - report_fatal_error("Dangerous instruction in branch delay slot!"); - bool MaskBefore = (IsMemAccess && baseRegNeedsLoadStoreMask(Inst.getOperand(AddrIdx) .getReg())); bool MaskAfter = IsSPFirstOperand && !IsStore; - if (MaskBefore || MaskAfter) + if (MaskBefore || MaskAfter) { + if (PendingCall) + report_fatal_error("Dangerous instruction in branch delay slot!"); sandboxLoadStoreStackChange(Inst, AddrIdx, STI, MaskBefore, MaskAfter); - else - MipsELFStreamer::EmitInstruction(Inst, STI); - return; + return; + } + // fallthrough } // Sandbox calls by aligning call and branch delay to the bundle end. // For indirect calls, emit the mask before the call. bool IsIndirectCall; - if (isCall(Inst.getOpcode(), &IsIndirectCall)) { + if (isCall(Inst, &IsIndirectCall)) { if (PendingCall) report_fatal_error("Dangerous instruction in branch delay slot!"); @@ -203,6 +217,7 @@ bool isBasePlusOffsetMemoryAccess(unsigned Opcode, unsigned *AddrIdx, case Mips::LWC1: case Mips::LDC1: case Mips::LL: + case Mips::LL_R6: case Mips::LWL: case Mips::LWR: *AddrIdx = 1; @@ -223,6 +238,7 @@ bool isBasePlusOffsetMemoryAccess(unsigned Opcode, unsigned *AddrIdx, // Store instructions with base address register in position 2. case Mips::SC: + case Mips::SC_R6: *AddrIdx = 2; if (IsStore) *IsStore = true; diff --git a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp index a8fa27244d94..fbe375b89641 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp @@ -27,10 +27,43 @@ using namespace llvm; -// Pin vtable to this file. -void MipsTargetStreamer::anchor() {} - -MipsTargetStreamer::MipsTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {} +MipsTargetStreamer::MipsTargetStreamer(MCStreamer &S) + : MCTargetStreamer(S), canHaveModuleDirective(true) {} +void MipsTargetStreamer::emitDirectiveSetMicroMips() {} +void MipsTargetStreamer::emitDirectiveSetNoMicroMips() {} +void MipsTargetStreamer::emitDirectiveSetMips16() {} +void MipsTargetStreamer::emitDirectiveSetNoMips16() {} +void MipsTargetStreamer::emitDirectiveSetReorder() {} +void MipsTargetStreamer::emitDirectiveSetNoReorder() {} +void MipsTargetStreamer::emitDirectiveSetMacro() {} +void MipsTargetStreamer::emitDirectiveSetNoMacro() {} +void MipsTargetStreamer::emitDirectiveSetAt() {} +void MipsTargetStreamer::emitDirectiveSetNoAt() {} +void MipsTargetStreamer::emitDirectiveEnd(StringRef Name) {} +void MipsTargetStreamer::emitDirectiveEnt(const MCSymbol &Symbol) {} +void MipsTargetStreamer::emitDirectiveAbiCalls() {} +void MipsTargetStreamer::emitDirectiveNaN2008() {} +void MipsTargetStreamer::emitDirectiveNaNLegacy() {} +void MipsTargetStreamer::emitDirectiveOptionPic0() {} +void MipsTargetStreamer::emitDirectiveOptionPic2() {} +void MipsTargetStreamer::emitFrame(unsigned StackReg, unsigned StackSize, + unsigned ReturnReg) {} +void MipsTargetStreamer::emitMask(unsigned CPUBitmask, int CPUTopSavedRegOff) {} +void MipsTargetStreamer::emitFMask(unsigned FPUBitmask, int FPUTopSavedRegOff) { +} +void MipsTargetStreamer::emitDirectiveSetMips32R2() {} +void MipsTargetStreamer::emitDirectiveSetMips64() {} +void MipsTargetStreamer::emitDirectiveSetMips64R2() {} +void MipsTargetStreamer::emitDirectiveSetDsp() {} +void MipsTargetStreamer::emitDirectiveCpload(unsigned RegNo) {} +void MipsTargetStreamer::emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset, + const MCSymbol &Sym, bool IsReg) { +} +void MipsTargetStreamer::emitDirectiveModuleOddSPReg(bool Enabled, + bool IsO32ABI) { + if (!Enabled && !IsO32ABI) + report_fatal_error("+nooddspreg is only valid for O32"); +} MipsTargetAsmStreamer::MipsTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS) @@ -38,42 +71,52 @@ MipsTargetAsmStreamer::MipsTargetAsmStreamer(MCStreamer &S, void MipsTargetAsmStreamer::emitDirectiveSetMicroMips() { OS << "\t.set\tmicromips\n"; + setCanHaveModuleDir(false); } void MipsTargetAsmStreamer::emitDirectiveSetNoMicroMips() { OS << "\t.set\tnomicromips\n"; + setCanHaveModuleDir(false); } void MipsTargetAsmStreamer::emitDirectiveSetMips16() { OS << "\t.set\tmips16\n"; + setCanHaveModuleDir(false); } void MipsTargetAsmStreamer::emitDirectiveSetNoMips16() { OS << "\t.set\tnomips16\n"; + setCanHaveModuleDir(false); } void MipsTargetAsmStreamer::emitDirectiveSetReorder() { OS << "\t.set\treorder\n"; + setCanHaveModuleDir(false); } void MipsTargetAsmStreamer::emitDirectiveSetNoReorder() { OS << "\t.set\tnoreorder\n"; + setCanHaveModuleDir(false); } void MipsTargetAsmStreamer::emitDirectiveSetMacro() { OS << "\t.set\tmacro\n"; + setCanHaveModuleDir(false); } void MipsTargetAsmStreamer::emitDirectiveSetNoMacro() { OS << "\t.set\tnomacro\n"; + setCanHaveModuleDir(false); } void MipsTargetAsmStreamer::emitDirectiveSetAt() { OS << "\t.set\tat\n"; + setCanHaveModuleDir(false); } void MipsTargetAsmStreamer::emitDirectiveSetNoAt() { OS << "\t.set\tnoat\n"; + setCanHaveModuleDir(false); } void MipsTargetAsmStreamer::emitDirectiveEnd(StringRef Name) { @@ -110,24 +153,28 @@ void MipsTargetAsmStreamer::emitFrame(unsigned StackReg, unsigned StackSize, void MipsTargetAsmStreamer::emitDirectiveSetMips32R2() { OS << "\t.set\tmips32r2\n"; + setCanHaveModuleDir(false); } void MipsTargetAsmStreamer::emitDirectiveSetMips64() { OS << "\t.set\tmips64\n"; + setCanHaveModuleDir(false); } void MipsTargetAsmStreamer::emitDirectiveSetMips64R2() { OS << "\t.set\tmips64r2\n"; + setCanHaveModuleDir(false); } void MipsTargetAsmStreamer::emitDirectiveSetDsp() { OS << "\t.set\tdsp\n"; + setCanHaveModuleDir(false); } // Print a 32 bit hex number with all numbers. static void printHex32(unsigned Value, raw_ostream &OS) { OS << "0x"; for (int i = 7; i >= 0; i--) - OS.write_hex((Value & (0xF << (i*4))) >> (i*4)); + OS.write_hex((Value & (0xF << (i * 4))) >> (i * 4)); } void MipsTargetAsmStreamer::emitMask(unsigned CPUBitmask, @@ -147,6 +194,7 @@ void MipsTargetAsmStreamer::emitFMask(unsigned FPUBitmask, void MipsTargetAsmStreamer::emitDirectiveCpload(unsigned RegNo) { OS << "\t.cpload\t$" << StringRef(MipsInstPrinter::getRegisterName(RegNo)).lower() << "\n"; + setCanHaveModuleDir(false); } void MipsTargetAsmStreamer::emitDirectiveCpsetup(unsigned RegNo, @@ -165,6 +213,34 @@ void MipsTargetAsmStreamer::emitDirectiveCpsetup(unsigned RegNo, OS << ", "; OS << Sym.getName() << "\n"; + setCanHaveModuleDir(false); +} + +void MipsTargetAsmStreamer::emitDirectiveModuleFP( + MipsABIFlagsSection::FpABIKind Value, bool Is32BitABI) { + MipsTargetStreamer::emitDirectiveModuleFP(Value, Is32BitABI); + + StringRef ModuleValue; + OS << "\t.module\tfp="; + OS << ABIFlagsSection.getFpABIString(Value) << "\n"; +} + +void MipsTargetAsmStreamer::emitDirectiveSetFp( + MipsABIFlagsSection::FpABIKind Value) { + StringRef ModuleValue; + OS << "\t.set\tfp="; + OS << ABIFlagsSection.getFpABIString(Value) << "\n"; +} + +void MipsTargetAsmStreamer::emitMipsAbiFlags() { + // No action required for text output. +} + +void MipsTargetAsmStreamer::emitDirectiveModuleOddSPReg(bool Enabled, + bool IsO32ABI) { + MipsTargetStreamer::emitDirectiveModuleOddSPReg(Enabled, IsO32ABI); + + OS << "\t.module\t" << (Enabled ? "" : "no") << "oddspreg\n"; } // This part is for ELF object output. @@ -174,7 +250,7 @@ MipsTargetELFStreamer::MipsTargetELFStreamer(MCStreamer &S, MCAssembler &MCA = getStreamer().getAssembler(); uint64_t Features = STI.getFeatureBits(); Triple T(STI.getTargetTriple()); - Pic = (MCA.getContext().getObjectFileInfo()->getRelocM() == Reloc::PIC_) + Pic = (MCA.getContext().getObjectFileInfo()->getRelocM() == Reloc::PIC_) ? true : false; @@ -182,16 +258,28 @@ MipsTargetELFStreamer::MipsTargetELFStreamer(MCStreamer &S, unsigned EFlags = 0; // Architecture - if (Features & Mips::FeatureMips64r2) + if (Features & Mips::FeatureMips64r6) + EFlags |= ELF::EF_MIPS_ARCH_64R6; + else if (Features & Mips::FeatureMips64r2) EFlags |= ELF::EF_MIPS_ARCH_64R2; else if (Features & Mips::FeatureMips64) EFlags |= ELF::EF_MIPS_ARCH_64; + else if (Features & Mips::FeatureMips5) + EFlags |= ELF::EF_MIPS_ARCH_5; else if (Features & Mips::FeatureMips4) EFlags |= ELF::EF_MIPS_ARCH_4; + else if (Features & Mips::FeatureMips3) + EFlags |= ELF::EF_MIPS_ARCH_3; + else if (Features & Mips::FeatureMips32r6) + EFlags |= ELF::EF_MIPS_ARCH_32R6; else if (Features & Mips::FeatureMips32r2) EFlags |= ELF::EF_MIPS_ARCH_32R2; else if (Features & Mips::FeatureMips32) EFlags |= ELF::EF_MIPS_ARCH_32; + else if (Features & Mips::FeatureMips2) + EFlags |= ELF::EF_MIPS_ARCH_2; + else + EFlags |= ELF::EF_MIPS_ARCH_1; if (T.isArch64Bit()) { if (Features & Mips::FeatureN32) @@ -244,17 +332,17 @@ void MipsTargetELFStreamer::finish() { ELF::SHF_ALLOC | ELF::SHF_MIPS_NOSTRIP, SectionKind::getMetadata()); OS.SwitchSection(Sec); - OS.EmitIntValue(1, 1); // kind + OS.EmitIntValue(1, 1); // kind OS.EmitIntValue(40, 1); // size - OS.EmitIntValue(0, 2); // section - OS.EmitIntValue(0, 4); // info - OS.EmitIntValue(0, 4); // ri_gprmask - OS.EmitIntValue(0, 4); // pad - OS.EmitIntValue(0, 4); // ri_cpr[0]mask - OS.EmitIntValue(0, 4); // ri_cpr[1]mask - OS.EmitIntValue(0, 4); // ri_cpr[2]mask - OS.EmitIntValue(0, 4); // ri_cpr[3]mask - OS.EmitIntValue(0, 8); // ri_gp_value + OS.EmitIntValue(0, 2); // section + OS.EmitIntValue(0, 4); // info + OS.EmitIntValue(0, 4); // ri_gprmask + OS.EmitIntValue(0, 4); // pad + OS.EmitIntValue(0, 4); // ri_cpr[0]mask + OS.EmitIntValue(0, 4); // ri_cpr[1]mask + OS.EmitIntValue(0, 4); // ri_cpr[2]mask + OS.EmitIntValue(0, 4); // ri_cpr[3]mask + OS.EmitIntValue(0, 8); // ri_gp_value } else { const MCSectionELF *Sec = Context.getELFSection(".reginfo", ELF::SHT_MIPS_REGINFO, ELF::SHF_ALLOC, @@ -268,6 +356,7 @@ void MipsTargetELFStreamer::finish() { OS.EmitIntValue(0, 4); // ri_cpr[3]mask OS.EmitIntValue(0, 4); // ri_gp_value } + emitMipsAbiFlags(); } void MipsTargetELFStreamer::emitAssignment(MCSymbol *Symbol, @@ -276,11 +365,11 @@ void MipsTargetELFStreamer::emitAssignment(MCSymbol *Symbol, if (Value->getKind() != MCExpr::SymbolRef) return; const MCSymbol &RhsSym = - static_cast<const MCSymbolRefExpr *>(Value)->getSymbol(); + static_cast<const MCSymbolRefExpr *>(Value)->getSymbol(); MCSymbolData &Data = getStreamer().getOrCreateSymbolData(&RhsSym); uint8_t Type = MCELF::GetType(Data); - if ((Type != ELF::STT_FUNC) - || !(MCELF::getOther(Data) & (ELF::STO_MIPS_MICROMIPS >> 2))) + if ((Type != ELF::STT_FUNC) || + !(MCELF::getOther(Data) & (ELF::STO_MIPS_MICROMIPS >> 2))) return; MCSymbolData &SymbolData = getStreamer().getOrCreateSymbolData(Symbol); @@ -305,6 +394,7 @@ void MipsTargetELFStreamer::emitDirectiveSetMicroMips() { void MipsTargetELFStreamer::emitDirectiveSetNoMicroMips() { MicroMipsEnabled = false; + setCanHaveModuleDir(false); } void MipsTargetELFStreamer::emitDirectiveSetMips16() { @@ -312,14 +402,17 @@ void MipsTargetELFStreamer::emitDirectiveSetMips16() { unsigned Flags = MCA.getELFHeaderEFlags(); Flags |= ELF::EF_MIPS_ARCH_ASE_M16; MCA.setELFHeaderEFlags(Flags); + setCanHaveModuleDir(false); } void MipsTargetELFStreamer::emitDirectiveSetNoMips16() { // FIXME: implement. + setCanHaveModuleDir(false); } void MipsTargetELFStreamer::emitDirectiveSetReorder() { // FIXME: implement. + setCanHaveModuleDir(false); } void MipsTargetELFStreamer::emitDirectiveSetNoReorder() { @@ -327,22 +420,27 @@ void MipsTargetELFStreamer::emitDirectiveSetNoReorder() { unsigned Flags = MCA.getELFHeaderEFlags(); Flags |= ELF::EF_MIPS_NOREORDER; MCA.setELFHeaderEFlags(Flags); + setCanHaveModuleDir(false); } void MipsTargetELFStreamer::emitDirectiveSetMacro() { // FIXME: implement. + setCanHaveModuleDir(false); } void MipsTargetELFStreamer::emitDirectiveSetNoMacro() { // FIXME: implement. + setCanHaveModuleDir(false); } void MipsTargetELFStreamer::emitDirectiveSetAt() { // FIXME: implement. + setCanHaveModuleDir(false); } void MipsTargetELFStreamer::emitDirectiveSetNoAt() { // FIXME: implement. + setCanHaveModuleDir(false); } void MipsTargetELFStreamer::emitDirectiveEnd(StringRef Name) { @@ -411,19 +509,19 @@ void MipsTargetELFStreamer::emitFMask(unsigned FPUBitmask, } void MipsTargetELFStreamer::emitDirectiveSetMips32R2() { - // No action required for ELF output. + setCanHaveModuleDir(false); } void MipsTargetELFStreamer::emitDirectiveSetMips64() { - // No action required for ELF output. + setCanHaveModuleDir(false); } void MipsTargetELFStreamer::emitDirectiveSetMips64R2() { - // No action required for ELF output. + setCanHaveModuleDir(false); } void MipsTargetELFStreamer::emitDirectiveSetDsp() { - // No action required for ELF output. + setCanHaveModuleDir(false); } void MipsTargetELFStreamer::emitDirectiveCpload(unsigned RegNo) { @@ -473,6 +571,8 @@ void MipsTargetELFStreamer::emitDirectiveCpload(unsigned RegNo) { TmpInst.addOperand(MCOperand::CreateReg(Mips::GP)); TmpInst.addOperand(MCOperand::CreateReg(RegNo)); getStreamer().EmitInstruction(TmpInst, STI); + + setCanHaveModuleDir(false); } void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo, @@ -528,4 +628,27 @@ void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo, Inst.addOperand(MCOperand::CreateReg(Mips::GP)); Inst.addOperand(MCOperand::CreateReg(RegNo)); getStreamer().EmitInstruction(Inst, STI); + + setCanHaveModuleDir(false); +} + +void MipsTargetELFStreamer::emitMipsAbiFlags() { + MCAssembler &MCA = getStreamer().getAssembler(); + MCContext &Context = MCA.getContext(); + MCStreamer &OS = getStreamer(); + const MCSectionELF *Sec = + Context.getELFSection(".MIPS.abiflags", ELF::SHT_MIPS_ABIFLAGS, + ELF::SHF_ALLOC, SectionKind::getMetadata()); + MCSectionData &ABIShndxSD = MCA.getOrCreateSectionData(*Sec); + ABIShndxSD.setAlignment(8); + OS.SwitchSection(Sec); + + OS << ABIFlagsSection; +} + +void MipsTargetELFStreamer::emitDirectiveModuleOddSPReg(bool Enabled, + bool IsO32ABI) { + MipsTargetStreamer::emitDirectiveModuleOddSPReg(Enabled, IsO32ABI); + + ABIFlagsSection.OddSPReg = Enabled; } diff --git a/lib/Target/Mips/MicroMipsInstrFPU.td b/lib/Target/Mips/MicroMipsInstrFPU.td index d95f9b07b955..b93017a7400a 100644 --- a/lib/Target/Mips/MicroMipsInstrFPU.td +++ b/lib/Target/Mips/MicroMipsInstrFPU.td @@ -24,13 +24,13 @@ def LDC1_MM : MMRel, LW_FT<"ldc1", AFGR64Opnd, II_LDC1, load>, LW_FM_MM<0x2f>; def SDC1_MM : MMRel, SW_FT<"sdc1", AFGR64Opnd, II_SDC1, store>, LW_FM_MM<0x2e>; def LWXC1_MM : MMRel, LWXC1_FT<"lwxc1", FGR32Opnd, II_LWXC1, load>, - LWXC1_FM_MM<0x48>; + LWXC1_FM_MM<0x48>, INSN_MIPS4_32R2_NOT_32R6_64R6; def SWXC1_MM : MMRel, SWXC1_FT<"swxc1", FGR32Opnd, II_SWXC1, store>, - SWXC1_FM_MM<0x88>; + SWXC1_FM_MM<0x88>, INSN_MIPS4_32R2_NOT_32R6_64R6; def LUXC1_MM : MMRel, LWXC1_FT<"luxc1", AFGR64Opnd, II_LUXC1>, - LWXC1_FM_MM<0x148>, INSN_MIPS5_32R2; + LWXC1_FM_MM<0x148>, INSN_MIPS5_32R2_NOT_32R6_64R6; def SUXC1_MM : MMRel, SWXC1_FT<"suxc1", AFGR64Opnd, II_SUXC1>, - SWXC1_FM_MM<0x188>, INSN_MIPS5_32R2; + SWXC1_FM_MM<0x188>, INSN_MIPS5_32R2_NOT_32R6_64R6; def FCMP_S32_MM : MMRel, CEQS_FT<"s", FGR32, II_C_CC_S, MipsFPCmp>, CEQS_FM_MM<0>; @@ -38,9 +38,9 @@ def FCMP_D32_MM : MMRel, CEQS_FT<"d", AFGR64, II_C_CC_D, MipsFPCmp>, CEQS_FM_MM<1>; def BC1F_MM : MMRel, BC1F_FT<"bc1f", brtarget_mm, IIBranch, MIPS_BRANCH_F>, - BC1F_FM_MM<0x1c>; + BC1F_FM_MM<0x1c>, ISA_MIPS1_NOT_32R6_64R6; def BC1T_MM : MMRel, BC1F_FT<"bc1t", brtarget_mm, IIBranch, MIPS_BRANCH_T>, - BC1F_FM_MM<0x1d>; + BC1F_FM_MM<0x1d>, ISA_MIPS1_NOT_32R6_64R6; def CEIL_W_S_MM : MMRel, ABSS_FT<"ceil.w.s", FGR32Opnd, FGR32Opnd, II_CEIL>, ROUND_W_FM_MM<0, 0x6c>; diff --git a/lib/Target/Mips/MicroMipsInstrInfo.td b/lib/Target/Mips/MicroMipsInstrInfo.td index 9904bc690c23..87a3a3e29ca2 100644 --- a/lib/Target/Mips/MicroMipsInstrInfo.td +++ b/lib/Target/Mips/MicroMipsInstrInfo.td @@ -246,7 +246,6 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in { } def JR_MM : MMRel, IndirectBranch<"jr", GPR32Opnd>, JR_FM_MM<0x3c>; def JALR_MM : JumpLinkReg<"jalr", GPR32Opnd>, JALR_FM_MM<0x03c>; - def RET_MM : MMRel, RetBase<"ret", GPR32Opnd>, JR_FM_MM<0x3c>; /// Branch Instructions def BEQ_MM : MMRel, CBranch<"beq", brtarget_mm, seteq, GPR32Opnd>, diff --git a/lib/Target/Mips/Mips.td b/lib/Target/Mips/Mips.td index ea16331f71d8..dd3bc9b08fc8 100644 --- a/lib/Target/Mips/Mips.td +++ b/lib/Target/Mips/Mips.td @@ -61,6 +61,8 @@ def FeatureGP64Bit : SubtargetFeature<"gp64", "IsGP64bit", "true", "General Purpose Registers are 64-bit wide.">; def FeatureFP64Bit : SubtargetFeature<"fp64", "IsFP64bit", "true", "Support 64-bit FP registers.">; +def FeatureFPXX : SubtargetFeature<"fpxx", "IsFPXX", "true", + "Support for FPXX.">; def FeatureNaN2008 : SubtargetFeature<"nan2008", "IsNaN2008bit", "true", "IEEE 754-2008 NaN encoding.">; def FeatureSingleFloat : SubtargetFeature<"single-float", "IsSingleFloat", @@ -73,6 +75,9 @@ def FeatureN64 : SubtargetFeature<"n64", "MipsABI", "N64", "Enable n64 ABI">; def FeatureEABI : SubtargetFeature<"eabi", "MipsABI", "EABI", "Enable eabi ABI">; +def FeatureNoOddSPReg : SubtargetFeature<"nooddspreg", "UseOddSPReg", "false", + "Disable odd numbered single-precision " + "registers">; def FeatureVFPU : SubtargetFeature<"vfpu", "HasVFPU", "true", "Enable vector FPU instructions.">; def FeatureMips1 : SubtargetFeature<"mips1", "MipsArchVersion", "Mips1", diff --git a/lib/Target/Mips/Mips16FrameLowering.cpp b/lib/Target/Mips/Mips16FrameLowering.cpp index c01d03a1ddf6..93706c2c3fcc 100644 --- a/lib/Target/Mips/Mips16FrameLowering.cpp +++ b/lib/Target/Mips/Mips16FrameLowering.cpp @@ -16,6 +16,7 @@ #include "Mips16InstrInfo.h" #include "MipsInstrInfo.h" #include "MipsRegisterInfo.h" +#include "MipsSubtarget.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -28,6 +29,9 @@ using namespace llvm; +Mips16FrameLowering::Mips16FrameLowering(const MipsSubtarget &STI) + : MipsFrameLowering(STI, STI.stackAlignment()) {} + void Mips16FrameLowering::emitPrologue(MachineFunction &MF) const { MachineBasicBlock &MBB = MF.front(); MachineFrameInfo *MFI = MF.getFrameInfo(); diff --git a/lib/Target/Mips/Mips16FrameLowering.h b/lib/Target/Mips/Mips16FrameLowering.h index 3f7829dd6b3f..1fb7eda0e915 100644 --- a/lib/Target/Mips/Mips16FrameLowering.h +++ b/lib/Target/Mips/Mips16FrameLowering.h @@ -19,8 +19,7 @@ namespace llvm { class Mips16FrameLowering : public MipsFrameLowering { public: - explicit Mips16FrameLowering(const MipsSubtarget &STI) - : MipsFrameLowering(STI, STI.stackAlignment()) {} + explicit Mips16FrameLowering(const MipsSubtarget &STI); /// emitProlog/emitEpilog - These methods insert prolog and epilog code into /// the function. diff --git a/lib/Target/Mips/Mips16ISelDAGToDAG.cpp b/lib/Target/Mips/Mips16ISelDAGToDAG.cpp index 4e86a27e323c..6672aef110a3 100644 --- a/lib/Target/Mips/Mips16ISelDAGToDAG.cpp +++ b/lib/Target/Mips/Mips16ISelDAGToDAG.cpp @@ -37,7 +37,7 @@ using namespace llvm; #define DEBUG_TYPE "mips-isel" bool Mips16DAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { - if (!Subtarget.inMips16Mode()) + if (!Subtarget->inMips16Mode()) return false; return MipsDAGToDAGISel::runOnMachineFunction(MF); } @@ -226,9 +226,9 @@ bool Mips16DAGToDAGISel::selectAddr16( const LSBaseSDNode *LS = dyn_cast<LSBaseSDNode>(Parent); if (LS) { - if (LS->getMemoryVT() == MVT::f32 && Subtarget.hasMips4_32r2()) + if (LS->getMemoryVT() == MVT::f32 && Subtarget->hasMips4_32r2()) return false; - if (LS->getMemoryVT() == MVT::f64 && Subtarget.hasMips4_32r2()) + if (LS->getMemoryVT() == MVT::f64 && Subtarget->hasMips4_32r2()) return false; } } diff --git a/lib/Target/Mips/Mips16ISelLowering.cpp b/lib/Target/Mips/Mips16ISelLowering.cpp index 9102450dc533..81a05df1a70d 100644 --- a/lib/Target/Mips/Mips16ISelLowering.cpp +++ b/lib/Target/Mips/Mips16ISelLowering.cpp @@ -120,13 +120,6 @@ static const Mips16IntrinsicHelperType Mips16IntrinsicHelper[] = { Mips16TargetLowering::Mips16TargetLowering(MipsTargetMachine &TM) : MipsTargetLowering(TM) { - // - // set up as if mips32 and then revert so we can test the mechanism - // for switching - addRegisterClass(MVT::i32, &Mips::GPR32RegClass); - addRegisterClass(MVT::f32, &Mips::FGR32RegClass); - computeRegisterProperties(); - clearRegisterClasses(); // Set up the register classes addRegisterClass(MVT::i32, &Mips::CPU16RegsRegClass); diff --git a/lib/Target/Mips/Mips16ISelLowering.h b/lib/Target/Mips/Mips16ISelLowering.h index df883339b26b..2a5eec5b40d4 100644 --- a/lib/Target/Mips/Mips16ISelLowering.h +++ b/lib/Target/Mips/Mips16ISelLowering.h @@ -11,8 +11,8 @@ // //===----------------------------------------------------------------------===// -#ifndef Mips16ISELLOWERING_H -#define Mips16ISELLOWERING_H +#ifndef MIPS16ISELLOWERING_H +#define MIPS16ISELLOWERING_H #include "MipsISelLowering.h" diff --git a/lib/Target/Mips/Mips16InstrInfo.td b/lib/Target/Mips/Mips16InstrInfo.td index 11166c45a880..5e4eebb62c12 100644 --- a/lib/Target/Mips/Mips16InstrInfo.td +++ b/lib/Target/Mips/Mips16InstrInfo.td @@ -1370,9 +1370,11 @@ def : Mips16Pat<(MipsJmpLink (i32 texternalsym:$dst)), (Jal16 texternalsym:$dst)>; // Indirect branch -def: Mips16Pat< - (brind CPU16Regs:$rs), - (JrcRx16 CPU16Regs:$rs)>; +def: Mips16Pat<(brind CPU16Regs:$rs), (JrcRx16 CPU16Regs:$rs)> { + // Ensure that the addition of MIPS32r6/MIPS64r6 support does not change + // MIPS16's behaviour. + let AddedComplexity = 1; +} // Jump and Link (Call) let isCall=1, hasDelaySlot=0 in diff --git a/lib/Target/Mips/Mips32r6InstrFormats.td b/lib/Target/Mips/Mips32r6InstrFormats.td index a3f9df52edb4..e4ec96a92f5b 100644 --- a/lib/Target/Mips/Mips32r6InstrFormats.td +++ b/lib/Target/Mips/Mips32r6InstrFormats.td @@ -39,7 +39,10 @@ def OPGROUP_DAUI : OPGROUP<0b011101>; def OPGROUP_PCREL : OPGROUP<0b111011>; def OPGROUP_REGIMM : OPGROUP<0b000001>; def OPGROUP_SPECIAL : OPGROUP<0b000000>; +// The spec occasionally names this value LL, LLD, SC, or SCD. def OPGROUP_SPECIAL3 : OPGROUP<0b011111>; +// The spec names this constant LWC2, LDC2, SWC2, and SDC2 in different places. +def OPGROUP_COP2LDST : OPGROUP<0b010010>; class OPCODE2<bits<2> Val> { bits<2> Value = Val; @@ -48,6 +51,11 @@ def OPCODE2_ADDIUPC : OPCODE2<0b00>; def OPCODE2_LWPC : OPCODE2<0b01>; def OPCODE2_LWUPC : OPCODE2<0b10>; +class OPCODE3<bits<3> Val> { + bits<3> Value = Val; +} +def OPCODE3_LDPC : OPCODE3<0b110>; + class OPCODE5<bits<5> Val> { bits<5> Value = Val; } @@ -59,6 +67,13 @@ def OPCODE5_BC1EQZ : OPCODE5<0b01001>; def OPCODE5_BC1NEZ : OPCODE5<0b01101>; def OPCODE5_BC2EQZ : OPCODE5<0b01001>; def OPCODE5_BC2NEZ : OPCODE5<0b01101>; +def OPCODE5_BGEZAL : OPCODE5<0b10001>; +// The next four constants are unnamed in the spec. These names are taken from +// the OPGROUP names they are used with. +def OPCODE5_LDC2 : OPCODE5<0b01110>; +def OPCODE5_LWC2 : OPCODE5<0b01010>; +def OPCODE5_SDC2 : OPCODE5<0b01111>; +def OPCODE5_SWC2 : OPCODE5<0b01011>; class OPCODE6<bits<6> Val> { bits<6> Value = Val; @@ -67,6 +82,22 @@ def OPCODE6_ALIGN : OPCODE6<0b100000>; def OPCODE6_DALIGN : OPCODE6<0b100100>; def OPCODE6_BITSWAP : OPCODE6<0b100000>; def OPCODE6_DBITSWAP : OPCODE6<0b100100>; +def OPCODE6_JALR : OPCODE6<0b001001>; +def OPCODE6_CACHE : OPCODE6<0b100101>; +def OPCODE6_PREF : OPCODE6<0b110101>; +// The next four constants are unnamed in the spec. These names are taken from +// the OPGROUP names they are used with. +def OPCODE6_LL : OPCODE6<0b110110>; +def OPCODE6_LLD : OPCODE6<0b110111>; +def OPCODE6_SC : OPCODE6<0b100110>; +def OPCODE6_SCD : OPCODE6<0b100111>; +def OPCODE6_CLO : OPCODE6<0b010001>; +def OPCODE6_CLZ : OPCODE6<0b010000>; +def OPCODE6_DCLO : OPCODE6<0b010011>; +def OPCODE6_DCLZ : OPCODE6<0b010010>; +def OPCODE6_LSA : OPCODE6<0b000101>; +def OPCODE6_DLSA : OPCODE6<0b010101>; +def OPCODE6_SDBBP : OPCODE6<0b001110>; class FIELD_FMT<bits<5> Val> { bits<5> Value = Val; @@ -77,22 +108,23 @@ def FIELD_FMT_D : FIELD_FMT<0b10001>; class FIELD_CMP_COND<bits<5> Val> { bits<5> Value = Val; } -def FIELD_CMP_COND_F : FIELD_CMP_COND<0b00000>; +// Note: The CMP_COND_FMT names differ from the C_COND_FMT names. +def FIELD_CMP_COND_AF : FIELD_CMP_COND<0b00000>; def FIELD_CMP_COND_UN : FIELD_CMP_COND<0b00001>; def FIELD_CMP_COND_EQ : FIELD_CMP_COND<0b00010>; def FIELD_CMP_COND_UEQ : FIELD_CMP_COND<0b00011>; -def FIELD_CMP_COND_OLT : FIELD_CMP_COND<0b00100>; +def FIELD_CMP_COND_LT : FIELD_CMP_COND<0b00100>; def FIELD_CMP_COND_ULT : FIELD_CMP_COND<0b00101>; -def FIELD_CMP_COND_OLE : FIELD_CMP_COND<0b00110>; +def FIELD_CMP_COND_LE : FIELD_CMP_COND<0b00110>; def FIELD_CMP_COND_ULE : FIELD_CMP_COND<0b00111>; -def FIELD_CMP_COND_SF : FIELD_CMP_COND<0b01000>; -def FIELD_CMP_COND_NGLE : FIELD_CMP_COND<0b01001>; +def FIELD_CMP_COND_SAF : FIELD_CMP_COND<0b01000>; +def FIELD_CMP_COND_SUN : FIELD_CMP_COND<0b01001>; def FIELD_CMP_COND_SEQ : FIELD_CMP_COND<0b01010>; -def FIELD_CMP_COND_NGL : FIELD_CMP_COND<0b01011>; -def FIELD_CMP_COND_LT : FIELD_CMP_COND<0b01100>; -def FIELD_CMP_COND_NGE : FIELD_CMP_COND<0b01101>; -def FIELD_CMP_COND_LE : FIELD_CMP_COND<0b01110>; -def FIELD_CMP_COND_NGT : FIELD_CMP_COND<0b01111>; +def FIELD_CMP_COND_SUEQ : FIELD_CMP_COND<0b01011>; +def FIELD_CMP_COND_SLT : FIELD_CMP_COND<0b01100>; +def FIELD_CMP_COND_SULT : FIELD_CMP_COND<0b01101>; +def FIELD_CMP_COND_SLE : FIELD_CMP_COND<0b01110>; +def FIELD_CMP_COND_SULE : FIELD_CMP_COND<0b01111>; class FIELD_CMP_FORMAT<bits<5> Val> { bits<5> Value = Val; @@ -139,6 +171,17 @@ class DAUI_FM : AUI_FM { let Inst{31-26} = OPGROUP_DAUI.Value; } +class BAL_FM : MipsR6Inst { + bits<16> offset; + + bits<32> Inst; + + let Inst{31-26} = OPGROUP_REGIMM.Value; + let Inst{25-21} = 0b00000; + let Inst{20-16} = OPCODE5_BGEZAL.Value; + let Inst{15-0} = offset; +} + class COP1_2R_FM<bits<6> funct, FIELD_FMT Format> : MipsR6Inst { bits<5> fs; bits<5> fd; @@ -216,6 +259,18 @@ class PCREL19_FM<OPCODE2 Operation> : MipsR6Inst { let Inst{18-0} = imm; } +class PCREL18_FM<OPCODE3 Operation> : MipsR6Inst { + bits<5> rs; + bits<18> imm; + + bits<32> Inst; + + let Inst{31-26} = OPGROUP_PCREL.Value; + let Inst{25-21} = rs; + let Inst{20-18} = Operation.Value; + let Inst{17-0} = imm; +} + class SPECIAL3_2R_FM<OPCODE6 Operation> : MipsR6Inst { bits<5> rd; bits<5> rt; @@ -230,6 +285,36 @@ class SPECIAL3_2R_FM<OPCODE6 Operation> : MipsR6Inst { let Inst{5-0} = Operation.Value; } +class SPECIAL3_MEM_FM<OPCODE6 Operation> : MipsR6Inst { + bits<21> addr; + bits<5> hint; + bits<5> base = addr{20-16}; + bits<9> offset = addr{8-0}; + + bits<32> Inst; + + let Inst{31-26} = OPGROUP_SPECIAL3.Value; + let Inst{25-21} = base; + let Inst{20-16} = hint; + let Inst{15-7} = offset; + let Inst{6} = 0; + let Inst{5-0} = Operation.Value; +} + +class SPECIAL_2R_FM<OPCODE6 Operation> : MipsR6Inst { + bits<5> rd; + bits<5> rs; + + bits<32> Inst; + + let Inst{31-26} = OPGROUP_SPECIAL.Value; + let Inst{25-21} = rs; + let Inst{20-16} = 0b00000; + let Inst{15-11} = rd; + let Inst{10-6} = 0b00001; + let Inst{5-0} = Operation.Value; +} + class SPECIAL_3R_FM<bits<5> mulop, bits<6> funct> : MipsR6Inst { bits<5> rd; bits<5> rs; @@ -245,6 +330,16 @@ class SPECIAL_3R_FM<bits<5> mulop, bits<6> funct> : MipsR6Inst { let Inst{5-0} = funct; } +class SPECIAL_SDBBP_FM : MipsR6Inst { + bits<20> code_; + + bits<32> Inst; + + let Inst{31-26} = OPGROUP_SPECIAL.Value; + let Inst{25-6} = code_; + let Inst{5-0} = OPCODE6_SDBBP.Value; +} + // This class is ambiguous with other branches: // BEQC/BNEC require that rs > rt class CMP_BRANCH_2R_OFF16_FM<OPGROUP funct> : MipsR6Inst { @@ -355,6 +450,40 @@ class SPECIAL3_DALIGN_FM<OPCODE6 Operation> : MipsR6Inst { let Inst{5-0} = Operation.Value; } +class SPECIAL3_LL_SC_FM<OPCODE6 Operation> : MipsR6Inst { + bits<5> rt; + bits<21> addr; + bits<5> base = addr{20-16}; + bits<9> offset = addr{8-0}; + + bits<32> Inst; + + let Inst{31-26} = OPGROUP_SPECIAL3.Value; + let Inst{25-21} = base; + let Inst{20-16} = rt; + let Inst{15-7} = offset; + let Inst{5-0} = Operation.Value; + + string DecoderMethod = "DecodeSpecial3LlSc"; +} + +class SPECIAL_LSA_FM<OPCODE6 Operation> : MipsR6Inst { + bits<5> rd; + bits<5> rs; + bits<5> rt; + bits<2> imm2; + + bits<32> Inst; + + let Inst{31-26} = OPGROUP_SPECIAL.Value; + let Inst{25-21} = rs; + let Inst{20-16} = rt; + let Inst{15-11} = rd; + let Inst{10-8} = 0b000; + let Inst{7-6} = imm2; + let Inst{5-0} = Operation.Value; +} + class REGIMM_FM<OPCODE5 Operation> : MipsR6Inst { bits<5> rs; bits<16> imm; @@ -384,3 +513,31 @@ class COP1_CMP_CONDN_FM<FIELD_CMP_FORMAT Format, let Inst{4-0} = Cond.Value; } +class JR_HB_R6_FM<OPCODE6 Operation> : MipsR6Inst { + bits<5> rs; + + bits<32> Inst; + + let Inst{31-26} = OPGROUP_SPECIAL.Value; + let Inst{25-21} = rs; + let Inst{20-16} = 0; + let Inst{15-11} = 0; + let Inst{10} = 1; + let Inst{9-6} = 0; + let Inst{5-0} = Operation.Value; +} + +class COP2LDST_FM<OPCODE5 Operation> : MipsR6Inst { + bits<5> rt; + bits<21> addr; + bits<5> base = addr{20-16}; + bits<11> offset = addr{10-0}; + + bits<32> Inst; + + let Inst{31-26} = OPGROUP_COP2LDST.Value; + let Inst{25-21} = Operation.Value; + let Inst{20-16} = rt; + let Inst{15-11} = base; + let Inst{10-0} = offset; +} diff --git a/lib/Target/Mips/Mips32r6InstrInfo.td b/lib/Target/Mips/Mips32r6InstrInfo.td index ffaf9657b6c7..d06e5ca22584 100644 --- a/lib/Target/Mips/Mips32r6InstrInfo.td +++ b/lib/Target/Mips/Mips32r6InstrInfo.td @@ -14,39 +14,8 @@ include "Mips32r6InstrFormats.td" // Notes about removals/changes from MIPS32r6: -// Unclear: ssnop -// Reencoded: cache, pref -// Reencoded: clo, clz // Reencoded: jr -> jalr // Reencoded: jr.hb -> jalr.hb -// Reencoded: ldc2 -// Reencoded: ll, sc -// Reencoded: lwc2 -// Reencoded: sdbbp -// Reencoded: sdc2 -// Reencoded: swc2 -// Removed: bc1any2, bc1any4 -// Removed: bc2[ft] -// Removed: bc2f, bc2t -// Removed: bgezal -// Removed: bltzal -// Removed: c.cond.fmt, bc1[ft] -// Removed: div, divu -// Removed: jalx -// Removed: ldxc1 -// Removed: luxc1 -// Removed: lwxc1 -// Removed: madd.[ds], nmadd.[ds], nmsub.[ds], sub.[ds] -// Removed: mfhi, mflo, mthi, mtlo, madd, maddu, msub, msubu, mul -// Removed: movf, movt -// Removed: movf.fmt, movt.fmt, movn.fmt, movz.fmt -// Removed: movn, movz -// Removed: mult, multu -// Removed: prefx -// Removed: sdxc1 -// Removed: suxc1 -// Removed: swxc1 -// Rencoded: [ls][wd]c2 def brtarget21 : Operand<OtherVT> { let EncoderMethod = "getBranchTarget21OpValue"; @@ -84,6 +53,7 @@ class ALUIPC_ENC : PCREL16_FM<OPCODE5_ALUIPC>; class AUI_ENC : AUI_FM; class AUIPC_ENC : PCREL16_FM<OPCODE5_AUIPC>; +class BAL_ENC : BAL_FM; class BALC_ENC : BRANCH_OFF26_FM<0b111010>; class BC_ENC : BRANCH_OFF26_FM<0b110010>; class BEQC_ENC : CMP_BRANCH_2R_OFF16_FM<OPGROUP_ADDI>, @@ -97,11 +67,20 @@ class BNEZALC_ENC : CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP_DADDI>, class BLTZC_ENC : CMP_BRANCH_1R_BOTH_OFF16_FM<OPGROUP_BGTZL>, DecodeDisambiguates<"BgtzlGroupBranch">; +class BGEC_ENC : CMP_BRANCH_2R_OFF16_FM<OPGROUP_BLEZL>, + DecodeDisambiguatedBy<"BlezlGroupBranch">; +class BGEUC_ENC : CMP_BRANCH_2R_OFF16_FM<OPGROUP_BLEZ>, + DecodeDisambiguatedBy<"BlezGroupBranch">; class BGEZC_ENC : CMP_BRANCH_1R_BOTH_OFF16_FM<OPGROUP_BLEZL>, DecodeDisambiguates<"BlezlGroupBranch">; class BGTZALC_ENC : CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP_BGTZ>, DecodeDisambiguatedBy<"BgtzGroupBranch">; +class BLTC_ENC : CMP_BRANCH_2R_OFF16_FM<OPGROUP_BGTZL>, + DecodeDisambiguatedBy<"BgtzlGroupBranch">; +class BLTUC_ENC : CMP_BRANCH_2R_OFF16_FM<OPGROUP_BGTZ>, + DecodeDisambiguatedBy<"BgtzGroupBranch">; + class BLEZC_ENC : CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP_BLEZL>, DecodeDisambiguatedBy<"BlezlGroupBranch">; class BLTZALC_ENC : CMP_BRANCH_1R_BOTH_OFF16_FM<OPGROUP_BGTZ>, @@ -110,7 +89,8 @@ class BGTZC_ENC : CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP_BGTZL>, DecodeDisambiguatedBy<"BgtzlGroupBranch">; class BEQZC_ENC : CMP_BRANCH_OFF21_FM<0b110110>; -class BGEZALC_ENC : CMP_BRANCH_1R_BOTH_OFF16_FM<OPGROUP_BLEZ>; +class BGEZALC_ENC : CMP_BRANCH_1R_BOTH_OFF16_FM<OPGROUP_BLEZ>, + DecodeDisambiguates<"BlezGroupBranch">; class BNEZC_ENC : CMP_BRANCH_OFF21_FM<0b111110>; class BC1EQZ_ENC : COP1_BCCZ_FM<OPCODE5_BC1EQZ>; @@ -120,9 +100,10 @@ class BC2NEZ_ENC : COP2_BCCZ_FM<OPCODE5_BC2NEZ>; class JIALC_ENC : JMP_IDX_COMPACT_FM<0b111110>; class JIC_ENC : JMP_IDX_COMPACT_FM<0b110110>; - +class JR_HB_R6_ENC : JR_HB_R6_FM<OPCODE6_JALR>; class BITSWAP_ENC : SPECIAL3_2R_FM<OPCODE6_BITSWAP>; -class BLEZALC_ENC : CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP_BLEZ>; +class BLEZALC_ENC : CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP_BLEZ>, + DecodeDisambiguatedBy<"BlezGroupBranch">; class BNVC_ENC : CMP_BRANCH_2R_OFF16_FM<OPGROUP_DADDI>, DecodeDisambiguatedBy<"DaddiGroupBranch">; class BOVC_ENC : CMP_BRANCH_2R_OFF16_FM<OPGROUP_ADDI>, @@ -170,12 +151,23 @@ class RINT_D_ENC : COP1_2R_FM<0b011010, FIELD_FMT_D>; class CLASS_S_ENC : COP1_2R_FM<0b011011, FIELD_FMT_S>; class CLASS_D_ENC : COP1_2R_FM<0b011011, FIELD_FMT_D>; -class CMP_CONDN_DESC_BASE<string CondStr, string Typestr, RegisterOperand FGROpnd> { - dag OutOperandList = (outs FGROpnd:$fd); - dag InOperandList = (ins FGROpnd:$fs, FGROpnd:$ft); - string AsmString = !strconcat("cmp.", CondStr, ".", Typestr, "\t$fd, $fs, $ft"); - list<dag> Pattern = []; -} +class CACHE_ENC : SPECIAL3_MEM_FM<OPCODE6_CACHE>; +class PREF_ENC : SPECIAL3_MEM_FM<OPCODE6_PREF>; + +class LDC2_R6_ENC : COP2LDST_FM<OPCODE5_LDC2>; +class LWC2_R6_ENC : COP2LDST_FM<OPCODE5_LWC2>; +class SDC2_R6_ENC : COP2LDST_FM<OPCODE5_SDC2>; +class SWC2_R6_ENC : COP2LDST_FM<OPCODE5_SWC2>; + +class LSA_R6_ENC : SPECIAL_LSA_FM<OPCODE6_LSA>; + +class LL_R6_ENC : SPECIAL3_LL_SC_FM<OPCODE6_LL>; +class SC_R6_ENC : SPECIAL3_LL_SC_FM<OPCODE6_SC>; + +class CLO_R6_ENC : SPECIAL_2R_FM<OPCODE6_CLO>; +class CLZ_R6_ENC : SPECIAL_2R_FM<OPCODE6_CLZ>; + +class SDBBP_R6_ENC : SPECIAL_SDBBP_FM; //===----------------------------------------------------------------------===// // @@ -183,56 +175,65 @@ class CMP_CONDN_DESC_BASE<string CondStr, string Typestr, RegisterOperand FGROpn // //===----------------------------------------------------------------------===// +class CMP_CONDN_DESC_BASE<string CondStr, string Typestr, + RegisterOperand FGROpnd, + SDPatternOperator Op = null_frag> { + dag OutOperandList = (outs FGRCCOpnd:$fd); + dag InOperandList = (ins FGROpnd:$fs, FGROpnd:$ft); + string AsmString = !strconcat("cmp.", CondStr, ".", Typestr, "\t$fd, $fs, $ft"); + list<dag> Pattern = [(set FGRCCOpnd:$fd, (Op FGROpnd:$fs, FGROpnd:$ft))]; +} + multiclass CMP_CC_M <FIELD_CMP_FORMAT Format, string Typestr, RegisterOperand FGROpnd>{ - def CMP_F_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_F>, - CMP_CONDN_DESC_BASE<"f", Typestr, FGROpnd>, + def CMP_F_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_AF>, + CMP_CONDN_DESC_BASE<"af", Typestr, FGROpnd>, ISA_MIPS32R6; def CMP_UN_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_UN>, - CMP_CONDN_DESC_BASE<"un", Typestr, FGROpnd>, + CMP_CONDN_DESC_BASE<"un", Typestr, FGROpnd, setuo>, ISA_MIPS32R6; def CMP_EQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_EQ>, - CMP_CONDN_DESC_BASE<"eq", Typestr, FGROpnd>, + CMP_CONDN_DESC_BASE<"eq", Typestr, FGROpnd, setoeq>, ISA_MIPS32R6; def CMP_UEQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_UEQ>, - CMP_CONDN_DESC_BASE<"ueq", Typestr, FGROpnd>, - ISA_MIPS32R6; - def CMP_OLT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_OLT>, - CMP_CONDN_DESC_BASE<"olt", Typestr, FGROpnd>, + CMP_CONDN_DESC_BASE<"ueq", Typestr, FGROpnd, setueq>, ISA_MIPS32R6; + def CMP_LT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_LT>, + CMP_CONDN_DESC_BASE<"lt", Typestr, FGROpnd, setolt>, + ISA_MIPS32R6; def CMP_ULT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_ULT>, - CMP_CONDN_DESC_BASE<"ult", Typestr, FGROpnd>, - ISA_MIPS32R6; - def CMP_OLE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_OLE>, - CMP_CONDN_DESC_BASE<"ole", Typestr, FGROpnd>, + CMP_CONDN_DESC_BASE<"ult", Typestr, FGROpnd, setult>, ISA_MIPS32R6; + def CMP_LE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_LE>, + CMP_CONDN_DESC_BASE<"le", Typestr, FGROpnd, setole>, + ISA_MIPS32R6; def CMP_ULE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_ULE>, - CMP_CONDN_DESC_BASE<"ule", Typestr, FGROpnd>, + CMP_CONDN_DESC_BASE<"ule", Typestr, FGROpnd, setule>, + ISA_MIPS32R6; + def CMP_SAF_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SAF>, + CMP_CONDN_DESC_BASE<"saf", Typestr, FGROpnd>, + ISA_MIPS32R6; + def CMP_SUN_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SUN>, + CMP_CONDN_DESC_BASE<"sun", Typestr, FGROpnd>, ISA_MIPS32R6; - def CMP_SF_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SF>, - CMP_CONDN_DESC_BASE<"sf", Typestr, FGROpnd>, - ISA_MIPS32R6; - def CMP_NGLE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_NGLE>, - CMP_CONDN_DESC_BASE<"ngle", Typestr, FGROpnd>, - ISA_MIPS32R6; def CMP_SEQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SEQ>, CMP_CONDN_DESC_BASE<"seq", Typestr, FGROpnd>, ISA_MIPS32R6; - def CMP_NGL_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_NGL>, - CMP_CONDN_DESC_BASE<"ngl", Typestr, FGROpnd>, - ISA_MIPS32R6; - def CMP_LT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_LT>, - CMP_CONDN_DESC_BASE<"lt", Typestr, FGROpnd>, - ISA_MIPS32R6; - def CMP_NGE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_NGE>, - CMP_CONDN_DESC_BASE<"nge", Typestr, FGROpnd>, + def CMP_SUEQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SUEQ>, + CMP_CONDN_DESC_BASE<"sueq", Typestr, FGROpnd>, + ISA_MIPS32R6; + def CMP_SLT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SLT>, + CMP_CONDN_DESC_BASE<"slt", Typestr, FGROpnd>, ISA_MIPS32R6; - def CMP_LE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_LE>, - CMP_CONDN_DESC_BASE<"le", Typestr, FGROpnd>, - ISA_MIPS32R6; - def CMP_NGT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_NGT>, - CMP_CONDN_DESC_BASE<"ngt", Typestr, FGROpnd>, + def CMP_SULT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SULT>, + CMP_CONDN_DESC_BASE<"sult", Typestr, FGROpnd>, + ISA_MIPS32R6; + def CMP_SLE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SLE>, + CMP_CONDN_DESC_BASE<"sle", Typestr, FGROpnd>, ISA_MIPS32R6; + def CMP_SULE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SULE>, + CMP_CONDN_DESC_BASE<"sule", Typestr, FGROpnd>, + ISA_MIPS32R6; } //===----------------------------------------------------------------------===// @@ -241,16 +242,17 @@ multiclass CMP_CC_M <FIELD_CMP_FORMAT Format, string Typestr, // //===----------------------------------------------------------------------===// -class PCREL19_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> { +class PCREL_DESC_BASE<string instr_asm, RegisterOperand GPROpnd, + Operand ImmOpnd> { dag OutOperandList = (outs GPROpnd:$rs); - dag InOperandList = (ins simm19_lsl2:$imm); + dag InOperandList = (ins ImmOpnd:$imm); string AsmString = !strconcat(instr_asm, "\t$rs, $imm"); list<dag> Pattern = []; } -class ADDIUPC_DESC : PCREL19_DESC_BASE<"addiupc", GPR32Opnd>; -class LWPC_DESC: PCREL19_DESC_BASE<"lwpc", GPR32Opnd>; -class LWUPC_DESC: PCREL19_DESC_BASE<"lwupc", GPR32Opnd>; +class ADDIUPC_DESC : PCREL_DESC_BASE<"addiupc", GPR32Opnd, simm19_lsl2>; +class LWPC_DESC: PCREL_DESC_BASE<"lwpc", GPR32Opnd, simm19_lsl2>; +class LWUPC_DESC: PCREL_DESC_BASE<"lwupc", GPR32Opnd, simm19_lsl2>; class ALIGN_DESC_BASE<string instr_asm, RegisterOperand GPROpnd, Operand ImmOpnd> { @@ -318,15 +320,26 @@ class CMP_CBR_RT_Z_DESC_BASE<string instr_asm, DAGOperand opnd, list<Register> Defs = [AT]; } +class BAL_DESC : BC_DESC_BASE<"bal", brtarget> { + bit isCall = 1; + bit hasDelaySlot = 1; + list<Register> Defs = [RA]; +} + class BALC_DESC : BC_DESC_BASE<"balc", brtarget26> { bit isCall = 1; list<Register> Defs = [RA]; } class BC_DESC : BC_DESC_BASE<"bc", brtarget26>; +class BGEC_DESC : CMP_BC_DESC_BASE<"bgec", brtarget, GPR32Opnd>; +class BGEUC_DESC : CMP_BC_DESC_BASE<"bgeuc", brtarget, GPR32Opnd>; class BEQC_DESC : CMP_BC_DESC_BASE<"beqc", brtarget, GPR32Opnd>; class BNEC_DESC : CMP_BC_DESC_BASE<"bnec", brtarget, GPR32Opnd>; +class BLTC_DESC : CMP_BC_DESC_BASE<"bltc", brtarget, GPR32Opnd>; +class BLTUC_DESC : CMP_BC_DESC_BASE<"bltuc", brtarget, GPR32Opnd>; + class BLTZC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bltzc", brtarget, GPR32Opnd>; class BGEZC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bgezc", brtarget, GPR32Opnd>; @@ -380,6 +393,14 @@ class JIC_DESC : JMP_IDX_COMPACT_DESC_BASE<"jic", jmpoffset16, GPR32Opnd> { list<Register> Defs = [AT]; } +class JR_HB_R6_DESC : JR_HB_DESC_BASE<"jr.hb", GPR32Opnd> { + bit isBranch = 1; + bit isIndirectBranch = 1; + bit hasDelaySlot = 1; + bit isTerminator=1; + bit isBarrier=1; +} + class BITSWAP_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> { dag OutOperandList = (outs GPROpnd:$rd); dag InOperandList = (ins GPROpnd:$rt); @@ -389,17 +410,22 @@ class BITSWAP_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> { class BITSWAP_DESC : BITSWAP_DESC_BASE<"bitswap", GPR32Opnd>; -class DIVMOD_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> { +class DIVMOD_DESC_BASE<string instr_asm, RegisterOperand GPROpnd, + SDPatternOperator Op=null_frag> { dag OutOperandList = (outs GPROpnd:$rd); dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt); string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt"); - list<dag> Pattern = []; + list<dag> Pattern = [(set GPROpnd:$rd, (Op GPROpnd:$rs, GPROpnd:$rt))]; + + // This instruction doesn't trap division by zero itself. We must insert + // teq instructions as well. + bit usesCustomInserter = 1; } -class DIV_DESC : DIVMOD_DESC_BASE<"div", GPR32Opnd>; -class DIVU_DESC : DIVMOD_DESC_BASE<"divu", GPR32Opnd>; -class MOD_DESC : DIVMOD_DESC_BASE<"mod", GPR32Opnd>; -class MODU_DESC : DIVMOD_DESC_BASE<"modu", GPR32Opnd>; +class DIV_DESC : DIVMOD_DESC_BASE<"div", GPR32Opnd, sdiv>; +class DIVU_DESC : DIVMOD_DESC_BASE<"divu", GPR32Opnd, udiv>; +class MOD_DESC : DIVMOD_DESC_BASE<"mod", GPR32Opnd, srem>; +class MODU_DESC : DIVMOD_DESC_BASE<"modu", GPR32Opnd, urem>; class BEQZALC_DESC : CMP_CBR_RT_Z_DESC_BASE<"beqzalc", brtarget, GPR32Opnd> { list<Register> Defs = [RA]; @@ -424,28 +450,35 @@ class BLTZALC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bltzalc", brtarget, GPR32Opnd> { class BNEZALC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bnezalc", brtarget, GPR32Opnd> { list<Register> Defs = [RA]; } -class MUL_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> { + +class MUL_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd, + SDPatternOperator Op=null_frag> { dag OutOperandList = (outs GPROpnd:$rd); dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt); string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt"); - list<dag> Pattern = []; + list<dag> Pattern = [(set GPROpnd:$rd, (Op GPROpnd:$rs, GPROpnd:$rt))]; } -class MUH_DESC : MUL_R6_DESC_BASE<"muh", GPR32Opnd>; -class MUHU_DESC : MUL_R6_DESC_BASE<"muhu", GPR32Opnd>; -class MUL_R6_DESC : MUL_R6_DESC_BASE<"mul", GPR32Opnd>; +class MUH_DESC : MUL_R6_DESC_BASE<"muh", GPR32Opnd, mulhs>; +class MUHU_DESC : MUL_R6_DESC_BASE<"muhu", GPR32Opnd, mulhu>; +class MUL_R6_DESC : MUL_R6_DESC_BASE<"mul", GPR32Opnd, mul>; class MULU_DESC : MUL_R6_DESC_BASE<"mulu", GPR32Opnd>; -class COP1_4R_DESC_BASE<string instr_asm, RegisterOperand FGROpnd> { +class COP1_SEL_DESC_BASE<string instr_asm, RegisterOperand FGROpnd> { dag OutOperandList = (outs FGROpnd:$fd); - dag InOperandList = (ins FGROpnd:$fd_in, FGROpnd:$fs, FGROpnd:$ft); + dag InOperandList = (ins FGRCCOpnd:$fd_in, FGROpnd:$fs, FGROpnd:$ft); string AsmString = !strconcat(instr_asm, "\t$fd, $fs, $ft"); - list<dag> Pattern = []; + list<dag> Pattern = [(set FGROpnd:$fd, (select FGRCCOpnd:$fd_in, + FGROpnd:$ft, + FGROpnd:$fs))]; string Constraints = "$fd_in = $fd"; } -class SEL_D_DESC : COP1_4R_DESC_BASE<"sel.d", FGR64Opnd>; -class SEL_S_DESC : COP1_4R_DESC_BASE<"sel.s", FGR32Opnd>; +class SEL_D_DESC : COP1_SEL_DESC_BASE<"sel.d", FGR64Opnd> { + // We must insert a SUBREG_TO_REG around $fd_in + bit usesCustomInserter = 1; +} +class SEL_S_DESC : COP1_SEL_DESC_BASE<"sel.s", FGR32Opnd>; class SELEQNE_Z_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> { dag OutOperandList = (outs GPROpnd:$rd); @@ -457,6 +490,14 @@ class SELEQNE_Z_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> { class SELEQZ_DESC : SELEQNE_Z_DESC_BASE<"seleqz", GPR32Opnd>; class SELNEZ_DESC : SELEQNE_Z_DESC_BASE<"selnez", GPR32Opnd>; +class COP1_4R_DESC_BASE<string instr_asm, RegisterOperand FGROpnd> { + dag OutOperandList = (outs FGROpnd:$fd); + dag InOperandList = (ins FGROpnd:$fd_in, FGROpnd:$fs, FGROpnd:$ft); + string AsmString = !strconcat(instr_asm, "\t$fd, $fs, $ft"); + list<dag> Pattern = []; + string Constraints = "$fd_in = $fd"; +} + class MADDF_S_DESC : COP1_4R_DESC_BASE<"maddf.s", FGR32Opnd>; class MADDF_D_DESC : COP1_4R_DESC_BASE<"maddf.d", FGR64Opnd>; class MSUBF_S_DESC : COP1_4R_DESC_BASE<"msubf.s", FGR32Opnd>; @@ -503,6 +544,96 @@ class RINT_D_DESC : CLASS_RINT_DESC_BASE<"rint.d", FGR64Opnd>; class CLASS_S_DESC : CLASS_RINT_DESC_BASE<"class.s", FGR32Opnd>; class CLASS_D_DESC : CLASS_RINT_DESC_BASE<"class.d", FGR64Opnd>; +class CACHE_HINT_DESC<string instr_asm, Operand MemOpnd, + RegisterOperand GPROpnd> { + dag OutOperandList = (outs); + dag InOperandList = (ins MemOpnd:$addr, uimm5:$hint); + string AsmString = !strconcat(instr_asm, "\t$hint, $addr"); + list<dag> Pattern = []; +} + +class CACHE_DESC : CACHE_HINT_DESC<"cache", mem_simm9, GPR32Opnd>; +class PREF_DESC : CACHE_HINT_DESC<"pref", mem_simm9, GPR32Opnd>; + +class COP2LD_DESC_BASE<string instr_asm, RegisterOperand COPOpnd> { + dag OutOperandList = (outs COPOpnd:$rt); + dag InOperandList = (ins mem_simm11:$addr); + string AsmString = !strconcat(instr_asm, "\t$rt, $addr"); + list<dag> Pattern = []; + bit mayLoad = 1; +} + +class LDC2_R6_DESC : COP2LD_DESC_BASE<"ldc2", COP2Opnd>; +class LWC2_R6_DESC : COP2LD_DESC_BASE<"lwc2", COP2Opnd>; + +class COP2ST_DESC_BASE<string instr_asm, RegisterOperand COPOpnd> { + dag OutOperandList = (outs); + dag InOperandList = (ins COPOpnd:$rt, mem_simm11:$addr); + string AsmString = !strconcat(instr_asm, "\t$rt, $addr"); + list<dag> Pattern = []; + bit mayStore = 1; +} + +class SDC2_R6_DESC : COP2ST_DESC_BASE<"sdc2", COP2Opnd>; +class SWC2_R6_DESC : COP2ST_DESC_BASE<"swc2", COP2Opnd>; + +class LSA_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd, + Operand ImmOpnd> { + dag OutOperandList = (outs GPROpnd:$rd); + dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt, ImmOpnd:$imm2); + string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt, $imm2"); + list<dag> Pattern = []; +} + +class LSA_R6_DESC : LSA_R6_DESC_BASE<"lsa", GPR32Opnd, uimm2>; + +class LL_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> { + dag OutOperandList = (outs GPROpnd:$rt); + dag InOperandList = (ins mem_simm9:$addr); + string AsmString = !strconcat(instr_asm, "\t$rt, $addr"); + list<dag> Pattern = []; + bit mayLoad = 1; +} + +class LL_R6_DESC : LL_R6_DESC_BASE<"ll", GPR32Opnd>; + +class SC_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> { + dag OutOperandList = (outs GPROpnd:$dst); + dag InOperandList = (ins GPROpnd:$rt, mem_simm9:$addr); + string AsmString = !strconcat(instr_asm, "\t$rt, $addr"); + list<dag> Pattern = []; + bit mayStore = 1; + string Constraints = "$rt = $dst"; +} + +class SC_R6_DESC : SC_R6_DESC_BASE<"sc", GPR32Opnd>; + +class CLO_CLZ_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> { + dag OutOperandList = (outs GPROpnd:$rd); + dag InOperandList = (ins GPROpnd:$rs); + string AsmString = !strconcat(instr_asm, "\t$rd, $rs"); +} + +class CLO_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> : + CLO_CLZ_R6_DESC_BASE<instr_asm, GPROpnd> { + list<dag> Pattern = [(set GPROpnd:$rd, (ctlz (not GPROpnd:$rs)))]; +} + +class CLZ_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> : + CLO_CLZ_R6_DESC_BASE<instr_asm, GPROpnd> { + list<dag> Pattern = [(set GPROpnd:$rd, (ctlz GPROpnd:$rs))]; +} + +class CLO_R6_DESC : CLO_R6_DESC_BASE<"clo", GPR32Opnd>; +class CLZ_R6_DESC : CLZ_R6_DESC_BASE<"clz", GPR32Opnd>; + +class SDBBP_R6_DESC { + dag OutOperandList = (outs); + dag InOperandList = (ins uimm20:$code_); + string AsmString = "sdbbp\t$code_"; + list<dag> Pattern = []; +} + //===----------------------------------------------------------------------===// // // Instruction Definitions @@ -514,6 +645,7 @@ def ALIGN : ALIGN_ENC, ALIGN_DESC, ISA_MIPS32R6; def ALUIPC : ALUIPC_ENC, ALUIPC_DESC, ISA_MIPS32R6; def AUI : AUI_ENC, AUI_DESC, ISA_MIPS32R6; def AUIPC : AUIPC_ENC, AUIPC_DESC, ISA_MIPS32R6; +def BAL : BAL_ENC, BAL_DESC, ISA_MIPS32R6; def BALC : BALC_ENC, BALC_DESC, ISA_MIPS32R6; def BC1EQZ : BC1EQZ_ENC, BC1EQZ_DESC, ISA_MIPS32R6; def BC1NEZ : BC1NEZ_ENC, BC1NEZ_DESC, ISA_MIPS32R6; @@ -523,8 +655,8 @@ def BC : BC_ENC, BC_DESC, ISA_MIPS32R6; def BEQC : BEQC_ENC, BEQC_DESC, ISA_MIPS32R6; def BEQZALC : BEQZALC_ENC, BEQZALC_DESC, ISA_MIPS32R6; def BEQZC : BEQZC_ENC, BEQZC_DESC, ISA_MIPS32R6; -def BGEC; // Also aliased to blec with operands swapped -def BGEUC; // Also aliased to bleuc with operands swapped +def BGEC : BGEC_ENC, BGEC_DESC, ISA_MIPS32R6; +def BGEUC : BGEUC_ENC, BGEUC_DESC, ISA_MIPS32R6; def BGEZALC : BGEZALC_ENC, BGEZALC_DESC, ISA_MIPS32R6; def BGEZC : BGEZC_ENC, BGEZC_DESC, ISA_MIPS32R6; def BGTZALC : BGTZALC_ENC, BGTZALC_DESC, ISA_MIPS32R6; @@ -532,8 +664,8 @@ def BGTZC : BGTZC_ENC, BGTZC_DESC, ISA_MIPS32R6; def BITSWAP : BITSWAP_ENC, BITSWAP_DESC, ISA_MIPS32R6; def BLEZALC : BLEZALC_ENC, BLEZALC_DESC, ISA_MIPS32R6; def BLEZC : BLEZC_ENC, BLEZC_DESC, ISA_MIPS32R6; -def BLTC; // Also aliased to bgtc with operands swapped -def BLTUC; // Also aliased to bgtuc with operands swapped +def BLTC : BLTC_ENC, BLTC_DESC, ISA_MIPS32R6; +def BLTUC : BLTUC_ENC, BLTUC_DESC, ISA_MIPS32R6; def BLTZALC : BLTZALC_ENC, BLTZALC_DESC, ISA_MIPS32R6; def BLTZC : BLTZC_ENC, BLTZC_DESC, ISA_MIPS32R6; def BNEC : BNEC_ENC, BNEC_DESC, ISA_MIPS32R6; @@ -541,15 +673,22 @@ def BNEZALC : BNEZALC_ENC, BNEZALC_DESC, ISA_MIPS32R6; def BNEZC : BNEZC_ENC, BNEZC_DESC, ISA_MIPS32R6; def BNVC : BNVC_ENC, BNVC_DESC, ISA_MIPS32R6; def BOVC : BOVC_ENC, BOVC_DESC, ISA_MIPS32R6; +def CACHE_R6 : CACHE_ENC, CACHE_DESC, ISA_MIPS32R6; def CLASS_D : CLASS_D_ENC, CLASS_D_DESC, ISA_MIPS32R6; def CLASS_S : CLASS_S_ENC, CLASS_S_DESC, ISA_MIPS32R6; +def CLO_R6 : CLO_R6_ENC, CLO_R6_DESC, ISA_MIPS32R6; +def CLZ_R6 : CLZ_R6_ENC, CLZ_R6_DESC, ISA_MIPS32R6; defm S : CMP_CC_M<FIELD_CMP_FORMAT_S, "s", FGR32Opnd>; defm D : CMP_CC_M<FIELD_CMP_FORMAT_D, "d", FGR64Opnd>; def DIV : DIV_ENC, DIV_DESC, ISA_MIPS32R6; def DIVU : DIVU_ENC, DIVU_DESC, ISA_MIPS32R6; def JIALC : JIALC_ENC, JIALC_DESC, ISA_MIPS32R6; def JIC : JIC_ENC, JIC_DESC, ISA_MIPS32R6; -// def LSA; // See MSA +def JR_HB_R6 : JR_HB_R6_ENC, JR_HB_R6_DESC, ISA_MIPS32R6; +def LDC2_R6 : LDC2_R6_ENC, LDC2_R6_DESC, ISA_MIPS32R6; +def LL_R6 : LL_R6_ENC, LL_R6_DESC, ISA_MIPS32R6; +def LSA_R6 : LSA_R6_ENC, LSA_R6_DESC, ISA_MIPS32R6; +def LWC2_R6 : LWC2_R6_ENC, LWC2_R6_DESC, ISA_MIPS32R6; def LWPC : LWPC_ENC, LWPC_DESC, ISA_MIPS32R6; def LWUPC : LWUPC_ENC, LWUPC_DESC, ISA_MIPS32R6; def MADDF_S : MADDF_S_ENC, MADDF_S_DESC, ISA_MIPS32R6; @@ -571,13 +710,115 @@ def MUHU : MUHU_ENC, MUHU_DESC, ISA_MIPS32R6; def MUL_R6 : MUL_R6_ENC, MUL_R6_DESC, ISA_MIPS32R6; def MULU : MULU_ENC, MULU_DESC, ISA_MIPS32R6; def NAL; // BAL with rd=0 +def PREF_R6 : PREF_ENC, PREF_DESC, ISA_MIPS32R6; def RINT_D : RINT_D_ENC, RINT_D_DESC, ISA_MIPS32R6; def RINT_S : RINT_S_ENC, RINT_S_DESC, ISA_MIPS32R6; -def SELEQZ : SELEQZ_ENC, SELEQZ_DESC, ISA_MIPS32R6; +def SC_R6 : SC_R6_ENC, SC_R6_DESC, ISA_MIPS32R6; +def SDBBP_R6 : SDBBP_R6_ENC, SDBBP_R6_DESC, ISA_MIPS32R6; +def SDC2_R6 : SDC2_R6_ENC, SDC2_R6_DESC, ISA_MIPS32R6; +def SELEQZ : SELEQZ_ENC, SELEQZ_DESC, ISA_MIPS32R6, GPR_32; def SELEQZ_D : SELEQZ_D_ENC, SELEQZ_D_DESC, ISA_MIPS32R6; def SELEQZ_S : SELEQZ_S_ENC, SELEQZ_S_DESC, ISA_MIPS32R6; -def SELNEZ : SELNEZ_ENC, SELNEZ_DESC, ISA_MIPS32R6; +def SELNEZ : SELNEZ_ENC, SELNEZ_DESC, ISA_MIPS32R6, GPR_32; def SELNEZ_D : SELNEZ_D_ENC, SELNEZ_D_DESC, ISA_MIPS32R6; def SELNEZ_S : SELNEZ_S_ENC, SELNEZ_S_DESC, ISA_MIPS32R6; def SEL_D : SEL_D_ENC, SEL_D_DESC, ISA_MIPS32R6; def SEL_S : SEL_S_ENC, SEL_S_DESC, ISA_MIPS32R6; +def SWC2_R6 : SWC2_R6_ENC, SWC2_R6_DESC, ISA_MIPS32R6; + +//===----------------------------------------------------------------------===// +// +// Instruction Aliases +// +//===----------------------------------------------------------------------===// + +def : MipsInstAlias<"sdbbp", (SDBBP_R6 0)>, ISA_MIPS32R6; +def : MipsInstAlias<"jr $rs", (JALR ZERO, GPR32Opnd:$rs), 1>, ISA_MIPS32R6; + +//===----------------------------------------------------------------------===// +// +// Patterns and Pseudo Instructions +// +//===----------------------------------------------------------------------===// + +// f32 comparisons supported via another comparison +def : MipsPat<(setone f32:$lhs, f32:$rhs), + (NOR (CMP_UEQ_S f32:$lhs, f32:$rhs), ZERO)>, ISA_MIPS32R6; +def : MipsPat<(seto f32:$lhs, f32:$rhs), + (NOR (CMP_UN_S f32:$lhs, f32:$rhs), ZERO)>, ISA_MIPS32R6; +def : MipsPat<(setune f32:$lhs, f32:$rhs), + (NOR (CMP_EQ_S f32:$lhs, f32:$rhs), ZERO)>, ISA_MIPS32R6; +def : MipsPat<(seteq f32:$lhs, f32:$rhs), (CMP_EQ_S f32:$lhs, f32:$rhs)>, + ISA_MIPS32R6; +def : MipsPat<(setgt f32:$lhs, f32:$rhs), (CMP_LE_S f32:$rhs, f32:$lhs)>, + ISA_MIPS32R6; +def : MipsPat<(setge f32:$lhs, f32:$rhs), (CMP_LT_S f32:$rhs, f32:$lhs)>, + ISA_MIPS32R6; +def : MipsPat<(setlt f32:$lhs, f32:$rhs), (CMP_LT_S f32:$lhs, f32:$rhs)>, + ISA_MIPS32R6; +def : MipsPat<(setlt f32:$lhs, f32:$rhs), (CMP_LE_S f32:$lhs, f32:$rhs)>, + ISA_MIPS32R6; +def : MipsPat<(setne f32:$lhs, f32:$rhs), + (NOR (CMP_EQ_S f32:$lhs, f32:$rhs), ZERO)>, ISA_MIPS32R6; + +// f64 comparisons supported via another comparison +def : MipsPat<(setone f64:$lhs, f64:$rhs), + (NOR (CMP_UEQ_D f64:$lhs, f64:$rhs), ZERO)>, ISA_MIPS32R6; +def : MipsPat<(seto f64:$lhs, f64:$rhs), + (NOR (CMP_UN_D f64:$lhs, f64:$rhs), ZERO)>, ISA_MIPS32R6; +def : MipsPat<(setune f64:$lhs, f64:$rhs), + (NOR (CMP_EQ_D f64:$lhs, f64:$rhs), ZERO)>, ISA_MIPS32R6; +def : MipsPat<(seteq f64:$lhs, f64:$rhs), (CMP_EQ_D f64:$lhs, f64:$rhs)>, + ISA_MIPS32R6; +def : MipsPat<(setgt f64:$lhs, f64:$rhs), (CMP_LE_D f64:$rhs, f64:$lhs)>, + ISA_MIPS32R6; +def : MipsPat<(setge f64:$lhs, f64:$rhs), (CMP_LT_D f64:$rhs, f64:$lhs)>, + ISA_MIPS32R6; +def : MipsPat<(setlt f64:$lhs, f64:$rhs), (CMP_LT_D f64:$lhs, f64:$rhs)>, + ISA_MIPS32R6; +def : MipsPat<(setlt f64:$lhs, f64:$rhs), (CMP_LE_D f64:$lhs, f64:$rhs)>, + ISA_MIPS32R6; +def : MipsPat<(setne f64:$lhs, f64:$rhs), + (NOR (CMP_EQ_D f64:$lhs, f64:$rhs), ZERO)>, ISA_MIPS32R6; + +// i32 selects +def : MipsPat<(select i32:$cond, i32:$t, i32:$f), + (OR (SELNEZ i32:$t, i32:$cond), (SELEQZ i32:$f, i32:$cond))>, + ISA_MIPS32R6; +def : MipsPat<(select (i32 (seteq i32:$cond, immz)), i32:$t, i32:$f), + (OR (SELEQZ i32:$t, i32:$cond), (SELNEZ i32:$f, i32:$cond))>, + ISA_MIPS32R6; +def : MipsPat<(select (i32 (setne i32:$cond, immz)), i32:$t, i32:$f), + (OR (SELNEZ i32:$t, i32:$cond), (SELEQZ i32:$f, i32:$cond))>, + ISA_MIPS32R6; +def : MipsPat<(select (i32 (seteq i32:$cond, immZExt16:$imm)), i32:$t, i32:$f), + (OR (SELEQZ i32:$t, (XORi i32:$cond, immZExt16:$imm)), + (SELNEZ i32:$f, (XORi i32:$cond, immZExt16:$imm)))>, + ISA_MIPS32R6; +def : MipsPat<(select (i32 (setne i32:$cond, immZExt16:$imm)), i32:$t, i32:$f), + (OR (SELNEZ i32:$f, (XORi i32:$cond, immZExt16:$imm)), + (SELEQZ i32:$t, (XORi i32:$cond, immZExt16:$imm)))>, + ISA_MIPS32R6; +def : MipsPat<(select (i32 (setgt i32:$cond, immSExt16Plus1:$imm)), i32:$t, + i32:$f), + (OR (SELEQZ i32:$t, (SLTi i32:$cond, (Plus1 imm:$imm))), + (SELNEZ i32:$f, (SLTi i32:$cond, (Plus1 imm:$imm))))>, + ISA_MIPS32R6; +def : MipsPat<(select (i32 (setugt i32:$cond, immSExt16Plus1:$imm)), + i32:$t, i32:$f), + (OR (SELEQZ i32:$t, (SLTiu i32:$cond, (Plus1 imm:$imm))), + (SELNEZ i32:$f, (SLTiu i32:$cond, (Plus1 imm:$imm))))>, + ISA_MIPS32R6; + +def : MipsPat<(select i32:$cond, i32:$t, immz), + (SELNEZ i32:$t, i32:$cond)>, ISA_MIPS32R6; +def : MipsPat<(select (i32 (setne i32:$cond, immz)), i32:$t, immz), + (SELNEZ i32:$t, i32:$cond)>, ISA_MIPS32R6; +def : MipsPat<(select (i32 (seteq i32:$cond, immz)), i32:$t, immz), + (SELEQZ i32:$t, i32:$cond)>, ISA_MIPS32R6; +def : MipsPat<(select i32:$cond, immz, i32:$f), + (SELEQZ i32:$f, i32:$cond)>, ISA_MIPS32R6; +def : MipsPat<(select (i32 (setne i32:$cond, immz)), immz, i32:$f), + (SELEQZ i32:$f, i32:$cond)>, ISA_MIPS32R6; +def : MipsPat<(select (i32 (seteq i32:$cond, immz)), immz, i32:$f), + (SELNEZ i32:$f, i32:$cond)>, ISA_MIPS32R6; diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td index 924b32529b2d..f0b6814e37cc 100644 --- a/lib/Target/Mips/Mips64InstrInfo.td +++ b/lib/Target/Mips/Mips64InstrInfo.td @@ -23,6 +23,8 @@ def uimm16_64 : Operand<i64> { // Signed Operand def simm10_64 : Operand<i64>; +def imm64: Operand<i64>; + // Transformation Function - get Imm - 32. def Subtract32 : SDNodeXForm<imm, [{ return getImm(N, (unsigned)N->getZExtValue() - 32); @@ -36,6 +38,9 @@ def immZExt6 : ImmLeaf<i32, [{return Imm == (Imm & 0x3f);}]>; def immSExt10_64 : PatLeaf<(i64 imm), [{ return isInt<10>(N->getSExtValue()); }]>; +def immZExt16_64 : PatLeaf<(i64 imm), + [{ return isInt<16>(N->getZExtValue()); }]>; + //===----------------------------------------------------------------------===// // Instructions specific format //===----------------------------------------------------------------------===// @@ -62,7 +67,7 @@ let isPseudo = 1, isCodeGenOnly = 1 in { let DecoderNamespace = "Mips64" in { /// Arithmetic Instructions (ALU Immediate) def DADDi : ArithLogicI<"daddi", simm16_64, GPR64Opnd>, ADDI_FM<0x18>, - ISA_MIPS3; + ISA_MIPS3_NOT_32R6_64R6; def DADDiu : ArithLogicI<"daddiu", simm16_64, GPR64Opnd, II_DADDIU, immSExt16, add>, ADDI_FM<0x19>, IsAsCheapAsAMove, ISA_MIPS3; @@ -164,49 +169,58 @@ def SDR : StoreLeftRight<"sdr", MipsSDR, GPR64Opnd, II_SDR>, LW_FM<0x2d>, ISA_MIPS3_NOT_32R6_64R6; /// Load-linked, Store-conditional -def LLD : LLBase<"lld", GPR64Opnd>, LW_FM<0x34>, ISA_MIPS3; -def SCD : SCBase<"scd", GPR64Opnd>, LW_FM<0x3c>, ISA_MIPS3; +def LLD : LLBase<"lld", GPR64Opnd>, LW_FM<0x34>, ISA_MIPS3_NOT_32R6_64R6; +def SCD : SCBase<"scd", GPR64Opnd>, LW_FM<0x3c>, ISA_MIPS3_NOT_32R6_64R6; /// Jump and Branch Instructions let isCodeGenOnly = 1 in { -def JR64 : IndirectBranch<"jr", GPR64Opnd>, MTLO_FM<8>; -def BEQ64 : CBranch<"beq", brtarget, seteq, GPR64Opnd>, BEQ_FM<4>; -def BNE64 : CBranch<"bne", brtarget, setne, GPR64Opnd>, BEQ_FM<5>; -def BGEZ64 : CBranchZero<"bgez", brtarget, setge, GPR64Opnd>, BGEZ_FM<1, 1>; -def BGTZ64 : CBranchZero<"bgtz", brtarget, setgt, GPR64Opnd>, BGEZ_FM<7, 0>; -def BLEZ64 : CBranchZero<"blez", brtarget, setle, GPR64Opnd>, BGEZ_FM<6, 0>; -def BLTZ64 : CBranchZero<"bltz", brtarget, setlt, GPR64Opnd>, BGEZ_FM<1, 0>; -def JALR64 : JumpLinkReg<"jalr", GPR64Opnd>, JALR_FM; -def JALR64Pseudo : JumpLinkRegPseudo<GPR64Opnd, JALR, RA, GPR32Opnd>; -def TAILCALL64_R : TailCallReg<GPR64Opnd, JR, GPR32Opnd>; + def JR64 : IndirectBranch<"jr", GPR64Opnd>, MTLO_FM<8>; + def BEQ64 : CBranch<"beq", brtarget, seteq, GPR64Opnd>, BEQ_FM<4>; + def BNE64 : CBranch<"bne", brtarget, setne, GPR64Opnd>, BEQ_FM<5>; + def BGEZ64 : CBranchZero<"bgez", brtarget, setge, GPR64Opnd>, BGEZ_FM<1, 1>; + def BGTZ64 : CBranchZero<"bgtz", brtarget, setgt, GPR64Opnd>, BGEZ_FM<7, 0>; + def BLEZ64 : CBranchZero<"blez", brtarget, setle, GPR64Opnd>, BGEZ_FM<6, 0>; + def BLTZ64 : CBranchZero<"bltz", brtarget, setlt, GPR64Opnd>, BGEZ_FM<1, 0>; + def JALR64 : JumpLinkReg<"jalr", GPR64Opnd>, JALR_FM; + def JALR64Pseudo : JumpLinkRegPseudo<GPR64Opnd, JALR, RA, GPR32Opnd>; + def TAILCALL64_R : TailCallReg<GPR64Opnd, JR, GPR32Opnd>; } +def PseudoReturn64 : PseudoReturnBase<GPR64Opnd>; +def PseudoIndirectBranch64 : PseudoIndirectBranchBase<GPR64Opnd>; + /// Multiply and Divide Instructions. def DMULT : Mult<"dmult", II_DMULT, GPR64Opnd, [HI0_64, LO0_64]>, - MULT_FM<0, 0x1c>, ISA_MIPS3; + MULT_FM<0, 0x1c>, ISA_MIPS3_NOT_32R6_64R6; def DMULTu : Mult<"dmultu", II_DMULTU, GPR64Opnd, [HI0_64, LO0_64]>, - MULT_FM<0, 0x1d>, ISA_MIPS3; + MULT_FM<0, 0x1d>, ISA_MIPS3_NOT_32R6_64R6; def PseudoDMULT : MultDivPseudo<DMULT, ACC128, GPR64Opnd, MipsMult, - II_DMULT>; + II_DMULT>, ISA_MIPS3_NOT_32R6_64R6; def PseudoDMULTu : MultDivPseudo<DMULTu, ACC128, GPR64Opnd, MipsMultu, - II_DMULTU>; + II_DMULTU>, ISA_MIPS3_NOT_32R6_64R6; def DSDIV : Div<"ddiv", II_DDIV, GPR64Opnd, [HI0_64, LO0_64]>, - MULT_FM<0, 0x1e>, ISA_MIPS3; + MULT_FM<0, 0x1e>, ISA_MIPS3_NOT_32R6_64R6; def DUDIV : Div<"ddivu", II_DDIVU, GPR64Opnd, [HI0_64, LO0_64]>, - MULT_FM<0, 0x1f>, ISA_MIPS3; + MULT_FM<0, 0x1f>, ISA_MIPS3_NOT_32R6_64R6; def PseudoDSDIV : MultDivPseudo<DSDIV, ACC128, GPR64Opnd, MipsDivRem, - II_DDIV, 0, 1, 1>; + II_DDIV, 0, 1, 1>, ISA_MIPS3_NOT_32R6_64R6; def PseudoDUDIV : MultDivPseudo<DUDIV, ACC128, GPR64Opnd, MipsDivRemU, - II_DDIVU, 0, 1, 1>; + II_DDIVU, 0, 1, 1>, ISA_MIPS3_NOT_32R6_64R6; let isCodeGenOnly = 1 in { -def MTHI64 : MoveToLOHI<"mthi", GPR64Opnd, [HI0_64]>, MTLO_FM<0x11>; -def MTLO64 : MoveToLOHI<"mtlo", GPR64Opnd, [LO0_64]>, MTLO_FM<0x13>; -def MFHI64 : MoveFromLOHI<"mfhi", GPR64Opnd, AC0_64>, MFLO_FM<0x10>; -def MFLO64 : MoveFromLOHI<"mflo", GPR64Opnd, AC0_64>, MFLO_FM<0x12>; -def PseudoMFHI64 : PseudoMFLOHI<GPR64, ACC128, MipsMFHI>; -def PseudoMFLO64 : PseudoMFLOHI<GPR64, ACC128, MipsMFLO>; -def PseudoMTLOHI64 : PseudoMTLOHI<ACC128, GPR64>; +def MTHI64 : MoveToLOHI<"mthi", GPR64Opnd, [HI0_64]>, MTLO_FM<0x11>, + ISA_MIPS3_NOT_32R6_64R6; +def MTLO64 : MoveToLOHI<"mtlo", GPR64Opnd, [LO0_64]>, MTLO_FM<0x13>, + ISA_MIPS3_NOT_32R6_64R6; +def MFHI64 : MoveFromLOHI<"mfhi", GPR64Opnd, AC0_64>, MFLO_FM<0x10>, + ISA_MIPS3_NOT_32R6_64R6; +def MFLO64 : MoveFromLOHI<"mflo", GPR64Opnd, AC0_64>, MFLO_FM<0x12>, + ISA_MIPS3_NOT_32R6_64R6; +def PseudoMFHI64 : PseudoMFLOHI<GPR64, ACC128, MipsMFHI>, + ISA_MIPS3_NOT_32R6_64R6; +def PseudoMFLO64 : PseudoMFLOHI<GPR64, ACC128, MipsMFLO>, + ISA_MIPS3_NOT_32R6_64R6; +def PseudoMTLOHI64 : PseudoMTLOHI<ACC128, GPR64>, ISA_MIPS3_NOT_32R6_64R6; /// Sign Ext In Register Instructions. def SEB64 : SignExtInReg<"seb", i8, GPR64Opnd, II_SEB>, SEB_FM<0x10, 0x20>, @@ -216,8 +230,8 @@ def SEH64 : SignExtInReg<"seh", i16, GPR64Opnd, II_SEH>, SEB_FM<0x18, 0x20>, } /// Count Leading -def DCLZ : CountLeading0<"dclz", GPR64Opnd>, CLO_FM<0x24>, ISA_MIPS64; -def DCLO : CountLeading1<"dclo", GPR64Opnd>, CLO_FM<0x25>, ISA_MIPS64; +def DCLZ : CountLeading0<"dclz", GPR64Opnd>, CLO_FM<0x24>, ISA_MIPS64_NOT_64R6; +def DCLO : CountLeading1<"dclo", GPR64Opnd>, CLO_FM<0x25>, ISA_MIPS64_NOT_64R6; /// Double Word Swap Bytes/HalfWords def DSBH : SubwordSwap<"dsbh", GPR64Opnd>, SEB_FM<2, 0x24>, ISA_MIPS64R2; @@ -431,13 +445,13 @@ def : MipsInstAlias<"daddu $rs, $rt, $imm", 0>; def : MipsInstAlias<"dadd $rs, $rt, $imm", (DADDi GPR64Opnd:$rs, GPR64Opnd:$rt, simm16_64:$imm), - 0>; + 0>, ISA_MIPS3_NOT_32R6_64R6; def : MipsInstAlias<"daddu $rs, $imm", (DADDiu GPR64Opnd:$rs, GPR64Opnd:$rs, simm16_64:$imm), 0>; def : MipsInstAlias<"dadd $rs, $imm", (DADDi GPR64Opnd:$rs, GPR64Opnd:$rs, simm16_64:$imm), - 0>; + 0>, ISA_MIPS3_NOT_32R6_64R6; def : MipsInstAlias<"add $rs, $imm", (ADDi GPR32Opnd:$rs, GPR32Opnd:$rs, simm16:$imm), 0>; @@ -450,10 +464,22 @@ def : MipsInstAlias<"dsll $rd, $rt, $rs", def : MipsInstAlias<"dsubu $rt, $rs, $imm", (DADDiu GPR64Opnd:$rt, GPR64Opnd:$rs, InvertedImOperand64:$imm), 0>; +def : MipsInstAlias<"dsubi $rs, $rt, $imm", + (DADDi GPR64Opnd:$rs, GPR64Opnd:$rt, + InvertedImOperand64:$imm), + 0>, ISA_MIPS3_NOT_32R6_64R6; +def : MipsInstAlias<"dsubi $rs, $imm", + (DADDi GPR64Opnd:$rs, GPR64Opnd:$rs, + InvertedImOperand64:$imm), + 0>, ISA_MIPS3_NOT_32R6_64R6; +def : MipsInstAlias<"dsub $rs, $rt, $imm", + (DADDi GPR64Opnd:$rs, GPR64Opnd:$rt, + InvertedImOperand64:$imm), + 0>, ISA_MIPS3_NOT_32R6_64R6; def : MipsInstAlias<"dsub $rs, $imm", (DADDi GPR64Opnd:$rs, GPR64Opnd:$rs, InvertedImOperand64:$imm), - 0>; + 0>, ISA_MIPS3_NOT_32R6_64R6; def : MipsInstAlias<"dsubu $rs, $imm", (DADDiu GPR64Opnd:$rs, GPR64Opnd:$rs, InvertedImOperand64:$imm), @@ -465,6 +491,11 @@ def : MipsInstAlias<"dsrl $rd, $rt, $rs", (DSRLV GPR64Opnd:$rd, GPR64Opnd:$rt, GPR32Opnd:$rs), 0>, ISA_MIPS3; +class LoadImm64< string instr_asm, Operand Od, RegisterOperand RO> : + MipsAsmPseudoInst<(outs RO:$rt), (ins Od:$imm64), + !strconcat(instr_asm, "\t$rt, $imm64")> ; +def LoadImm64Reg : LoadImm64<"dli", imm64, GPR64Opnd>; + /// Move between CPU and coprocessor registers let DecoderNamespace = "Mips64", Predicates = [HasMips64] in { def DMFC0 : MFC3OP<"dmfc0", GPR64Opnd>, MFC3OP_FM<0x10, 1>; diff --git a/lib/Target/Mips/Mips64r6InstrInfo.td b/lib/Target/Mips/Mips64r6InstrInfo.td index f971218779d9..63cf60b44cdc 100644 --- a/lib/Target/Mips/Mips64r6InstrInfo.td +++ b/lib/Target/Mips/Mips64r6InstrInfo.td @@ -13,10 +13,6 @@ // Notes about removals/changes from MIPS32r6: // Reencoded: dclo, dclz -// Reencoded: lld, scd -// Removed: daddi -// Removed: ddiv, ddivu, dmult, dmultu -// Removed: div, divu //===----------------------------------------------------------------------===// // @@ -29,14 +25,20 @@ class DAUI_ENC : DAUI_FM; class DAHI_ENC : REGIMM_FM<OPCODE5_DAHI>; class DATI_ENC : REGIMM_FM<OPCODE5_DATI>; class DBITSWAP_ENC : SPECIAL3_2R_FM<OPCODE6_DBITSWAP>; +class DCLO_R6_ENC : SPECIAL_2R_FM<OPCODE6_DCLO>; +class DCLZ_R6_ENC : SPECIAL_2R_FM<OPCODE6_DCLZ>; class DDIV_ENC : SPECIAL_3R_FM<0b00010, 0b011110>; class DDIVU_ENC : SPECIAL_3R_FM<0b00010, 0b011111>; +class DLSA_R6_ENC : SPECIAL_LSA_FM<OPCODE6_DLSA>; class DMOD_ENC : SPECIAL_3R_FM<0b00011, 0b011110>; class DMODU_ENC : SPECIAL_3R_FM<0b00011, 0b011111>; -class DMUH_ENC : SPECIAL_3R_FM<0b00011, 0b111000>; -class DMUHU_ENC : SPECIAL_3R_FM<0b00011, 0b111001>; -class DMUL_R6_ENC : SPECIAL_3R_FM<0b00010, 0b111000>; -class DMULU_ENC : SPECIAL_3R_FM<0b00010, 0b111001>; +class DMUH_ENC : SPECIAL_3R_FM<0b00011, 0b011100>; +class DMUHU_ENC : SPECIAL_3R_FM<0b00011, 0b011101>; +class DMUL_R6_ENC : SPECIAL_3R_FM<0b00010, 0b011100>; +class DMULU_ENC : SPECIAL_3R_FM<0b00010, 0b011101>; +class LDPC_ENC : PCREL18_FM<OPCODE3_LDPC>; +class LLD_R6_ENC : SPECIAL3_LL_SC_FM<OPCODE6_LLD>; +class SCD_R6_ENC : SPECIAL3_LL_SC_FM<OPCODE6_SCD>; //===----------------------------------------------------------------------===// // @@ -56,14 +58,22 @@ class DAHI_DESC : AHI_ATI_DESC_BASE<"dahi", GPR64Opnd>; class DATI_DESC : AHI_ATI_DESC_BASE<"dati", GPR64Opnd>; class DAUI_DESC : AUI_DESC_BASE<"daui", GPR64Opnd>; class DBITSWAP_DESC : BITSWAP_DESC_BASE<"dbitswap", GPR64Opnd>; -class DDIV_DESC : DIVMOD_DESC_BASE<"ddiv", GPR64Opnd>; -class DDIVU_DESC : DIVMOD_DESC_BASE<"ddivu", GPR64Opnd>; -class DMOD_DESC : DIVMOD_DESC_BASE<"dmod", GPR64Opnd>; -class DMODU_DESC : DIVMOD_DESC_BASE<"dmodu", GPR64Opnd>; -class DMUH_DESC : MUL_R6_DESC_BASE<"dmuh", GPR64Opnd>; -class DMUHU_DESC : MUL_R6_DESC_BASE<"dmuhu", GPR64Opnd>; -class DMUL_R6_DESC : MUL_R6_DESC_BASE<"dmul", GPR64Opnd>; +class DCLO_R6_DESC : CLO_R6_DESC_BASE<"dclo", GPR64Opnd>; +class DCLZ_R6_DESC : CLZ_R6_DESC_BASE<"dclz", GPR64Opnd>; +class DDIV_DESC : DIVMOD_DESC_BASE<"ddiv", GPR64Opnd, sdiv>; +class DDIVU_DESC : DIVMOD_DESC_BASE<"ddivu", GPR64Opnd, udiv>; +class DLSA_R6_DESC : LSA_R6_DESC_BASE<"dlsa", GPR64Opnd, uimm2>; +class DMOD_DESC : DIVMOD_DESC_BASE<"dmod", GPR64Opnd, srem>; +class DMODU_DESC : DIVMOD_DESC_BASE<"dmodu", GPR64Opnd, urem>; +class DMUH_DESC : MUL_R6_DESC_BASE<"dmuh", GPR64Opnd, mulhs>; +class DMUHU_DESC : MUL_R6_DESC_BASE<"dmuhu", GPR64Opnd, mulhu>; +class DMUL_R6_DESC : MUL_R6_DESC_BASE<"dmul", GPR64Opnd, mul>; class DMULU_DESC : MUL_R6_DESC_BASE<"dmulu", GPR64Opnd>; +class LDPC_DESC : PCREL_DESC_BASE<"ldpc", GPR64Opnd, simm18_lsl3>; +class LLD_R6_DESC : LL_R6_DESC_BASE<"lld", GPR64Opnd>; +class SCD_R6_DESC : SC_R6_DESC_BASE<"scd", GPR64Opnd>; +class SELEQZ64_DESC : SELEQNE_Z_DESC_BASE<"seleqz", GPR64Opnd>; +class SELNEZ64_DESC : SELEQNE_Z_DESC_BASE<"selnez", GPR64Opnd>; //===----------------------------------------------------------------------===// // @@ -76,13 +86,132 @@ def DALIGN : DALIGN_ENC, DALIGN_DESC, ISA_MIPS64R6; def DATI : DATI_ENC, DATI_DESC, ISA_MIPS64R6; def DAUI : DAUI_ENC, DAUI_DESC, ISA_MIPS64R6; def DBITSWAP : DBITSWAP_ENC, DBITSWAP_DESC, ISA_MIPS64R6; +def DCLO_R6 : DCLO_R6_ENC, DCLO_R6_DESC, ISA_MIPS64R6; +def DCLZ_R6 : DCLZ_R6_ENC, DCLZ_R6_DESC, ISA_MIPS64R6; def DDIV : DDIV_ENC, DDIV_DESC, ISA_MIPS64R6; def DDIVU : DDIVU_ENC, DDIVU_DESC, ISA_MIPS64R6; -// def DLSA; // See MSA +def DLSA_R6 : DLSA_R6_ENC, DLSA_R6_DESC, ISA_MIPS64R6; def DMOD : DMOD_ENC, DMOD_DESC, ISA_MIPS64R6; def DMODU : DMODU_ENC, DMODU_DESC, ISA_MIPS64R6; def DMUH: DMUH_ENC, DMUH_DESC, ISA_MIPS64R6; def DMUHU: DMUHU_ENC, DMUHU_DESC, ISA_MIPS64R6; def DMUL_R6: DMUL_R6_ENC, DMUL_R6_DESC, ISA_MIPS64R6; def DMULU: DMULU_ENC, DMULU_DESC, ISA_MIPS64R6; -def LDPC; +def LDPC: LDPC_ENC, LDPC_DESC, ISA_MIPS64R6; +def LLD_R6 : LLD_R6_ENC, LLD_R6_DESC, ISA_MIPS32R6; +def SCD_R6 : SCD_R6_ENC, SCD_R6_DESC, ISA_MIPS32R6; +let DecoderNamespace = "Mips32r6_64r6_GP64" in { + def SELEQZ64 : SELEQZ_ENC, SELEQZ64_DESC, ISA_MIPS32R6, GPR_64; + def SELNEZ64 : SELNEZ_ENC, SELNEZ64_DESC, ISA_MIPS32R6, GPR_64; +} + +//===----------------------------------------------------------------------===// +// +// Instruction Aliases +// +//===----------------------------------------------------------------------===// + +def : MipsInstAlias<"jr $rs", (JALR64 ZERO_64, GPR64Opnd:$rs), 1>, ISA_MIPS64R6; + +//===----------------------------------------------------------------------===// +// +// Patterns and Pseudo Instructions +// +//===----------------------------------------------------------------------===// + +// i64 selects +def : MipsPat<(select i64:$cond, i64:$t, i64:$f), + (OR64 (SELNEZ64 i64:$t, i64:$cond), + (SELEQZ64 i64:$f, i64:$cond))>, + ISA_MIPS64R6; +def : MipsPat<(select (i32 (seteq i64:$cond, immz)), i64:$t, i64:$f), + (OR64 (SELEQZ64 i64:$t, i64:$cond), + (SELNEZ64 i64:$f, i64:$cond))>, + ISA_MIPS64R6; +def : MipsPat<(select (i32 (setne i64:$cond, immz)), i64:$t, i64:$f), + (OR64 (SELNEZ64 i64:$t, i64:$cond), + (SELEQZ64 i64:$f, i64:$cond))>, + ISA_MIPS64R6; +def : MipsPat<(select (i32 (seteq i64:$cond, immZExt16_64:$imm)), i64:$t, i64:$f), + (OR64 (SELEQZ64 i64:$t, (XORi64 i64:$cond, immZExt16_64:$imm)), + (SELNEZ64 i64:$f, (XORi64 i64:$cond, immZExt16_64:$imm)))>, + ISA_MIPS64R6; +def : MipsPat<(select (i32 (setne i64:$cond, immZExt16_64:$imm)), i64:$t, i64:$f), + (OR64 (SELNEZ64 i64:$t, (XORi64 i64:$cond, immZExt16_64:$imm)), + (SELEQZ64 i64:$f, (XORi64 i64:$cond, immZExt16_64:$imm)))>, + ISA_MIPS64R6; +def : MipsPat< + (select (i32 (setgt i64:$cond, immSExt16Plus1:$imm)), i64:$t, i64:$f), + (OR64 (SELEQZ64 i64:$t, + (SUBREG_TO_REG (i64 0), (SLTi64 i64:$cond, (Plus1 imm:$imm)), + sub_32)), + (SELNEZ64 i64:$f, + (SUBREG_TO_REG (i64 0), (SLTi64 i64:$cond, (Plus1 imm:$imm)), + sub_32)))>, + ISA_MIPS64R6; +def : MipsPat< + (select (i32 (setugt i64:$cond, immSExt16Plus1:$imm)), i64:$t, i64:$f), + (OR64 (SELEQZ64 i64:$t, + (SUBREG_TO_REG (i64 0), (SLTiu64 i64:$cond, (Plus1 imm:$imm)), + sub_32)), + (SELNEZ64 i64:$f, + (SUBREG_TO_REG (i64 0), (SLTiu64 i64:$cond, (Plus1 imm:$imm)), + sub_32)))>, + ISA_MIPS64R6; + +def : MipsPat<(select (i32 (setne i64:$cond, immz)), i64:$t, immz), + (SELNEZ64 i64:$t, i64:$cond)>, ISA_MIPS64R6; +def : MipsPat<(select (i32 (seteq i64:$cond, immz)), i64:$t, immz), + (SELEQZ64 i64:$t, i64:$cond)>, ISA_MIPS64R6; +def : MipsPat<(select (i32 (setne i64:$cond, immz)), immz, i64:$f), + (SELEQZ64 i64:$f, i64:$cond)>, ISA_MIPS64R6; +def : MipsPat<(select (i32 (seteq i64:$cond, immz)), immz, i64:$f), + (SELNEZ64 i64:$f, i64:$cond)>, ISA_MIPS64R6; + +// i64 selects from an i32 comparison +// One complicating factor here is that bits 32-63 of an i32 are undefined. +// FIXME: Ideally, setcc would always produce an i64 on MIPS64 targets. +// This would allow us to remove the sign-extensions here. +def : MipsPat<(select i32:$cond, i64:$t, i64:$f), + (OR64 (SELNEZ64 i64:$t, (SLL64_32 i32:$cond)), + (SELEQZ64 i64:$f, (SLL64_32 i32:$cond)))>, + ISA_MIPS64R6; +def : MipsPat<(select (i32 (seteq i32:$cond, immz)), i64:$t, i64:$f), + (OR64 (SELEQZ64 i64:$t, (SLL64_32 i32:$cond)), + (SELNEZ64 i64:$f, (SLL64_32 i32:$cond)))>, + ISA_MIPS64R6; +def : MipsPat<(select (i32 (setne i32:$cond, immz)), i64:$t, i64:$f), + (OR64 (SELNEZ64 i64:$t, (SLL64_32 i32:$cond)), + (SELEQZ64 i64:$f, (SLL64_32 i32:$cond)))>, + ISA_MIPS64R6; +def : MipsPat<(select (i32 (seteq i32:$cond, immZExt16:$imm)), i64:$t, i64:$f), + (OR64 (SELEQZ64 i64:$t, (SLL64_32 (XORi i32:$cond, + immZExt16:$imm))), + (SELNEZ64 i64:$f, (SLL64_32 (XORi i32:$cond, + immZExt16:$imm))))>, + ISA_MIPS64R6; +def : MipsPat<(select (i32 (setne i32:$cond, immZExt16:$imm)), i64:$t, i64:$f), + (OR64 (SELNEZ64 i64:$f, (SLL64_32 (XORi i32:$cond, + immZExt16:$imm))), + (SELEQZ64 i64:$t, (SLL64_32 (XORi i32:$cond, + immZExt16:$imm))))>, + ISA_MIPS64R6; + +def : MipsPat<(select i32:$cond, i64:$t, immz), + (SELNEZ64 i64:$t, (SLL64_32 i32:$cond))>, + ISA_MIPS64R6; +def : MipsPat<(select (i32 (setne i32:$cond, immz)), i64:$t, immz), + (SELNEZ64 i64:$t, (SLL64_32 i32:$cond))>, + ISA_MIPS64R6; +def : MipsPat<(select (i32 (seteq i32:$cond, immz)), i64:$t, immz), + (SELEQZ64 i64:$t, (SLL64_32 i32:$cond))>, + ISA_MIPS64R6; +def : MipsPat<(select i32:$cond, immz, i64:$f), + (SELEQZ64 i64:$f, (SLL64_32 i32:$cond))>, + ISA_MIPS64R6; +def : MipsPat<(select (i32 (setne i32:$cond, immz)), immz, i64:$f), + (SELEQZ64 i64:$f, (SLL64_32 i32:$cond))>, + ISA_MIPS64R6; +def : MipsPat<(select (i32 (seteq i32:$cond, immz)), immz, i64:$f), + (SELNEZ64 i64:$f, (SLL64_32 i32:$cond))>, + ISA_MIPS64R6; diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp index 6df90aa75a11..1fb75a290b98 100644 --- a/lib/Target/Mips/MipsAsmPrinter.cpp +++ b/lib/Target/Mips/MipsAsmPrinter.cpp @@ -91,7 +91,46 @@ bool MipsAsmPrinter::lowerOperand(const MachineOperand &MO, MCOperand &MCOp) { #include "MipsGenMCPseudoLowering.inc" +// Lower PseudoReturn/PseudoIndirectBranch/PseudoIndirectBranch64 to JR, JR_MM, +// JALR, or JALR64 as appropriate for the target +void MipsAsmPrinter::emitPseudoIndirectBranch(MCStreamer &OutStreamer, + const MachineInstr *MI) { + bool HasLinkReg = false; + MCInst TmpInst0; + + if (Subtarget->hasMips64r6()) { + // MIPS64r6 should use (JALR64 ZERO_64, $rs) + TmpInst0.setOpcode(Mips::JALR64); + HasLinkReg = true; + } else if (Subtarget->hasMips32r6()) { + // MIPS32r6 should use (JALR ZERO, $rs) + TmpInst0.setOpcode(Mips::JALR); + HasLinkReg = true; + } else if (Subtarget->inMicroMipsMode()) + // microMIPS should use (JR_MM $rs) + TmpInst0.setOpcode(Mips::JR_MM); + else { + // Everything else should use (JR $rs) + TmpInst0.setOpcode(Mips::JR); + } + + MCOperand MCOp; + + if (HasLinkReg) { + unsigned ZeroReg = Subtarget->isGP64bit() ? Mips::ZERO_64 : Mips::ZERO; + TmpInst0.addOperand(MCOperand::CreateReg(ZeroReg)); + } + + lowerOperand(MI->getOperand(0), MCOp); + TmpInst0.addOperand(MCOp); + + EmitToStreamer(OutStreamer, TmpInst0); +} + void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) { + MipsTargetStreamer &TS = getTargetStreamer(); + TS.setCanHaveModuleDir(false); + if (MI->isDebugValue()) { SmallString<128> Str; raw_svector_ostream OS(Str); @@ -141,6 +180,14 @@ void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) { if (emitPseudoExpansionLowering(OutStreamer, &*I)) continue; + if (I->getOpcode() == Mips::PseudoReturn || + I->getOpcode() == Mips::PseudoReturn64 || + I->getOpcode() == Mips::PseudoIndirectBranch || + I->getOpcode() == Mips::PseudoIndirectBranch64) { + emitPseudoIndirectBranch(OutStreamer, &*I); + continue; + } + // The inMips16Mode() test is not permanent. // Some instructions are marked as pseudo right now which // would make the test fail for the wrong reason but @@ -657,6 +704,13 @@ void MipsAsmPrinter::EmitStartOfAsmFile(Module &M) { OutContext.getELFSection(".gcc_compiled_long64", ELF::SHT_PROGBITS, 0, SectionKind::getDataRel())); } + + getTargetStreamer().updateABIInfo(*Subtarget); + getTargetStreamer().emitDirectiveModuleFP(); + + if (Subtarget->isABI_O32()) + getTargetStreamer().emitDirectiveModuleOddSPReg(Subtarget->useOddSPReg(), + Subtarget->isABI_O32()); } void MipsAsmPrinter::EmitJal(MCSymbol *Symbol) { @@ -852,7 +906,7 @@ void MipsAsmPrinter::EmitFPCallStub( TS.emitDirectiveSetNoMicroMips(); // // .ent __call_stub_fp_xxxx - // .type __call_stub_fp_xxxx,@function + // .type __call_stub_fp_xxxx,@function // __call_stub_fp_xxxx: // std::string x = "__call_stub_fp_" + std::string(Symbol); diff --git a/lib/Target/Mips/MipsAsmPrinter.h b/lib/Target/Mips/MipsAsmPrinter.h index e82b145925d7..967aa0b16416 100644 --- a/lib/Target/Mips/MipsAsmPrinter.h +++ b/lib/Target/Mips/MipsAsmPrinter.h @@ -40,6 +40,12 @@ private: bool emitPseudoExpansionLowering(MCStreamer &OutStreamer, const MachineInstr *MI); + // Emit PseudoReturn, PseudoReturn64, PseudoIndirectBranch, + // and PseudoIndirectBranch64 as a JR, JR_MM, JALR, or JALR64 as appropriate + // for the target. + void emitPseudoIndirectBranch(MCStreamer &OutStreamer, + const MachineInstr *MI); + // lowerOperand - Convert a MachineOperand into the equivalent MCOperand. bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp); diff --git a/lib/Target/Mips/MipsCallingConv.td b/lib/Target/Mips/MipsCallingConv.td index c83d880cdb2f..007213c848f8 100644 --- a/lib/Target/Mips/MipsCallingConv.td +++ b/lib/Target/Mips/MipsCallingConv.td @@ -239,6 +239,11 @@ def RetCC_Mips : CallingConv<[ def CSR_SingleFloatOnly : CalleeSavedRegs<(add (sequence "F%u", 31, 20), RA, FP, (sequence "S%u", 7, 0))>; +def CSR_O32_FPXX : CalleeSavedRegs<(add (sequence "D%u", 15, 10), RA, FP, + (sequence "S%u", 7, 0))> { + let OtherPreserved = (add (decimate (sequence "F%u", 30, 20), 2)); +} + def CSR_O32 : CalleeSavedRegs<(add (sequence "D%u", 15, 10), RA, FP, (sequence "S%u", 7, 0))>; diff --git a/lib/Target/Mips/MipsCodeEmitter.cpp b/lib/Target/Mips/MipsCodeEmitter.cpp index 13fa546b9eb2..151ef134e1da 100644 --- a/lib/Target/Mips/MipsCodeEmitter.cpp +++ b/lib/Target/Mips/MipsCodeEmitter.cpp @@ -124,6 +124,7 @@ private: unsigned getSizeInsEncoding(const MachineInstr &MI, unsigned OpNo) const; unsigned getLSAImmEncoding(const MachineInstr &MI, unsigned OpNo) const; unsigned getSimm19Lsl2Encoding(const MachineInstr &MI, unsigned OpNo) const; + unsigned getSimm18Lsl3Encoding(const MachineInstr &MI, unsigned OpNo) const; /// Expand pseudo instructions with accumulator register operands. void expandACCInstr(MachineBasicBlock::instr_iterator MI, @@ -273,6 +274,12 @@ unsigned MipsCodeEmitter::getLSAImmEncoding(const MachineInstr &MI, return 0; } +unsigned MipsCodeEmitter::getSimm18Lsl3Encoding(const MachineInstr &MI, + unsigned OpNo) const { + llvm_unreachable("Unimplemented function."); + return 0; +} + unsigned MipsCodeEmitter::getSimm19Lsl2Encoding(const MachineInstr &MI, unsigned OpNo) const { llvm_unreachable("Unimplemented function."); diff --git a/lib/Target/Mips/MipsCondMov.td b/lib/Target/Mips/MipsCondMov.td index 7177f6544a39..690f62608504 100644 --- a/lib/Target/Mips/MipsCondMov.td +++ b/lib/Target/Mips/MipsCondMov.td @@ -104,136 +104,162 @@ multiclass MovnPats<RegisterClass CRC, RegisterClass DRC, Instruction MOVNInst, // Instantiation of instructions. def MOVZ_I_I : MMRel, CMov_I_I_FT<"movz", GPR32Opnd, GPR32Opnd, II_MOVZ>, - ADD_FM<0, 0xa>, INSN_MIPS4_32; + ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6; let isCodeGenOnly = 1 in { def MOVZ_I_I64 : CMov_I_I_FT<"movz", GPR32Opnd, GPR64Opnd, II_MOVZ>, - ADD_FM<0, 0xa>; + ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6; def MOVZ_I64_I : CMov_I_I_FT<"movz", GPR64Opnd, GPR32Opnd, II_MOVZ>, - ADD_FM<0, 0xa>; + ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6; def MOVZ_I64_I64 : CMov_I_I_FT<"movz", GPR64Opnd, GPR64Opnd, II_MOVZ>, - ADD_FM<0, 0xa>; + ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6; } def MOVN_I_I : MMRel, CMov_I_I_FT<"movn", GPR32Opnd, GPR32Opnd, II_MOVN>, - ADD_FM<0, 0xb>, INSN_MIPS4_32; + ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6; let isCodeGenOnly = 1 in { def MOVN_I_I64 : CMov_I_I_FT<"movn", GPR32Opnd, GPR64Opnd, II_MOVN>, - ADD_FM<0, 0xb>; + ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6; def MOVN_I64_I : CMov_I_I_FT<"movn", GPR64Opnd, GPR32Opnd, II_MOVN>, - ADD_FM<0, 0xb>; + ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6; def MOVN_I64_I64 : CMov_I_I_FT<"movn", GPR64Opnd, GPR64Opnd, II_MOVN>, - ADD_FM<0, 0xb>; + ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6; } def MOVZ_I_S : MMRel, CMov_I_F_FT<"movz.s", GPR32Opnd, FGR32Opnd, II_MOVZ_S>, - CMov_I_F_FM<18, 16>, INSN_MIPS4_32; + CMov_I_F_FM<18, 16>, INSN_MIPS4_32_NOT_32R6_64R6; let isCodeGenOnly = 1 in def MOVZ_I64_S : CMov_I_F_FT<"movz.s", GPR64Opnd, FGR32Opnd, II_MOVZ_S>, - CMov_I_F_FM<18, 16>, AdditionalRequires<[HasMips64]>; + CMov_I_F_FM<18, 16>, INSN_MIPS4_32_NOT_32R6_64R6, + AdditionalRequires<[HasMips64]>; def MOVN_I_S : MMRel, CMov_I_F_FT<"movn.s", GPR32Opnd, FGR32Opnd, II_MOVN_S>, - CMov_I_F_FM<19, 16>, INSN_MIPS4_32; + CMov_I_F_FM<19, 16>, INSN_MIPS4_32_NOT_32R6_64R6; let isCodeGenOnly = 1 in def MOVN_I64_S : CMov_I_F_FT<"movn.s", GPR64Opnd, FGR32Opnd, II_MOVN_S>, - CMov_I_F_FM<19, 16>, AdditionalRequires<[IsGP64bit]>; + CMov_I_F_FM<19, 16>, INSN_MIPS4_32_NOT_32R6_64R6, + AdditionalRequires<[IsGP64bit]>; def MOVZ_I_D32 : MMRel, CMov_I_F_FT<"movz.d", GPR32Opnd, AFGR64Opnd, II_MOVZ_D>, CMov_I_F_FM<18, 17>, - INSN_MIPS4_32, FGR_32; + INSN_MIPS4_32_NOT_32R6_64R6, FGR_32; def MOVN_I_D32 : MMRel, CMov_I_F_FT<"movn.d", GPR32Opnd, AFGR64Opnd, II_MOVN_D>, CMov_I_F_FM<19, 17>, - INSN_MIPS4_32, FGR_32; + INSN_MIPS4_32_NOT_32R6_64R6, FGR_32; let DecoderNamespace = "Mips64" in { def MOVZ_I_D64 : CMov_I_F_FT<"movz.d", GPR32Opnd, FGR64Opnd, II_MOVZ_D>, - CMov_I_F_FM<18, 17>, INSN_MIPS4_32, FGR_64; + CMov_I_F_FM<18, 17>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64; def MOVN_I_D64 : CMov_I_F_FT<"movn.d", GPR32Opnd, FGR64Opnd, II_MOVN_D>, - CMov_I_F_FM<19, 17>, INSN_MIPS4_32, FGR_64; + CMov_I_F_FM<19, 17>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64; let isCodeGenOnly = 1 in { - def MOVZ_I64_D64 : CMov_I_F_FT<"movz.d", GPR64Opnd, FGR64Opnd, - II_MOVZ_D>, CMov_I_F_FM<18, 17>, FGR_64; - def MOVN_I64_D64 : CMov_I_F_FT<"movn.d", GPR64Opnd, FGR64Opnd, - II_MOVN_D>, CMov_I_F_FM<19, 17>, FGR_64; + def MOVZ_I64_D64 : CMov_I_F_FT<"movz.d", GPR64Opnd, FGR64Opnd, II_MOVZ_D>, + CMov_I_F_FM<18, 17>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64; + def MOVN_I64_D64 : CMov_I_F_FT<"movn.d", GPR64Opnd, FGR64Opnd, II_MOVN_D>, + CMov_I_F_FM<19, 17>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64; } } def MOVT_I : MMRel, CMov_F_I_FT<"movt", GPR32Opnd, II_MOVT, MipsCMovFP_T>, - CMov_F_I_FM<1>, INSN_MIPS4_32; + CMov_F_I_FM<1>, INSN_MIPS4_32_NOT_32R6_64R6; let isCodeGenOnly = 1 in def MOVT_I64 : CMov_F_I_FT<"movt", GPR64Opnd, II_MOVT, MipsCMovFP_T>, - CMov_F_I_FM<1>, AdditionalRequires<[IsGP64bit]>; + CMov_F_I_FM<1>, INSN_MIPS4_32_NOT_32R6_64R6, + AdditionalRequires<[IsGP64bit]>; def MOVF_I : MMRel, CMov_F_I_FT<"movf", GPR32Opnd, II_MOVF, MipsCMovFP_F>, - CMov_F_I_FM<0>, INSN_MIPS4_32; + CMov_F_I_FM<0>, INSN_MIPS4_32_NOT_32R6_64R6; let isCodeGenOnly = 1 in def MOVF_I64 : CMov_F_I_FT<"movf", GPR64Opnd, II_MOVF, MipsCMovFP_F>, - CMov_F_I_FM<0>, AdditionalRequires<[IsGP64bit]>; + CMov_F_I_FM<0>, INSN_MIPS4_32_NOT_32R6_64R6, + AdditionalRequires<[IsGP64bit]>; def MOVT_S : MMRel, CMov_F_F_FT<"movt.s", FGR32Opnd, II_MOVT_S, MipsCMovFP_T>, - CMov_F_F_FM<16, 1>, INSN_MIPS4_32; + CMov_F_F_FM<16, 1>, INSN_MIPS4_32_NOT_32R6_64R6; def MOVF_S : MMRel, CMov_F_F_FT<"movf.s", FGR32Opnd, II_MOVF_S, MipsCMovFP_F>, - CMov_F_F_FM<16, 0>, INSN_MIPS4_32; + CMov_F_F_FM<16, 0>, INSN_MIPS4_32_NOT_32R6_64R6; def MOVT_D32 : MMRel, CMov_F_F_FT<"movt.d", AFGR64Opnd, II_MOVT_D, MipsCMovFP_T>, CMov_F_F_FM<17, 1>, - INSN_MIPS4_32, FGR_32; + INSN_MIPS4_32_NOT_32R6_64R6, FGR_32; def MOVF_D32 : MMRel, CMov_F_F_FT<"movf.d", AFGR64Opnd, II_MOVF_D, MipsCMovFP_F>, CMov_F_F_FM<17, 0>, - INSN_MIPS4_32, FGR_32; + INSN_MIPS4_32_NOT_32R6_64R6, FGR_32; let DecoderNamespace = "Mips64" in { def MOVT_D64 : CMov_F_F_FT<"movt.d", FGR64Opnd, II_MOVT_D, MipsCMovFP_T>, - CMov_F_F_FM<17, 1>, INSN_MIPS4_32, FGR_64; + CMov_F_F_FM<17, 1>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64; def MOVF_D64 : CMov_F_F_FT<"movf.d", FGR64Opnd, II_MOVF_D, MipsCMovFP_F>, - CMov_F_F_FM<17, 0>, INSN_MIPS4_32, FGR_64; + CMov_F_F_FM<17, 0>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64; } // Instantiation of conditional move patterns. -defm : MovzPats0<GPR32, GPR32, MOVZ_I_I, SLT, SLTu, SLTi, SLTiu>; -defm : MovzPats1<GPR32, GPR32, MOVZ_I_I, XOR>; -defm : MovzPats2<GPR32, GPR32, MOVZ_I_I, XORi>; +defm : MovzPats0<GPR32, GPR32, MOVZ_I_I, SLT, SLTu, SLTi, SLTiu>, + INSN_MIPS4_32_NOT_32R6_64R6; +defm : MovzPats1<GPR32, GPR32, MOVZ_I_I, XOR>, INSN_MIPS4_32_NOT_32R6_64R6; +defm : MovzPats2<GPR32, GPR32, MOVZ_I_I, XORi>, INSN_MIPS4_32_NOT_32R6_64R6; -defm : MovzPats0<GPR32, GPR64, MOVZ_I_I64, SLT, SLTu, SLTi, SLTiu>, GPR_64; +defm : MovzPats0<GPR32, GPR64, MOVZ_I_I64, SLT, SLTu, SLTi, SLTiu>, + INSN_MIPS4_32_NOT_32R6_64R6, GPR_64; defm : MovzPats0<GPR64, GPR32, MOVZ_I_I, SLT64, SLTu64, SLTi64, SLTiu64>, - GPR_64; + INSN_MIPS4_32_NOT_32R6_64R6, GPR_64; defm : MovzPats0<GPR64, GPR64, MOVZ_I_I64, SLT64, SLTu64, SLTi64, SLTiu64>, + INSN_MIPS4_32_NOT_32R6_64R6, GPR_64; +defm : MovzPats1<GPR32, GPR64, MOVZ_I_I64, XOR>, + INSN_MIPS4_32_NOT_32R6_64R6, GPR_64; +defm : MovzPats1<GPR64, GPR32, MOVZ_I64_I, XOR64>, + INSN_MIPS4_32_NOT_32R6_64R6, GPR_64; +defm : MovzPats1<GPR64, GPR64, MOVZ_I64_I64, XOR64>, + INSN_MIPS4_32_NOT_32R6_64R6, GPR_64; +defm : MovzPats2<GPR32, GPR64, MOVZ_I_I64, XORi>, + INSN_MIPS4_32_NOT_32R6_64R6, GPR_64; +defm : MovzPats2<GPR64, GPR32, MOVZ_I64_I, XORi64>, + INSN_MIPS4_32_NOT_32R6_64R6, GPR_64; +defm : MovzPats2<GPR64, GPR64, MOVZ_I64_I64, XORi64>, + INSN_MIPS4_32_NOT_32R6_64R6, GPR_64; + +defm : MovnPats<GPR32, GPR32, MOVN_I_I, XOR>, INSN_MIPS4_32_NOT_32R6_64R6; + +defm : MovnPats<GPR32, GPR64, MOVN_I_I64, XOR>, INSN_MIPS4_32_NOT_32R6_64R6, + GPR_64; +defm : MovnPats<GPR64, GPR32, MOVN_I64_I, XOR64>, INSN_MIPS4_32_NOT_32R6_64R6, + GPR_64; +defm : MovnPats<GPR64, GPR64, MOVN_I64_I64, XOR64>, INSN_MIPS4_32_NOT_32R6_64R6, GPR_64; -defm : MovzPats1<GPR32, GPR64, MOVZ_I_I64, XOR>, GPR_64; -defm : MovzPats1<GPR64, GPR32, MOVZ_I64_I, XOR64>, GPR_64; -defm : MovzPats1<GPR64, GPR64, MOVZ_I64_I64, XOR64>, GPR_64; -defm : MovzPats2<GPR32, GPR64, MOVZ_I_I64, XORi>, GPR_64; -defm : MovzPats2<GPR64, GPR32, MOVZ_I64_I, XORi64>, GPR_64; -defm : MovzPats2<GPR64, GPR64, MOVZ_I64_I64, XORi64>, GPR_64; - -defm : MovnPats<GPR32, GPR32, MOVN_I_I, XOR>; - -defm : MovnPats<GPR32, GPR64, MOVN_I_I64, XOR>, GPR_64; -defm : MovnPats<GPR64, GPR32, MOVN_I64_I, XOR64>, GPR_64; -defm : MovnPats<GPR64, GPR64, MOVN_I64_I64, XOR64>, GPR_64; -defm : MovzPats0<GPR32, FGR32, MOVZ_I_S, SLT, SLTu, SLTi, SLTiu>; -defm : MovzPats1<GPR32, FGR32, MOVZ_I_S, XOR>; -defm : MovnPats<GPR32, FGR32, MOVN_I_S, XOR>; +defm : MovzPats0<GPR32, FGR32, MOVZ_I_S, SLT, SLTu, SLTi, SLTiu>, + INSN_MIPS4_32_NOT_32R6_64R6; +defm : MovzPats1<GPR32, FGR32, MOVZ_I_S, XOR>, INSN_MIPS4_32_NOT_32R6_64R6; +defm : MovnPats<GPR32, FGR32, MOVN_I_S, XOR>, INSN_MIPS4_32_NOT_32R6_64R6; defm : MovzPats0<GPR64, FGR32, MOVZ_I_S, SLT64, SLTu64, SLTi64, SLTiu64>, + INSN_MIPS4_32_NOT_32R6_64R6, GPR_64; +defm : MovzPats1<GPR64, FGR32, MOVZ_I64_S, XOR64>, INSN_MIPS4_32_NOT_32R6_64R6, + GPR_64; +defm : MovnPats<GPR64, FGR32, MOVN_I64_S, XOR64>, INSN_MIPS4_32_NOT_32R6_64R6, GPR_64; -defm : MovzPats1<GPR64, FGR32, MOVZ_I64_S, XOR64>, GPR_64; -defm : MovnPats<GPR64, FGR32, MOVN_I64_S, XOR64>, GPR_64; -defm : MovzPats0<GPR32, AFGR64, MOVZ_I_D32, SLT, SLTu, SLTi, SLTiu>, FGR_32; -defm : MovzPats1<GPR32, AFGR64, MOVZ_I_D32, XOR>, FGR_32; -defm : MovnPats<GPR32, AFGR64, MOVN_I_D32, XOR>, FGR_32; +defm : MovzPats0<GPR32, AFGR64, MOVZ_I_D32, SLT, SLTu, SLTi, SLTiu>, + INSN_MIPS4_32_NOT_32R6_64R6, FGR_32; +defm : MovzPats1<GPR32, AFGR64, MOVZ_I_D32, XOR>, INSN_MIPS4_32_NOT_32R6_64R6, + FGR_32; +defm : MovnPats<GPR32, AFGR64, MOVN_I_D32, XOR>, INSN_MIPS4_32_NOT_32R6_64R6, + FGR_32; -defm : MovzPats0<GPR32, FGR64, MOVZ_I_D64, SLT, SLTu, SLTi, SLTiu>, FGR_64; +defm : MovzPats0<GPR32, FGR64, MOVZ_I_D64, SLT, SLTu, SLTi, SLTiu>, + INSN_MIPS4_32_NOT_32R6_64R6, FGR_64; defm : MovzPats0<GPR64, FGR64, MOVZ_I_D64, SLT64, SLTu64, SLTi64, SLTiu64>, + INSN_MIPS4_32_NOT_32R6_64R6, FGR_64; +defm : MovzPats1<GPR32, FGR64, MOVZ_I_D64, XOR>, INSN_MIPS4_32_NOT_32R6_64R6, + FGR_64; +defm : MovzPats1<GPR64, FGR64, MOVZ_I64_D64, XOR64>, + INSN_MIPS4_32_NOT_32R6_64R6, FGR_64; +defm : MovnPats<GPR32, FGR64, MOVN_I_D64, XOR>, INSN_MIPS4_32_NOT_32R6_64R6, + FGR_64; +defm : MovnPats<GPR64, FGR64, MOVN_I64_D64, XOR64>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64; -defm : MovzPats1<GPR32, FGR64, MOVZ_I_D64, XOR>, FGR_64; -defm : MovzPats1<GPR64, FGR64, MOVZ_I64_D64, XOR64>, FGR_64; -defm : MovnPats<GPR32, FGR64, MOVN_I_D64, XOR>, FGR_64; -defm : MovnPats<GPR64, FGR64, MOVN_I64_D64, XOR64>, FGR_64; diff --git a/lib/Target/Mips/MipsDSPInstrFormats.td b/lib/Target/Mips/MipsDSPInstrFormats.td index cf09113cd8ad..b5d52ced9d3d 100644 --- a/lib/Target/Mips/MipsDSPInstrFormats.td +++ b/lib/Target/Mips/MipsDSPInstrFormats.td @@ -7,9 +7,9 @@ // //===----------------------------------------------------------------------===// -def HasDSP : Predicate<"Subtarget.hasDSP()">, +def HasDSP : Predicate<"Subtarget->hasDSP()">, AssemblerPredicate<"FeatureDSP">; -def HasDSPR2 : Predicate<"Subtarget.hasDSPR2()">, +def HasDSPR2 : Predicate<"Subtarget->hasDSPR2()">, AssemblerPredicate<"FeatureDSPR2">; // Fields. diff --git a/lib/Target/Mips/MipsDelaySlotFiller.cpp b/lib/Target/Mips/MipsDelaySlotFiller.cpp index d6c7cac27300..bcfbc12df01c 100644 --- a/lib/Target/Mips/MipsDelaySlotFiller.cpp +++ b/lib/Target/Mips/MipsDelaySlotFiller.cpp @@ -23,6 +23,7 @@ #include "llvm/CodeGen/MachineBranchProbabilityInfo.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/PseudoSourceValue.h" #include "llvm/Support/CommandLine.h" #include "llvm/Target/TargetInstrInfo.h" @@ -177,6 +178,13 @@ namespace { for (MachineFunction::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) Changed |= runOnMachineBasicBlock(*FI); + + // This pass invalidates liveness information when it reorders + // instructions to fill delay slot. Without this, -verify-machineinstrs + // will fail. + if (Changed) + F.getRegInfo().invalidateLiveness(); + return Changed; } diff --git a/lib/Target/Mips/MipsFastISel.cpp b/lib/Target/Mips/MipsFastISel.cpp index 268a0ed591c4..617801ba8c67 100644 --- a/lib/Target/Mips/MipsFastISel.cpp +++ b/lib/Target/Mips/MipsFastISel.cpp @@ -12,6 +12,7 @@ #include "MipsISelLowering.h" #include "MipsMachineFunction.h" #include "MipsSubtarget.h" +#include "MipsTargetMachine.h" using namespace llvm; @@ -36,11 +37,11 @@ class MipsFastISel final : public FastISel { /// Subtarget - Keep a pointer to the MipsSubtarget around so that we can /// make the right decision when generating code for different targets. - const MipsSubtarget *Subtarget; Module &M; const TargetMachine &TM; const TargetInstrInfo &TII; const TargetLowering &TLI; + const MipsSubtarget *Subtarget; MipsFunctionInfo *MFI; // Convenience variables to avoid some queries. @@ -54,8 +55,8 @@ public: : FastISel(funcInfo, libInfo), M(const_cast<Module &>(*funcInfo.Fn->getParent())), TM(funcInfo.MF->getTarget()), TII(*TM.getInstrInfo()), - TLI(*TM.getTargetLowering()) { - Subtarget = &TM.getSubtarget<MipsSubtarget>(); + TLI(*TM.getTargetLowering()), + Subtarget(&TM.getSubtarget<MipsSubtarget>()) { MFI = funcInfo.MF->getInfo<MipsFunctionInfo>(); Context = &funcInfo.Fn->getContext(); TargetSupported = ((Subtarget->getRelocationModel() == Reloc::PIC_) && @@ -68,8 +69,11 @@ public: bool ComputeAddress(const Value *Obj, Address &Addr); private: + bool EmitLoad(MVT VT, unsigned &ResultReg, Address &Addr, + unsigned Alignment = 0); bool EmitStore(MVT VT, unsigned SrcReg, Address &Addr, unsigned Alignment = 0); + bool SelectLoad(const Instruction *I); bool SelectRet(const Instruction *I); bool SelectStore(const Instruction *I); @@ -80,6 +84,36 @@ private: unsigned MaterializeGV(const GlobalValue *GV, MVT VT); unsigned MaterializeInt(const Constant *C, MVT VT); unsigned Materialize32BitInt(int64_t Imm, const TargetRegisterClass *RC); + + // for some reason, this default is not generated by tablegen + // so we explicitly generate it here. + // + unsigned FastEmitInst_riir(uint64_t inst, const TargetRegisterClass *RC, + unsigned Op0, bool Op0IsKill, uint64_t imm1, + uint64_t imm2, unsigned Op3, bool Op3IsKill) { + return 0; + } + + MachineInstrBuilder EmitInst(unsigned Opc) { + return BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)); + } + + MachineInstrBuilder EmitInst(unsigned Opc, unsigned DstReg) { + return BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), + DstReg); + } + + MachineInstrBuilder EmitInstStore(unsigned Opc, unsigned SrcReg, + unsigned MemReg, int64_t MemOffset) { + return EmitInst(Opc).addReg(SrcReg).addReg(MemReg).addImm(MemOffset); + } + + MachineInstrBuilder EmitInstLoad(unsigned Opc, unsigned DstReg, + unsigned MemReg, int64_t MemOffset) { + return EmitInst(Opc, DstReg).addReg(MemReg).addImm(MemOffset); + } + +#include "MipsGenFastISel.inc" }; bool MipsFastISel::isTypeLegal(Type *Ty, MVT &VT) { @@ -100,6 +134,8 @@ bool MipsFastISel::isLoadTypeLegal(Type *Ty, MVT &VT) { // We will extend this in a later patch: // If this is a type than can be sign or zero-extended to a basic operation // go ahead and accept it now. + if (VT == MVT::i8 || VT == MVT::i16) + return true; return false; } @@ -116,6 +152,45 @@ bool MipsFastISel::ComputeAddress(const Value *Obj, Address &Addr) { return Addr.Base.Reg != 0; } +bool MipsFastISel::EmitLoad(MVT VT, unsigned &ResultReg, Address &Addr, + unsigned Alignment) { + // + // more cases will be handled here in following patches. + // + unsigned Opc; + switch (VT.SimpleTy) { + case MVT::i32: { + ResultReg = createResultReg(&Mips::GPR32RegClass); + Opc = Mips::LW; + break; + } + case MVT::i16: { + ResultReg = createResultReg(&Mips::GPR32RegClass); + Opc = Mips::LHu; + break; + } + case MVT::i8: { + ResultReg = createResultReg(&Mips::GPR32RegClass); + Opc = Mips::LBu; + break; + } + case MVT::f32: { + ResultReg = createResultReg(&Mips::FGR32RegClass); + Opc = Mips::LWC1; + break; + } + case MVT::f64: { + ResultReg = createResultReg(&Mips::AFGR64RegClass); + Opc = Mips::LDC1; + break; + } + default: + return false; + } + EmitInstLoad(Opc, ResultReg, Addr.Base.Reg, Addr.Offset); + return true; +} + // Materialize a constant into a register, and return the register // number (or zero if we failed to handle it). unsigned MipsFastISel::TargetMaterializeConstant(const Constant *C) { @@ -141,12 +216,49 @@ bool MipsFastISel::EmitStore(MVT VT, unsigned SrcReg, Address &Addr, // // more cases will be handled here in following patches. // - if (VT != MVT::i32) + unsigned Opc; + switch (VT.SimpleTy) { + case MVT::i8: + Opc = Mips::SB; + break; + case MVT::i16: + Opc = Mips::SH; + break; + case MVT::i32: + Opc = Mips::SW; + break; + case MVT::f32: + Opc = Mips::SWC1; + break; + case MVT::f64: + Opc = Mips::SDC1; + break; + default: return false; - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::SW)) - .addReg(SrcReg) - .addReg(Addr.Base.Reg) - .addImm(Addr.Offset); + } + EmitInstStore(Opc, SrcReg, Addr.Base.Reg, Addr.Offset); + return true; +} + +bool MipsFastISel::SelectLoad(const Instruction *I) { + // Atomic loads need special handling. + if (cast<LoadInst>(I)->isAtomic()) + return false; + + // Verify we have a legal type before going any further. + MVT VT; + if (!isLoadTypeLegal(I->getType(), VT)) + return false; + + // See if we can handle this address. + Address Addr; + if (!ComputeAddress(I->getOperand(0), Addr)) + return false; + + unsigned ResultReg; + if (!EmitLoad(VT, ResultReg, Addr, cast<LoadInst>(I)->getAlignment())) + return false; + UpdateValueMap(I, ResultReg); return true; } @@ -186,8 +298,7 @@ bool MipsFastISel::SelectRet(const Instruction *I) { if (Ret->getNumOperands() > 0) { return false; } - unsigned RetOpc = Mips::RetRA; - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(RetOpc)); + EmitInst(Mips::RetRA); return true; } @@ -197,6 +308,8 @@ bool MipsFastISel::TargetSelectInstruction(const Instruction *I) { switch (I->getOpcode()) { default: break; + case Instruction::Load: + return SelectLoad(I); case Instruction::Store: return SelectStore(I); case Instruction::Ret: @@ -207,6 +320,22 @@ bool MipsFastISel::TargetSelectInstruction(const Instruction *I) { } unsigned MipsFastISel::MaterializeFP(const ConstantFP *CFP, MVT VT) { + int64_t Imm = CFP->getValueAPF().bitcastToAPInt().getZExtValue(); + if (VT == MVT::f32) { + const TargetRegisterClass *RC = &Mips::FGR32RegClass; + unsigned DestReg = createResultReg(RC); + unsigned TempReg = Materialize32BitInt(Imm, &Mips::GPR32RegClass); + EmitInst(Mips::MTC1, DestReg).addReg(TempReg); + return DestReg; + } else if (VT == MVT::f64) { + const TargetRegisterClass *RC = &Mips::AFGR64RegClass; + unsigned DestReg = createResultReg(RC); + unsigned TempReg1 = Materialize32BitInt(Imm >> 32, &Mips::GPR32RegClass); + unsigned TempReg2 = + Materialize32BitInt(Imm & 0xFFFFFFFF, &Mips::GPR32RegClass); + EmitInst(Mips::BuildPairF64, DestReg).addReg(TempReg2).addReg(TempReg1); + return DestReg; + } return 0; } @@ -221,9 +350,8 @@ unsigned MipsFastISel::MaterializeGV(const GlobalValue *GV, MVT VT) { // TLS not supported at this time. if (IsThreadLocal) return 0; - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::LW), DestReg) - .addReg(MFI->getGlobalBaseReg()) - .addGlobalAddress(GV, 0, MipsII::MO_GOT); + EmitInst(Mips::LW, DestReg).addReg(MFI->getGlobalBaseReg()).addGlobalAddress( + GV, 0, MipsII::MO_GOT); return DestReg; } unsigned MipsFastISel::MaterializeInt(const Constant *C, MVT VT) { @@ -245,15 +373,10 @@ unsigned MipsFastISel::Materialize32BitInt(int64_t Imm, if (isInt<16>(Imm)) { unsigned Opc = Mips::ADDiu; - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) - .addReg(Mips::ZERO) - .addImm(Imm); + EmitInst(Opc, ResultReg).addReg(Mips::ZERO).addImm(Imm); return ResultReg; } else if (isUInt<16>(Imm)) { - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::ORi), - ResultReg) - .addReg(Mips::ZERO) - .addImm(Imm); + EmitInst(Mips::ORi, ResultReg).addReg(Mips::ZERO).addImm(Imm); return ResultReg; } unsigned Lo = Imm & 0xFFFF; @@ -261,16 +384,10 @@ unsigned MipsFastISel::Materialize32BitInt(int64_t Imm, if (Lo) { // Both Lo and Hi have nonzero bits. unsigned TmpReg = createResultReg(RC); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::LUi), - TmpReg).addImm(Hi); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::ORi), - ResultReg) - .addReg(TmpReg) - .addImm(Lo); - + EmitInst(Mips::LUi, TmpReg).addImm(Hi); + EmitInst(Mips::ORi, ResultReg).addReg(TmpReg).addImm(Lo); } else { - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::LUi), - ResultReg).addImm(Hi); + EmitInst(Mips::LUi, ResultReg).addImm(Hi); } return ResultReg; } diff --git a/lib/Target/Mips/MipsFrameLowering.h b/lib/Target/Mips/MipsFrameLowering.h index e10a3a551f9f..8e9196c30fad 100644 --- a/lib/Target/Mips/MipsFrameLowering.h +++ b/lib/Target/Mips/MipsFrameLowering.h @@ -15,7 +15,6 @@ #define MIPS_FRAMEINFO_H #include "Mips.h" -#include "MipsSubtarget.h" #include "llvm/Target/TargetFrameLowering.h" namespace llvm { diff --git a/lib/Target/Mips/MipsISelDAGToDAG.cpp b/lib/Target/Mips/MipsISelDAGToDAG.cpp index 90cff631931f..0bdabf37f63b 100644 --- a/lib/Target/Mips/MipsISelDAGToDAG.cpp +++ b/lib/Target/Mips/MipsISelDAGToDAG.cpp @@ -47,6 +47,7 @@ using namespace llvm; //===----------------------------------------------------------------------===// bool MipsDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { + Subtarget = &TM.getSubtarget<MipsSubtarget>(); bool Ret = SelectionDAGISel::runOnMachineFunction(MF); processFunctionAfterISel(MF); @@ -202,7 +203,7 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) { #ifndef NDEBUG case ISD::LOAD: case ISD::STORE: - assert((Subtarget.systemSupportsUnalignedAccess() || + assert((Subtarget->systemSupportsUnalignedAccess() || cast<MemSDNode>(Node)->getMemoryVT().getSizeInBits() / 8 <= cast<MemSDNode>(Node)->getAlignment()) && "Unexpected unaligned loads/stores."); diff --git a/lib/Target/Mips/MipsISelDAGToDAG.h b/lib/Target/Mips/MipsISelDAGToDAG.h index 13becb6b5bb9..2a6c875f4e35 100644 --- a/lib/Target/Mips/MipsISelDAGToDAG.h +++ b/lib/Target/Mips/MipsISelDAGToDAG.h @@ -32,7 +32,7 @@ namespace llvm { class MipsDAGToDAGISel : public SelectionDAGISel { public: explicit MipsDAGToDAGISel(MipsTargetMachine &TM) - : SelectionDAGISel(TM), Subtarget(TM.getSubtarget<MipsSubtarget>()) {} + : SelectionDAGISel(TM), Subtarget(&TM.getSubtarget<MipsSubtarget>()) {} // Pass Name const char *getPassName() const override { @@ -46,7 +46,7 @@ protected: /// Keep a pointer to the MipsSubtarget around so that we can make the right /// decision when generating code for different targets. - const MipsSubtarget &Subtarget; + const MipsSubtarget *Subtarget; private: // Include the pieces autogenerated from the target description. diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp index bfe5ea1846d7..b7af2d4aaf48 100644 --- a/lib/Target/Mips/MipsISelLowering.cpp +++ b/lib/Target/Mips/MipsISelLowering.cpp @@ -215,6 +215,11 @@ MipsTargetLowering::MipsTargetLowering(MipsTargetMachine &TM) // setcc operations results (slt, sgt, ...). setBooleanContents(ZeroOrOneBooleanContent); setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); + // The cmp.cond.fmt instruction in MIPS32r6/MIPS64r6 uses 0 and -1 like MSA + // does. Integer booleans still use 0 and 1. + if (Subtarget->hasMips32r6()) + setBooleanContents(ZeroOrOneBooleanContent, + ZeroOrNegativeOneBooleanContent); // Load extented operations for i1 types must be promoted setLoadExtAction(ISD::EXTLOAD, MVT::i1, Promote); @@ -251,7 +256,7 @@ MipsTargetLowering::MipsTargetLowering(MipsTargetMachine &TM) setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); - if (isGP64bit()) { + if (Subtarget->isGP64bit()) { setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); setOperationAction(ISD::BlockAddress, MVT::i64, Custom); setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); @@ -263,14 +268,14 @@ MipsTargetLowering::MipsTargetLowering(MipsTargetMachine &TM) setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); } - if (!isGP64bit()) { + if (!Subtarget->isGP64bit()) { setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); } setOperationAction(ISD::ADD, MVT::i32, Custom); - if (isGP64bit()) + if (Subtarget->isGP64bit()) setOperationAction(ISD::ADD, MVT::i64, Custom); setOperationAction(ISD::SDIV, MVT::i32, Expand); @@ -287,7 +292,8 @@ MipsTargetLowering::MipsTargetLowering(MipsTargetMachine &TM) setOperationAction(ISD::BR_CC, MVT::f64, Expand); setOperationAction(ISD::BR_CC, MVT::i32, Expand); setOperationAction(ISD::BR_CC, MVT::i64, Expand); - setOperationAction(ISD::SELECT_CC, MVT::Other, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i32, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand); setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand); setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand); @@ -368,7 +374,7 @@ MipsTargetLowering::MipsTargetLowering(MipsTargetMachine &TM) if (!Subtarget->hasMips64r2()) setOperationAction(ISD::BSWAP, MVT::i64, Expand); - if (isGP64bit()) { + if (Subtarget->isGP64bit()) { setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Custom); setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, Custom); setLoadExtAction(ISD::EXTLOAD, MVT::i32, Custom); @@ -384,12 +390,13 @@ MipsTargetLowering::MipsTargetLowering(MipsTargetMachine &TM) setTargetDAGCombine(ISD::OR); setTargetDAGCombine(ISD::ADD); - setMinFunctionAlignment(isGP64bit() ? 3 : 2); + setMinFunctionAlignment(Subtarget->isGP64bit() ? 3 : 2); - setStackPointerRegisterToSaveRestore(isN64() ? Mips::SP_64 : Mips::SP); + setStackPointerRegisterToSaveRestore(Subtarget->isABI_N64() ? Mips::SP_64 + : Mips::SP); - setExceptionPointerRegister(isN64() ? Mips::A0_64 : Mips::A0); - setExceptionSelectorRegister(isN64() ? Mips::A1_64 : Mips::A1); + setExceptionPointerRegister(Subtarget->isABI_N64() ? Mips::A0_64 : Mips::A0); + setExceptionSelectorRegister(Subtarget->isABI_N64() ? Mips::A1_64 : Mips::A1); MaxStoresPerMemcpy = 16; @@ -815,10 +822,10 @@ addLiveIn(MachineFunction &MF, unsigned PReg, const TargetRegisterClass *RC) return VReg; } -static MachineBasicBlock *expandPseudoDIV(MachineInstr *MI, - MachineBasicBlock &MBB, - const TargetInstrInfo &TII, - bool Is64Bit) { +static MachineBasicBlock *insertDivByZeroTrap(MachineInstr *MI, + MachineBasicBlock &MBB, + const TargetInstrInfo &TII, + bool Is64Bit) { if (NoZeroDivCheck) return &MBB; @@ -836,6 +843,10 @@ static MachineBasicBlock *expandPseudoDIV(MachineInstr *MI, // Clear Divisor's kill flag. Divisor.setIsKill(false); + + // We would normally delete the original instruction here but in this case + // we only needed to inject an additional instruction rather than replace it. + return &MBB; } @@ -918,10 +929,22 @@ MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, return emitAtomicCmpSwap(MI, BB, 8); case Mips::PseudoSDIV: case Mips::PseudoUDIV: - return expandPseudoDIV(MI, *BB, *getTargetMachine().getInstrInfo(), false); + case Mips::DIV: + case Mips::DIVU: + case Mips::MOD: + case Mips::MODU: + return insertDivByZeroTrap(MI, *BB, *getTargetMachine().getInstrInfo(), + false); case Mips::PseudoDSDIV: case Mips::PseudoDUDIV: - return expandPseudoDIV(MI, *BB, *getTargetMachine().getInstrInfo(), true); + case Mips::DDIV: + case Mips::DDIVU: + case Mips::DMOD: + case Mips::DMODU: + return insertDivByZeroTrap(MI, *BB, *getTargetMachine().getInstrInfo(), + true); + case Mips::SEL_D: + return emitSEL_D(MI, BB); } } @@ -941,16 +964,20 @@ MipsTargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, unsigned LL, SC, AND, NOR, ZERO, BEQ; if (Size == 4) { - LL = isMicroMips ? Mips::LL_MM : Mips::LL; - SC = isMicroMips ? Mips::SC_MM : Mips::SC; + if (isMicroMips) { + LL = Mips::LL_MM; + SC = Mips::SC_MM; + } else { + LL = Subtarget->hasMips32r6() ? Mips::LL : Mips::LL_R6; + SC = Subtarget->hasMips32r6() ? Mips::SC : Mips::SC_R6; + } AND = Mips::AND; NOR = Mips::NOR; ZERO = Mips::ZERO; BEQ = Mips::BEQ; - } - else { - LL = Mips::LLD; - SC = Mips::SCD; + } else { + LL = Subtarget->hasMips64r6() ? Mips::LLD : Mips::LLD_R6; + SC = Subtarget->hasMips64r6() ? Mips::SCD : Mips::SCD_R6; AND = Mips::AND64; NOR = Mips::NOR64; ZERO = Mips::ZERO_64; @@ -1012,11 +1039,39 @@ MipsTargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, return exitMBB; } -MachineBasicBlock * -MipsTargetLowering::emitAtomicBinaryPartword(MachineInstr *MI, - MachineBasicBlock *BB, - unsigned Size, unsigned BinOpcode, - bool Nand) const { +MachineBasicBlock *MipsTargetLowering::emitSignExtendToI32InReg( + MachineInstr *MI, MachineBasicBlock *BB, unsigned Size, unsigned DstReg, + unsigned SrcReg) const { + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + DebugLoc DL = MI->getDebugLoc(); + + if (Subtarget->hasMips32r2() && Size == 1) { + BuildMI(BB, DL, TII->get(Mips::SEB), DstReg).addReg(SrcReg); + return BB; + } + + if (Subtarget->hasMips32r2() && Size == 2) { + BuildMI(BB, DL, TII->get(Mips::SEH), DstReg).addReg(SrcReg); + return BB; + } + + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &RegInfo = MF->getRegInfo(); + const TargetRegisterClass *RC = getRegClassFor(MVT::i32); + unsigned ScrReg = RegInfo.createVirtualRegister(RC); + + assert(Size < 32); + int64_t ShiftImm = 32 - (Size * 8); + + BuildMI(BB, DL, TII->get(Mips::SLL), ScrReg).addReg(SrcReg).addImm(ShiftImm); + BuildMI(BB, DL, TII->get(Mips::SRA), DstReg).addReg(ScrReg).addImm(ShiftImm); + + return BB; +} + +MachineBasicBlock *MipsTargetLowering::emitAtomicBinaryPartword( + MachineInstr *MI, MachineBasicBlock *BB, unsigned Size, unsigned BinOpcode, + bool Nand) const { assert((Size == 1 || Size == 2) && "Unsupported size for EmitAtomicBinaryPartial."); @@ -1046,7 +1101,6 @@ MipsTargetLowering::emitAtomicBinaryPartword(MachineInstr *MI, unsigned StoreVal = RegInfo.createVirtualRegister(RC); unsigned MaskedOldVal1 = RegInfo.createVirtualRegister(RC); unsigned SrlRes = RegInfo.createVirtualRegister(RC); - unsigned SllRes = RegInfo.createVirtualRegister(RC); unsigned Success = RegInfo.createVirtualRegister(RC); // insert new blocks after the current block @@ -1152,19 +1206,14 @@ MipsTargetLowering::emitAtomicBinaryPartword(MachineInstr *MI, // sinkMBB: // and maskedoldval1,oldval,mask // srl srlres,maskedoldval1,shiftamt - // sll sllres,srlres,24 - // sra dest,sllres,24 + // sign_extend dest,srlres BB = sinkMBB; - int64_t ShiftImm = (Size == 1) ? 24 : 16; BuildMI(BB, DL, TII->get(Mips::AND), MaskedOldVal1) .addReg(OldVal).addReg(Mask); BuildMI(BB, DL, TII->get(Mips::SRLV), SrlRes) .addReg(MaskedOldVal1).addReg(ShiftAmt); - BuildMI(BB, DL, TII->get(Mips::SLL), SllRes) - .addReg(SrlRes).addImm(ShiftImm); - BuildMI(BB, DL, TII->get(Mips::SRA), Dest) - .addReg(SllRes).addImm(ShiftImm); + BB = emitSignExtendToI32InReg(MI, BB, Size, Dest, SrlRes); MI->eraseFromParent(); // The instruction is gone now. @@ -1285,7 +1334,6 @@ MipsTargetLowering::emitAtomicCmpSwapPartword(MachineInstr *MI, unsigned MaskedOldVal1 = RegInfo.createVirtualRegister(RC); unsigned StoreVal = RegInfo.createVirtualRegister(RC); unsigned SrlRes = RegInfo.createVirtualRegister(RC); - unsigned SllRes = RegInfo.createVirtualRegister(RC); unsigned Success = RegInfo.createVirtualRegister(RC); // insert new blocks after the current block @@ -1382,23 +1430,44 @@ MipsTargetLowering::emitAtomicCmpSwapPartword(MachineInstr *MI, // sinkMBB: // srl srlres,maskedoldval0,shiftamt - // sll sllres,srlres,24 - // sra dest,sllres,24 + // sign_extend dest,srlres BB = sinkMBB; - int64_t ShiftImm = (Size == 1) ? 24 : 16; BuildMI(BB, DL, TII->get(Mips::SRLV), SrlRes) .addReg(MaskedOldVal0).addReg(ShiftAmt); - BuildMI(BB, DL, TII->get(Mips::SLL), SllRes) - .addReg(SrlRes).addImm(ShiftImm); - BuildMI(BB, DL, TII->get(Mips::SRA), Dest) - .addReg(SllRes).addImm(ShiftImm); + BB = emitSignExtendToI32InReg(MI, BB, Size, Dest, SrlRes); MI->eraseFromParent(); // The instruction is gone now. return exitMBB; } +MachineBasicBlock *MipsTargetLowering::emitSEL_D(MachineInstr *MI, + MachineBasicBlock *BB) const { + MachineFunction *MF = BB->getParent(); + const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + MachineRegisterInfo &RegInfo = MF->getRegInfo(); + DebugLoc DL = MI->getDebugLoc(); + MachineBasicBlock::iterator II(MI); + + unsigned Fc = MI->getOperand(1).getReg(); + const auto &FGR64RegClass = TRI->getRegClass(Mips::FGR64RegClassID); + + unsigned Fc2 = RegInfo.createVirtualRegister(FGR64RegClass); + + BuildMI(*BB, II, DL, TII->get(Mips::SUBREG_TO_REG), Fc2) + .addImm(0) + .addReg(Fc) + .addImm(Mips::sub_lo); + + // We don't erase the original instruction, we just replace the condition + // register with the 64-bit super-register. + MI->getOperand(1).setReg(Fc2); + + return BB; +} + //===----------------------------------------------------------------------===// // Misc Lower Operation implementation //===----------------------------------------------------------------------===// @@ -1421,7 +1490,8 @@ SDValue MipsTargetLowering::lowerBR_JT(SDValue Op, SelectionDAG &DAG) const { 0); Chain = Addr.getValue(1); - if ((getTargetMachine().getRelocationModel() == Reloc::PIC_) || isN64()) { + if ((getTargetMachine().getRelocationModel() == Reloc::PIC_) || + Subtarget->isABI_N64()) { // For PIC, the sequence is: // BRIND(load(Jumptable + index) + RelocBase) // RelocBase can be JumpTable, GOT or some sort of global base. @@ -1439,6 +1509,7 @@ SDValue MipsTargetLowering::lowerBRCOND(SDValue Op, SelectionDAG &DAG) const { SDValue Dest = Op.getOperand(2); SDLoc DL(Op); + assert(!Subtarget->hasMips32r6() && !Subtarget->hasMips64r6()); SDValue CondRes = createFPCmp(DAG, Op.getOperand(1)); // Return if flag is not set by a floating point comparison. @@ -1458,6 +1529,7 @@ SDValue MipsTargetLowering::lowerBRCOND(SDValue Op, SelectionDAG &DAG) const { SDValue MipsTargetLowering:: lowerSELECT(SDValue Op, SelectionDAG &DAG) const { + assert(!Subtarget->hasMips32r6() && !Subtarget->hasMips64r6()); SDValue Cond = createFPCmp(DAG, Op.getOperand(0)); // Return if flag is not set by a floating point comparison. @@ -1483,6 +1555,7 @@ lowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const } SDValue MipsTargetLowering::lowerSETCC(SDValue Op, SelectionDAG &DAG) const { + assert(!Subtarget->hasMips32r6() && !Subtarget->hasMips64r6()); SDValue Cond = createFPCmp(DAG, Op); assert(Cond.getOpcode() == MipsISD::FPCmp && @@ -1502,7 +1575,8 @@ SDValue MipsTargetLowering::lowerGlobalAddress(SDValue Op, GlobalAddressSDNode *N = cast<GlobalAddressSDNode>(Op); const GlobalValue *GV = N->getGlobal(); - if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !isN64()) { + if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && + !Subtarget->isABI_N64()) { const MipsTargetObjectFile &TLOF = (const MipsTargetObjectFile&)getObjFileLowering(); @@ -1521,15 +1595,18 @@ SDValue MipsTargetLowering::lowerGlobalAddress(SDValue Op, } if (GV->hasInternalLinkage() || (GV->hasLocalLinkage() && !isa<Function>(GV))) - return getAddrLocal(N, Ty, DAG, isN32() || isN64()); + return getAddrLocal(N, Ty, DAG, + Subtarget->isABI_N32() || Subtarget->isABI_N64()); if (LargeGOT) return getAddrGlobalLargeGOT(N, Ty, DAG, MipsII::MO_GOT_HI16, MipsII::MO_GOT_LO16, DAG.getEntryNode(), MachinePointerInfo::getGOT()); - return getAddrGlobal(N, Ty, DAG, (isN32() || isN64()) ? MipsII::MO_GOT_DISP - : MipsII::MO_GOT16, + return getAddrGlobal(N, Ty, DAG, + (Subtarget->isABI_N32() || Subtarget->isABI_N64()) + ? MipsII::MO_GOT_DISP + : MipsII::MO_GOT16, DAG.getEntryNode(), MachinePointerInfo::getGOT()); } @@ -1538,10 +1615,12 @@ SDValue MipsTargetLowering::lowerBlockAddress(SDValue Op, BlockAddressSDNode *N = cast<BlockAddressSDNode>(Op); EVT Ty = Op.getValueType(); - if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !isN64()) + if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && + !Subtarget->isABI_N64()) return getAddrNonPIC(N, Ty, DAG); - return getAddrLocal(N, Ty, DAG, isN32() || isN64()); + return getAddrLocal(N, Ty, DAG, + Subtarget->isABI_N32() || Subtarget->isABI_N64()); } SDValue MipsTargetLowering:: @@ -1579,7 +1658,7 @@ lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(DL).setChain(DAG.getEntryNode()) - .setCallee(CallingConv::C, PtrTy, TlsGetAddr, &Args, 0); + .setCallee(CallingConv::C, PtrTy, TlsGetAddr, std::move(Args), 0); std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); SDValue Ret = CallResult.first; @@ -1629,10 +1708,12 @@ lowerJumpTable(SDValue Op, SelectionDAG &DAG) const JumpTableSDNode *N = cast<JumpTableSDNode>(Op); EVT Ty = Op.getValueType(); - if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !isN64()) + if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && + !Subtarget->isABI_N64()) return getAddrNonPIC(N, Ty, DAG); - return getAddrLocal(N, Ty, DAG, isN32() || isN64()); + return getAddrLocal(N, Ty, DAG, + Subtarget->isABI_N32() || Subtarget->isABI_N64()); } SDValue MipsTargetLowering:: @@ -1650,10 +1731,12 @@ lowerConstantPool(SDValue Op, SelectionDAG &DAG) const ConstantPoolSDNode *N = cast<ConstantPoolSDNode>(Op); EVT Ty = Op.getValueType(); - if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !isN64()) + if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && + !Subtarget->isABI_N64()) return getAddrNonPIC(N, Ty, DAG); - return getAddrLocal(N, Ty, DAG, isN32() || isN64()); + return getAddrLocal(N, Ty, DAG, + Subtarget->isABI_N32() || Subtarget->isABI_N64()); } SDValue MipsTargetLowering::lowerVASTART(SDValue Op, SelectionDAG &DAG) const { @@ -1784,8 +1867,9 @@ lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { MFI->setFrameAddressIsTaken(true); EVT VT = Op.getValueType(); SDLoc DL(Op); - SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), DL, - isN64() ? Mips::FP_64 : Mips::FP, VT); + SDValue FrameAddr = + DAG.getCopyFromReg(DAG.getEntryNode(), DL, + Subtarget->isABI_N64() ? Mips::FP_64 : Mips::FP, VT); return FrameAddr; } @@ -1801,7 +1885,7 @@ SDValue MipsTargetLowering::lowerRETURNADDR(SDValue Op, MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); MVT VT = Op.getSimpleValueType(); - unsigned RA = isN64() ? Mips::RA_64 : Mips::RA; + unsigned RA = Subtarget->isABI_N64() ? Mips::RA_64 : Mips::RA; MFI->setReturnAddressIsTaken(true); // Return RA, which contains the return address. Mark it an implicit live-in. @@ -1823,12 +1907,12 @@ SDValue MipsTargetLowering::lowerEH_RETURN(SDValue Op, SelectionDAG &DAG) SDValue Offset = Op.getOperand(1); SDValue Handler = Op.getOperand(2); SDLoc DL(Op); - EVT Ty = isN64() ? MVT::i64 : MVT::i32; + EVT Ty = Subtarget->isABI_N64() ? MVT::i64 : MVT::i32; // Store stack offset in V1, store jump target in V0. Glue CopyToReg and // EH_RETURN nodes, so that instructions are emitted back-to-back. - unsigned OffsetReg = isN64() ? Mips::V1_64 : Mips::V1; - unsigned AddrReg = isN64() ? Mips::V0_64 : Mips::V0; + unsigned OffsetReg = Subtarget->isABI_N64() ? Mips::V1_64 : Mips::V1; + unsigned AddrReg = Subtarget->isABI_N64() ? Mips::V0_64 : Mips::V0; Chain = DAG.getCopyToReg(Chain, DL, OffsetReg, Offset, SDValue()); Chain = DAG.getCopyToReg(Chain, DL, AddrReg, Handler, Chain.getValue(1)); return DAG.getNode(MipsISD::EH_RETURN, DL, MVT::Other, Chain, @@ -2256,8 +2340,8 @@ getOpndList(SmallVectorImpl<SDValue> &Ops, // in PIC mode) allow symbols to be resolved via lazy binding. // The lazy binding stub requires GP to point to the GOT. if (IsPICCall && !InternalLinkage) { - unsigned GPReg = isN64() ? Mips::GP_64 : Mips::GP; - EVT Ty = isN64() ? MVT::i64 : MVT::i32; + unsigned GPReg = Subtarget->isABI_N64() ? Mips::GP_64 : Mips::GP; + EVT Ty = Subtarget->isABI_N64() ? MVT::i64 : MVT::i32; RegsToPass.push_back(std::make_pair(GPReg, getGlobalReg(CLI.DAG, Ty))); } @@ -2326,8 +2410,8 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, getTargetMachine(), ArgLocs, *DAG.getContext()); MipsCC::SpecialCallingConvType SpecialCallingConv = getSpecialCallingConv(Callee); - MipsCC MipsCCInfo(CallConv, isO32(), Subtarget->isFP64bit(), CCInfo, - SpecialCallingConv); + MipsCC MipsCCInfo(CallConv, Subtarget->isABI_O32(), Subtarget->isFP64bit(), + CCInfo, SpecialCallingConv); MipsCCInfo.analyzeCallOperands(Outs, IsVarArg, Subtarget->mipsSEUsesSoftFloat(), @@ -2360,7 +2444,8 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Chain = DAG.getCALLSEQ_START(Chain, NextStackOffsetVal, DL); SDValue StackPtr = DAG.getCopyFromReg( - Chain, DL, isN64() ? Mips::SP_64 : Mips::SP, getPointerTy()); + Chain, DL, Subtarget->isABI_N64() ? Mips::SP_64 : Mips::SP, + getPointerTy()); // With EABI is it possible to have 16 args on registers. std::deque< std::pair<unsigned, SDValue> > RegsToPass; @@ -2446,8 +2531,9 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol // node so that legalize doesn't hack it. - bool IsPICCall = (isN64() || IsPIC); // true if calls are translated to - // jalr $25 + bool IsPICCall = + (Subtarget->isABI_N64() || IsPIC); // true if calls are translated to + // jalr $25 bool GlobalOrExternal = false, InternalLinkage = false; SDValue CalleeLo; EVT Ty = Callee.getValueType(); @@ -2458,7 +2544,8 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, InternalLinkage = Val->hasInternalLinkage(); if (InternalLinkage) - Callee = getAddrLocal(G, Ty, DAG, isN32() || isN64()); + Callee = getAddrLocal(G, Ty, DAG, + Subtarget->isABI_N32() || Subtarget->isABI_N64()); else if (LargeGOT) Callee = getAddrGlobalLargeGOT(G, Ty, DAG, MipsII::MO_CALL_HI16, MipsII::MO_CALL_LO16, Chain, @@ -2474,7 +2561,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { const char *Sym = S->getSymbol(); - if (!isN64() && !IsPIC) // !N64 && static + if (!Subtarget->isABI_N64() && !IsPIC) // !N64 && static Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(), MipsII::MO_NO_FLAG); else if (LargeGOT) @@ -2525,7 +2612,8 @@ MipsTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, SmallVector<CCValAssign, 16> RVLocs; CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), getTargetMachine(), RVLocs, *DAG.getContext()); - MipsCC MipsCCInfo(CallConv, isO32(), Subtarget->isFP64bit(), CCInfo); + MipsCC MipsCCInfo(CallConv, Subtarget->isABI_O32(), Subtarget->isFP64bit(), + CCInfo); MipsCCInfo.analyzeCallResult(Ins, Subtarget->mipsSEUsesSoftFloat(), CallNode, RetTy); @@ -2572,7 +2660,8 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain, SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), getTargetMachine(), ArgLocs, *DAG.getContext()); - MipsCC MipsCCInfo(CallConv, isO32(), Subtarget->isFP64bit(), CCInfo); + MipsCC MipsCCInfo(CallConv, Subtarget->isABI_O32(), Subtarget->isFP64bit(), + CCInfo); Function::const_arg_iterator FuncArg = DAG.getMachineFunction().getFunction()->arg_begin(); bool UseSoftFloat = Subtarget->mipsSEUsesSoftFloat(); @@ -2634,7 +2723,8 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain, (RegVT == MVT::i64 && ValVT == MVT::f64) || (RegVT == MVT::f64 && ValVT == MVT::i64)) ArgValue = DAG.getNode(ISD::BITCAST, DL, ValVT, ArgValue); - else if (isO32() && RegVT == MVT::i32 && ValVT == MVT::f64) { + else if (Subtarget->isABI_O32() && RegVT == MVT::i32 && + ValVT == MVT::f64) { unsigned Reg2 = addLiveIn(DAG.getMachineFunction(), getNextIntArgReg(ArgReg), RC); SDValue ArgValue2 = DAG.getCopyFromReg(Chain, DL, Reg2, RegVT); @@ -2672,7 +2762,7 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain, unsigned Reg = MipsFI->getSRetReturnReg(); if (!Reg) { Reg = MF.getRegInfo().createVirtualRegister( - getRegClassFor(isN64() ? MVT::i64 : MVT::i32)); + getRegClassFor(Subtarget->isABI_N64() ? MVT::i64 : MVT::i32)); MipsFI->setSRetReturnReg(Reg); } SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[i]); @@ -2723,7 +2813,8 @@ MipsTargetLowering::LowerReturn(SDValue Chain, // CCState - Info about the registers and stack slot. CCState CCInfo(CallConv, IsVarArg, MF, getTargetMachine(), RVLocs, *DAG.getContext()); - MipsCC MipsCCInfo(CallConv, isO32(), Subtarget->isFP64bit(), CCInfo); + MipsCC MipsCCInfo(CallConv, Subtarget->isABI_O32(), Subtarget->isFP64bit(), + CCInfo); // Analyze return values. MipsCCInfo.analyzeReturn(Outs, Subtarget->mipsSEUsesSoftFloat(), @@ -2759,7 +2850,7 @@ MipsTargetLowering::LowerReturn(SDValue Chain, if (!Reg) llvm_unreachable("sret virtual register not created in the entry block"); SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy()); - unsigned V0 = isN64() ? Mips::V0_64 : Mips::V0; + unsigned V0 = Subtarget->isABI_N64() ? Mips::V0_64 : Mips::V0; Chain = DAG.getCopyToReg(Chain, DL, V0, Val, Flag); Flag = Chain.getValue(1); @@ -2980,9 +3071,9 @@ getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const return std::make_pair(0U, &Mips::CPU16RegsRegClass); return std::make_pair(0U, &Mips::GPR32RegClass); } - if (VT == MVT::i64 && !isGP64bit()) + if (VT == MVT::i64 && !Subtarget->isGP64bit()) return std::make_pair(0U, &Mips::GPR32RegClass); - if (VT == MVT::i64 && isGP64bit()) + if (VT == MVT::i64 && Subtarget->isGP64bit()) return std::make_pair(0U, &Mips::GPR64RegClass); // This will generate an error message return std::make_pair(0U, nullptr); @@ -3169,7 +3260,7 @@ bool MipsTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { } unsigned MipsTargetLowering::getJumpTableEncoding() const { - if (isN64()) + if (Subtarget->isABI_N64()) return MachineJumpTableInfo::EK_GPRel64BlockAddress; return TargetLowering::getJumpTableEncoding(); diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h index 4ac33bf1142c..4701bc412739 100644 --- a/lib/Target/Mips/MipsISelLowering.h +++ b/lib/Target/Mips/MipsISelLowering.h @@ -17,7 +17,6 @@ #include "MCTargetDesc/MipsBaseInfo.h" #include "Mips.h" -#include "MipsSubtarget.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/IR/Function.h" @@ -210,6 +209,7 @@ namespace llvm { // TargetLowering Implementation //===--------------------------------------------------------------------===// class MipsFunctionInfo; + class MipsSubtarget; class MipsTargetLowering : public TargetLowering { bool isMicroMips; @@ -438,12 +438,6 @@ namespace llvm { // Subtarget Info const MipsSubtarget *Subtarget; - bool hasMips64() const { return Subtarget->hasMips64(); } - bool isGP64bit() const { return Subtarget->isGP64bit(); } - bool isO32() const { return Subtarget->isABI_O32(); } - bool isN32() const { return Subtarget->isABI_N32(); } - bool isN64() const { return Subtarget->isABI_N64(); } - private: // Create a TargetGlobalAddress node. SDValue getTargetNode(GlobalAddressSDNode *N, EVT Ty, SelectionDAG &DAG, @@ -598,6 +592,12 @@ namespace llvm { unsigned getJumpTableEncoding() const override; + /// Emit a sign-extension using sll/sra, seb, or seh appropriately. + MachineBasicBlock *emitSignExtendToI32InReg(MachineInstr *MI, + MachineBasicBlock *BB, + unsigned Size, unsigned DstReg, + unsigned SrcRec) const; + MachineBasicBlock *emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, unsigned Size, unsigned BinOpcode, bool Nand = false) const; MachineBasicBlock *emitAtomicBinaryPartword(MachineInstr *MI, @@ -607,6 +607,7 @@ namespace llvm { MachineBasicBlock *BB, unsigned Size) const; MachineBasicBlock *emitAtomicCmpSwapPartword(MachineInstr *MI, MachineBasicBlock *BB, unsigned Size) const; + MachineBasicBlock *emitSEL_D(MachineInstr *MI, MachineBasicBlock *BB) const; }; /// Create MipsTargetLowering objects. diff --git a/lib/Target/Mips/MipsInstrFPU.td b/lib/Target/Mips/MipsInstrFPU.td index 32cda3b278a5..2260d53567f6 100644 --- a/lib/Target/Mips/MipsInstrFPU.td +++ b/lib/Target/Mips/MipsInstrFPU.td @@ -57,13 +57,13 @@ let PrintMethod = "printFCCOperand", DecoderMethod = "DecodeCondCode" in // Feature predicates. //===----------------------------------------------------------------------===// -def IsFP64bit : Predicate<"Subtarget.isFP64bit()">, +def IsFP64bit : Predicate<"Subtarget->isFP64bit()">, AssemblerPredicate<"FeatureFP64Bit">; -def NotFP64bit : Predicate<"!Subtarget.isFP64bit()">, +def NotFP64bit : Predicate<"!Subtarget->isFP64bit()">, AssemblerPredicate<"!FeatureFP64Bit">; -def IsSingleFloat : Predicate<"Subtarget.isSingleFloat()">, +def IsSingleFloat : Predicate<"Subtarget->isSingleFloat()">, AssemblerPredicate<"FeatureSingleFloat">; -def IsNotSingleFloat : Predicate<"!Subtarget.isSingleFloat()">, +def IsNotSingleFloat : Predicate<"!Subtarget->isSingleFloat()">, AssemblerPredicate<"!FeatureSingleFloat">; //===----------------------------------------------------------------------===// @@ -153,6 +153,15 @@ class MTC1_FT<string opstr, RegisterOperand DstRC, RegisterOperand SrcRC, InstSE<(outs DstRC:$fs), (ins SrcRC:$rt), !strconcat(opstr, "\t$rt, $fs"), [(set DstRC:$fs, (OpNode SrcRC:$rt))], Itin, FrmFR, opstr>; +class MTC1_64_FT<string opstr, RegisterOperand DstRC, RegisterOperand SrcRC, + InstrItinClass Itin> : + InstSE<(outs DstRC:$fs), (ins DstRC:$fs_in, SrcRC:$rt), + !strconcat(opstr, "\t$rt, $fs"), [], Itin, FrmFR, opstr> { + // $fs_in is part of a white lie to work around a widespread bug in the FPU + // implementation. See expandBuildPairF64 for details. + let Constraints = "$fs = $fs_in"; +} + class LW_FT<string opstr, RegisterOperand RC, InstrItinClass Itin, SDPatternOperator OpNode= null_frag> : InstSE<(outs RC:$rt), (ins mem:$addr), !strconcat(opstr, "\t$rt, $addr"), @@ -249,11 +258,11 @@ multiclass C_COND_M<string TypeStr, RegisterOperand RC, bits<5> fmt, def C_NGT_#NAME : C_COND_FT<"ngt", TypeStr, RC, itin>, C_COND_FM<fmt, 15>; } -defm S : C_COND_M<"s", FGR32Opnd, 16, II_C_CC_S>; -defm D32 : C_COND_M<"d", AFGR64Opnd, 17, II_C_CC_D>, +defm S : C_COND_M<"s", FGR32Opnd, 16, II_C_CC_S>, ISA_MIPS1_NOT_32R6_64R6; +defm D32 : C_COND_M<"d", AFGR64Opnd, 17, II_C_CC_D>, ISA_MIPS1_NOT_32R6_64R6, AdditionalRequires<[NotFP64bit]>; let DecoderNamespace = "Mips64" in -defm D64 : C_COND_M<"d", FGR64Opnd, 17, II_C_CC_D>, +defm D64 : C_COND_M<"d", FGR64Opnd, 17, II_C_CC_D>, ISA_MIPS1_NOT_32R6_64R6, AdditionalRequires<[IsFP64bit]>; //===----------------------------------------------------------------------===// @@ -355,8 +364,12 @@ def MTC1 : MMRel, MTC1_FT<"mtc1", FGR32Opnd, GPR32Opnd, II_MTC1, bitconvert>, MFC1_FM<4>; def MFHC1 : MMRel, MFC1_FT<"mfhc1", GPR32Opnd, FGRH32Opnd, II_MFHC1>, MFC1_FM<3>, ISA_MIPS32R2; -def MTHC1 : MMRel, MTC1_FT<"mthc1", FGRH32Opnd, GPR32Opnd, II_MTHC1>, - MFC1_FM<7>, ISA_MIPS32R2; +def MTHC1_D32 : MMRel, MTC1_64_FT<"mthc1", FGR64Opnd, GPR32Opnd, II_MTHC1>, + MFC1_FM<7>, ISA_MIPS32R2, AdditionalRequires<[NotFP64bit]>; +def MTHC1_D64 : MTC1_64_FT<"mthc1", AFGR64Opnd, GPR32Opnd, II_MTHC1>, + MFC1_FM<7>, ISA_MIPS32R2, AdditionalRequires<[IsFP64bit]> { + let DecoderNamespace = "Mips64"; +} def DMFC1 : MFC1_FT<"dmfc1", GPR64Opnd, FGR64Opnd, II_DMFC1, bitconvert>, MFC1_FM<1>, ISA_MIPS3; def DMTC1 : MTC1_FT<"dmtc1", FGR64Opnd, GPR64Opnd, II_DMTC1, @@ -390,56 +403,64 @@ def SDC1 : MMRel, SW_FT<"sdc1", AFGR64Opnd, II_SDC1, store>, LW_FM<0x3d>, // Cop2 Memory Instructions // FIXME: These aren't really FPU instructions and as such don't belong in this // file -def LWC2 : LW_FT<"lwc2", COP2Opnd, NoItinerary, load>, LW_FM<0x32>; -def SWC2 : SW_FT<"swc2", COP2Opnd, NoItinerary, store>, LW_FM<0x3a>; -def LDC2 : LW_FT<"ldc2", COP2Opnd, NoItinerary, load>, LW_FM<0x36>, ISA_MIPS2; -def SDC2 : SW_FT<"sdc2", COP2Opnd, NoItinerary, store>, LW_FM<0x3e>, ISA_MIPS2; +def LWC2 : LW_FT<"lwc2", COP2Opnd, NoItinerary, load>, LW_FM<0x32>, + ISA_MIPS1_NOT_32R6_64R6; +def SWC2 : SW_FT<"swc2", COP2Opnd, NoItinerary, store>, LW_FM<0x3a>, + ISA_MIPS1_NOT_32R6_64R6; +def LDC2 : LW_FT<"ldc2", COP2Opnd, NoItinerary, load>, LW_FM<0x36>, + ISA_MIPS2_NOT_32R6_64R6; +def SDC2 : SW_FT<"sdc2", COP2Opnd, NoItinerary, store>, LW_FM<0x3e>, + ISA_MIPS2_NOT_32R6_64R6; // Cop3 Memory Instructions // FIXME: These aren't really FPU instructions and as such don't belong in this // file -def LWC3 : LW_FT<"lwc3", COP3Opnd, NoItinerary, load>, LW_FM<0x33>; -def SWC3 : SW_FT<"swc3", COP3Opnd, NoItinerary, store>, LW_FM<0x3b>; -def LDC3 : LW_FT<"ldc3", COP3Opnd, NoItinerary, load>, LW_FM<0x37>, ISA_MIPS2; -def SDC3 : SW_FT<"sdc3", COP3Opnd, NoItinerary, store>, LW_FM<0x3f>, ISA_MIPS2; +let DecoderNamespace = "COP3_" in { + def LWC3 : LW_FT<"lwc3", COP3Opnd, NoItinerary, load>, LW_FM<0x33>; + def SWC3 : SW_FT<"swc3", COP3Opnd, NoItinerary, store>, LW_FM<0x3b>; + def LDC3 : LW_FT<"ldc3", COP3Opnd, NoItinerary, load>, LW_FM<0x37>, + ISA_MIPS2; + def SDC3 : SW_FT<"sdc3", COP3Opnd, NoItinerary, store>, LW_FM<0x3f>, + ISA_MIPS2; +} // Indexed loads and stores. // Base register + offset register addressing mode (indicated by "x" in the // instruction mnemonic) is disallowed under NaCl. let AdditionalPredicates = [IsNotNaCl] in { def LWXC1 : MMRel, LWXC1_FT<"lwxc1", FGR32Opnd, II_LWXC1, load>, LWXC1_FM<0>, - INSN_MIPS4_32R2; + INSN_MIPS4_32R2_NOT_32R6_64R6; def SWXC1 : MMRel, SWXC1_FT<"swxc1", FGR32Opnd, II_SWXC1, store>, SWXC1_FM<8>, - INSN_MIPS4_32R2; + INSN_MIPS4_32R2_NOT_32R6_64R6; } let AdditionalPredicates = [NotInMicroMips, IsNotNaCl] in { def LDXC1 : LWXC1_FT<"ldxc1", AFGR64Opnd, II_LDXC1, load>, LWXC1_FM<1>, - INSN_MIPS4_32R2, FGR_32; + INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32; def SDXC1 : SWXC1_FT<"sdxc1", AFGR64Opnd, II_SDXC1, store>, SWXC1_FM<9>, - INSN_MIPS4_32R2, FGR_32; + INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32; } let DecoderNamespace="Mips64" in { def LDXC164 : LWXC1_FT<"ldxc1", FGR64Opnd, II_LDXC1, load>, LWXC1_FM<1>, - INSN_MIPS4_32R2, FGR_64; + INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64; def SDXC164 : SWXC1_FT<"sdxc1", FGR64Opnd, II_SDXC1, store>, SWXC1_FM<9>, - INSN_MIPS4_32R2, FGR_64; + INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64; } // Load/store doubleword indexed unaligned. let AdditionalPredicates = [IsNotNaCl] in { def LUXC1 : MMRel, LWXC1_FT<"luxc1", AFGR64Opnd, II_LUXC1>, LWXC1_FM<0x5>, - INSN_MIPS5_32R2, FGR_32; + INSN_MIPS5_32R2_NOT_32R6_64R6, FGR_32; def SUXC1 : MMRel, SWXC1_FT<"suxc1", AFGR64Opnd, II_SUXC1>, SWXC1_FM<0xd>, - INSN_MIPS5_32R2, FGR_32; + INSN_MIPS5_32R2_NOT_32R6_64R6, FGR_32; } let DecoderNamespace="Mips64" in { def LUXC164 : LWXC1_FT<"luxc1", FGR64Opnd, II_LUXC1>, LWXC1_FM<0x5>, - INSN_MIPS5_32R2, FGR_64; + INSN_MIPS5_32R2_NOT_32R6_64R6, FGR_64; def SUXC164 : SWXC1_FT<"suxc1", FGR64Opnd, II_SUXC1>, SWXC1_FM<0xd>, - INSN_MIPS5_32R2, FGR_64; + INSN_MIPS5_32R2_NOT_32R6_64R6, FGR_64; } /// Floating-point Aritmetic @@ -457,42 +478,42 @@ def FSUB_S : MMRel, ADDS_FT<"sub.s", FGR32Opnd, II_SUB_S, 0, fsub>, defm FSUB : ADDS_M<"sub.d", II_SUB_D, 0, fsub>, ADDS_FM<0x01, 17>; def MADD_S : MMRel, MADDS_FT<"madd.s", FGR32Opnd, II_MADD_S, fadd>, - MADDS_FM<4, 0>, ISA_MIPS32R2; + MADDS_FM<4, 0>, ISA_MIPS32R2_NOT_32R6_64R6; def MSUB_S : MMRel, MADDS_FT<"msub.s", FGR32Opnd, II_MSUB_S, fsub>, - MADDS_FM<5, 0>, ISA_MIPS32R2; + MADDS_FM<5, 0>, ISA_MIPS32R2_NOT_32R6_64R6; let AdditionalPredicates = [NoNaNsFPMath] in { def NMADD_S : MMRel, NMADDS_FT<"nmadd.s", FGR32Opnd, II_NMADD_S, fadd>, - MADDS_FM<6, 0>, ISA_MIPS32R2; + MADDS_FM<6, 0>, ISA_MIPS32R2_NOT_32R6_64R6; def NMSUB_S : MMRel, NMADDS_FT<"nmsub.s", FGR32Opnd, II_NMSUB_S, fsub>, - MADDS_FM<7, 0>, ISA_MIPS32R2; + MADDS_FM<7, 0>, ISA_MIPS32R2_NOT_32R6_64R6; } def MADD_D32 : MMRel, MADDS_FT<"madd.d", AFGR64Opnd, II_MADD_D, fadd>, - MADDS_FM<4, 1>, ISA_MIPS32R2, FGR_32; + MADDS_FM<4, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_32; def MSUB_D32 : MMRel, MADDS_FT<"msub.d", AFGR64Opnd, II_MSUB_D, fsub>, - MADDS_FM<5, 1>, ISA_MIPS32R2, FGR_32; + MADDS_FM<5, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_32; let AdditionalPredicates = [NoNaNsFPMath] in { def NMADD_D32 : MMRel, NMADDS_FT<"nmadd.d", AFGR64Opnd, II_NMADD_D, fadd>, - MADDS_FM<6, 1>, ISA_MIPS32R2, FGR_32; + MADDS_FM<6, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_32; def NMSUB_D32 : MMRel, NMADDS_FT<"nmsub.d", AFGR64Opnd, II_NMSUB_D, fsub>, - MADDS_FM<7, 1>, ISA_MIPS32R2, FGR_32; + MADDS_FM<7, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_32; } let isCodeGenOnly=1 in { def MADD_D64 : MADDS_FT<"madd.d", FGR64Opnd, II_MADD_D, fadd>, - MADDS_FM<4, 1>, ISA_MIPS32R2, FGR_64; + MADDS_FM<4, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_64; def MSUB_D64 : MADDS_FT<"msub.d", FGR64Opnd, II_MSUB_D, fsub>, - MADDS_FM<5, 1>, ISA_MIPS32R2, FGR_64; + MADDS_FM<5, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_64; } let AdditionalPredicates = [NoNaNsFPMath], isCodeGenOnly=1 in { def NMADD_D64 : NMADDS_FT<"nmadd.d", FGR64Opnd, II_NMADD_D, fadd>, - MADDS_FM<6, 1>, ISA_MIPS32R2, FGR_64; + MADDS_FM<6, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_64; def NMSUB_D64 : NMADDS_FT<"nmsub.d", FGR64Opnd, II_NMSUB_D, fsub>, - MADDS_FM<7, 1>, ISA_MIPS32R2, FGR_64; + MADDS_FM<7, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_64; } //===----------------------------------------------------------------------===// @@ -504,9 +525,9 @@ def MIPS_BRANCH_F : PatLeaf<(i32 0)>; def MIPS_BRANCH_T : PatLeaf<(i32 1)>; def BC1F : MMRel, BC1F_FT<"bc1f", brtarget, IIBranch, MIPS_BRANCH_F>, - BC1F_FM<0, 0>; + BC1F_FM<0, 0>, ISA_MIPS1_NOT_32R6_64R6; def BC1T : MMRel, BC1F_FT<"bc1t", brtarget, IIBranch, MIPS_BRANCH_T>, - BC1F_FM<0, 1>; + BC1F_FM<0, 1>, ISA_MIPS1_NOT_32R6_64R6; //===----------------------------------------------------------------------===// // Floating Point Flag Conditions @@ -531,12 +552,13 @@ def MIPS_FCOND_LE : PatLeaf<(i32 14)>; def MIPS_FCOND_NGT : PatLeaf<(i32 15)>; /// Floating Point Compare -def FCMP_S32 : MMRel, CEQS_FT<"s", FGR32, II_C_CC_S, MipsFPCmp>, CEQS_FM<16>; +def FCMP_S32 : MMRel, CEQS_FT<"s", FGR32, II_C_CC_S, MipsFPCmp>, CEQS_FM<16>, + ISA_MIPS1_NOT_32R6_64R6; def FCMP_D32 : MMRel, CEQS_FT<"d", AFGR64, II_C_CC_D, MipsFPCmp>, CEQS_FM<17>, - AdditionalRequires<[NotFP64bit]>; + ISA_MIPS1_NOT_32R6_64R6, AdditionalRequires<[NotFP64bit]>; let DecoderNamespace = "Mips64" in def FCMP_D64 : CEQS_FT<"d", FGR64, II_C_CC_D, MipsFPCmp>, CEQS_FM<17>, - AdditionalRequires<[IsFP64bit]>; + ISA_MIPS1_NOT_32R6_64R6, AdditionalRequires<[IsFP64bit]>; //===----------------------------------------------------------------------===// // Floating Point Pseudo-Instructions @@ -569,8 +591,10 @@ def ExtractElementF64_64 : ExtractElementF64Base<FGR64Opnd>, //===----------------------------------------------------------------------===// // InstAliases. //===----------------------------------------------------------------------===// -def : MipsInstAlias<"bc1t $offset", (BC1T FCC0, brtarget:$offset)>; -def : MipsInstAlias<"bc1f $offset", (BC1F FCC0, brtarget:$offset)>; +def : MipsInstAlias<"bc1t $offset", (BC1T FCC0, brtarget:$offset)>, + ISA_MIPS1_NOT_32R6_64R6; +def : MipsInstAlias<"bc1f $offset", (BC1F FCC0, brtarget:$offset)>, + ISA_MIPS1_NOT_32R6_64R6; //===----------------------------------------------------------------------===// // Floating Point Patterns diff --git a/lib/Target/Mips/MipsInstrFormats.td b/lib/Target/Mips/MipsInstrFormats.td index 0377eabed78d..6a01ae560f3a 100644 --- a/lib/Target/Mips/MipsInstrFormats.td +++ b/lib/Target/Mips/MipsInstrFormats.td @@ -844,6 +844,44 @@ class BARRIER_FM<bits<5> op> : StdArch { let Inst{5-0} = 0; // SLL } +class SDBBP_FM : StdArch { + bits<20> code_; + + bits<32> Inst; + + let Inst{31-26} = 0b011100; // SPECIAL2 + let Inst{25-6} = code_; + let Inst{5-0} = 0b111111; // SDBBP +} + +class JR_HB_FM<bits<6> op> : StdArch{ + bits<5> rs; + + bits<32> Inst; + + let Inst{31-26} = 0; // SPECIAL + let Inst{25-21} = rs; + let Inst{20-11} = 0; + let Inst{10} = 1; + let Inst{9-6} = 0; + let Inst{5-0} = op; +} + +class JALR_HB_FM<bits<6> op> : StdArch { + bits<5> rd; + bits<5> rs; + + bits<32> Inst; + + let Inst{31-26} = 0; // SPECIAL + let Inst{25-21} = rs; + let Inst{20-16} = 0; + let Inst{15-11} = rd; + let Inst{10} = 1; + let Inst{9-6} = 0; + let Inst{5-0} = op; +} + class COP0_TLB_FM<bits<6> op> : StdArch { bits<32> Inst; @@ -852,3 +890,17 @@ class COP0_TLB_FM<bits<6> op> : StdArch { let Inst{24-6} = 0; let Inst{5-0} = op; // Operation } + +class CACHEOP_FM<bits<6> op> : StdArch { + bits<21> addr; + bits<5> hint; + bits<5> base = addr{20-16}; + bits<16> offset = addr{15-0}; + + bits<32> Inst; + + let Inst{31-26} = op; + let Inst{25-21} = base; + let Inst{20-16} = hint; + let Inst{15-0} = offset; +} diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td index 0d3cb7578e25..8e9472c2d9d8 100644 --- a/lib/Target/Mips/MipsInstrInfo.td +++ b/lib/Target/Mips/MipsInstrInfo.td @@ -146,61 +146,61 @@ def MipsSDR : SDNode<"MipsISD::SDR", SDTStore, //===----------------------------------------------------------------------===// // Mips Instruction Predicate Definitions. //===----------------------------------------------------------------------===// -def HasMips2 : Predicate<"Subtarget.hasMips2()">, +def HasMips2 : Predicate<"Subtarget->hasMips2()">, AssemblerPredicate<"FeatureMips2">; -def HasMips3_32 : Predicate<"Subtarget.hasMips3_32()">, +def HasMips3_32 : Predicate<"Subtarget->hasMips3_32()">, AssemblerPredicate<"FeatureMips3_32">; -def HasMips3_32r2 : Predicate<"Subtarget.hasMips3_32r2()">, +def HasMips3_32r2 : Predicate<"Subtarget->hasMips3_32r2()">, AssemblerPredicate<"FeatureMips3_32r2">; -def HasMips3 : Predicate<"Subtarget.hasMips3()">, +def HasMips3 : Predicate<"Subtarget->hasMips3()">, AssemblerPredicate<"FeatureMips3">; -def HasMips4_32 : Predicate<"Subtarget.hasMips4_32()">, +def HasMips4_32 : Predicate<"Subtarget->hasMips4_32()">, AssemblerPredicate<"FeatureMips4_32">; -def HasMips4_32r2 : Predicate<"Subtarget.hasMips4_32r2()">, +def HasMips4_32r2 : Predicate<"Subtarget->hasMips4_32r2()">, AssemblerPredicate<"FeatureMips4_32r2">; -def HasMips5_32r2 : Predicate<"Subtarget.hasMips5_32r2()">, +def HasMips5_32r2 : Predicate<"Subtarget->hasMips5_32r2()">, AssemblerPredicate<"FeatureMips5_32r2">; -def HasMips32 : Predicate<"Subtarget.hasMips32()">, +def HasMips32 : Predicate<"Subtarget->hasMips32()">, AssemblerPredicate<"FeatureMips32">; -def HasMips32r2 : Predicate<"Subtarget.hasMips32r2()">, +def HasMips32r2 : Predicate<"Subtarget->hasMips32r2()">, AssemblerPredicate<"FeatureMips32r2">; -def HasMips32r6 : Predicate<"Subtarget.hasMips32r6()">, +def HasMips32r6 : Predicate<"Subtarget->hasMips32r6()">, AssemblerPredicate<"FeatureMips32r6">; -def NotMips32r6 : Predicate<"!Subtarget.hasMips32r6()">, +def NotMips32r6 : Predicate<"!Subtarget->hasMips32r6()">, AssemblerPredicate<"!FeatureMips32r6">; -def IsGP64bit : Predicate<"Subtarget.isGP64bit()">, +def IsGP64bit : Predicate<"Subtarget->isGP64bit()">, AssemblerPredicate<"FeatureGP64Bit">; -def IsGP32bit : Predicate<"!Subtarget.isGP64bit()">, +def IsGP32bit : Predicate<"!Subtarget->isGP64bit()">, AssemblerPredicate<"!FeatureGP64Bit">; -def HasMips64 : Predicate<"Subtarget.hasMips64()">, +def HasMips64 : Predicate<"Subtarget->hasMips64()">, AssemblerPredicate<"FeatureMips64">; -def HasMips64r2 : Predicate<"Subtarget.hasMips64r2()">, +def HasMips64r2 : Predicate<"Subtarget->hasMips64r2()">, AssemblerPredicate<"FeatureMips64r2">; -def HasMips64r6 : Predicate<"Subtarget.hasMips64r6()">, +def HasMips64r6 : Predicate<"Subtarget->hasMips64r6()">, AssemblerPredicate<"FeatureMips64r6">; -def NotMips64r6 : Predicate<"!Subtarget.hasMips64r6()">, +def NotMips64r6 : Predicate<"!Subtarget->hasMips64r6()">, AssemblerPredicate<"!FeatureMips64r6">; -def IsN64 : Predicate<"Subtarget.isABI_N64()">, +def IsN64 : Predicate<"Subtarget->isABI_N64()">, AssemblerPredicate<"FeatureN64">; -def InMips16Mode : Predicate<"Subtarget.inMips16Mode()">, +def InMips16Mode : Predicate<"Subtarget->inMips16Mode()">, AssemblerPredicate<"FeatureMips16">; -def HasCnMips : Predicate<"Subtarget.hasCnMips()">, +def HasCnMips : Predicate<"Subtarget->hasCnMips()">, AssemblerPredicate<"FeatureCnMips">; def RelocStatic : Predicate<"TM.getRelocationModel() == Reloc::Static">, AssemblerPredicate<"FeatureMips32">; def RelocPIC : Predicate<"TM.getRelocationModel() == Reloc::PIC_">, AssemblerPredicate<"FeatureMips32">; def NoNaNsFPMath : Predicate<"TM.Options.NoNaNsFPMath">; -def HasStdEnc : Predicate<"Subtarget.hasStandardEncoding()">, +def HasStdEnc : Predicate<"Subtarget->hasStandardEncoding()">, AssemblerPredicate<"!FeatureMips16">; -def NotDSP : Predicate<"!Subtarget.hasDSP()">; -def InMicroMips : Predicate<"Subtarget.inMicroMipsMode()">, +def NotDSP : Predicate<"!Subtarget->hasDSP()">; +def InMicroMips : Predicate<"Subtarget->inMicroMipsMode()">, AssemblerPredicate<"FeatureMicroMips">; -def NotInMicroMips : Predicate<"!Subtarget.inMicroMipsMode()">, +def NotInMicroMips : Predicate<"!Subtarget->inMicroMipsMode()">, AssemblerPredicate<"!FeatureMicroMips">; -def IsLE : Predicate<"Subtarget.isLittle()">; -def IsBE : Predicate<"!Subtarget.isLittle()">; -def IsNotNaCl : Predicate<"!Subtarget.isTargetNaCl()">; +def IsLE : Predicate<"Subtarget->isLittle()">; +def IsBE : Predicate<"!Subtarget->isLittle()">; +def IsNotNaCl : Predicate<"!Subtarget->isTargetNaCl()">; //===----------------------------------------------------------------------===// // Mips GPR size adjectives. @@ -232,8 +232,17 @@ class ISA_MIPS3_NOT_32R6_64R6 { list<Predicate> InsnPredicates = [HasMips3, NotMips32r6, NotMips64r6]; } class ISA_MIPS32 { list<Predicate> InsnPredicates = [HasMips32]; } +class ISA_MIPS32_NOT_32R6_64R6 { + list<Predicate> InsnPredicates = [HasMips32, NotMips32r6, NotMips64r6]; +} class ISA_MIPS32R2 { list<Predicate> InsnPredicates = [HasMips32r2]; } +class ISA_MIPS32R2_NOT_32R6_64R6 { + list<Predicate> InsnPredicates = [HasMips32r2, NotMips32r6, NotMips64r6]; +} class ISA_MIPS64 { list<Predicate> InsnPredicates = [HasMips64]; } +class ISA_MIPS64_NOT_64R6 { + list<Predicate> InsnPredicates = [HasMips64, NotMips64r6]; +} class ISA_MIPS64R2 { list<Predicate> InsnPredicates = [HasMips64r2]; } class ISA_MIPS32R6 { list<Predicate> InsnPredicates = [HasMips32r6]; } class ISA_MIPS64R6 { list<Predicate> InsnPredicates = [HasMips64r6]; } @@ -241,17 +250,32 @@ class ISA_MIPS64R6 { list<Predicate> InsnPredicates = [HasMips64r6]; } // The portions of MIPS-III that were also added to MIPS32 class INSN_MIPS3_32 { list<Predicate> InsnPredicates = [HasMips3_32]; } +// The portions of MIPS-III that were also added to MIPS32 but were removed in +// MIPS32r6 and MIPS64r6. +class INSN_MIPS3_32_NOT_32R6_64R6 { + list<Predicate> InsnPredicates = [HasMips3_32, NotMips32r6, NotMips64r6]; +} + // The portions of MIPS-III that were also added to MIPS32 class INSN_MIPS3_32R2 { list<Predicate> InsnPredicates = [HasMips3_32r2]; } -// The portions of MIPS-IV that were also added to MIPS32 -class INSN_MIPS4_32 { list<Predicate> InsnPredicates = [HasMips4_32]; } +// The portions of MIPS-IV that were also added to MIPS32 but were removed in +// MIPS32r6 and MIPS64r6. +class INSN_MIPS4_32_NOT_32R6_64R6 { + list<Predicate> InsnPredicates = [HasMips4_32, NotMips32r6, NotMips64r6]; +} -// The portions of MIPS-IV that were also added to MIPS32R2 -class INSN_MIPS4_32R2 { list<Predicate> InsnPredicates = [HasMips4_32r2]; } +// The portions of MIPS-IV that were also added to MIPS32r2 but were removed in +// MIPS32r6 and MIPS64r6. +class INSN_MIPS4_32R2_NOT_32R6_64R6 { + list<Predicate> InsnPredicates = [HasMips4_32r2, NotMips32r6, NotMips64r6]; +} -// The portions of MIPS-V that were also added to MIPS32R2 -class INSN_MIPS5_32R2 { list<Predicate> InsnPredicates = [HasMips5_32r2]; } +// The portions of MIPS-V that were also added to MIPS32r2 but were removed in +// MIPS32r6 and MIPS64r6. +class INSN_MIPS5_32R2_NOT_32R6_64R6 { + list<Predicate> InsnPredicates = [HasMips5_32r2, NotMips32r6, NotMips64r6]; +} //===----------------------------------------------------------------------===// @@ -328,7 +352,9 @@ def calltarget : Operand<iPTR> { let ParserMatchClass = MipsJumpTargetAsmOperand; } +def simm9 : Operand<i32>; def simm10 : Operand<i32>; +def simm11 : Operand<i32>; def simm16 : Operand<i32> { let DecoderMethod= "DecodeSimm16"; @@ -337,6 +363,13 @@ def simm16 : Operand<i32> { def simm19_lsl2 : Operand<i32> { let EncoderMethod = "getSimm19Lsl2Encoding"; let DecoderMethod = "DecodeSimm19Lsl2"; + let ParserMatchClass = MipsJumpTargetAsmOperand; +} + +def simm18_lsl3 : Operand<i32> { + let EncoderMethod = "getSimm18Lsl3Encoding"; + let DecoderMethod = "DecodeSimm18Lsl3"; + let ParserMatchClass = MipsJumpTargetAsmOperand; } def simm20 : Operand<i32> { @@ -386,6 +419,15 @@ def MipsMemAsmOperand : AsmOperandClass { let ParserMethod = "parseMemOperand"; } +def MipsMemSimm11AsmOperand : AsmOperandClass { + let Name = "MemOffsetSimm11"; + let SuperClasses = [MipsMemAsmOperand]; + let RenderMethod = "addMemOperands"; + let ParserMethod = "parseMemOperand"; + let PredicateMethod = "isMemWithSimmOffset<11>"; + //let DiagnosticType = "Simm11"; +} + def MipsInvertedImmoperand : AsmOperandClass { let Name = "InvNum"; let RenderMethod = "addImmOperands"; @@ -417,6 +459,17 @@ def mem_msa : mem_generic { let EncoderMethod = "getMSAMemEncoding"; } +def mem_simm9 : mem_generic { + let MIOperandInfo = (ops ptr_rc, simm9); + let EncoderMethod = "getMemEncoding"; +} + +def mem_simm11 : mem_generic { + let MIOperandInfo = (ops ptr_rc, simm11); + let EncoderMethod = "getMemEncoding"; + let ParserMatchClass = MipsMemSimm11AsmOperand; +} + def mem_ea : Operand<iPTR> { let PrintMethod = "printMemOperandEA"; let MIOperandInfo = (ops ptr_rc, simm16); @@ -690,20 +743,11 @@ class JumpFR<string opstr, RegisterOperand RO, FrmR, opstr>; // Indirect branch -class IndirectBranch<string opstr, RegisterOperand RO> : - JumpFR<opstr, RO, brind> { +class IndirectBranch<string opstr, RegisterOperand RO> : JumpFR<opstr, RO> { let isBranch = 1; let isIndirectBranch = 1; } -// Return instruction -class RetBase<string opstr, RegisterOperand RO>: JumpFR<opstr, RO> { - let isReturn = 1; - let isCodeGenOnly = 1; - let hasCtrlDep = 1; - let hasExtraSrcRegAllocReq = 1; -} - // Jump and Link (Call) let isCall=1, hasDelaySlot=1, Defs = [RA] in { class JumpLink<string opstr, DAGOperand opnd> : @@ -1042,7 +1086,7 @@ def SUBu : MMRel, ArithLogicR<"subu", GPR32Opnd, 0, II_SUBU, sub>, ADD_FM<0, 0x23>; let Defs = [HI0, LO0] in def MUL : MMRel, ArithLogicR<"mul", GPR32Opnd, 1, II_MUL, mul>, - ADD_FM<0x1c, 2>, ISA_MIPS32; + ADD_FM<0x1c, 2>, ISA_MIPS32_NOT_32R6_64R6; def ADD : MMRel, ArithLogicR<"add", GPR32Opnd>, ADD_FM<0, 0x20>; def SUB : MMRel, ArithLogicR<"sub", GPR32Opnd>, ADD_FM<0, 0x22>; def SLT : MMRel, SetCC_R<"slt", setlt, GPR32Opnd>, ADD_FM<0, 0x2a>; @@ -1103,7 +1147,7 @@ def SWR : StoreLeftRight<"swr", MipsSWR, GPR32Opnd, II_SWR>, LW_FM<0x2e>, ISA_MIPS1_NOT_32R6_64R6; } -def SYNC : MMRel, SYNC_FT<"sync">, SYNC_FM; +def SYNC : MMRel, SYNC_FT<"sync">, SYNC_FM, ISA_MIPS32; def TEQ : MMRel, TEQ_FT<"teq", GPR32Opnd>, TEQ_FM<0x34>; def TGE : MMRel, TEQ_FT<"tge", GPR32Opnd>, TEQ_FM<0x30>; def TGEU : MMRel, TEQ_FT<"tgeu", GPR32Opnd>, TEQ_FM<0x31>; @@ -1127,6 +1171,7 @@ def TNEI : MMRel, TEQI_FT<"tnei", GPR32Opnd>, TEQI_FM<0xe>, def BREAK : MMRel, BRK_FT<"break">, BRK_FM<0xd>; def SYSCALL : MMRel, SYS_FT<"syscall">, SYS_FM<0xc>; def TRAP : TrapBase<BREAK>; +def SDBBP : SYS_FT<"sdbbp">, SDBBP_FM, ISA_MIPS32_NOT_32R6_64R6; def ERET : MMRel, ER_FT<"eret">, ER_FM<0x18>, INSN_MIPS3_32; def DERET : MMRel, ER_FT<"deret">, ER_FM<0x1f>, ISA_MIPS32; @@ -1139,8 +1184,8 @@ let EncodingPredicates = []<Predicate>, // FIXME: Lack of HasStdEnc is probably def WAIT : WAIT_FT<"wait">, WAIT_FM; /// Load-linked, Store-conditional -def LL : LLBase<"ll", GPR32Opnd>, LW_FM<0x30>, ISA_MIPS2; -def SC : SCBase<"sc", GPR32Opnd>, LW_FM<0x38>, ISA_MIPS2; +def LL : LLBase<"ll", GPR32Opnd>, LW_FM<0x30>, ISA_MIPS2_NOT_32R6_64R6; +def SC : SCBase<"sc", GPR32Opnd>, LW_FM<0x38>, ISA_MIPS2_NOT_32R6_64R6; } /// Jump and Branch Instructions @@ -1161,17 +1206,49 @@ def B : UncondBranch<BEQ>; def JAL : MMRel, JumpLink<"jal", calltarget>, FJ<3>; let AdditionalPredicates = [NotInMicroMips] in { -def JALR : JumpLinkReg<"jalr", GPR32Opnd>, JALR_FM; -def JALRPseudo : JumpLinkRegPseudo<GPR32Opnd, JALR, RA>; + def JALR : JumpLinkReg<"jalr", GPR32Opnd>, JALR_FM; + def JALRPseudo : JumpLinkRegPseudo<GPR32Opnd, JALR, RA>; } -def JALX : JumpLink<"jalx", calltarget>, FJ<0x1D>; -def BGEZAL : MMRel, BGEZAL_FT<"bgezal", brtarget, GPR32Opnd>, BGEZAL_FM<0x11>; -def BLTZAL : MMRel, BGEZAL_FT<"bltzal", brtarget, GPR32Opnd>, BGEZAL_FM<0x10>; + +// FIXME: JALX really requires either MIPS16 or microMIPS in addition to MIPS32. +def JALX : JumpLink<"jalx", calltarget>, FJ<0x1D>, ISA_MIPS32_NOT_32R6_64R6; +def BGEZAL : MMRel, BGEZAL_FT<"bgezal", brtarget, GPR32Opnd>, BGEZAL_FM<0x11>, + ISA_MIPS1_NOT_32R6_64R6; +def BLTZAL : MMRel, BGEZAL_FT<"bltzal", brtarget, GPR32Opnd>, BGEZAL_FM<0x10>, + ISA_MIPS1_NOT_32R6_64R6; def BAL_BR : BAL_BR_Pseudo<BGEZAL>; def TAILCALL : TailCall<J>; def TAILCALL_R : TailCallReg<GPR32Opnd, JR>; -def RET : MMRel, RetBase<"ret", GPR32Opnd>, MTLO_FM<8>; +// Indirect branches are matched as PseudoIndirectBranch/PseudoIndirectBranch64 +// then are expanded to JR, JR64, JALR, or JALR64 depending on the ISA. +class PseudoIndirectBranchBase<RegisterOperand RO> : + MipsPseudo<(outs), (ins RO:$rs), [(brind RO:$rs)], IIBranch> { + let isTerminator=1; + let isBarrier=1; + let hasDelaySlot = 1; + let isBranch = 1; + let isIndirectBranch = 1; +} + +def PseudoIndirectBranch : PseudoIndirectBranchBase<GPR32Opnd>; + +// Return instructions are matched as a RetRA instruction, then ar expanded +// into PseudoReturn/PseudoReturn64 after register allocation. Finally, +// MipsAsmPrinter expands this into JR, JR64, JALR, or JALR64 depending on the +// ISA. +class PseudoReturnBase<RegisterOperand RO> : MipsPseudo<(outs), (ins RO:$rs), + [], IIBranch> { + let isTerminator = 1; + let isBarrier = 1; + let hasDelaySlot = 1; + let isReturn = 1; + let isCodeGenOnly = 1; + let hasCtrlDep = 1; + let hasExtraSrcRegAllocReq = 1; +} + +def PseudoReturn : PseudoReturnBase<GPR32Opnd>; // Exception handling related node and instructions. // The conversion sequence is: @@ -1196,20 +1273,24 @@ let Uses = [V0, V1], isTerminator = 1, isReturn = 1, isBarrier = 1 in { /// Multiply and Divide Instructions. def MULT : MMRel, Mult<"mult", II_MULT, GPR32Opnd, [HI0, LO0]>, - MULT_FM<0, 0x18>; + MULT_FM<0, 0x18>, ISA_MIPS1_NOT_32R6_64R6; def MULTu : MMRel, Mult<"multu", II_MULTU, GPR32Opnd, [HI0, LO0]>, - MULT_FM<0, 0x19>; + MULT_FM<0, 0x19>, ISA_MIPS1_NOT_32R6_64R6; def SDIV : MMRel, Div<"div", II_DIV, GPR32Opnd, [HI0, LO0]>, - MULT_FM<0, 0x1a>; + MULT_FM<0, 0x1a>, ISA_MIPS1_NOT_32R6_64R6; def UDIV : MMRel, Div<"divu", II_DIVU, GPR32Opnd, [HI0, LO0]>, - MULT_FM<0, 0x1b>; + MULT_FM<0, 0x1b>, ISA_MIPS1_NOT_32R6_64R6; -def MTHI : MMRel, MoveToLOHI<"mthi", GPR32Opnd, [HI0]>, MTLO_FM<0x11>; -def MTLO : MMRel, MoveToLOHI<"mtlo", GPR32Opnd, [LO0]>, MTLO_FM<0x13>; +def MTHI : MMRel, MoveToLOHI<"mthi", GPR32Opnd, [HI0]>, MTLO_FM<0x11>, + ISA_MIPS1_NOT_32R6_64R6; +def MTLO : MMRel, MoveToLOHI<"mtlo", GPR32Opnd, [LO0]>, MTLO_FM<0x13>, + ISA_MIPS1_NOT_32R6_64R6; let EncodingPredicates = []<Predicate>, // FIXME: Lack of HasStdEnc is probably a bug AdditionalPredicates = [NotInMicroMips] in { -def MFHI : MMRel, MoveFromLOHI<"mfhi", GPR32Opnd, AC0>, MFLO_FM<0x10>; -def MFLO : MMRel, MoveFromLOHI<"mflo", GPR32Opnd, AC0>, MFLO_FM<0x12>; +def MFHI : MMRel, MoveFromLOHI<"mfhi", GPR32Opnd, AC0>, MFLO_FM<0x10>, + ISA_MIPS1_NOT_32R6_64R6; +def MFLO : MMRel, MoveFromLOHI<"mflo", GPR32Opnd, AC0>, MFLO_FM<0x12>, + ISA_MIPS1_NOT_32R6_64R6; } /// Sign Ext In Register Instructions. @@ -1219,8 +1300,10 @@ def SEH : MMRel, SignExtInReg<"seh", i16, GPR32Opnd, II_SEH>, SEB_FM<0x18, 0x20>, ISA_MIPS32R2; /// Count Leading -def CLZ : MMRel, CountLeading0<"clz", GPR32Opnd>, CLO_FM<0x20>, ISA_MIPS32; -def CLO : MMRel, CountLeading1<"clo", GPR32Opnd>, CLO_FM<0x21>, ISA_MIPS32; +def CLZ : MMRel, CountLeading0<"clz", GPR32Opnd>, CLO_FM<0x20>, + ISA_MIPS32_NOT_32R6_64R6; +def CLO : MMRel, CountLeading1<"clo", GPR32Opnd>, CLO_FM<0x21>, + ISA_MIPS32_NOT_32R6_64R6; /// Word Swap Bytes Within Halfwords def WSBH : MMRel, SubwordSwap<"wsbh", GPR32Opnd>, SEB_FM<2, 0x20>, ISA_MIPS32R2; @@ -1235,27 +1318,37 @@ def NOP : PseudoSE<(outs), (ins), []>, PseudoInstExpansion<(SLL ZERO, ZERO, 0)>; def LEA_ADDiu : MMRel, EffectiveAddress<"addiu", GPR32Opnd>, LW_FM<9>; // MADD*/MSUB* -def MADD : MMRel, MArithR<"madd", II_MADD, 1>, MULT_FM<0x1c, 0>, ISA_MIPS32; -def MADDU : MMRel, MArithR<"maddu", II_MADDU, 1>, MULT_FM<0x1c, 1>, ISA_MIPS32; -def MSUB : MMRel, MArithR<"msub", II_MSUB>, MULT_FM<0x1c, 4>, ISA_MIPS32; -def MSUBU : MMRel, MArithR<"msubu", II_MSUBU>, MULT_FM<0x1c, 5>, ISA_MIPS32; +def MADD : MMRel, MArithR<"madd", II_MADD, 1>, MULT_FM<0x1c, 0>, + ISA_MIPS32_NOT_32R6_64R6; +def MADDU : MMRel, MArithR<"maddu", II_MADDU, 1>, MULT_FM<0x1c, 1>, + ISA_MIPS32_NOT_32R6_64R6; +def MSUB : MMRel, MArithR<"msub", II_MSUB>, MULT_FM<0x1c, 4>, + ISA_MIPS32_NOT_32R6_64R6; +def MSUBU : MMRel, MArithR<"msubu", II_MSUBU>, MULT_FM<0x1c, 5>, + ISA_MIPS32_NOT_32R6_64R6; let AdditionalPredicates = [NotDSP] in { -def PseudoMULT : MultDivPseudo<MULT, ACC64, GPR32Opnd, MipsMult, II_MULT>; -def PseudoMULTu : MultDivPseudo<MULTu, ACC64, GPR32Opnd, MipsMultu, II_MULTU>; -def PseudoMFHI : PseudoMFLOHI<GPR32, ACC64, MipsMFHI>; -def PseudoMFLO : PseudoMFLOHI<GPR32, ACC64, MipsMFLO>; -def PseudoMTLOHI : PseudoMTLOHI<ACC64, GPR32>; -def PseudoMADD : MAddSubPseudo<MADD, MipsMAdd, II_MADD>; -def PseudoMADDU : MAddSubPseudo<MADDU, MipsMAddu, II_MADDU>; -def PseudoMSUB : MAddSubPseudo<MSUB, MipsMSub, II_MSUB>; -def PseudoMSUBU : MAddSubPseudo<MSUBU, MipsMSubu, II_MSUBU>; +def PseudoMULT : MultDivPseudo<MULT, ACC64, GPR32Opnd, MipsMult, II_MULT>, + ISA_MIPS1_NOT_32R6_64R6; +def PseudoMULTu : MultDivPseudo<MULTu, ACC64, GPR32Opnd, MipsMultu, II_MULTU>, + ISA_MIPS1_NOT_32R6_64R6; +def PseudoMFHI : PseudoMFLOHI<GPR32, ACC64, MipsMFHI>, ISA_MIPS1_NOT_32R6_64R6; +def PseudoMFLO : PseudoMFLOHI<GPR32, ACC64, MipsMFLO>, ISA_MIPS1_NOT_32R6_64R6; +def PseudoMTLOHI : PseudoMTLOHI<ACC64, GPR32>, ISA_MIPS1_NOT_32R6_64R6; +def PseudoMADD : MAddSubPseudo<MADD, MipsMAdd, II_MADD>, + ISA_MIPS32_NOT_32R6_64R6; +def PseudoMADDU : MAddSubPseudo<MADDU, MipsMAddu, II_MADDU>, + ISA_MIPS32_NOT_32R6_64R6; +def PseudoMSUB : MAddSubPseudo<MSUB, MipsMSub, II_MSUB>, + ISA_MIPS32_NOT_32R6_64R6; +def PseudoMSUBU : MAddSubPseudo<MSUBU, MipsMSubu, II_MSUBU>, + ISA_MIPS32_NOT_32R6_64R6; } def PseudoSDIV : MultDivPseudo<SDIV, ACC64, GPR32Opnd, MipsDivRem, II_DIV, - 0, 1, 1>; + 0, 1, 1>, ISA_MIPS1_NOT_32R6_64R6; def PseudoUDIV : MultDivPseudo<UDIV, ACC64, GPR32Opnd, MipsDivRemU, II_DIVU, - 0, 1, 1>; + 0, 1, 1>, ISA_MIPS1_NOT_32R6_64R6; def RDHWR : ReadHardware<GPR32Opnd, HWRegsOpnd>, RDHWR_FM; @@ -1274,6 +1367,46 @@ def SSNOP : Barrier<"ssnop">, BARRIER_FM<1>; def EHB : Barrier<"ehb">, BARRIER_FM<3>; def PAUSE : Barrier<"pause">, BARRIER_FM<5>, ISA_MIPS32R2; +// JR_HB and JALR_HB are defined here using the new style naming +// scheme because some of this code is shared with Mips32r6InstrInfo.td +// and because of that it doesn't follow the naming convention of the +// rest of the file. To avoid a mixture of old vs new style, the new +// style was chosen. +class JR_HB_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> { + dag OutOperandList = (outs); + dag InOperandList = (ins GPROpnd:$rs); + string AsmString = !strconcat(instr_asm, "\t$rs"); + list<dag> Pattern = []; +} + +class JALR_HB_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> { + dag OutOperandList = (outs GPROpnd:$rd); + dag InOperandList = (ins GPROpnd:$rs); + string AsmString = !strconcat(instr_asm, "\t$rd, $rs"); + list<dag> Pattern = []; +} + +class JR_HB_DESC : InstSE<(outs), (ins), "", [], NoItinerary, FrmJ>, + JR_HB_DESC_BASE<"jr.hb", GPR32Opnd> { + let isBranch=1; + let isIndirectBranch=1; + let hasDelaySlot=1; + let isTerminator=1; + let isBarrier=1; +} + +class JALR_HB_DESC : InstSE<(outs), (ins), "", [], NoItinerary, FrmJ>, + JALR_HB_DESC_BASE<"jalr.hb", GPR32Opnd> { + let isIndirectBranch=1; + let hasDelaySlot=1; +} + +class JR_HB_ENC : JR_HB_FM<8>; +class JALR_HB_ENC : JALR_HB_FM<9>; + +def JR_HB : JR_HB_DESC, JR_HB_ENC, ISA_MIPS32_NOT_32R6_64R6; +def JALR_HB : JALR_HB_DESC, JALR_HB_ENC, ISA_MIPS32; + class TLB<string asmstr> : InstSE<(outs), (ins), asmstr, [], NoItinerary, FrmOther>; def TLBP : TLB<"tlbp">, COP0_TLB_FM<0x08>; @@ -1281,6 +1414,15 @@ def TLBR : TLB<"tlbr">, COP0_TLB_FM<0x01>; def TLBWI : TLB<"tlbwi">, COP0_TLB_FM<0x02>; def TLBWR : TLB<"tlbwr">, COP0_TLB_FM<0x06>; +class CacheOp<string instr_asm, Operand MemOpnd, RegisterOperand GPROpnd> : + InstSE<(outs), (ins MemOpnd:$addr, uimm5:$hint), + !strconcat(instr_asm, "\t$hint, $addr"), [], NoItinerary, FrmOther>; + +def CACHE : CacheOp<"cache", mem, GPR32Opnd>, CACHEOP_FM<0b101111>, + INSN_MIPS3_32_NOT_32R6_64R6; +def PREF : CacheOp<"pref", mem, GPR32Opnd>, CACHEOP_FM<0b110011>, + INSN_MIPS3_32_NOT_32R6_64R6; + //===----------------------------------------------------------------------===// // Instruction aliases //===----------------------------------------------------------------------===// @@ -1289,19 +1431,23 @@ def : MipsInstAlias<"move $dst, $src", GPR_32 { let AdditionalPredicates = [NotInMicroMips]; } -def : MipsInstAlias<"bal $offset", (BGEZAL ZERO, brtarget:$offset), 0>; +def : MipsInstAlias<"bal $offset", (BGEZAL ZERO, brtarget:$offset), 0>, + ISA_MIPS1_NOT_32R6_64R6; def : MipsInstAlias<"addu $rs, $rt, $imm", (ADDiu GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), 0>; def : MipsInstAlias<"add $rs, $rt, $imm", (ADDi GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), 0>; def : MipsInstAlias<"and $rs, $rt, $imm", (ANDi GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), 0>; +def : MipsInstAlias<"and $rs, $imm", + (ANDi GPR32Opnd:$rs, GPR32Opnd:$rs, simm16:$imm), 0>; def : MipsInstAlias<"j $rs", (JR GPR32Opnd:$rs), 0>; let Predicates = [NotInMicroMips] in { def : MipsInstAlias<"jalr $rs", (JALR RA, GPR32Opnd:$rs), 0>; } def : MipsInstAlias<"jal $rs", (JALR RA, GPR32Opnd:$rs), 0>; def : MipsInstAlias<"jal $rd,$rs", (JALR GPR32Opnd:$rd, GPR32Opnd:$rs), 0>; +def : MipsInstAlias<"jalr.hb $rs", (JALR_HB RA, GPR32Opnd:$rs), 1>, ISA_MIPS32; def : MipsInstAlias<"not $rt, $rs", (NOR GPR32Opnd:$rt, GPR32Opnd:$rs, ZERO), 0>; def : MipsInstAlias<"neg $rt, $rs", @@ -1318,6 +1464,8 @@ def : MipsInstAlias<"xor $rs, $rt, $imm", (XORi GPR32Opnd:$rs, GPR32Opnd:$rt, uimm16:$imm), 0>; def : MipsInstAlias<"or $rs, $rt, $imm", (ORi GPR32Opnd:$rs, GPR32Opnd:$rt, uimm16:$imm), 0>; +def : MipsInstAlias<"or $rs, $imm", + (ORi GPR32Opnd:$rs, GPR32Opnd:$rs, uimm16:$imm), 0>; def : MipsInstAlias<"nop", (SLL ZERO, ZERO, 0), 1>; def : MipsInstAlias<"mfc0 $rt, $rd", (MFC0 GPR32Opnd:$rt, GPR32Opnd:$rd, 0), 0>; def : MipsInstAlias<"mtc0 $rt, $rd", (MTC0 GPR32Opnd:$rt, GPR32Opnd:$rd, 0), 0>; @@ -1360,6 +1508,9 @@ def : MipsInstAlias<"sra $rd, $rt, $rs", (SRAV GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>; def : MipsInstAlias<"srl $rd, $rt, $rs", (SRLV GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>; +def : MipsInstAlias<"sdbbp", (SDBBP 0)>, ISA_MIPS32_NOT_32R6_64R6; +def : MipsInstAlias<"sync", + (SYNC 0), 1>, ISA_MIPS2; //===----------------------------------------------------------------------===// // Assembler Pseudo Instructions //===----------------------------------------------------------------------===// @@ -1412,6 +1563,10 @@ let AdditionalPredicates = [NotDSP] in { (ADDiu GPR32:$src, imm:$imm)>; } +// SYNC +def : MipsPat<(MipsSync (i32 immz)), + (SYNC 0)>, ISA_MIPS2; + // Call def : MipsPat<(MipsJmpLink (i32 tglobaladdr:$dst)), (JAL tglobaladdr:$dst)>; diff --git a/lib/Target/Mips/MipsLongBranch.cpp b/lib/Target/Mips/MipsLongBranch.cpp index acfe76e35cf5..c6838a37be28 100644 --- a/lib/Target/Mips/MipsLongBranch.cpp +++ b/lib/Target/Mips/MipsLongBranch.cpp @@ -15,6 +15,7 @@ #include "Mips.h" #include "MCTargetDesc/MipsBaseInfo.h" +#include "MCTargetDesc/MipsMCNaCl.h" #include "MipsTargetMachine.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -64,7 +65,8 @@ namespace { : MachineFunctionPass(ID), TM(tm), IsPIC(TM.getRelocationModel() == Reloc::PIC_), ABI(TM.getSubtarget<MipsSubtarget>().getTargetABI()), - LongBranchSeqSize(!IsPIC ? 2 : (ABI == MipsSubtarget::N64 ? 10 : 9)) {} + LongBranchSeqSize(!IsPIC ? 2 : (ABI == MipsSubtarget::N64 ? 10 : + (!TM.getSubtarget<MipsSubtarget>().isTargetNaCl() ? 9 : 10))) {} const char *getPassName() const override { return "Mips Long Branch"; @@ -264,6 +266,13 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) { LongBrMBB->addSuccessor(BalTgtMBB); BalTgtMBB->addSuccessor(TgtMBB); + // We must select between the MIPS32r6/MIPS64r6 BAL (which is a normal + // instruction) and the pre-MIPS32r6/MIPS64r6 definition (which is an + // pseudo-instruction wrapping BGEZAL). + + const MipsSubtarget &Subtarget = TM.getSubtarget<MipsSubtarget>(); + unsigned BalOp = Subtarget.hasMips32r6() ? Mips::BAL : Mips::BAL_BR; + if (ABI != MipsSubtarget::N64) { // $longbr: // addiu $sp, $sp, -8 @@ -305,9 +314,11 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) { BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LONG_BRANCH_LUi), Mips::AT) .addMBB(TgtMBB).addMBB(BalTgtMBB); MIBundleBuilder(*LongBrMBB, Pos) - .append(BuildMI(*MF, DL, TII->get(Mips::BAL_BR)).addMBB(BalTgtMBB)) - .append(BuildMI(*MF, DL, TII->get(Mips::LONG_BRANCH_ADDiu), Mips::AT) - .addReg(Mips::AT).addMBB(TgtMBB).addMBB(BalTgtMBB)); + .append(BuildMI(*MF, DL, TII->get(BalOp)).addMBB(BalTgtMBB)) + .append(BuildMI(*MF, DL, TII->get(Mips::LONG_BRANCH_ADDiu), Mips::AT) + .addReg(Mips::AT) + .addMBB(TgtMBB) + .addMBB(BalTgtMBB)); Pos = BalTgtMBB->begin(); @@ -316,10 +327,23 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) { BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::LW), Mips::RA) .addReg(Mips::SP).addImm(0); - MIBundleBuilder(*BalTgtMBB, Pos) - .append(BuildMI(*MF, DL, TII->get(Mips::JR)).addReg(Mips::AT)) - .append(BuildMI(*MF, DL, TII->get(Mips::ADDiu), Mips::SP) - .addReg(Mips::SP).addImm(8)); + if (!TM.getSubtarget<MipsSubtarget>().isTargetNaCl()) { + MIBundleBuilder(*BalTgtMBB, Pos) + .append(BuildMI(*MF, DL, TII->get(Mips::JR)).addReg(Mips::AT)) + .append(BuildMI(*MF, DL, TII->get(Mips::ADDiu), Mips::SP) + .addReg(Mips::SP).addImm(8)); + } else { + // In NaCl, modifying the sp is not allowed in branch delay slot. + BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::ADDiu), Mips::SP) + .addReg(Mips::SP).addImm(8); + + MIBundleBuilder(*BalTgtMBB, Pos) + .append(BuildMI(*MF, DL, TII->get(Mips::JR)).addReg(Mips::AT)) + .append(BuildMI(*MF, DL, TII->get(Mips::NOP))); + + // Bundle-align the target of indirect branch JR. + TgtMBB->setAlignment(MIPS_NACL_BUNDLE_ALIGN); + } } else { // $longbr: // daddiu $sp, $sp, -16 @@ -364,11 +388,12 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) { .addReg(Mips::AT_64).addImm(16); MIBundleBuilder(*LongBrMBB, Pos) - .append(BuildMI(*MF, DL, TII->get(Mips::BAL_BR)).addMBB(BalTgtMBB)) - .append(BuildMI(*MF, DL, TII->get(Mips::LONG_BRANCH_DADDiu), - Mips::AT_64).addReg(Mips::AT_64) - .addMBB(TgtMBB, MipsII::MO_ABS_LO) - .addMBB(BalTgtMBB)); + .append(BuildMI(*MF, DL, TII->get(BalOp)).addMBB(BalTgtMBB)) + .append( + BuildMI(*MF, DL, TII->get(Mips::LONG_BRANCH_DADDiu), Mips::AT_64) + .addReg(Mips::AT_64) + .addMBB(TgtMBB, MipsII::MO_ABS_LO) + .addMBB(BalTgtMBB)); Pos = BalTgtMBB->begin(); @@ -450,9 +475,18 @@ bool MipsLongBranch::runOnMachineFunction(MachineFunction &F) { continue; int ShVal = TM.getSubtarget<MipsSubtarget>().inMicroMipsMode() ? 2 : 4; + int64_t Offset = computeOffset(I->Br) / ShVal; + + if (TM.getSubtarget<MipsSubtarget>().isTargetNaCl()) { + // The offset calculation does not include sandboxing instructions + // that will be added later in the MC layer. Since at this point we + // don't know the exact amount of code that "sandboxing" will add, we + // conservatively estimate that code will not grow more than 100%. + Offset *= 2; + } // Check if offset fits into 16-bit immediate field of branches. - if (!ForceLongBranch && isInt<16>(computeOffset(I->Br) / ShVal)) + if (!ForceLongBranch && isInt<16>(Offset)) continue; I->HasLongBranch = true; diff --git a/lib/Target/Mips/MipsMSAInstrFormats.td b/lib/Target/Mips/MipsMSAInstrFormats.td index 6bd0366b52e7..bff2d0fab1ec 100644 --- a/lib/Target/Mips/MipsMSAInstrFormats.td +++ b/lib/Target/Mips/MipsMSAInstrFormats.td @@ -7,7 +7,7 @@ // //===----------------------------------------------------------------------===// -def HasMSA : Predicate<"Subtarget.hasMSA()">, +def HasMSA : Predicate<"Subtarget->hasMSA()">, AssemblerPredicate<"FeatureMSA">; class MSAInst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther> { diff --git a/lib/Target/Mips/MipsMachineFunction.h b/lib/Target/Mips/MipsMachineFunction.h index e9101cc77c0d..8c16f82bfb68 100644 --- a/lib/Target/Mips/MipsMachineFunction.h +++ b/lib/Target/Mips/MipsMachineFunction.h @@ -15,7 +15,6 @@ #define MIPS_MACHINE_FUNCTION_INFO_H #include "Mips16HardFloatInfo.h" -#include "MipsSubtarget.h" #include "llvm/ADT/StringMap.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp index 83d25ab469c3..084449bba59c 100644 --- a/lib/Target/Mips/MipsRegisterInfo.cpp +++ b/lib/Target/Mips/MipsRegisterInfo.cpp @@ -93,6 +93,9 @@ MipsRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { if (Subtarget.isFP64bit()) return CSR_O32_FP64_SaveList; + if (Subtarget.isFPXX()) + return CSR_O32_FPXX_SaveList; + return CSR_O32_SaveList; } @@ -110,6 +113,9 @@ MipsRegisterInfo::getCallPreservedMask(CallingConv::ID) const { if (Subtarget.isFP64bit()) return CSR_O32_FP64_RegMask; + if (Subtarget.isFPXX()) + return CSR_O32_FPXX_RegMask; + return CSR_O32_RegMask; } @@ -201,6 +207,11 @@ getReservedRegs(const MachineFunction &MF) const { Reserved.set(Mips::GP_64); } + if (Subtarget.isABI_O32() && !Subtarget.useOddSPReg()) { + for (const auto &Reg : Mips::OddSPRegClass) + Reserved.set(Reg); + } + return Reserved; } diff --git a/lib/Target/Mips/MipsRegisterInfo.td b/lib/Target/Mips/MipsRegisterInfo.td index 875a596eb4aa..6323da378850 100644 --- a/lib/Target/Mips/MipsRegisterInfo.td +++ b/lib/Target/Mips/MipsRegisterInfo.td @@ -340,6 +340,12 @@ def AFGR64 : RegisterClass<"Mips", [f64], 64, (add def FGR64 : RegisterClass<"Mips", [f64], 64, (sequence "D%u_64", 0, 31)>; +// Used to reserve odd registers when given -mattr=+nooddspreg +def OddSP : RegisterClass<"Mips", [f32], 32, + (add (decimate (sequence "F%u", 1, 31), 2), + (decimate (sequence "F_HI%u", 1, 31), 2))>, + Unallocatable; + // FP control registers. def CCR : RegisterClass<"Mips", [i32], 32, (sequence "FCR%u", 0, 31)>, Unallocatable; @@ -348,6 +354,10 @@ def CCR : RegisterClass<"Mips", [i32], 32, (sequence "FCR%u", 0, 31)>, def FCC : RegisterClass<"Mips", [i32], 32, (sequence "FCC%u", 0, 7)>, Unallocatable; +// MIPS32r6/MIPS64r6 store FPU condition codes in normal FGR registers. +// This class allows us to represent this in codegen patterns. +def FGRCC : RegisterClass<"Mips", [i32], 32, (sequence "F%u", 0, 31)>; + def MSA128B: RegisterClass<"Mips", [v16i8], 128, (sequence "W%u", 0, 31)>; def MSA128H: RegisterClass<"Mips", [v8i16, v8f16], 128, @@ -512,6 +522,12 @@ def FGR32Opnd : RegisterOperand<FGR32> { let ParserMatchClass = FGR32AsmOperand; } +def FGRCCOpnd : RegisterOperand<FGRCC> { + // The assembler doesn't use register classes so we can re-use + // FGR32AsmOperand. + let ParserMatchClass = FGR32AsmOperand; +} + def FGRH32Opnd : RegisterOperand<FGRH32> { let ParserMatchClass = FGRH32AsmOperand; } diff --git a/lib/Target/Mips/MipsSEFrameLowering.cpp b/lib/Target/Mips/MipsSEFrameLowering.cpp index 6ad5821571d7..6573070f7031 100644 --- a/lib/Target/Mips/MipsSEFrameLowering.cpp +++ b/lib/Target/Mips/MipsSEFrameLowering.cpp @@ -16,6 +16,7 @@ #include "MipsAnalyzeImmediate.h" #include "MipsMachineFunction.h" #include "MipsSEInstrInfo.h" +#include "MipsSubtarget.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -257,6 +258,9 @@ bool ExpandPseudo::expandCopyACC(MachineBasicBlock &MBB, Iter I, return true; } +MipsSEFrameLowering::MipsSEFrameLowering(const MipsSubtarget &STI) + : MipsFrameLowering(STI, STI.stackAlignment()) {} + unsigned MipsSEFrameLowering::ehDataReg(unsigned I) const { static const unsigned EhDataReg[] = { Mips::A0, Mips::A1, Mips::A2, Mips::A3 diff --git a/lib/Target/Mips/MipsSEFrameLowering.h b/lib/Target/Mips/MipsSEFrameLowering.h index 5d2801ffb213..e832848754dc 100644 --- a/lib/Target/Mips/MipsSEFrameLowering.h +++ b/lib/Target/Mips/MipsSEFrameLowering.h @@ -20,8 +20,7 @@ namespace llvm { class MipsSEFrameLowering : public MipsFrameLowering { public: - explicit MipsSEFrameLowering(const MipsSubtarget &STI) - : MipsFrameLowering(STI, STI.stackAlignment()) {} + explicit MipsSEFrameLowering(const MipsSubtarget &STI); /// emitProlog/emitEpilog - These methods insert prolog and epilog code into /// the function. diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp index d5385be7ddb0..6f35947f8f01 100644 --- a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp +++ b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp @@ -37,7 +37,7 @@ using namespace llvm; #define DEBUG_TYPE "mips-isel" bool MipsSEDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { - if (Subtarget.inMips16Mode()) + if (Subtarget->inMips16Mode()) return false; return MipsDAGToDAGISel::runOnMachineFunction(MF); } @@ -134,7 +134,7 @@ void MipsSEDAGToDAGISel::initGlobalBaseReg(MachineFunction &MF) { unsigned V0, V1, GlobalBaseReg = MipsFI->getGlobalBaseReg(); const TargetRegisterClass *RC; - if (Subtarget.isABI_N64()) + if (Subtarget->isABI_N64()) RC = (const TargetRegisterClass*)&Mips::GPR64RegClass; else RC = (const TargetRegisterClass*)&Mips::GPR32RegClass; @@ -142,7 +142,7 @@ void MipsSEDAGToDAGISel::initGlobalBaseReg(MachineFunction &MF) { V0 = RegInfo.createVirtualRegister(RC); V1 = RegInfo.createVirtualRegister(RC); - if (Subtarget.isABI_N64()) { + if (Subtarget->isABI_N64()) { MF.getRegInfo().addLiveIn(Mips::T9_64); MBB.addLiveIn(Mips::T9_64); @@ -174,7 +174,7 @@ void MipsSEDAGToDAGISel::initGlobalBaseReg(MachineFunction &MF) { MF.getRegInfo().addLiveIn(Mips::T9); MBB.addLiveIn(Mips::T9); - if (Subtarget.isABI_N32()) { + if (Subtarget->isABI_N32()) { // lui $v0, %hi(%neg(%gp_rel(fname))) // addu $v1, $v0, $t9 // addiu $globalbasereg, $v1, %lo(%neg(%gp_rel(fname))) @@ -187,7 +187,7 @@ void MipsSEDAGToDAGISel::initGlobalBaseReg(MachineFunction &MF) { return; } - assert(Subtarget.isABI_O32()); + assert(Subtarget->isABI_O32()); // For O32 ABI, the following instruction sequence is emitted to initialize // the global base register: @@ -408,7 +408,7 @@ bool MipsSEDAGToDAGISel::selectIntAddrMSA(SDValue Addr, SDValue &Base, // * MSA is enabled // * N is a ISD::BUILD_VECTOR representing a constant splat bool MipsSEDAGToDAGISel::selectVSplat(SDNode *N, APInt &Imm) const { - if (!Subtarget.hasMSA()) + if (!Subtarget->hasMSA()) return false; BuildVectorSDNode *Node = dyn_cast<BuildVectorSDNode>(N); @@ -422,7 +422,7 @@ bool MipsSEDAGToDAGISel::selectVSplat(SDNode *N, APInt &Imm) const { if (!Node->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, HasAnyUndefs, 8, - !Subtarget.isLittle())) + !Subtarget->isLittle())) return false; Imm = SplatValue; @@ -648,7 +648,7 @@ std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) { } case ISD::ADDE: { - if (Subtarget.hasDSP()) // Select DSP instructions, ADDSC and ADDWC. + if (Subtarget->hasDSP()) // Select DSP instructions, ADDSC and ADDWC. break; SDValue InFlag = Node->getOperand(2); Result = selectAddESubE(Mips::ADDu, InFlag, InFlag.getValue(0), DL, Node); @@ -658,11 +658,11 @@ std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) { case ISD::ConstantFP: { ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(Node); if (Node->getValueType(0) == MVT::f64 && CN->isExactlyValue(+0.0)) { - if (Subtarget.isGP64bit()) { + if (Subtarget->isGP64bit()) { SDValue Zero = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, Mips::ZERO_64, MVT::i64); Result = CurDAG->getMachineNode(Mips::DMTC1, DL, MVT::f64, Zero); - } else if (Subtarget.isFP64bit()) { + } else if (Subtarget->isFP64bit()) { SDValue Zero = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, Mips::ZERO, MVT::i32); Result = CurDAG->getMachineNode(Mips::BuildPairF64_64, DL, MVT::f64, @@ -813,12 +813,12 @@ std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) { EVT ResVecTy = BVN->getValueType(0); EVT ViaVecTy; - if (!Subtarget.hasMSA() || !BVN->getValueType(0).is128BitVector()) + if (!Subtarget->hasMSA() || !BVN->getValueType(0).is128BitVector()) return std::make_pair(false, nullptr); if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, HasAnyUndefs, 8, - !Subtarget.isLittle())) + !Subtarget->isLittle())) return std::make_pair(false, nullptr); switch (SplatBitSize) { diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp index 969d73018953..be4ca862540e 100644 --- a/lib/Target/Mips/MipsSEISelLowering.cpp +++ b/lib/Target/Mips/MipsSEISelLowering.cpp @@ -39,7 +39,7 @@ MipsSETargetLowering::MipsSETargetLowering(MipsTargetMachine &TM) // Set up the register classes addRegisterClass(MVT::i32, &Mips::GPR32RegClass); - if (isGP64bit()) + if (Subtarget->isGP64bit()) addRegisterClass(MVT::i64, &Mips::GPR64RegClass); if (Subtarget->hasDSP() || Subtarget->hasMSA()) { @@ -120,10 +120,10 @@ MipsSETargetLowering::MipsSETargetLowering(MipsTargetMachine &TM) if (Subtarget->hasCnMips()) setOperationAction(ISD::MUL, MVT::i64, Legal); - else if (isGP64bit()) + else if (Subtarget->isGP64bit()) setOperationAction(ISD::MUL, MVT::i64, Custom); - if (isGP64bit()) { + if (Subtarget->isGP64bit()) { setOperationAction(ISD::MULHS, MVT::i64, Custom); setOperationAction(ISD::MULHU, MVT::i64, Custom); } @@ -152,6 +152,76 @@ MipsSETargetLowering::MipsSETargetLowering(MipsTargetMachine &TM) setOperationAction(ISD::STORE, MVT::f64, Custom); } + if (Subtarget->hasMips32r6()) { + // MIPS32r6 replaces the accumulator-based multiplies with a three register + // instruction + setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); + setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); + setOperationAction(ISD::MUL, MVT::i32, Legal); + setOperationAction(ISD::MULHS, MVT::i32, Legal); + setOperationAction(ISD::MULHU, MVT::i32, Legal); + + // MIPS32r6 replaces the accumulator-based division/remainder with separate + // three register division and remainder instructions. + setOperationAction(ISD::SDIVREM, MVT::i32, Expand); + setOperationAction(ISD::UDIVREM, MVT::i32, Expand); + setOperationAction(ISD::SDIV, MVT::i32, Legal); + setOperationAction(ISD::UDIV, MVT::i32, Legal); + setOperationAction(ISD::SREM, MVT::i32, Legal); + setOperationAction(ISD::UREM, MVT::i32, Legal); + + // MIPS32r6 replaces conditional moves with an equivalent that removes the + // need for three GPR read ports. + setOperationAction(ISD::SETCC, MVT::i32, Legal); + setOperationAction(ISD::SELECT, MVT::i32, Legal); + setOperationAction(ISD::SELECT_CC, MVT::i32, Expand); + + setOperationAction(ISD::SETCC, MVT::f32, Legal); + setOperationAction(ISD::SELECT, MVT::f32, Legal); + setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); + + assert(Subtarget->isFP64bit() && "FR=1 is required for MIPS32r6"); + setOperationAction(ISD::SETCC, MVT::f64, Legal); + setOperationAction(ISD::SELECT, MVT::f64, Legal); + setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); + + setOperationAction(ISD::BRCOND, MVT::Other, Legal); + + // Floating point > and >= are supported via < and <= + setCondCodeAction(ISD::SETOGE, MVT::f32, Expand); + setCondCodeAction(ISD::SETOGT, MVT::f32, Expand); + setCondCodeAction(ISD::SETUGE, MVT::f32, Expand); + setCondCodeAction(ISD::SETUGT, MVT::f32, Expand); + + setCondCodeAction(ISD::SETOGE, MVT::f64, Expand); + setCondCodeAction(ISD::SETOGT, MVT::f64, Expand); + setCondCodeAction(ISD::SETUGE, MVT::f64, Expand); + setCondCodeAction(ISD::SETUGT, MVT::f64, Expand); + } + + if (Subtarget->hasMips64r6()) { + // MIPS64r6 replaces the accumulator-based multiplies with a three register + // instruction + setOperationAction(ISD::MUL, MVT::i64, Legal); + setOperationAction(ISD::MULHS, MVT::i64, Legal); + setOperationAction(ISD::MULHU, MVT::i64, Legal); + + // MIPS32r6 replaces the accumulator-based division/remainder with separate + // three register division and remainder instructions. + setOperationAction(ISD::SDIVREM, MVT::i64, Expand); + setOperationAction(ISD::UDIVREM, MVT::i64, Expand); + setOperationAction(ISD::SDIV, MVT::i64, Legal); + setOperationAction(ISD::UDIV, MVT::i64, Legal); + setOperationAction(ISD::SREM, MVT::i64, Legal); + setOperationAction(ISD::UREM, MVT::i64, Legal); + + // MIPS64r6 replaces conditional moves with an equivalent that removes the + // need for three GPR read ports. + setOperationAction(ISD::SETCC, MVT::i64, Legal); + setOperationAction(ISD::SELECT, MVT::i64, Legal); + setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); + } + computeRegisterProperties(); } @@ -160,6 +230,14 @@ llvm::createMipsSETargetLowering(MipsTargetMachine &TM) { return new MipsSETargetLowering(TM); } +const TargetRegisterClass * +MipsSETargetLowering::getRepRegClassFor(MVT VT) const { + if (VT == MVT::Untyped) + return Subtarget->hasDSP() ? &Mips::ACC64DSPRegClass : &Mips::ACC64RegClass; + + return TargetLowering::getRepRegClassFor(VT); +} + // Enable MSA support for the given integer type and Register class. void MipsSETargetLowering:: addMSAIntType(MVT::SimpleValueType Ty, const TargetRegisterClass *RC) { @@ -449,8 +527,8 @@ static SDValue performADDECombine(SDNode *N, SelectionDAG &DAG, if (DCI.isBeforeLegalize()) return SDValue(); - if (Subtarget->hasMips32() && N->getValueType(0) == MVT::i32 && - selectMADD(N, &DAG)) + if (Subtarget->hasMips32() && !Subtarget->hasMips32r6() && + N->getValueType(0) == MVT::i32 && selectMADD(N, &DAG)) return SDValue(N, 0); return SDValue(); @@ -1178,6 +1256,9 @@ SDValue MipsSETargetLowering::lowerSTORE(SDValue Op, SelectionDAG &DAG) const { SDValue MipsSETargetLowering::lowerMulDiv(SDValue Op, unsigned NewOpc, bool HasLo, bool HasHi, SelectionDAG &DAG) const { + // MIPS32r6/MIPS64r6 removed accumulator based multiplies. + assert(!Subtarget->hasMips32r6()); + EVT Ty = Op.getOperand(0).getValueType(); SDLoc DL(Op); SDValue Mult = DAG.getNode(NewOpc, DL, MVT::Untyped, @@ -1651,7 +1732,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::mips_copy_s_w: return lowerMSACopyIntr(Op, DAG, MipsISD::VEXTRACT_SEXT_ELT); case Intrinsic::mips_copy_s_d: - if (hasMips64()) + if (Subtarget->hasMips64()) // Lower directly into VEXTRACT_SEXT_ELT since i64 is legal on Mips64. return lowerMSACopyIntr(Op, DAG, MipsISD::VEXTRACT_SEXT_ELT); else { @@ -1666,7 +1747,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::mips_copy_u_w: return lowerMSACopyIntr(Op, DAG, MipsISD::VEXTRACT_ZEXT_ELT); case Intrinsic::mips_copy_u_d: - if (hasMips64()) + if (Subtarget->hasMips64()) // Lower directly into VEXTRACT_ZEXT_ELT since i64 is legal on Mips64. return lowerMSACopyIntr(Op, DAG, MipsISD::VEXTRACT_ZEXT_ELT); else { @@ -2943,8 +3024,8 @@ MipsSETargetLowering::emitINSERT_DF_VIDX(MachineInstr *MI, unsigned SrcValReg = MI->getOperand(3).getReg(); const TargetRegisterClass *VecRC = nullptr; - const TargetRegisterClass *GPRRC = isGP64bit() ? &Mips::GPR64RegClass - : &Mips::GPR32RegClass; + const TargetRegisterClass *GPRRC = + Subtarget->isGP64bit() ? &Mips::GPR64RegClass : &Mips::GPR32RegClass; unsigned EltLog2Size; unsigned InsertOp = 0; unsigned InsveOp = 0; diff --git a/lib/Target/Mips/MipsSEISelLowering.h b/lib/Target/Mips/MipsSEISelLowering.h index 03a20ef6741c..13ef6fce8875 100644 --- a/lib/Target/Mips/MipsSEISelLowering.h +++ b/lib/Target/Mips/MipsSEISelLowering.h @@ -11,8 +11,8 @@ // //===----------------------------------------------------------------------===// -#ifndef MipsSEISELLOWERING_H -#define MipsSEISELLOWERING_H +#ifndef MIPSSEISELLOWERING_H +#define MIPSSEISELLOWERING_H #include "MipsISelLowering.h" #include "MipsRegisterInfo.h" @@ -46,13 +46,7 @@ namespace llvm { return false; } - const TargetRegisterClass *getRepRegClassFor(MVT VT) const override { - if (VT == MVT::Untyped) - return Subtarget->hasDSP() ? &Mips::ACC64DSPRegClass : - &Mips::ACC64RegClass; - - return TargetLowering::getRepRegClassFor(VT); - } + const TargetRegisterClass *getRepRegClassFor(MVT VT) const override; private: bool isEligibleForTailCallOptimization(const MipsCC &MipsCCInfo, diff --git a/lib/Target/Mips/MipsSEInstrInfo.cpp b/lib/Target/Mips/MipsSEInstrInfo.cpp index f6f364f1a361..32da7492dad4 100644 --- a/lib/Target/Mips/MipsSEInstrInfo.cpp +++ b/lib/Target/Mips/MipsSEInstrInfo.cpp @@ -272,7 +272,7 @@ bool MipsSEInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { default: return false; case Mips::RetRA: - expandRetRA(MBB, MI, Mips::RET); + expandRetRA(MBB, MI); break; case Mips::PseudoMFHI: Opc = isMicroMips ? Mips::MFHI16_MM : Mips::MFHI; @@ -428,9 +428,14 @@ unsigned MipsSEInstrInfo::getAnalyzableBrOpc(unsigned Opc) const { } void MipsSEInstrInfo::expandRetRA(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, - unsigned Opc) const { - BuildMI(MBB, I, I->getDebugLoc(), get(Opc)).addReg(Mips::RA); + MachineBasicBlock::iterator I) const { + const auto &Subtarget = TM.getSubtarget<MipsSubtarget>(); + + if (Subtarget.isGP64bit()) + BuildMI(MBB, I, I->getDebugLoc(), get(Mips::PseudoReturn64)) + .addReg(Mips::RA_64); + else + BuildMI(MBB, I, I->getDebugLoc(), get(Mips::PseudoReturn)).addReg(Mips::RA); } std::pair<bool, bool> @@ -542,20 +547,31 @@ void MipsSEInstrInfo::expandBuildPairF64(MachineBasicBlock &MBB, const MCInstrDesc& Mtc1Tdd = get(Mips::MTC1); DebugLoc dl = I->getDebugLoc(); const TargetRegisterInfo &TRI = getRegisterInfo(); + bool HasMTHC1 = TM.getSubtarget<MipsSubtarget>().hasMips32r2() || + TM.getSubtarget<MipsSubtarget>().hasMips32r6(); - // For FP32 mode: - // mtc1 Lo, $fp - // mtc1 Hi, $fp + 1 - // For FP64 mode: + // When mthc1 is available, use: // mtc1 Lo, $fp // mthc1 Hi, $fp + // + // Otherwise, for FP64: + // spill + reload via ldc1 + // This has not been implemented since FP64 on MIPS32 and earlier is not + // supported. + // + // Otherwise, for FP32: + // mtc1 Lo, $fp + // mtc1 Hi, $fp + 1 BuildMI(MBB, I, dl, Mtc1Tdd, TRI.getSubReg(DstReg, Mips::sub_lo)) .addReg(LoReg); - if (FP64) { - // FIXME: The .addReg(DstReg, RegState::Implicit) is a white lie used to - // temporarily work around a widespread bug in the -mfp64 support. + if (HasMTHC1 || FP64) { + assert(TM.getSubtarget<MipsSubtarget>().hasMips32r2() && + "MTHC1 requires MIPS32r2"); + + // FIXME: The .addReg(DstReg) is a white lie used to temporarily work + // around a widespread bug in the -mfp64 support. // The problem is that none of the 32-bit fpu ops mention the fact // that they clobber the upper 32-bits of the 64-bit FPR. Fixing that // requires a major overhaul of the FPU implementation which can't @@ -565,9 +581,9 @@ void MipsSEInstrInfo::expandBuildPairF64(MachineBasicBlock &MBB, // We therefore pretend that it reads the bottom 32-bits to // artificially create a dependency and prevent the scheduler // changing the behaviour of the code. - BuildMI(MBB, I, dl, get(Mips::MTHC1), TRI.getSubReg(DstReg, Mips::sub_hi)) - .addReg(HiReg) - .addReg(DstReg, RegState::Implicit); + BuildMI(MBB, I, dl, get(FP64 ? Mips::MTHC1_D64 : Mips::MTHC1_D32), DstReg) + .addReg(DstReg) + .addReg(HiReg); } else BuildMI(MBB, I, dl, Mtc1Tdd, TRI.getSubReg(DstReg, Mips::sub_hi)) .addReg(HiReg); @@ -580,17 +596,16 @@ void MipsSEInstrInfo::expandEhReturn(MachineBasicBlock &MBB, // indirect jump to TargetReg const MipsSubtarget &STI = TM.getSubtarget<MipsSubtarget>(); unsigned ADDU = STI.isABI_N64() ? Mips::DADDu : Mips::ADDu; - unsigned JR = STI.isABI_N64() ? Mips::JR64 : Mips::JR; - unsigned SP = STI.isABI_N64() ? Mips::SP_64 : Mips::SP; - unsigned RA = STI.isABI_N64() ? Mips::RA_64 : Mips::RA; - unsigned T9 = STI.isABI_N64() ? Mips::T9_64 : Mips::T9; - unsigned ZERO = STI.isABI_N64() ? Mips::ZERO_64 : Mips::ZERO; + unsigned SP = STI.isGP64bit() ? Mips::SP_64 : Mips::SP; + unsigned RA = STI.isGP64bit() ? Mips::RA_64 : Mips::RA; + unsigned T9 = STI.isGP64bit() ? Mips::T9_64 : Mips::T9; + unsigned ZERO = STI.isGP64bit() ? Mips::ZERO_64 : Mips::ZERO; unsigned OffsetReg = I->getOperand(0).getReg(); unsigned TargetReg = I->getOperand(1).getReg(); // addu $ra, $v0, $zero // addu $sp, $sp, $v1 - // jr $ra + // jr $ra (via RetRA) if (TM.getRelocationModel() == Reloc::PIC_) BuildMI(MBB, I, I->getDebugLoc(), TM.getInstrInfo()->get(ADDU), T9) .addReg(TargetReg).addReg(ZERO); @@ -598,7 +613,7 @@ void MipsSEInstrInfo::expandEhReturn(MachineBasicBlock &MBB, .addReg(TargetReg).addReg(ZERO); BuildMI(MBB, I, I->getDebugLoc(), TM.getInstrInfo()->get(ADDU), SP) .addReg(SP).addReg(OffsetReg); - BuildMI(MBB, I, I->getDebugLoc(), TM.getInstrInfo()->get(JR)).addReg(RA); + expandRetRA(MBB, I); } const MipsInstrInfo *llvm::createMipsSEInstrInfo(MipsTargetMachine &TM) { diff --git a/lib/Target/Mips/MipsSEInstrInfo.h b/lib/Target/Mips/MipsSEInstrInfo.h index aa68552066c3..9ac94ce38f66 100644 --- a/lib/Target/Mips/MipsSEInstrInfo.h +++ b/lib/Target/Mips/MipsSEInstrInfo.h @@ -81,8 +81,7 @@ public: private: unsigned getAnalyzableBrOpc(unsigned Opc) const override; - void expandRetRA(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - unsigned Opc) const; + void expandRetRA(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const; std::pair<bool, bool> compareOpndSize(unsigned Opc, const MachineFunction &MF) const; diff --git a/lib/Target/Mips/MipsSelectionDAGInfo.cpp b/lib/Target/Mips/MipsSelectionDAGInfo.cpp index 0d4398e2fe95..edd8f670707f 100644 --- a/lib/Target/Mips/MipsSelectionDAGInfo.cpp +++ b/lib/Target/Mips/MipsSelectionDAGInfo.cpp @@ -16,9 +16,8 @@ using namespace llvm; #define DEBUG_TYPE "mips-selectiondag-info" -MipsSelectionDAGInfo::MipsSelectionDAGInfo(const MipsTargetMachine &TM) - : TargetSelectionDAGInfo(TM) { -} +MipsSelectionDAGInfo::MipsSelectionDAGInfo(const DataLayout &DL) + : TargetSelectionDAGInfo(&DL) {} MipsSelectionDAGInfo::~MipsSelectionDAGInfo() { } diff --git a/lib/Target/Mips/MipsSelectionDAGInfo.h b/lib/Target/Mips/MipsSelectionDAGInfo.h index 6cafb558b35a..2b3d527fe6ff 100644 --- a/lib/Target/Mips/MipsSelectionDAGInfo.h +++ b/lib/Target/Mips/MipsSelectionDAGInfo.h @@ -22,7 +22,7 @@ class MipsTargetMachine; class MipsSelectionDAGInfo : public TargetSelectionDAGInfo { public: - explicit MipsSelectionDAGInfo(const MipsTargetMachine &TM); + explicit MipsSelectionDAGInfo(const DataLayout &DL); ~MipsSelectionDAGInfo(); }; diff --git a/lib/Target/Mips/MipsSubtarget.cpp b/lib/Target/Mips/MipsSubtarget.cpp index 74ec06479c4e..693daa3fde2e 100644 --- a/lib/Target/Mips/MipsSubtarget.cpp +++ b/lib/Target/Mips/MipsSubtarget.cpp @@ -60,11 +60,9 @@ Mips16ConstantIslands( /// Select the Mips CPU for the given triple and cpu name. /// FIXME: Merge with the copy in MipsMCTargetDesc.cpp -static inline StringRef selectMipsCPU(StringRef TT, StringRef CPU) { +static StringRef selectMipsCPU(Triple TT, StringRef CPU) { if (CPU.empty() || CPU == "generic") { - Triple TheTriple(TT); - if (TheTriple.getArch() == Triple::mips || - TheTriple.getArch() == Triple::mipsel) + if (TT.getArch() == Triple::mips || TT.getArch() == Triple::mipsel) CPU = "mips32"; else CPU = "mips64"; @@ -74,39 +72,56 @@ static inline StringRef selectMipsCPU(StringRef TT, StringRef CPU) { void MipsSubtarget::anchor() { } +static std::string computeDataLayout(const MipsSubtarget &ST) { + std::string Ret = ""; + + // There are both little and big endian mips. + if (ST.isLittle()) + Ret += "e"; + else + Ret += "E"; + + Ret += "-m:m"; + + // Pointers are 32 bit on some ABIs. + if (!ST.isABI_N64()) + Ret += "-p:32:32"; + + // 8 and 16 bit integers only need no have natural alignment, but try to + // align them to 32 bits. 64 bit integers have natural alignment. + Ret += "-i8:8:32-i16:16:32-i64:64"; + + // 32 bit registers are always available and the stack is at least 64 bit + // aligned. On N64 64 bit registers are also available and the stack is + // 128 bit aligned. + if (ST.isABI_N64() || ST.isABI_N32()) + Ret += "-n32:64-S128"; + else + Ret += "-n32-S64"; + + return Ret; +} + MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU, const std::string &FS, bool little, Reloc::Model _RM, MipsTargetMachine *_TM) : MipsGenSubtargetInfo(TT, CPU, FS), MipsArchVersion(Mips32), MipsABI(UnknownABI), IsLittle(little), IsSingleFloat(false), - IsFP64bit(false), IsNaN2008bit(false), IsGP64bit(false), HasVFPU(false), - HasCnMips(false), IsLinux(true), HasMips3_32(false), HasMips3_32r2(false), - HasMips4_32(false), HasMips4_32r2(false), HasMips5_32r2(false), - InMips16Mode(false), InMips16HardFloat(Mips16HardFloat), - InMicroMipsMode(false), HasDSP(false), HasDSPR2(false), - AllowMixed16_32(Mixed16_32 | Mips_Os16), Os16(Mips_Os16), HasMSA(false), - RM(_RM), OverrideMode(NoOverride), TM(_TM), TargetTriple(TT) { - std::string CPUName = CPU; - CPUName = selectMipsCPU(TT, CPUName); - - // Parse features string. - ParseSubtargetFeatures(CPUName, FS); - - if (InMips16Mode && !TM->Options.UseSoftFloat) { - // Hard float for mips16 means essentially to compile as soft float - // but to use a runtime library for soft float that is written with - // native mips32 floating point instructions (those runtime routines - // run in mips32 hard float mode). - TM->Options.UseSoftFloat = true; - TM->Options.FloatABIType = FloatABI::Soft; - InMips16HardFloat = true; - } + IsFPXX(false), IsFP64bit(false), UseOddSPReg(true), IsNaN2008bit(false), + IsGP64bit(false), HasVFPU(false), HasCnMips(false), IsLinux(true), + HasMips3_32(false), HasMips3_32r2(false), HasMips4_32(false), + HasMips4_32r2(false), HasMips5_32r2(false), InMips16Mode(false), + InMips16HardFloat(Mips16HardFloat), InMicroMipsMode(false), HasDSP(false), + HasDSPR2(false), AllowMixed16_32(Mixed16_32 | Mips_Os16), Os16(Mips_Os16), + HasMSA(false), RM(_RM), OverrideMode(NoOverride), TM(_TM), + TargetTriple(TT), + DL(computeDataLayout(initializeSubtargetDependencies(CPU, FS, TM))), + TSInfo(DL), JITInfo(), InstrInfo(MipsInstrInfo::create(*TM)), + FrameLowering(MipsFrameLowering::create(*TM, *this)), + TLInfo(MipsTargetLowering::create(*TM)) { PreviousInMips16Mode = InMips16Mode; - // Initialize scheduling itinerary for the specified CPU. - InstrItins = getInstrItineraryForCPU(CPUName); - // Don't even attempt to generate code for MIPS-I, MIPS-II, MIPS-III, and // MIPS-V. They have not been tested and currently exist for the integrated // assembler only. @@ -137,6 +152,11 @@ MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU, "See -mattr=+fp64.", false); + if (!isABI_O32() && !useOddSPReg()) + report_fatal_error("-mattr=+nooddspreg is not currently permitted for a " + "the O32 ABI.", + false); + if (hasMips32r6()) { StringRef ISA = hasMips64r6() ? "MIPS64r6" : "MIPS32r6"; @@ -167,6 +187,29 @@ MipsSubtarget::enablePostRAScheduler(CodeGenOpt::Level OptLevel, return OptLevel >= CodeGenOpt::Aggressive; } +MipsSubtarget & +MipsSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS, + const TargetMachine *TM) { + std::string CPUName = selectMipsCPU(TargetTriple, CPU); + + // Parse features string. + ParseSubtargetFeatures(CPUName, FS); + // Initialize scheduling itinerary for the specified CPU. + InstrItins = getInstrItineraryForCPU(CPUName); + + if (InMips16Mode && !TM->Options.UseSoftFloat) { + // Hard float for mips16 means essentially to compile as soft float + // but to use a runtime library for soft float that is written with + // native mips32 floating point instructions (those runtime routines + // run in mips32 hard float mode). + TM->Options.UseSoftFloat = true; + TM->Options.FloatABIType = FloatABI::Soft; + InMips16HardFloat = true; + } + + return *this; +} + //FIXME: This logic for reseting the subtarget along with // the helper classes can probably be simplified but there are a lot of // cases so we will defer rewriting this to later. @@ -186,14 +229,14 @@ void MipsSubtarget::resetSubtarget(MachineFunction *MF) { return; OverrideMode = Mips16Override; PreviousInMips16Mode = true; - TM->setHelperClassesMips16(); + setHelperClassesMips16(); return; } else if (ChangeToNoMips16) { if (!PreviousInMips16Mode) return; OverrideMode = NoMips16Override; PreviousInMips16Mode = false; - TM->setHelperClassesMipsSE(); + setHelperClassesMipsSE(); return; } else { if (OverrideMode == NoOverride) @@ -201,16 +244,52 @@ void MipsSubtarget::resetSubtarget(MachineFunction *MF) { OverrideMode = NoOverride; DEBUG(dbgs() << "back to default" << "\n"); if (inMips16Mode() && !PreviousInMips16Mode) { - TM->setHelperClassesMips16(); + setHelperClassesMips16(); PreviousInMips16Mode = true; } else if (!inMips16Mode() && PreviousInMips16Mode) { - TM->setHelperClassesMipsSE(); + setHelperClassesMipsSE(); PreviousInMips16Mode = false; } return; } } +void MipsSubtarget::setHelperClassesMips16() { + InstrInfoSE.swap(InstrInfo); + FrameLoweringSE.swap(FrameLowering); + TLInfoSE.swap(TLInfo); + if (!InstrInfo16) { + InstrInfo.reset(MipsInstrInfo::create(*TM)); + FrameLowering.reset(MipsFrameLowering::create(*TM, *this)); + TLInfo.reset(MipsTargetLowering::create(*TM)); + } else { + InstrInfo16.swap(InstrInfo); + FrameLowering16.swap(FrameLowering); + TLInfo16.swap(TLInfo); + } + assert(TLInfo && "null target lowering 16"); + assert(InstrInfo && "null instr info 16"); + assert(FrameLowering && "null frame lowering 16"); +} + +void MipsSubtarget::setHelperClassesMipsSE() { + InstrInfo16.swap(InstrInfo); + FrameLowering16.swap(FrameLowering); + TLInfo16.swap(TLInfo); + if (!InstrInfoSE) { + InstrInfo.reset(MipsInstrInfo::create(*TM)); + FrameLowering.reset(MipsFrameLowering::create(*TM, *this)); + TLInfo.reset(MipsTargetLowering::create(*TM)); + } else { + InstrInfoSE.swap(InstrInfo); + FrameLoweringSE.swap(FrameLowering); + TLInfoSE.swap(TLInfo); + } + assert(TLInfo && "null target lowering in SE"); + assert(InstrInfo && "null instr info SE"); + assert(FrameLowering && "null frame lowering SE"); +} + bool MipsSubtarget::mipsSEUsesSoftFloat() const { return TM->Options.UseSoftFloat && !InMips16HardFloat; } diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h index 373f48136211..a3dcf03c63a9 100644 --- a/lib/Target/Mips/MipsSubtarget.h +++ b/lib/Target/Mips/MipsSubtarget.h @@ -14,6 +14,12 @@ #ifndef MIPSSUBTARGET_H #define MIPSSUBTARGET_H +#include "MipsFrameLowering.h" +#include "MipsISelLowering.h" +#include "MipsInstrInfo.h" +#include "MipsJITInfo.h" +#include "MipsSelectionDAGInfo.h" +#include "llvm/IR/DataLayout.h" #include "llvm/MC/MCInstrItineraries.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Target/TargetSubtargetInfo.h" @@ -56,9 +62,16 @@ protected: // floating point registers instead of only using even ones. bool IsSingleFloat; + // IsFPXX - MIPS O32 modeless ABI. + bool IsFPXX; + // IsFP64bit - The target processor has 64-bit floating point registers. bool IsFP64bit; + /// Are odd single-precision registers permitted? + /// This corresponds to -modd-spreg and -mno-odd-spreg + bool UseOddSPReg; + // IsNan2008 - IEEE 754-2008 NaN encoding. bool IsNaN2008bit; @@ -132,6 +145,20 @@ protected: MipsTargetMachine *TM; Triple TargetTriple; + + const DataLayout DL; // Calculates type size & alignment + const MipsSelectionDAGInfo TSInfo; + MipsJITInfo JITInfo; + std::unique_ptr<const MipsInstrInfo> InstrInfo; + std::unique_ptr<const MipsFrameLowering> FrameLowering; + std::unique_ptr<const MipsTargetLowering> TLInfo; + std::unique_ptr<const MipsInstrInfo> InstrInfo16; + std::unique_ptr<const MipsFrameLowering> FrameLowering16; + std::unique_ptr<const MipsTargetLowering> TLInfo16; + std::unique_ptr<const MipsInstrInfo> InstrInfoSE; + std::unique_ptr<const MipsFrameLowering> FrameLoweringSE; + std::unique_ptr<const MipsTargetLowering> TLInfoSE; + public: bool enablePostRAScheduler(CodeGenOpt::Level OptLevel, AntiDepBreakMode& Mode, @@ -142,6 +169,7 @@ public: bool isABI_N64() const { return MipsABI == N64; } bool isABI_N32() const { return MipsABI == N32; } bool isABI_O32() const { return MipsABI == O32; } + bool isABI_FPXX() const { return false; } // TODO: add check for FPXX unsigned getTargetABI() const { return MipsABI; } /// This constructor initializes the data members to match that @@ -154,23 +182,36 @@ public: /// subtarget options. Definition of function is auto generated by tblgen. void ParseSubtargetFeatures(StringRef CPU, StringRef FS); + bool hasMips1() const { return MipsArchVersion >= Mips1; } bool hasMips2() const { return MipsArchVersion >= Mips2; } bool hasMips3() const { return MipsArchVersion >= Mips3; } + bool hasMips4() const { return MipsArchVersion >= Mips4; } + bool hasMips5() const { return MipsArchVersion >= Mips5; } bool hasMips4_32() const { return HasMips4_32; } bool hasMips4_32r2() const { return HasMips4_32r2; } - bool hasMips32() const { return MipsArchVersion >= Mips32; } - bool hasMips32r2() const { return MipsArchVersion == Mips32r2 || - MipsArchVersion == Mips64r2; } - bool hasMips32r6() const { return MipsArchVersion == Mips32r6 || - MipsArchVersion == Mips64r6; } + bool hasMips32() const { + return MipsArchVersion >= Mips32 && MipsArchVersion != Mips3 && + MipsArchVersion != Mips4 && MipsArchVersion != Mips5; + } + bool hasMips32r2() const { + return MipsArchVersion == Mips32r2 || MipsArchVersion == Mips32r6 || + MipsArchVersion == Mips64r2 || MipsArchVersion == Mips64r6; + } + bool hasMips32r6() const { + return MipsArchVersion == Mips32r6 || MipsArchVersion == Mips64r6; + } bool hasMips64() const { return MipsArchVersion >= Mips64; } - bool hasMips64r2() const { return MipsArchVersion == Mips64r2; } + bool hasMips64r2() const { + return MipsArchVersion == Mips64r2 || MipsArchVersion == Mips64r6; + } bool hasMips64r6() const { return MipsArchVersion == Mips64r6; } bool hasCnMips() const { return HasCnMips; } bool isLittle() const { return IsLittle; } + bool isFPXX() const { return IsFPXX; } bool isFP64bit() const { return IsFP64bit; } + bool useOddSPReg() const { return UseOddSPReg; } bool isNaN2008() const { return IsNaN2008bit; } bool isNotFP64bit() const { return !IsFP64bit; } bool isGP64bit() const { return IsGP64bit; } @@ -234,12 +275,31 @@ public: /// \brief Reset the subtarget for the Mips target. void resetSubtarget(MachineFunction *MF); + MipsSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS, + const TargetMachine *TM); + /// Does the system support unaligned memory access. /// /// MIPS32r6/MIPS64r6 require full unaligned access support but does not /// specify which component of the system provides it. Hardware, software, and /// hybrid implementations are all valid. bool systemSupportsUnalignedAccess() const { return hasMips32r6(); } + + // Set helper classes + void setHelperClassesMips16(); + void setHelperClassesMipsSE(); + + MipsJITInfo *getJITInfo() { return &JITInfo; } + const MipsSelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; } + const DataLayout *getDataLayout() const { return &DL; } + const MipsInstrInfo *getInstrInfo() const { return InstrInfo.get(); } + const TargetFrameLowering *getFrameLowering() const { + return FrameLowering.get(); + } + const MipsRegisterInfo *getRegisterInfo() const { + return &InstrInfo->getRegisterInfo(); + } + const MipsTargetLowering *getTargetLowering() const { return TLInfo.get(); } }; } // End llvm namespace diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp index 984c58eb6c1e..425dbf1d89f6 100644 --- a/lib/Target/Mips/MipsTargetMachine.cpp +++ b/lib/Target/Mips/MipsTargetMachine.cpp @@ -45,93 +45,21 @@ extern "C" void LLVMInitializeMipsTarget() { RegisterTargetMachine<MipselTargetMachine> B(TheMips64elTarget); } -static std::string computeDataLayout(const MipsSubtarget &ST) { - std::string Ret = ""; - - // There are both little and big endian mips. - if (ST.isLittle()) - Ret += "e"; - else - Ret += "E"; - - Ret += "-m:m"; - - // Pointers are 32 bit on some ABIs. - if (!ST.isABI_N64()) - Ret += "-p:32:32"; - - // 8 and 16 bit integers only need no have natural alignment, but try to - // align them to 32 bits. 64 bit integers have natural alignment. - Ret += "-i8:8:32-i16:16:32-i64:64"; - - // 32 bit registers are always available and the stack is at least 64 bit - // aligned. On N64 64 bit registers are also available and the stack is - // 128 bit aligned. - if (ST.isABI_N64() || ST.isABI_N32()) - Ret += "-n32:64-S128"; - else - Ret += "-n32-S64"; - - return Ret; -} - // On function prologue, the stack is created by decrementing // its pointer. Once decremented, all references are done with positive // offset from the stack/frame pointer, using StackGrowsUp enables // an easier handling. // Using CodeModel::Large enables different CALL behavior. -MipsTargetMachine:: -MipsTargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL, - bool isLittle) - : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), - Subtarget(TT, CPU, FS, isLittle, RM, this), - DL(computeDataLayout(Subtarget)), - InstrInfo(MipsInstrInfo::create(*this)), - FrameLowering(MipsFrameLowering::create(*this, Subtarget)), - TLInfo(MipsTargetLowering::create(*this)), TSInfo(*this), - InstrItins(Subtarget.getInstrItineraryData()), JITInfo() { +MipsTargetMachine::MipsTargetMachine(const Target &T, StringRef TT, + StringRef CPU, StringRef FS, + const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL, bool isLittle) + : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), + Subtarget(TT, CPU, FS, isLittle, RM, this) { initAsmInfo(); } - -void MipsTargetMachine::setHelperClassesMips16() { - InstrInfoSE.swap(InstrInfo); - FrameLoweringSE.swap(FrameLowering); - TLInfoSE.swap(TLInfo); - if (!InstrInfo16) { - InstrInfo.reset(MipsInstrInfo::create(*this)); - FrameLowering.reset(MipsFrameLowering::create(*this, Subtarget)); - TLInfo.reset(MipsTargetLowering::create(*this)); - } else { - InstrInfo16.swap(InstrInfo); - FrameLowering16.swap(FrameLowering); - TLInfo16.swap(TLInfo); - } - assert(TLInfo && "null target lowering 16"); - assert(InstrInfo && "null instr info 16"); - assert(FrameLowering && "null frame lowering 16"); -} - -void MipsTargetMachine::setHelperClassesMipsSE() { - InstrInfo16.swap(InstrInfo); - FrameLowering16.swap(FrameLowering); - TLInfo16.swap(TLInfo); - if (!InstrInfoSE) { - InstrInfo.reset(MipsInstrInfo::create(*this)); - FrameLowering.reset(MipsFrameLowering::create(*this, Subtarget)); - TLInfo.reset(MipsTargetLowering::create(*this)); - } else { - InstrInfoSE.swap(InstrInfo); - FrameLoweringSE.swap(FrameLowering); - TLInfoSE.swap(TLInfo); - } - assert(TLInfo && "null target lowering in SE"); - assert(InstrInfo && "null instr info SE"); - assert(FrameLowering && "null frame lowering SE"); -} void MipsebTargetMachine::anchor() { } MipsebTargetMachine:: diff --git a/lib/Target/Mips/MipsTargetMachine.h b/lib/Target/Mips/MipsTargetMachine.h index a5aa39bc9f4d..a0e7d4355558 100644 --- a/lib/Target/Mips/MipsTargetMachine.h +++ b/lib/Target/Mips/MipsTargetMachine.h @@ -14,15 +14,9 @@ #ifndef MIPSTARGETMACHINE_H #define MIPSTARGETMACHINE_H -#include "MipsFrameLowering.h" -#include "MipsISelLowering.h" -#include "MipsInstrInfo.h" -#include "MipsJITInfo.h" -#include "MipsSelectionDAGInfo.h" #include "MipsSubtarget.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/SelectionDAGISel.h" -#include "llvm/IR/DataLayout.h" #include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetMachine.h" @@ -32,68 +26,47 @@ class MipsRegisterInfo; class MipsTargetMachine : public LLVMTargetMachine { MipsSubtarget Subtarget; - const DataLayout DL; // Calculates type size & alignment - std::unique_ptr<const MipsInstrInfo> InstrInfo; - std::unique_ptr<const MipsFrameLowering> FrameLowering; - std::unique_ptr<const MipsTargetLowering> TLInfo; - std::unique_ptr<const MipsInstrInfo> InstrInfo16; - std::unique_ptr<const MipsFrameLowering> FrameLowering16; - std::unique_ptr<const MipsTargetLowering> TLInfo16; - std::unique_ptr<const MipsInstrInfo> InstrInfoSE; - std::unique_ptr<const MipsFrameLowering> FrameLoweringSE; - std::unique_ptr<const MipsTargetLowering> TLInfoSE; - MipsSelectionDAGInfo TSInfo; - const InstrItineraryData &InstrItins; - MipsJITInfo JITInfo; public: - MipsTargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL, - bool isLittle); + MipsTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, + const TargetOptions &Options, Reloc::Model RM, + CodeModel::Model CM, CodeGenOpt::Level OL, bool isLittle); virtual ~MipsTargetMachine() {} void addAnalysisPasses(PassManagerBase &PM) override; - const MipsInstrInfo *getInstrInfo() const override - { return InstrInfo.get(); } - const TargetFrameLowering *getFrameLowering() const override - { return FrameLowering.get(); } - const MipsSubtarget *getSubtargetImpl() const override - { return &Subtarget; } - const DataLayout *getDataLayout() const override - { return &DL;} - + const MipsInstrInfo *getInstrInfo() const override { + return getSubtargetImpl()->getInstrInfo(); + } + const TargetFrameLowering *getFrameLowering() const override { + return getSubtargetImpl()->getFrameLowering(); + } + const MipsSubtarget *getSubtargetImpl() const override { return &Subtarget; } const InstrItineraryData *getInstrItineraryData() const override { - return Subtarget.inMips16Mode() ? nullptr : &InstrItins; + return Subtarget.inMips16Mode() + ? nullptr + : &getSubtargetImpl()->getInstrItineraryData(); + } + MipsJITInfo *getJITInfo() override { + return Subtarget.getJITInfo(); } - - MipsJITInfo *getJITInfo() override { return &JITInfo; } - const MipsRegisterInfo *getRegisterInfo() const override { - return &InstrInfo->getRegisterInfo(); + return getSubtargetImpl()->getRegisterInfo(); } - const MipsTargetLowering *getTargetLowering() const override { - return TLInfo.get(); + return getSubtargetImpl()->getTargetLowering(); + } + const DataLayout *getDataLayout() const override { + return getSubtargetImpl()->getDataLayout(); } - const MipsSelectionDAGInfo* getSelectionDAGInfo() const override { - return &TSInfo; + return getSubtargetImpl()->getSelectionDAGInfo(); } // Pass Pipeline Configuration TargetPassConfig *createPassConfig(PassManagerBase &PM) override; bool addCodeEmitter(PassManagerBase &PM, JITCodeEmitter &JCE) override; - - // Set helper classes - void setHelperClassesMips16(); - - void setHelperClassesMipsSE(); - - }; /// MipsebTargetMachine - Mips32/64 big endian target machine. diff --git a/lib/Target/Mips/MipsTargetStreamer.h b/lib/Target/Mips/MipsTargetStreamer.h index 4ad37ac5b12c..99f7d4c92cfb 100644 --- a/lib/Target/Mips/MipsTargetStreamer.h +++ b/lib/Target/Mips/MipsTargetStreamer.h @@ -12,46 +12,83 @@ #include "llvm/MC/MCELFStreamer.h" #include "llvm/MC/MCStreamer.h" +#include "MCTargetDesc/MipsABIFlagsSection.h" namespace llvm { -class MipsTargetStreamer : public MCTargetStreamer { - virtual void anchor(); +struct MipsABIFlagsSection; + +class MipsTargetStreamer : public MCTargetStreamer { public: MipsTargetStreamer(MCStreamer &S); - virtual void emitDirectiveSetMicroMips() = 0; - virtual void emitDirectiveSetNoMicroMips() = 0; - virtual void emitDirectiveSetMips16() = 0; - virtual void emitDirectiveSetNoMips16() = 0; - - virtual void emitDirectiveSetReorder() = 0; - virtual void emitDirectiveSetNoReorder() = 0; - virtual void emitDirectiveSetMacro() = 0; - virtual void emitDirectiveSetNoMacro() = 0; - virtual void emitDirectiveSetAt() = 0; - virtual void emitDirectiveSetNoAt() = 0; - virtual void emitDirectiveEnd(StringRef Name) = 0; - - virtual void emitDirectiveEnt(const MCSymbol &Symbol) = 0; - virtual void emitDirectiveAbiCalls() = 0; - virtual void emitDirectiveNaN2008() = 0; - virtual void emitDirectiveNaNLegacy() = 0; - virtual void emitDirectiveOptionPic0() = 0; - virtual void emitDirectiveOptionPic2() = 0; + virtual void emitDirectiveSetMicroMips(); + virtual void emitDirectiveSetNoMicroMips(); + virtual void emitDirectiveSetMips16(); + virtual void emitDirectiveSetNoMips16(); + + virtual void emitDirectiveSetReorder(); + virtual void emitDirectiveSetNoReorder(); + virtual void emitDirectiveSetMacro(); + virtual void emitDirectiveSetNoMacro(); + virtual void emitDirectiveSetAt(); + virtual void emitDirectiveSetNoAt(); + virtual void emitDirectiveEnd(StringRef Name); + + virtual void emitDirectiveEnt(const MCSymbol &Symbol); + virtual void emitDirectiveAbiCalls(); + virtual void emitDirectiveNaN2008(); + virtual void emitDirectiveNaNLegacy(); + virtual void emitDirectiveOptionPic0(); + virtual void emitDirectiveOptionPic2(); virtual void emitFrame(unsigned StackReg, unsigned StackSize, - unsigned ReturnReg) = 0; - virtual void emitMask(unsigned CPUBitmask, int CPUTopSavedRegOff) = 0; - virtual void emitFMask(unsigned FPUBitmask, int FPUTopSavedRegOff) = 0; + unsigned ReturnReg); + virtual void emitMask(unsigned CPUBitmask, int CPUTopSavedRegOff); + virtual void emitFMask(unsigned FPUBitmask, int FPUTopSavedRegOff); - virtual void emitDirectiveSetMips32R2() = 0; - virtual void emitDirectiveSetMips64() = 0; - virtual void emitDirectiveSetMips64R2() = 0; - virtual void emitDirectiveSetDsp() = 0; + virtual void emitDirectiveSetMips32R2(); + virtual void emitDirectiveSetMips64(); + virtual void emitDirectiveSetMips64R2(); + virtual void emitDirectiveSetDsp(); // PIC support - virtual void emitDirectiveCpload(unsigned RegNo) = 0; + virtual void emitDirectiveCpload(unsigned RegNo); virtual void emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset, - const MCSymbol &Sym, bool IsReg) = 0; + const MCSymbol &Sym, bool IsReg); + + /// Emit a '.module fp=value' directive using the given values. + /// Updates the .MIPS.abiflags section + virtual void emitDirectiveModuleFP(MipsABIFlagsSection::FpABIKind Value, + bool Is32BitABI) { + ABIFlagsSection.setFpABI(Value, Is32BitABI); + } + + /// Emit a '.module fp=value' directive using the current values of the + /// .MIPS.abiflags section. + void emitDirectiveModuleFP() { + emitDirectiveModuleFP(ABIFlagsSection.getFpABI(), + ABIFlagsSection.Is32BitABI); + } + + virtual void emitDirectiveModuleOddSPReg(bool Enabled, bool IsO32ABI); + virtual void emitDirectiveSetFp(MipsABIFlagsSection::FpABIKind Value){}; + virtual void emitMipsAbiFlags(){}; + void setCanHaveModuleDir(bool Can) { canHaveModuleDirective = Can; } + bool getCanHaveModuleDir() { return canHaveModuleDirective; } + + // This method enables template classes to set internal abi flags + // structure values. + template <class PredicateLibrary> + void updateABIInfo(const PredicateLibrary &P) { + ABIFlagsSection.setAllFromPredicates(P); + } + + MipsABIFlagsSection &getABIFlagsSection() { return ABIFlagsSection; } + +protected: + MipsABIFlagsSection ABIFlagsSection; + +private: + bool canHaveModuleDirective; }; // This part is for ascii assembly output @@ -93,6 +130,13 @@ public: virtual void emitDirectiveCpload(unsigned RegNo); void emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset, const MCSymbol &Sym, bool IsReg) override; + + // ABI Flags + void emitDirectiveModuleFP(MipsABIFlagsSection::FpABIKind Value, + bool Is32BitABI) override; + void emitDirectiveModuleOddSPReg(bool Enabled, bool IsO32ABI) override; + void emitDirectiveSetFp(MipsABIFlagsSection::FpABIKind Value) override; + void emitMipsAbiFlags() override; }; // This part is for ELF object output @@ -144,6 +188,10 @@ public: void emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset, const MCSymbol &Sym, bool IsReg) override; + // ABI Flags + void emitDirectiveModuleOddSPReg(bool Enabled, bool IsO32ABI) override; + void emitMipsAbiFlags() override; + protected: bool isO32() const { return STI.getFeatureBits() & Mips::FeatureO32; } bool isN32() const { return STI.getFeatureBits() & Mips::FeatureN32; } diff --git a/lib/Target/NVPTX/NVPTX.td b/lib/Target/NVPTX/NVPTX.td index d78b4e81a3e5..93fabf615369 100644 --- a/lib/Target/NVPTX/NVPTX.td +++ b/lib/Target/NVPTX/NVPTX.td @@ -34,12 +34,18 @@ def SM30 : SubtargetFeature<"sm_30", "SmVersion", "30", "Target SM 3.0">; def SM35 : SubtargetFeature<"sm_35", "SmVersion", "35", "Target SM 3.5">; +def SM50 : SubtargetFeature<"sm_50", "SmVersion", "50", + "Target SM 5.0">; // PTX Versions def PTX30 : SubtargetFeature<"ptx30", "PTXVersion", "30", "Use PTX version 3.0">; def PTX31 : SubtargetFeature<"ptx31", "PTXVersion", "31", "Use PTX version 3.1">; +def PTX32 : SubtargetFeature<"ptx32", "PTXVersion", "32", + "Use PTX version 3.2">; +def PTX40 : SubtargetFeature<"ptx40", "PTXVersion", "40", + "Use PTX version 4.0">; //===----------------------------------------------------------------------===// // NVPTX supported processors. @@ -52,6 +58,7 @@ def : Proc<"sm_20", [SM20]>; def : Proc<"sm_21", [SM21]>; def : Proc<"sm_30", [SM30]>; def : Proc<"sm_35", [SM35]>; +def : Proc<"sm_50", [SM50]>; def NVPTXInstrInfo : InstrInfo { diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index 4ec575ff34c9..decf02a0bedf 100644 --- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -734,23 +734,7 @@ void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) { << " func_retval0"; } else { if ((Ty->getTypeID() == Type::StructTyID) || isa<VectorType>(Ty)) { - SmallVector<EVT, 16> vtparts; - ComputeValueVTs(*TLI, Ty, vtparts); - unsigned totalsz = 0; - for (unsigned i = 0, e = vtparts.size(); i != e; ++i) { - unsigned elems = 1; - EVT elemtype = vtparts[i]; - if (vtparts[i].isVector()) { - elems = vtparts[i].getVectorNumElements(); - elemtype = vtparts[i].getVectorElementType(); - } - for (unsigned j = 0, je = elems; j != je; ++j) { - unsigned sz = elemtype.getSizeInBits(); - if (elemtype.isInteger() && (sz < 8)) - sz = 8; - totalsz += sz / 8; - } - } + unsigned totalsz = TD->getTypeAllocSize(Ty); unsigned retAlignment = 0; if (!llvm::getAlign(*F, 0, retAlignment)) retAlignment = TD->getABITypeAlignment(Ty); @@ -1321,6 +1305,10 @@ bool NVPTXAsmPrinter::doFinalization(Module &M) { // external global variable with init -> .visible // external without init -> .extern // appending -> not allowed, assert. +// for any linkage other than +// internal, private, linker_private, +// linker_private_weak, linker_private_weak_def_auto, +// we emit -> .weak. void NVPTXAsmPrinter::emitLinkageDirective(const GlobalValue *V, raw_ostream &O) { @@ -1346,6 +1334,9 @@ void NVPTXAsmPrinter::emitLinkageDirective(const GlobalValue *V, msg.append(V->getName().str()); msg.append("has unsupported appending linkage type"); llvm_unreachable(msg.c_str()); + } else if (!V->hasInternalLinkage() && + !V->hasPrivateLinkage()) { + O << ".weak "; } } } @@ -1356,10 +1347,15 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, // Skip meta data if (GVar->hasSection()) { - if (GVar->getSection() == "llvm.metadata") + if (GVar->getSection() == StringRef("llvm.metadata")) return; } + // Skip LLVM intrinsic global variables + if (GVar->getName().startswith("llvm.") || + GVar->getName().startswith("nvvm.")) + return; + const DataLayout *TD = TM.getDataLayout(); // GlobalVariables are always constant pointers themselves. @@ -1371,6 +1367,10 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, O << ".visible "; else O << ".extern "; + } else if (GVar->hasLinkOnceLinkage() || GVar->hasWeakLinkage() || + GVar->hasAvailableExternallyLinkage() || + GVar->hasCommonLinkage()) { + O << ".weak "; } if (llvm::isTexture(*GVar)) { @@ -1438,7 +1438,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, O << "linear"; break; case 2: - assert(0 && "Anisotropic filtering is not supported"); + llvm_unreachable("Anisotropic filtering is not supported"); default: O << "nearest"; break; @@ -1480,6 +1480,11 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, O << "."; emitPTXAddressSpace(PTy->getAddressSpace(), O); + + if (isManaged(*GVar)) { + O << " .attribute(.managed)"; + } + if (GVar->getAlignment() == 0) O << " .align " << (int) TD->getPrefTypeAlignment(ETy); else @@ -1497,13 +1502,24 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, // Ptx allows variable initilization only for constant and global state // spaces. - if (((PTy->getAddressSpace() == llvm::ADDRESS_SPACE_GLOBAL) || - (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST)) && - GVar->hasInitializer()) { - const Constant *Initializer = GVar->getInitializer(); - if (!Initializer->isNullValue()) { - O << " = "; - printScalarConstant(Initializer, O); + if (GVar->hasInitializer()) { + if ((PTy->getAddressSpace() == llvm::ADDRESS_SPACE_GLOBAL) || + (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST)) { + const Constant *Initializer = GVar->getInitializer(); + // 'undef' is treated as there is no value spefied. + if (!Initializer->isNullValue() && !isa<UndefValue>(Initializer)) { + O << " = "; + printScalarConstant(Initializer, O); + } + } else { + // The frontend adds zero-initializer to variables that don't have an + // initial value, so skip warning for this case. + if (!GVar->getInitializer()->isNullValue()) { + std::string warnMsg = "initial value of '" + GVar->getName().str() + + "' is not allowed in addrspace(" + + llvm::utostr_32(PTy->getAddressSpace()) + ")"; + report_fatal_error(warnMsg.c_str()); + } } } } else { @@ -1562,7 +1578,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, } break; default: - assert(0 && "type not supported yet"); + llvm_unreachable("type not supported yet"); } } @@ -1682,7 +1698,7 @@ void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar, O << "]"; break; default: - assert(0 && "type not supported yet"); + llvm_unreachable("type not supported yet"); } return; } diff --git a/lib/Target/NVPTX/NVPTXFrameLowering.cpp b/lib/Target/NVPTX/NVPTXFrameLowering.cpp index 9030584f06fc..8b088412dbba 100644 --- a/lib/Target/NVPTX/NVPTXFrameLowering.cpp +++ b/lib/Target/NVPTX/NVPTXFrameLowering.cpp @@ -26,6 +26,10 @@ using namespace llvm; +NVPTXFrameLowering::NVPTXFrameLowering(NVPTXSubtarget &STI) + : TargetFrameLowering(TargetFrameLowering::StackGrowsUp, 8, 0), + is64bit(STI.is64Bit()) {} + bool NVPTXFrameLowering::hasFP(const MachineFunction &MF) const { return true; } void NVPTXFrameLowering::emitPrologue(MachineFunction &MF) const { @@ -43,17 +47,21 @@ void NVPTXFrameLowering::emitPrologue(MachineFunction &MF) const { // cvta.local %SP, %SPL; if (is64bit) { unsigned LocalReg = MRI.createVirtualRegister(&NVPTX::Int64RegsRegClass); - MachineInstr *MI = BuildMI( - MBB, MBBI, dl, tm.getInstrInfo()->get(NVPTX::cvta_local_yes_64), - NVPTX::VRFrame).addReg(LocalReg); - BuildMI(MBB, MI, dl, tm.getInstrInfo()->get(NVPTX::MOV_DEPOT_ADDR_64), + MachineInstr *MI = + BuildMI(MBB, MBBI, dl, + MF.getTarget().getInstrInfo()->get(NVPTX::cvta_local_yes_64), + NVPTX::VRFrame).addReg(LocalReg); + BuildMI(MBB, MI, dl, + MF.getTarget().getInstrInfo()->get(NVPTX::MOV_DEPOT_ADDR_64), LocalReg).addImm(MF.getFunctionNumber()); } else { unsigned LocalReg = MRI.createVirtualRegister(&NVPTX::Int32RegsRegClass); - MachineInstr *MI = BuildMI( - MBB, MBBI, dl, tm.getInstrInfo()->get(NVPTX::cvta_local_yes), - NVPTX::VRFrame).addReg(LocalReg); - BuildMI(MBB, MI, dl, tm.getInstrInfo()->get(NVPTX::MOV_DEPOT_ADDR), + MachineInstr *MI = + BuildMI(MBB, MBBI, dl, + MF.getTarget().getInstrInfo()->get(NVPTX::cvta_local_yes), + NVPTX::VRFrame).addReg(LocalReg); + BuildMI(MBB, MI, dl, + MF.getTarget().getInstrInfo()->get(NVPTX::MOV_DEPOT_ADDR), LocalReg).addImm(MF.getFunctionNumber()); } } diff --git a/lib/Target/NVPTX/NVPTXFrameLowering.h b/lib/Target/NVPTX/NVPTXFrameLowering.h index 2ae6d72720e2..56fb673de0eb 100644 --- a/lib/Target/NVPTX/NVPTXFrameLowering.h +++ b/lib/Target/NVPTX/NVPTXFrameLowering.h @@ -17,16 +17,12 @@ #include "llvm/Target/TargetFrameLowering.h" namespace llvm { -class NVPTXTargetMachine; - +class NVPTXSubtarget; class NVPTXFrameLowering : public TargetFrameLowering { - NVPTXTargetMachine &tm; bool is64bit; public: - explicit NVPTXFrameLowering(NVPTXTargetMachine &_tm, bool _is64bit) - : TargetFrameLowering(TargetFrameLowering::StackGrowsUp, 8, 0), tm(_tm), - is64bit(_is64bit) {} + explicit NVPTXFrameLowering(NVPTXSubtarget &STI); bool hasFP(const MachineFunction &MF) const override; void emitPrologue(MachineFunction &MF) const override; diff --git a/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp index 023dd5e48c4a..faa9fdb424b6 100644 --- a/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp +++ b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp @@ -84,7 +84,7 @@ bool GenericToNVVM::runOnModule(Module &M) { GlobalVariable *GV = I++; if (GV->getType()->getAddressSpace() == llvm::ADDRESS_SPACE_GENERIC && !llvm::isTexture(*GV) && !llvm::isSurface(*GV) && - !GV->getName().startswith("llvm.")) { + !llvm::isSampler(*GV) && !GV->getName().startswith("llvm.")) { GlobalVariable *NewGV = new GlobalVariable( M, GV->getType()->getElementType(), GV->isConstant(), GV->getLinkage(), diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index cd308806c36a..0dfbf10cd9d7 100644 --- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -24,11 +24,14 @@ using namespace llvm; #define DEBUG_TYPE "nvptx-isel" -static cl::opt<int> -FMAContractLevel("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden, - cl::desc("NVPTX Specific: FMA contraction (0: don't do it" - " 1: do it 2: do it aggressively"), - cl::init(2)); +unsigned FMAContractLevel = 0; + +static cl::opt<unsigned, true> +FMAContractLevelOpt("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden, + cl::desc("NVPTX Specific: FMA contraction (0: don't do it" + " 1: do it 2: do it aggressively"), + cl::location(FMAContractLevel), + cl::init(2)); static cl::opt<int> UsePrecDivF32( "nvptx-prec-divf32", cl::ZeroOrMore, cl::Hidden, @@ -138,7 +141,7 @@ SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) { case NVPTXISD::LDGV4: case NVPTXISD::LDUV2: case NVPTXISD::LDUV4: - ResNode = SelectLDGLDUVector(N); + ResNode = SelectLDGLDU(N); break; case NVPTXISD::StoreV2: case NVPTXISD::StoreV4: @@ -164,6 +167,9 @@ SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) { case ISD::INTRINSIC_WO_CHAIN: ResNode = SelectIntrinsicNoChain(N); break; + case ISD::INTRINSIC_W_CHAIN: + ResNode = SelectIntrinsicChain(N); + break; case NVPTXISD::Tex1DFloatI32: case NVPTXISD::Tex1DFloatFloat: case NVPTXISD::Tex1DFloatFloatLevel: @@ -253,6 +259,12 @@ SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) { case NVPTXISD::Suld3DV4I32Trap: ResNode = SelectSurfaceIntrinsic(N); break; + case ISD::AND: + case ISD::SRA: + case ISD::SRL: + // Try to select BFE + ResNode = SelectBFE(N); + break; case ISD::ADDRSPACECAST: ResNode = SelectAddrSpaceCast(N); break; @@ -264,6 +276,21 @@ SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) { return SelectCode(N); } +SDNode *NVPTXDAGToDAGISel::SelectIntrinsicChain(SDNode *N) { + unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); + switch (IID) { + default: + return NULL; + case Intrinsic::nvvm_ldg_global_f: + case Intrinsic::nvvm_ldg_global_i: + case Intrinsic::nvvm_ldg_global_p: + case Intrinsic::nvvm_ldu_global_f: + case Intrinsic::nvvm_ldu_global_i: + case Intrinsic::nvvm_ldu_global_p: + return SelectLDGLDU(N); + } +} + static unsigned int getCodeAddrSpace(MemSDNode *N, const NVPTXSubtarget &Subtarget) { const Value *Src = N->getMemOperand()->getValue(); @@ -981,22 +1008,101 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) { return LD; } -SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) { +SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { SDValue Chain = N->getOperand(0); - SDValue Op1 = N->getOperand(1); + SDValue Op1; + MemSDNode *Mem; + bool IsLDG = true; + + // If this is an LDG intrinsic, the address is the third operand. Its its an + // LDG/LDU SD node (from custom vector handling), then its the second operand + if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) { + Op1 = N->getOperand(2); + Mem = cast<MemIntrinsicSDNode>(N); + unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); + switch (IID) { + default: + return NULL; + case Intrinsic::nvvm_ldg_global_f: + case Intrinsic::nvvm_ldg_global_i: + case Intrinsic::nvvm_ldg_global_p: + IsLDG = true; + break; + case Intrinsic::nvvm_ldu_global_f: + case Intrinsic::nvvm_ldu_global_i: + case Intrinsic::nvvm_ldu_global_p: + IsLDG = false; + break; + } + } else { + Op1 = N->getOperand(1); + Mem = cast<MemSDNode>(N); + } + unsigned Opcode; SDLoc DL(N); SDNode *LD; - MemSDNode *Mem = cast<MemSDNode>(N); SDValue Base, Offset, Addr; - EVT EltVT = Mem->getMemoryVT().getVectorElementType(); + EVT EltVT = Mem->getMemoryVT(); + if (EltVT.isVector()) { + EltVT = EltVT.getVectorElementType(); + } if (SelectDirectAddr(Op1, Addr)) { switch (N->getOpcode()) { default: return nullptr; + case ISD::INTRINSIC_W_CHAIN: + if (IsLDG) { + switch (EltVT.getSimpleVT().SimpleTy) { + default: + return nullptr; + case MVT::i8: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8avar; + break; + case MVT::i16: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16avar; + break; + case MVT::i32: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32avar; + break; + case MVT::i64: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64avar; + break; + case MVT::f32: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32avar; + break; + case MVT::f64: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64avar; + break; + } + } else { + switch (EltVT.getSimpleVT().SimpleTy) { + default: + return nullptr; + case MVT::i8: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8avar; + break; + case MVT::i16: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16avar; + break; + case MVT::i32: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32avar; + break; + case MVT::i64: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64avar; + break; + case MVT::f32: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32avar; + break; + case MVT::f64: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64avar; + break; + } + } + break; case NVPTXISD::LDGV2: switch (EltVT.getSimpleVT().SimpleTy) { default: @@ -1092,6 +1198,55 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) { switch (N->getOpcode()) { default: return nullptr; + case ISD::INTRINSIC_W_CHAIN: + if (IsLDG) { + switch (EltVT.getSimpleVT().SimpleTy) { + default: + return nullptr; + case MVT::i8: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8ari64; + break; + case MVT::i16: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16ari64; + break; + case MVT::i32: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32ari64; + break; + case MVT::i64: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64ari64; + break; + case MVT::f32: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32ari64; + break; + case MVT::f64: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64ari64; + break; + } + } else { + switch (EltVT.getSimpleVT().SimpleTy) { + default: + return nullptr; + case MVT::i8: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8ari64; + break; + case MVT::i16: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16ari64; + break; + case MVT::i32: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32ari64; + break; + case MVT::i64: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64ari64; + break; + case MVT::f32: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32ari64; + break; + case MVT::f64: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64ari64; + break; + } + } + break; case NVPTXISD::LDGV2: switch (EltVT.getSimpleVT().SimpleTy) { default: @@ -1181,6 +1336,55 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) { switch (N->getOpcode()) { default: return nullptr; + case ISD::INTRINSIC_W_CHAIN: + if (IsLDG) { + switch (EltVT.getSimpleVT().SimpleTy) { + default: + return nullptr; + case MVT::i8: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8ari; + break; + case MVT::i16: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16ari; + break; + case MVT::i32: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32ari; + break; + case MVT::i64: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64ari; + break; + case MVT::f32: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32ari; + break; + case MVT::f64: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64ari; + break; + } + } else { + switch (EltVT.getSimpleVT().SimpleTy) { + default: + return nullptr; + case MVT::i8: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8ari; + break; + case MVT::i16: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16ari; + break; + case MVT::i32: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32ari; + break; + case MVT::i64: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64ari; + break; + case MVT::f32: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32ari; + break; + case MVT::f64: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64ari; + break; + } + } + break; case NVPTXISD::LDGV2: switch (EltVT.getSimpleVT().SimpleTy) { default: @@ -1276,6 +1480,55 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) { switch (N->getOpcode()) { default: return nullptr; + case ISD::INTRINSIC_W_CHAIN: + if (IsLDG) { + switch (EltVT.getSimpleVT().SimpleTy) { + default: + return nullptr; + case MVT::i8: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8areg64; + break; + case MVT::i16: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16areg64; + break; + case MVT::i32: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32areg64; + break; + case MVT::i64: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64areg64; + break; + case MVT::f32: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32areg64; + break; + case MVT::f64: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64areg64; + break; + } + } else { + switch (EltVT.getSimpleVT().SimpleTy) { + default: + return nullptr; + case MVT::i8: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8areg64; + break; + case MVT::i16: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16areg64; + break; + case MVT::i32: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32areg64; + break; + case MVT::i64: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64areg64; + break; + case MVT::f32: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32areg64; + break; + case MVT::f64: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64areg64; + break; + } + } + break; case NVPTXISD::LDGV2: switch (EltVT.getSimpleVT().SimpleTy) { default: @@ -1365,6 +1618,55 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) { switch (N->getOpcode()) { default: return nullptr; + case ISD::INTRINSIC_W_CHAIN: + if (IsLDG) { + switch (EltVT.getSimpleVT().SimpleTy) { + default: + return nullptr; + case MVT::i8: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8areg; + break; + case MVT::i16: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16areg; + break; + case MVT::i32: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32areg; + break; + case MVT::i64: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64areg; + break; + case MVT::f32: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32areg; + break; + case MVT::f64: + Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64areg; + break; + } + } else { + switch (EltVT.getSimpleVT().SimpleTy) { + default: + return nullptr; + case MVT::i8: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8areg; + break; + case MVT::i16: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16areg; + break; + case MVT::i32: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32areg; + break; + case MVT::i64: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64areg; + break; + case MVT::f32: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32areg; + break; + case MVT::f64: + Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64areg; + break; + } + } + break; case NVPTXISD::LDGV2: switch (EltVT.getSimpleVT().SimpleTy) { default: @@ -1457,7 +1759,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) { } MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1); - MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand(); + MemRefs0[0] = Mem->getMemOperand(); cast<MachineSDNode>(LD)->setMemRefs(MemRefs0, MemRefs0 + 1); return LD; @@ -2959,6 +3261,214 @@ SDNode *NVPTXDAGToDAGISel::SelectSurfaceIntrinsic(SDNode *N) { return Ret; } +/// SelectBFE - Look for instruction sequences that can be made more efficient +/// by using the 'bfe' (bit-field extract) PTX instruction +SDNode *NVPTXDAGToDAGISel::SelectBFE(SDNode *N) { + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + SDValue Len; + SDValue Start; + SDValue Val; + bool IsSigned = false; + + if (N->getOpcode() == ISD::AND) { + // Canonicalize the operands + // We want 'and %val, %mask' + if (isa<ConstantSDNode>(LHS) && !isa<ConstantSDNode>(RHS)) { + std::swap(LHS, RHS); + } + + ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS); + if (!Mask) { + // We need a constant mask on the RHS of the AND + return NULL; + } + + // Extract the mask bits + uint64_t MaskVal = Mask->getZExtValue(); + if (!isMask_64(MaskVal)) { + // We *could* handle shifted masks here, but doing so would require an + // 'and' operation to fix up the low-order bits so we would trade + // shr+and for bfe+and, which has the same throughput + return NULL; + } + + // How many bits are in our mask? + uint64_t NumBits = CountTrailingOnes_64(MaskVal); + Len = CurDAG->getTargetConstant(NumBits, MVT::i32); + + if (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SRA) { + // We have a 'srl/and' pair, extract the effective start bit and length + Val = LHS.getNode()->getOperand(0); + Start = LHS.getNode()->getOperand(1); + ConstantSDNode *StartConst = dyn_cast<ConstantSDNode>(Start); + if (StartConst) { + uint64_t StartVal = StartConst->getZExtValue(); + // How many "good" bits do we have left? "good" is defined here as bits + // that exist in the original value, not shifted in. + uint64_t GoodBits = Start.getValueType().getSizeInBits() - StartVal; + if (NumBits > GoodBits) { + // Do not handle the case where bits have been shifted in. In theory + // we could handle this, but the cost is likely higher than just + // emitting the srl/and pair. + return NULL; + } + Start = CurDAG->getTargetConstant(StartVal, MVT::i32); + } else { + // Do not handle the case where the shift amount (can be zero if no srl + // was found) is not constant. We could handle this case, but it would + // require run-time logic that would be more expensive than just + // emitting the srl/and pair. + return NULL; + } + } else { + // Do not handle the case where the LHS of the and is not a shift. While + // it would be trivial to handle this case, it would just transform + // 'and' -> 'bfe', but 'and' has higher-throughput. + return NULL; + } + } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) { + if (LHS->getOpcode() == ISD::AND) { + ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(RHS); + if (!ShiftCnst) { + // Shift amount must be constant + return NULL; + } + + uint64_t ShiftAmt = ShiftCnst->getZExtValue(); + + SDValue AndLHS = LHS->getOperand(0); + SDValue AndRHS = LHS->getOperand(1); + + // Canonicalize the AND to have the mask on the RHS + if (isa<ConstantSDNode>(AndLHS)) { + std::swap(AndLHS, AndRHS); + } + + ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(AndRHS); + if (!MaskCnst) { + // Mask must be constant + return NULL; + } + + uint64_t MaskVal = MaskCnst->getZExtValue(); + uint64_t NumZeros; + uint64_t NumBits; + if (isMask_64(MaskVal)) { + NumZeros = 0; + // The number of bits in the result bitfield will be the number of + // trailing ones (the AND) minus the number of bits we shift off + NumBits = CountTrailingOnes_64(MaskVal) - ShiftAmt; + } else if (isShiftedMask_64(MaskVal)) { + NumZeros = countTrailingZeros(MaskVal); + unsigned NumOnes = CountTrailingOnes_64(MaskVal >> NumZeros); + // The number of bits in the result bitfield will be the number of + // trailing zeros plus the number of set bits in the mask minus the + // number of bits we shift off + NumBits = NumZeros + NumOnes - ShiftAmt; + } else { + // This is not a mask we can handle + return NULL; + } + + if (ShiftAmt < NumZeros) { + // Handling this case would require extra logic that would make this + // transformation non-profitable + return NULL; + } + + Val = AndLHS; + Start = CurDAG->getTargetConstant(ShiftAmt, MVT::i32); + Len = CurDAG->getTargetConstant(NumBits, MVT::i32); + } else if (LHS->getOpcode() == ISD::SHL) { + // Here, we have a pattern like: + // + // (sra (shl val, NN), MM) + // or + // (srl (shl val, NN), MM) + // + // If MM >= NN, we can efficiently optimize this with bfe + Val = LHS->getOperand(0); + + SDValue ShlRHS = LHS->getOperand(1); + ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(ShlRHS); + if (!ShlCnst) { + // Shift amount must be constant + return NULL; + } + uint64_t InnerShiftAmt = ShlCnst->getZExtValue(); + + SDValue ShrRHS = RHS; + ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(ShrRHS); + if (!ShrCnst) { + // Shift amount must be constant + return NULL; + } + uint64_t OuterShiftAmt = ShrCnst->getZExtValue(); + + // To avoid extra codegen and be profitable, we need Outer >= Inner + if (OuterShiftAmt < InnerShiftAmt) { + return NULL; + } + + // If the outer shift is more than the type size, we have no bitfield to + // extract (since we also check that the inner shift is <= the outer shift + // then this also implies that the inner shift is < the type size) + if (OuterShiftAmt >= Val.getValueType().getSizeInBits()) { + return NULL; + } + + Start = + CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt, MVT::i32); + Len = + CurDAG->getTargetConstant(Val.getValueType().getSizeInBits() - + OuterShiftAmt, MVT::i32); + + if (N->getOpcode() == ISD::SRA) { + // If we have a arithmetic right shift, we need to use the signed bfe + // variant + IsSigned = true; + } + } else { + // No can do... + return NULL; + } + } else { + // No can do... + return NULL; + } + + + unsigned Opc; + // For the BFE operations we form here from "and" and "srl", always use the + // unsigned variants. + if (Val.getValueType() == MVT::i32) { + if (IsSigned) { + Opc = NVPTX::BFE_S32rii; + } else { + Opc = NVPTX::BFE_U32rii; + } + } else if (Val.getValueType() == MVT::i64) { + if (IsSigned) { + Opc = NVPTX::BFE_S64rii; + } else { + Opc = NVPTX::BFE_U64rii; + } + } else { + // We cannot handle this type + return NULL; + } + + SDValue Ops[] = { + Val, Start, Len + }; + + SDNode *Ret = + CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops); + + return Ret; +} + // SelectDirectAddr - Match a direct address for DAG. // A direct address could be a globaladdress or externalsymbol. bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) { diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index 11f92e79d99c..c44ccb20ed88 100644 --- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -59,10 +59,11 @@ private: SDNode *Select(SDNode *N) override; SDNode *SelectIntrinsicNoChain(SDNode *N); + SDNode *SelectIntrinsicChain(SDNode *N); SDNode *SelectTexSurfHandle(SDNode *N); SDNode *SelectLoad(SDNode *N); SDNode *SelectLoadVector(SDNode *N); - SDNode *SelectLDGLDUVector(SDNode *N); + SDNode *SelectLDGLDU(SDNode *N); SDNode *SelectStore(SDNode *N); SDNode *SelectStoreVector(SDNode *N); SDNode *SelectLoadParam(SDNode *N); @@ -71,6 +72,7 @@ private: SDNode *SelectAddrSpaceCast(SDNode *N); SDNode *SelectTextureIntrinsic(SDNode *N); SDNode *SelectSurfaceIntrinsic(SDNode *N); + SDNode *SelectBFE(SDNode *N); inline SDValue getI32Imm(unsigned Imm) { return CurDAG->getTargetConstant(Imm, MVT::i32); diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp index b0943befa88f..cb452ff07254 100644 --- a/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -33,6 +33,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include <sstream> @@ -111,6 +112,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM) MaxStoresPerMemmove = (unsigned) 0xFFFFFFFF; setBooleanContents(ZeroOrNegativeOneBooleanContent); + setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); // Jump is Expensive. Don't create extra control flow for 'and', 'or' // condition branches. @@ -130,7 +132,13 @@ NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM) addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass); // Operations not directly supported by NVPTX. - setOperationAction(ISD::SELECT_CC, MVT::Other, Expand); + setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); + setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i8, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i16, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i32, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); setOperationAction(ISD::BR_CC, MVT::f32, Expand); setOperationAction(ISD::BR_CC, MVT::f64, Expand); setOperationAction(ISD::BR_CC, MVT::i1, Expand); @@ -146,6 +154,13 @@ NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM) setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); + setOperationAction(ISD::SHL_PARTS, MVT::i32 , Custom); + setOperationAction(ISD::SRA_PARTS, MVT::i32 , Custom); + setOperationAction(ISD::SRL_PARTS, MVT::i32 , Custom); + setOperationAction(ISD::SHL_PARTS, MVT::i64 , Custom); + setOperationAction(ISD::SRA_PARTS, MVT::i64 , Custom); + setOperationAction(ISD::SRL_PARTS, MVT::i64 , Custom); + if (nvptxSubtarget.hasROT64()) { setOperationAction(ISD::ROTL, MVT::i64, Legal); setOperationAction(ISD::ROTR, MVT::i64, Legal); @@ -237,6 +252,13 @@ NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM) setOperationAction(ISD::CTPOP, MVT::i32, Legal); setOperationAction(ISD::CTPOP, MVT::i64, Legal); + // We have some custom DAG combine patterns for these nodes + setTargetDAGCombine(ISD::ADD); + setTargetDAGCombine(ISD::AND); + setTargetDAGCombine(ISD::FADD); + setTargetDAGCombine(ISD::MUL); + setTargetDAGCombine(ISD::SHL); + // Now deduce the information based on the above mentioned // actions computeRegisterProperties(); @@ -328,6 +350,16 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { return "NVPTXISD::StoreV2"; case NVPTXISD::StoreV4: return "NVPTXISD::StoreV4"; + case NVPTXISD::FUN_SHFL_CLAMP: + return "NVPTXISD::FUN_SHFL_CLAMP"; + case NVPTXISD::FUN_SHFR_CLAMP: + return "NVPTXISD::FUN_SHFR_CLAMP"; + case NVPTXISD::IMAD: + return "NVPTXISD::IMAD"; + case NVPTXISD::MUL_WIDE_SIGNED: + return "NVPTXISD::MUL_WIDE_SIGNED"; + case NVPTXISD::MUL_WIDE_UNSIGNED: + return "NVPTXISD::MUL_WIDE_UNSIGNED"; case NVPTXISD::Tex1DFloatI32: return "NVPTXISD::Tex1DFloatI32"; case NVPTXISD::Tex1DFloatFloat: return "NVPTXISD::Tex1DFloatFloat"; case NVPTXISD::Tex1DFloatFloatLevel: @@ -441,8 +473,12 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { } } -bool NVPTXTargetLowering::shouldSplitVectorType(EVT VT) const { - return VT.getScalarType() == MVT::i1; +TargetLoweringBase::LegalizeTypeAction +NVPTXTargetLowering::getPreferredVectorAction(EVT VT) const { + if (VT.getVectorNumElements() != 1 && VT.getScalarType() == MVT::i1) + return TypeSplitVector; + + return TargetLoweringBase::getPreferredVectorAction(VT); } SDValue @@ -487,26 +523,12 @@ NVPTXTargetLowering::getPrototype(Type *retTy, const ArgListTy &Args, } else if (isa<PointerType>(retTy)) { O << ".param .b" << getPointerTy().getSizeInBits() << " _"; } else { - if ((retTy->getTypeID() == Type::StructTyID) || isa<VectorType>(retTy)) { - SmallVector<EVT, 16> vtparts; - ComputeValueVTs(*this, retTy, vtparts); - unsigned totalsz = 0; - for (unsigned i = 0, e = vtparts.size(); i != e; ++i) { - unsigned elems = 1; - EVT elemtype = vtparts[i]; - if (vtparts[i].isVector()) { - elems = vtparts[i].getVectorNumElements(); - elemtype = vtparts[i].getVectorElementType(); - } - // TODO: no need to loop - for (unsigned j = 0, je = elems; j != je; ++j) { - unsigned sz = elemtype.getSizeInBits(); - if (elemtype.isInteger() && (sz < 8)) - sz = 8; - totalsz += sz / 8; - } - } - O << ".param .align " << retAlignment << " .b8 _[" << totalsz << "]"; + if((retTy->getTypeID() == Type::StructTyID) || + isa<VectorType>(retTy)) { + O << ".param .align " + << retAlignment + << " .b8 _[" + << getDataLayout()->getTypeAllocSize(retTy) << "]"; } else { assert(false && "Unknown return type"); } @@ -675,7 +697,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (Ty->isAggregateType()) { // aggregate SmallVector<EVT, 16> vtparts; - ComputeValueVTs(*this, Ty, vtparts); + SmallVector<uint64_t, 16> Offsets; + ComputePTXValueVTs(*this, Ty, vtparts, &Offsets, 0); unsigned align = getArgumentAlignment(Callee, CS, Ty, paramCount + 1); // declare .param .align <align> .b8 .param<n>[<size>]; @@ -687,34 +710,26 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, DeclareParamOps); InFlag = Chain.getValue(1); - unsigned curOffset = 0; for (unsigned j = 0, je = vtparts.size(); j != je; ++j) { - unsigned elems = 1; EVT elemtype = vtparts[j]; - if (vtparts[j].isVector()) { - elems = vtparts[j].getVectorNumElements(); - elemtype = vtparts[j].getVectorElementType(); - } - for (unsigned k = 0, ke = elems; k != ke; ++k) { - unsigned sz = elemtype.getSizeInBits(); - if (elemtype.isInteger() && (sz < 8)) - sz = 8; - SDValue StVal = OutVals[OIdx]; - if (elemtype.getSizeInBits() < 16) { - StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal); - } - SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue CopyParamOps[] = { Chain, - DAG.getConstant(paramCount, MVT::i32), - DAG.getConstant(curOffset, MVT::i32), - StVal, InFlag }; - Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, - CopyParamVTs, CopyParamOps, - elemtype, MachinePointerInfo()); - InFlag = Chain.getValue(1); - curOffset += sz / 8; - ++OIdx; + unsigned ArgAlign = GreatestCommonDivisor64(align, Offsets[j]); + if (elemtype.isInteger() && (sz < 8)) + sz = 8; + SDValue StVal = OutVals[OIdx]; + if (elemtype.getSizeInBits() < 16) { + StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal); } + SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue CopyParamOps[] = { Chain, + DAG.getConstant(paramCount, MVT::i32), + DAG.getConstant(Offsets[j], MVT::i32), + StVal, InFlag }; + Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, + CopyParamVTs, CopyParamOps, + elemtype, MachinePointerInfo(), + ArgAlign); + InFlag = Chain.getValue(1); + ++OIdx; } if (vtparts.size() > 0) --OIdx; @@ -899,13 +914,15 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } // struct or vector SmallVector<EVT, 16> vtparts; + SmallVector<uint64_t, 16> Offsets; const PointerType *PTy = dyn_cast<PointerType>(Args[i].Ty); assert(PTy && "Type of a byval parameter should be pointer"); - ComputeValueVTs(*this, PTy->getElementType(), vtparts); + ComputePTXValueVTs(*this, PTy->getElementType(), vtparts, &Offsets, 0); // declare .param .align <align> .b8 .param<n>[<size>]; unsigned sz = Outs[OIdx].Flags.getByValSize(); SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); + unsigned ArgAlign = Outs[OIdx].Flags.getByValAlign(); // The ByValAlign in the Outs[OIdx].Flags is alway set at this point, // so we don't need to worry about natural alignment or not. // See TargetLowering::LowerCallTo(). @@ -917,38 +934,28 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, DeclareParamOps); InFlag = Chain.getValue(1); - unsigned curOffset = 0; for (unsigned j = 0, je = vtparts.size(); j != je; ++j) { - unsigned elems = 1; EVT elemtype = vtparts[j]; - if (vtparts[j].isVector()) { - elems = vtparts[j].getVectorNumElements(); - elemtype = vtparts[j].getVectorElementType(); + int curOffset = Offsets[j]; + unsigned PartAlign = GreatestCommonDivisor64(ArgAlign, curOffset); + SDValue srcAddr = + DAG.getNode(ISD::ADD, dl, getPointerTy(), OutVals[OIdx], + DAG.getConstant(curOffset, getPointerTy())); + SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr, + MachinePointerInfo(), false, false, false, + PartAlign); + if (elemtype.getSizeInBits() < 16) { + theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal); } - for (unsigned k = 0, ke = elems; k != ke; ++k) { - unsigned sz = elemtype.getSizeInBits(); - if (elemtype.isInteger() && (sz < 8)) - sz = 8; - SDValue srcAddr = - DAG.getNode(ISD::ADD, dl, getPointerTy(), OutVals[OIdx], - DAG.getConstant(curOffset, getPointerTy())); - SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr, - MachinePointerInfo(), false, false, false, - 0); - if (elemtype.getSizeInBits() < 16) { - theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal); - } - SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue CopyParamOps[] = { Chain, DAG.getConstant(paramCount, MVT::i32), - DAG.getConstant(curOffset, MVT::i32), theVal, - InFlag }; - Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs, - CopyParamOps, elemtype, - MachinePointerInfo()); + SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue CopyParamOps[] = { Chain, DAG.getConstant(paramCount, MVT::i32), + DAG.getConstant(curOffset, MVT::i32), theVal, + InFlag }; + Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs, + CopyParamOps, elemtype, + MachinePointerInfo()); - InFlag = Chain.getValue(1); - curOffset += sz / 8; - } + InFlag = Chain.getValue(1); } ++paramCount; } @@ -1057,7 +1064,6 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Generate loads from param memory/moves from registers for result if (Ins.size() > 0) { - unsigned resoffset = 0; if (retTy && retTy->isVectorTy()) { EVT ObjectVT = getValueType(retTy); unsigned NumElts = ObjectVT.getVectorNumElements(); @@ -1066,14 +1072,15 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, ObjectVT) == NumElts && "Vector was not scalarized"); unsigned sz = EltVT.getSizeInBits(); - bool needTruncate = sz < 16 ? true : false; + bool needTruncate = sz < 8 ? true : false; if (NumElts == 1) { // Just a simple load SmallVector<EVT, 4> LoadRetVTs; - if (needTruncate) { - // If loading i1 result, generate - // load i16 + if (EltVT == MVT::i1 || EltVT == MVT::i8) { + // If loading i1/i8 result, generate + // load.b8 i16 + // if i1 // trunc i16 to i1 LoadRetVTs.push_back(MVT::i16); } else @@ -1097,9 +1104,10 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } else if (NumElts == 2) { // LoadV2 SmallVector<EVT, 4> LoadRetVTs; - if (needTruncate) { - // If loading i1 result, generate - // load i16 + if (EltVT == MVT::i1 || EltVT == MVT::i8) { + // If loading i1/i8 result, generate + // load.b8 i16 + // if i1 // trunc i16 to i1 LoadRetVTs.push_back(MVT::i16); LoadRetVTs.push_back(MVT::i16); @@ -1142,9 +1150,10 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, VecSize); for (unsigned i = 0; i < NumElts; i += VecSize) { SmallVector<EVT, 8> LoadRetVTs; - if (needTruncate) { - // If loading i1 result, generate - // load i16 + if (EltVT == MVT::i1 || EltVT == MVT::i8) { + // If loading i1/i8 result, generate + // load.b8 i16 + // if i1 // trunc i16 to i1 for (unsigned j = 0; j < VecSize; ++j) LoadRetVTs.push_back(MVT::i16); @@ -1183,10 +1192,13 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } } else { SmallVector<EVT, 16> VTs; - ComputePTXValueVTs(*this, retTy, VTs); + SmallVector<uint64_t, 16> Offsets; + ComputePTXValueVTs(*this, retTy, VTs, &Offsets, 0); assert(VTs.size() == Ins.size() && "Bad value decomposition"); + unsigned RetAlign = getArgumentAlignment(Callee, CS, retTy, 0); for (unsigned i = 0, e = Ins.size(); i != e; ++i) { unsigned sz = VTs[i].getSizeInBits(); + unsigned AlignI = GreatestCommonDivisor64(RetAlign, Offsets[i]); bool needTruncate = sz < 8 ? true : false; if (VTs[i].isInteger() && (sz < 8)) sz = 8; @@ -1212,19 +1224,18 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVector<SDValue, 4> LoadRetOps; LoadRetOps.push_back(Chain); LoadRetOps.push_back(DAG.getConstant(1, MVT::i32)); - LoadRetOps.push_back(DAG.getConstant(resoffset, MVT::i32)); + LoadRetOps.push_back(DAG.getConstant(Offsets[i], MVT::i32)); LoadRetOps.push_back(InFlag); SDValue retval = DAG.getMemIntrinsicNode( NVPTXISD::LoadParam, dl, DAG.getVTList(LoadRetVTs), LoadRetOps, - TheLoadType, MachinePointerInfo()); + TheLoadType, MachinePointerInfo(), AlignI); Chain = retval.getValue(1); InFlag = retval.getValue(2); SDValue Ret0 = retval.getValue(0); if (needTruncate) Ret0 = DAG.getNode(ISD::TRUNCATE, dl, Ins[i].VT, Ret0); InVals.push_back(Ret0); - resoffset += sz / 8; } } } @@ -1262,6 +1273,127 @@ NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::BUILD_VECTOR, dl, Node->getValueType(0), Ops); } +/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which +/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift +/// amount, or +/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift +/// amount. +SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op, + SelectionDAG &DAG) const { + assert(Op.getNumOperands() == 3 && "Not a double-shift!"); + assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); + + EVT VT = Op.getValueType(); + unsigned VTBits = VT.getSizeInBits(); + SDLoc dl(Op); + SDValue ShOpLo = Op.getOperand(0); + SDValue ShOpHi = Op.getOperand(1); + SDValue ShAmt = Op.getOperand(2); + unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; + + if (VTBits == 32 && nvptxSubtarget.getSmVersion() >= 35) { + + // For 32bit and sm35, we can use the funnel shift 'shf' instruction. + // {dHi, dLo} = {aHi, aLo} >> Amt + // dHi = aHi >> Amt + // dLo = shf.r.clamp aLo, aHi, Amt + + SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); + SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi, + ShAmt); + + SDValue Ops[2] = { Lo, Hi }; + return DAG.getMergeValues(Ops, dl); + } + else { + + // {dHi, dLo} = {aHi, aLo} >> Amt + // - if (Amt>=size) then + // dLo = aHi >> (Amt-size) + // dHi = aHi >> Amt (this is either all 0 or all 1) + // else + // dLo = (aLo >>logic Amt) | (aHi << (size-Amt)) + // dHi = aHi >> Amt + + SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, + DAG.getConstant(VTBits, MVT::i32), ShAmt); + SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); + SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, + DAG.getConstant(VTBits, MVT::i32)); + SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); + SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); + SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); + + SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt, + DAG.getConstant(VTBits, MVT::i32), ISD::SETGE); + SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); + SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal); + + SDValue Ops[2] = { Lo, Hi }; + return DAG.getMergeValues(Ops, dl); + } +} + +/// LowerShiftLeftParts - Lower SHL_PARTS, which +/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift +/// amount, or +/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift +/// amount. +SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op, + SelectionDAG &DAG) const { + assert(Op.getNumOperands() == 3 && "Not a double-shift!"); + assert(Op.getOpcode() == ISD::SHL_PARTS); + + EVT VT = Op.getValueType(); + unsigned VTBits = VT.getSizeInBits(); + SDLoc dl(Op); + SDValue ShOpLo = Op.getOperand(0); + SDValue ShOpHi = Op.getOperand(1); + SDValue ShAmt = Op.getOperand(2); + + if (VTBits == 32 && nvptxSubtarget.getSmVersion() >= 35) { + + // For 32bit and sm35, we can use the funnel shift 'shf' instruction. + // {dHi, dLo} = {aHi, aLo} << Amt + // dHi = shf.l.clamp aLo, aHi, Amt + // dLo = aLo << Amt + + SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi, + ShAmt); + SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); + + SDValue Ops[2] = { Lo, Hi }; + return DAG.getMergeValues(Ops, dl); + } + else { + + // {dHi, dLo} = {aHi, aLo} << Amt + // - if (Amt>=size) then + // dLo = aLo << Amt (all 0) + // dLo = aLo << (Amt-size) + // else + // dLo = aLo << Amt + // dHi = (aHi << Amt) | (aLo >> (size-Amt)) + + SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, + DAG.getConstant(VTBits, MVT::i32), ShAmt); + SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); + SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, + DAG.getConstant(VTBits, MVT::i32)); + SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); + SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); + SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); + + SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt, + DAG.getConstant(VTBits, MVT::i32), ISD::SETGE); + SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); + SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal); + + SDValue Ops[2] = { Lo, Hi }; + return DAG.getMergeValues(Ops, dl); + } +} + SDValue NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { @@ -1282,6 +1414,11 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerSTORE(Op, DAG); case ISD::LOAD: return LowerLOAD(Op, DAG); + case ISD::SHL_PARTS: + return LowerShiftLeftParts(Op, DAG); + case ISD::SRA_PARTS: + case ISD::SRL_PARTS: + return LowerShiftRightParts(Op, DAG); default: llvm_unreachable("Custom lowering not defined for operation"); } @@ -1495,7 +1632,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( const Function *F = MF.getFunction(); const AttributeSet &PAL = F->getAttributes(); - const TargetLowering *TLI = nvTM->getTargetLowering(); + const TargetLowering *TLI = DAG.getTarget().getTargetLowering(); SDValue Root = DAG.getRoot(); std::vector<SDValue> OutChains; @@ -1549,8 +1686,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( assert(vtparts.size() > 0 && "empty aggregate type not expected"); for (unsigned parti = 0, parte = vtparts.size(); parti != parte; ++parti) { - EVT partVT = vtparts[parti]; - InVals.push_back(DAG.getNode(ISD::UNDEF, dl, partVT)); + InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); ++InsIdx; } if (vtparts.size() > 0) @@ -1866,7 +2002,7 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, unsigned Offset = 0; EVT VecVT = - EVT::getVectorVT(F->getContext(), OutVals[0].getValueType(), VecSize); + EVT::getVectorVT(F->getContext(), EltVT, VecSize); unsigned PerStoreOffset = TD->getTypeAllocSize(VecVT.getTypeForEVT(F->getContext())); @@ -1925,12 +2061,10 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, } } else { SmallVector<EVT, 16> ValVTs; - // const_cast is necessary since we are still using an LLVM version from - // before the type system re-write. - ComputePTXValueVTs(*this, RetTy, ValVTs); + SmallVector<uint64_t, 16> Offsets; + ComputePTXValueVTs(*this, RetTy, ValVTs, &Offsets, 0); assert(ValVTs.size() == OutVals.size() && "Bad return value decomposition"); - unsigned SizeSoFar = 0; for (unsigned i = 0, e = Outs.size(); i != e; ++i) { SDValue theVal = OutVals[i]; EVT TheValType = theVal.getValueType(); @@ -1954,16 +2088,14 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, else if (TmpVal.getValueType().getSizeInBits() < 16) TmpVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, TmpVal); - SDValue Ops[] = { Chain, DAG.getConstant(SizeSoFar, MVT::i32), TmpVal }; + SDValue Ops[] = { + Chain, + DAG.getConstant(Offsets[i], MVT::i32), + TmpVal }; Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl, DAG.getVTList(MVT::Other), Ops, TheStoreType, MachinePointerInfo()); - if(TheValType.isVector()) - SizeSoFar += - TheStoreType.getVectorElementType().getStoreSizeInBits() / 8; - else - SizeSoFar += TheStoreType.getStoreSizeInBits()/8; } } } @@ -2220,22 +2352,62 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( case Intrinsic::nvvm_ldu_global_i: case Intrinsic::nvvm_ldu_global_f: - case Intrinsic::nvvm_ldu_global_p: + case Intrinsic::nvvm_ldu_global_p: { Info.opc = ISD::INTRINSIC_W_CHAIN; if (Intrinsic == Intrinsic::nvvm_ldu_global_i) Info.memVT = getValueType(I.getType()); - else if (Intrinsic == Intrinsic::nvvm_ldu_global_p) + else if(Intrinsic == Intrinsic::nvvm_ldu_global_p) + Info.memVT = getPointerTy(); + else Info.memVT = getValueType(I.getType()); + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.vol = 0; + Info.readMem = true; + Info.writeMem = false; + + // alignment is available as metadata. + // Grab it and set the alignment. + assert(I.hasMetadataOtherThanDebugLoc() && "Must have alignment metadata"); + MDNode *AlignMD = I.getMetadata("align"); + assert(AlignMD && "Must have a non-null MDNode"); + assert(AlignMD->getNumOperands() == 1 && "Must have a single operand"); + Value *Align = AlignMD->getOperand(0); + int64_t Alignment = cast<ConstantInt>(Align)->getZExtValue(); + Info.align = Alignment; + + return true; + } + case Intrinsic::nvvm_ldg_global_i: + case Intrinsic::nvvm_ldg_global_f: + case Intrinsic::nvvm_ldg_global_p: { + + Info.opc = ISD::INTRINSIC_W_CHAIN; + if (Intrinsic == Intrinsic::nvvm_ldg_global_i) + Info.memVT = getValueType(I.getType()); + else if(Intrinsic == Intrinsic::nvvm_ldg_global_p) + Info.memVT = getPointerTy(); else - Info.memVT = MVT::f32; + Info.memVT = getValueType(I.getType()); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.vol = 0; Info.readMem = true; Info.writeMem = false; - Info.align = 0; + + // alignment is available as metadata. + // Grab it and set the alignment. + assert(I.hasMetadataOtherThanDebugLoc() && "Must have alignment metadata"); + MDNode *AlignMD = I.getMetadata("align"); + assert(AlignMD && "Must have a non-null MDNode"); + assert(AlignMD->getNumOperands() == 1 && "Must have a single operand"); + Value *Align = AlignMD->getOperand(0); + int64_t Alignment = cast<ConstantInt>(Align)->getZExtValue(); + Info.align = Alignment; + return true; + } case Intrinsic::nvvm_tex_1d_v4f32_i32: case Intrinsic::nvvm_tex_1d_v4f32_f32: @@ -2427,6 +2599,7 @@ NVPTXTargetLowering::getConstraintType(const std::string &Constraint) const { switch (Constraint[0]) { default: break; + case 'b': case 'r': case 'h': case 'c': @@ -2446,6 +2619,8 @@ NVPTXTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const { if (Constraint.size() == 1) { switch (Constraint[0]) { + case 'b': + return std::make_pair(0U, &NVPTX::Int1RegsRegClass); case 'c': return std::make_pair(0U, &NVPTX::Int16RegsRegClass); case 'h': @@ -2469,6 +2644,406 @@ unsigned NVPTXTargetLowering::getFunctionAlignment(const Function *) const { return 4; } +//===----------------------------------------------------------------------===// +// NVPTX DAG Combining +//===----------------------------------------------------------------------===// + +extern unsigned FMAContractLevel; + +/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with +/// operands N0 and N1. This is a helper for PerformADDCombine that is +/// called with the default operands, and if that fails, with commuted +/// operands. +static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, + TargetLowering::DAGCombinerInfo &DCI, + const NVPTXSubtarget &Subtarget, + CodeGenOpt::Level OptLevel) { + SelectionDAG &DAG = DCI.DAG; + // Skip non-integer, non-scalar case + EVT VT=N0.getValueType(); + if (VT.isVector()) + return SDValue(); + + // fold (add (mul a, b), c) -> (mad a, b, c) + // + if (N0.getOpcode() == ISD::MUL) { + assert (VT.isInteger()); + // For integer: + // Since integer multiply-add costs the same as integer multiply + // but is more costly than integer add, do the fusion only when + // the mul is only used in the add. + if (OptLevel==CodeGenOpt::None || VT != MVT::i32 || + !N0.getNode()->hasOneUse()) + return SDValue(); + + // Do the folding + return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT, + N0.getOperand(0), N0.getOperand(1), N1); + } + else if (N0.getOpcode() == ISD::FMUL) { + if (VT == MVT::f32 || VT == MVT::f64) { + if (FMAContractLevel == 0) + return SDValue(); + + // For floating point: + // Do the fusion only when the mul has less than 5 uses and all + // are add. + // The heuristic is that if a use is not an add, then that use + // cannot be fused into fma, therefore mul is still needed anyway. + // If there are more than 4 uses, even if they are all add, fusing + // them will increase register pressue. + // + int numUses = 0; + int nonAddCount = 0; + for (SDNode::use_iterator UI = N0.getNode()->use_begin(), + UE = N0.getNode()->use_end(); + UI != UE; ++UI) { + numUses++; + SDNode *User = *UI; + if (User->getOpcode() != ISD::FADD) + ++nonAddCount; + } + if (numUses >= 5) + return SDValue(); + if (nonAddCount) { + int orderNo = N->getIROrder(); + int orderNo2 = N0.getNode()->getIROrder(); + // simple heuristics here for considering potential register + // pressure, the logics here is that the differnce are used + // to measure the distance between def and use, the longer distance + // more likely cause register pressure. + if (orderNo - orderNo2 < 500) + return SDValue(); + + // Now, check if at least one of the FMUL's operands is live beyond the node N, + // which guarantees that the FMA will not increase register pressure at node N. + bool opIsLive = false; + const SDNode *left = N0.getOperand(0).getNode(); + const SDNode *right = N0.getOperand(1).getNode(); + + if (dyn_cast<ConstantSDNode>(left) || dyn_cast<ConstantSDNode>(right)) + opIsLive = true; + + if (!opIsLive) + for (SDNode::use_iterator UI = left->use_begin(), UE = left->use_end(); UI != UE; ++UI) { + SDNode *User = *UI; + int orderNo3 = User->getIROrder(); + if (orderNo3 > orderNo) { + opIsLive = true; + break; + } + } + + if (!opIsLive) + for (SDNode::use_iterator UI = right->use_begin(), UE = right->use_end(); UI != UE; ++UI) { + SDNode *User = *UI; + int orderNo3 = User->getIROrder(); + if (orderNo3 > orderNo) { + opIsLive = true; + break; + } + } + + if (!opIsLive) + return SDValue(); + } + + return DAG.getNode(ISD::FMA, SDLoc(N), VT, + N0.getOperand(0), N0.getOperand(1), N1); + } + } + + return SDValue(); +} + +/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. +/// +static SDValue PerformADDCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const NVPTXSubtarget &Subtarget, + CodeGenOpt::Level OptLevel) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + // First try with the default operand order. + SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, + OptLevel); + if (Result.getNode()) + return Result; + + // If that didn't work, try again with the operands commuted. + return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel); +} + +static SDValue PerformANDCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + // The type legalizer turns a vector load of i8 values into a zextload to i16 + // registers, optionally ANY_EXTENDs it (if target type is integer), + // and ANDs off the high 8 bits. Since we turn this load into a + // target-specific DAG node, the DAG combiner fails to eliminate these AND + // nodes. Do that here. + SDValue Val = N->getOperand(0); + SDValue Mask = N->getOperand(1); + + if (isa<ConstantSDNode>(Val)) { + std::swap(Val, Mask); + } + + SDValue AExt; + // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and + if (Val.getOpcode() == ISD::ANY_EXTEND) { + AExt = Val; + Val = Val->getOperand(0); + } + + if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) { + Val = Val->getOperand(0); + } + + if (Val->getOpcode() == NVPTXISD::LoadV2 || + Val->getOpcode() == NVPTXISD::LoadV4) { + ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask); + if (!MaskCnst) { + // Not an AND with a constant + return SDValue(); + } + + uint64_t MaskVal = MaskCnst->getZExtValue(); + if (MaskVal != 0xff) { + // Not an AND that chops off top 8 bits + return SDValue(); + } + + MemSDNode *Mem = dyn_cast<MemSDNode>(Val); + if (!Mem) { + // Not a MemSDNode?!? + return SDValue(); + } + + EVT MemVT = Mem->getMemoryVT(); + if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) { + // We only handle the i8 case + return SDValue(); + } + + unsigned ExtType = + cast<ConstantSDNode>(Val->getOperand(Val->getNumOperands()-1))-> + getZExtValue(); + if (ExtType == ISD::SEXTLOAD) { + // If for some reason the load is a sextload, the and is needed to zero + // out the high 8 bits + return SDValue(); + } + + bool AddTo = false; + if (AExt.getNode() != 0) { + // Re-insert the ext as a zext. + Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), + AExt.getValueType(), Val); + AddTo = true; + } + + // If we get here, the AND is unnecessary. Just replace it with the load + DCI.CombineTo(N, Val, AddTo); + } + + return SDValue(); +} + +enum OperandSignedness { + Signed = 0, + Unsigned, + Unknown +}; + +/// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand +/// that can be demoted to \p OptSize bits without loss of information. The +/// signedness of the operand, if determinable, is placed in \p S. +static bool IsMulWideOperandDemotable(SDValue Op, + unsigned OptSize, + OperandSignedness &S) { + S = Unknown; + + if (Op.getOpcode() == ISD::SIGN_EXTEND || + Op.getOpcode() == ISD::SIGN_EXTEND_INREG) { + EVT OrigVT = Op.getOperand(0).getValueType(); + if (OrigVT.getSizeInBits() == OptSize) { + S = Signed; + return true; + } + } else if (Op.getOpcode() == ISD::ZERO_EXTEND) { + EVT OrigVT = Op.getOperand(0).getValueType(); + if (OrigVT.getSizeInBits() == OptSize) { + S = Unsigned; + return true; + } + } + + return false; +} + +/// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can +/// be demoted to \p OptSize bits without loss of information. If the operands +/// contain a constant, it should appear as the RHS operand. The signedness of +/// the operands is placed in \p IsSigned. +static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS, + unsigned OptSize, + bool &IsSigned) { + + OperandSignedness LHSSign; + + // The LHS operand must be a demotable op + if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign)) + return false; + + // We should have been able to determine the signedness from the LHS + if (LHSSign == Unknown) + return false; + + IsSigned = (LHSSign == Signed); + + // The RHS can be a demotable op or a constant + if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) { + APInt Val = CI->getAPIntValue(); + if (LHSSign == Unsigned) { + if (Val.isIntN(OptSize)) { + return true; + } + return false; + } else { + if (Val.isSignedIntN(OptSize)) { + return true; + } + return false; + } + } else { + OperandSignedness RHSSign; + if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign)) + return false; + + if (LHSSign != RHSSign) + return false; + + return true; + } +} + +/// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply +/// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform +/// works on both multiply DAG nodes and SHL DAG nodes with a constant shift +/// amount. +static SDValue TryMULWIDECombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + EVT MulType = N->getValueType(0); + if (MulType != MVT::i32 && MulType != MVT::i64) { + return SDValue(); + } + + unsigned OptSize = MulType.getSizeInBits() >> 1; + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + // Canonicalize the multiply so the constant (if any) is on the right + if (N->getOpcode() == ISD::MUL) { + if (isa<ConstantSDNode>(LHS)) { + std::swap(LHS, RHS); + } + } + + // If we have a SHL, determine the actual multiply amount + if (N->getOpcode() == ISD::SHL) { + ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS); + if (!ShlRHS) { + return SDValue(); + } + + APInt ShiftAmt = ShlRHS->getAPIntValue(); + unsigned BitWidth = MulType.getSizeInBits(); + if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) { + APInt MulVal = APInt(BitWidth, 1) << ShiftAmt; + RHS = DCI.DAG.getConstant(MulVal, MulType); + } else { + return SDValue(); + } + } + + bool Signed; + // Verify that our operands are demotable + if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) { + return SDValue(); + } + + EVT DemotedVT; + if (MulType == MVT::i32) { + DemotedVT = MVT::i16; + } else { + DemotedVT = MVT::i32; + } + + // Truncate the operands to the correct size. Note that these are just for + // type consistency and will (likely) be eliminated in later phases. + SDValue TruncLHS = + DCI.DAG.getNode(ISD::TRUNCATE, SDLoc(N), DemotedVT, LHS); + SDValue TruncRHS = + DCI.DAG.getNode(ISD::TRUNCATE, SDLoc(N), DemotedVT, RHS); + + unsigned Opc; + if (Signed) { + Opc = NVPTXISD::MUL_WIDE_SIGNED; + } else { + Opc = NVPTXISD::MUL_WIDE_UNSIGNED; + } + + return DCI.DAG.getNode(Opc, SDLoc(N), MulType, TruncLHS, TruncRHS); +} + +/// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes. +static SDValue PerformMULCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + CodeGenOpt::Level OptLevel) { + if (OptLevel > 0) { + // Try mul.wide combining at OptLevel > 0 + SDValue Ret = TryMULWIDECombine(N, DCI); + if (Ret.getNode()) + return Ret; + } + + return SDValue(); +} + +/// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes. +static SDValue PerformSHLCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + CodeGenOpt::Level OptLevel) { + if (OptLevel > 0) { + // Try mul.wide combining at OptLevel > 0 + SDValue Ret = TryMULWIDECombine(N, DCI); + if (Ret.getNode()) + return Ret; + } + + return SDValue(); +} + +SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + // FIXME: Get this from the DAG somehow + CodeGenOpt::Level OptLevel = CodeGenOpt::Aggressive; + switch (N->getOpcode()) { + default: break; + case ISD::ADD: + case ISD::FADD: + return PerformADDCombine(N, DCI, nvptxSubtarget, OptLevel); + case ISD::MUL: + return PerformMULCombine(N, DCI, OptLevel); + case ISD::SHL: + return PerformSHLCombine(N, DCI, OptLevel); + case ISD::AND: + return PerformANDCombine(N, DCI); + } + return SDValue(); +} + /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads. static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, SmallVectorImpl<SDValue> &Results) { diff --git a/lib/Target/NVPTX/NVPTXISelLowering.h b/lib/Target/NVPTX/NVPTXISelLowering.h index 7bad8a28f323..7b4026d8fba0 100644 --- a/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/lib/Target/NVPTX/NVPTXISelLowering.h @@ -16,7 +16,6 @@ #define NVPTXISELLOWERING_H #include "NVPTX.h" -#include "NVPTXSubtarget.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/Target/TargetLowering.h" @@ -50,6 +49,11 @@ enum NodeType { CallSeqBegin, CallSeqEnd, CallPrototype, + FUN_SHFL_CLAMP, + FUN_SHFR_CLAMP, + MUL_WIDE_SIGNED, + MUL_WIDE_UNSIGNED, + IMAD, Dummy, LoadV2 = ISD::FIRST_TARGET_MEMORY_OPCODE, @@ -167,6 +171,8 @@ enum NodeType { }; } +class NVPTXSubtarget; + //===--------------------------------------------------------------------===// // TargetLowering Implementation //===--------------------------------------------------------------------===// @@ -196,9 +202,9 @@ public: /// getFunctionAlignment - Return the Log2 alignment of this function. unsigned getFunctionAlignment(const Function *F) const; - EVT getSetCCResultType(LLVMContext &, EVT VT) const override { + EVT getSetCCResultType(LLVMContext &Ctx, EVT VT) const override { if (VT.isVector()) - return MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements()); return MVT::i1; } @@ -236,7 +242,8 @@ public: // PTX always uses 32-bit shift amounts MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i32; } - bool shouldSplitVectorType(EVT VT) const override; + TargetLoweringBase::LegalizeTypeAction + getPreferredVectorAction(EVT VT) const override; private: const NVPTXSubtarget &nvptxSubtarget; // cache the subtarget here @@ -255,8 +262,12 @@ private: SDValue LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const; + void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const override; + SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; unsigned getArgumentAlignment(SDValue Callee, const ImmutableCallSite *CS, Type *Ty, unsigned Idx) const; diff --git a/lib/Target/NVPTX/NVPTXImageOptimizer.cpp b/lib/Target/NVPTX/NVPTXImageOptimizer.cpp index 397f4bccfe2b..a98fb37f6e25 100644 --- a/lib/Target/NVPTX/NVPTXImageOptimizer.cpp +++ b/lib/Target/NVPTX/NVPTXImageOptimizer.cpp @@ -146,7 +146,7 @@ bool NVPTXImageOptimizer::replaceIsTypePTexture(Instruction &I) { void NVPTXImageOptimizer::replaceWith(Instruction *From, ConstantInt *To) { // We implement "poor man's DCE" here to make sure any code that is no longer // live is actually unreachable and can be trivially eliminated by the - // unreachable block elimiation pass. + // unreachable block elimination pass. for (CallInst::use_iterator UI = From->use_begin(), UE = From->use_end(); UI != UE; ++UI) { if (BranchInst *BI = dyn_cast<BranchInst>(*UI)) { diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/lib/Target/NVPTX/NVPTXInstrInfo.cpp index cdc80887dc28..b5b4fbed0799 100644 --- a/lib/Target/NVPTX/NVPTXInstrInfo.cpp +++ b/lib/Target/NVPTX/NVPTXInstrInfo.cpp @@ -29,8 +29,8 @@ using namespace llvm; void NVPTXInstrInfo::anchor() {} // FIXME: Add the subtarget support on this constructor. -NVPTXInstrInfo::NVPTXInstrInfo(NVPTXTargetMachine &tm) - : NVPTXGenInstrInfo(), TM(tm), RegInfo(*TM.getSubtargetImpl()) {} +NVPTXInstrInfo::NVPTXInstrInfo(NVPTXSubtarget &STI) + : NVPTXGenInstrInfo(), RegInfo(STI) {} void NVPTXInstrInfo::copyPhysReg( MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL, diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.h b/lib/Target/NVPTX/NVPTXInstrInfo.h index 88a9e45f25f5..2ac29748676a 100644 --- a/lib/Target/NVPTX/NVPTXInstrInfo.h +++ b/lib/Target/NVPTX/NVPTXInstrInfo.h @@ -24,11 +24,10 @@ namespace llvm { class NVPTXInstrInfo : public NVPTXGenInstrInfo { - NVPTXTargetMachine &TM; const NVPTXRegisterInfo RegInfo; virtual void anchor(); public: - explicit NVPTXInstrInfo(NVPTXTargetMachine &TM); + explicit NVPTXInstrInfo(NVPTXSubtarget &STI); const NVPTXRegisterInfo &getRegisterInfo() const { return RegInfo; } diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td index fbcd0e4a358f..d2c0373b9998 100644 --- a/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -158,9 +158,12 @@ def do_SQRTF32_APPROX : Predicate<"!usePrecSqrtF32()">; def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">; def hasHWROT32 : Predicate<"Subtarget.hasHWROT32()">; +def noHWROT32 : Predicate<"!Subtarget.hasHWROT32()">; def true : Predicate<"1">; +def hasPTX31 : Predicate<"Subtarget.getPTXVersion() >= 31">; + //===----------------------------------------------------------------------===// // Some Common Instruction Class Templates @@ -461,33 +464,45 @@ def SHL2MUL16 : SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(temp.shl(v), MVT::i16); }]>; -def MULWIDES64 : NVPTXInst<(outs Int64Regs:$dst), - (ins Int32Regs:$a, Int32Regs:$b), +def MULWIDES64 + : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), + "mul.wide.s32 \t$dst, $a, $b;", []>; +def MULWIDES64Imm + : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b), "mul.wide.s32 \t$dst, $a, $b;", []>; -def MULWIDES64Imm : NVPTXInst<(outs Int64Regs:$dst), - (ins Int32Regs:$a, i64imm:$b), +def MULWIDES64Imm64 + : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b), "mul.wide.s32 \t$dst, $a, $b;", []>; -def MULWIDEU64 : NVPTXInst<(outs Int64Regs:$dst), - (ins Int32Regs:$a, Int32Regs:$b), +def MULWIDEU64 + : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), + "mul.wide.u32 \t$dst, $a, $b;", []>; +def MULWIDEU64Imm + : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b), "mul.wide.u32 \t$dst, $a, $b;", []>; -def MULWIDEU64Imm : NVPTXInst<(outs Int64Regs:$dst), - (ins Int32Regs:$a, i64imm:$b), +def MULWIDEU64Imm64 + : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b), "mul.wide.u32 \t$dst, $a, $b;", []>; -def MULWIDES32 : NVPTXInst<(outs Int32Regs:$dst), - (ins Int16Regs:$a, Int16Regs:$b), +def MULWIDES32 + : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), "mul.wide.s16 \t$dst, $a, $b;", []>; -def MULWIDES32Imm : NVPTXInst<(outs Int32Regs:$dst), - (ins Int16Regs:$a, i32imm:$b), +def MULWIDES32Imm + : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b), + "mul.wide.s16 \t$dst, $a, $b;", []>; +def MULWIDES32Imm32 + : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b), "mul.wide.s16 \t$dst, $a, $b;", []>; -def MULWIDEU32 : NVPTXInst<(outs Int32Regs:$dst), - (ins Int16Regs:$a, Int16Regs:$b), - "mul.wide.u16 \t$dst, $a, $b;", []>; -def MULWIDEU32Imm : NVPTXInst<(outs Int32Regs:$dst), - (ins Int16Regs:$a, i32imm:$b), +def MULWIDEU32 + : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), + "mul.wide.u16 \t$dst, $a, $b;", []>; +def MULWIDEU32Imm + : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b), "mul.wide.u16 \t$dst, $a, $b;", []>; +def MULWIDEU32Imm32 + : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b), + "mul.wide.u16 \t$dst, $a, $b;", []>; def : Pat<(shl (sext Int32Regs:$a), (i32 Int5Const:$b)), (MULWIDES64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>, @@ -507,25 +522,63 @@ def : Pat<(mul (sext Int32Regs:$a), (sext Int32Regs:$b)), (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>, Requires<[doMulWide]>; def : Pat<(mul (sext Int32Regs:$a), (i64 SInt32Const:$b)), - (MULWIDES64Imm Int32Regs:$a, (i64 SInt32Const:$b))>, + (MULWIDES64Imm64 Int32Regs:$a, (i64 SInt32Const:$b))>, Requires<[doMulWide]>; def : Pat<(mul (zext Int32Regs:$a), (zext Int32Regs:$b)), - (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>, Requires<[doMulWide]>; + (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>, + Requires<[doMulWide]>; def : Pat<(mul (zext Int32Regs:$a), (i64 UInt32Const:$b)), - (MULWIDEU64Imm Int32Regs:$a, (i64 UInt32Const:$b))>, + (MULWIDEU64Imm64 Int32Regs:$a, (i64 UInt32Const:$b))>, Requires<[doMulWide]>; def : Pat<(mul (sext Int16Regs:$a), (sext Int16Regs:$b)), - (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>, Requires<[doMulWide]>; + (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>, + Requires<[doMulWide]>; def : Pat<(mul (sext Int16Regs:$a), (i32 SInt16Const:$b)), - (MULWIDES32Imm Int16Regs:$a, (i32 SInt16Const:$b))>, + (MULWIDES32Imm32 Int16Regs:$a, (i32 SInt16Const:$b))>, Requires<[doMulWide]>; def : Pat<(mul (zext Int16Regs:$a), (zext Int16Regs:$b)), - (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>, Requires<[doMulWide]>; + (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>, + Requires<[doMulWide]>; def : Pat<(mul (zext Int16Regs:$a), (i32 UInt16Const:$b)), - (MULWIDEU32Imm Int16Regs:$a, (i32 UInt16Const:$b))>, + (MULWIDEU32Imm32 Int16Regs:$a, (i32 UInt16Const:$b))>, + Requires<[doMulWide]>; + + +def SDTMulWide + : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>; +def mul_wide_signed + : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>; +def mul_wide_unsigned + : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>; + +def : Pat<(i32 (mul_wide_signed Int16Regs:$a, Int16Regs:$b)), + (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>, + Requires<[doMulWide]>; +def : Pat<(i32 (mul_wide_signed Int16Regs:$a, imm:$b)), + (MULWIDES32Imm Int16Regs:$a, imm:$b)>, + Requires<[doMulWide]>; +def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, Int16Regs:$b)), + (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>, + Requires<[doMulWide]>; +def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, imm:$b)), + (MULWIDEU32Imm Int16Regs:$a, imm:$b)>, + Requires<[doMulWide]>; + + +def : Pat<(i64 (mul_wide_signed Int32Regs:$a, Int32Regs:$b)), + (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>, + Requires<[doMulWide]>; +def : Pat<(i64 (mul_wide_signed Int32Regs:$a, imm:$b)), + (MULWIDES64Imm Int32Regs:$a, imm:$b)>, + Requires<[doMulWide]>; +def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, Int32Regs:$b)), + (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>, + Requires<[doMulWide]>; +def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, imm:$b)), + (MULWIDEU64Imm Int32Regs:$a, imm:$b)>, Requires<[doMulWide]>; defm MULT : I3<"mul.lo.s", mul>; @@ -541,69 +594,75 @@ defm SREM : I3<"rem.s", srem>; defm UREM : I3<"rem.u", urem>; // The ri version will not be selected as DAGCombiner::visitUREM will lower it. +def SDTIMAD + : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>, + SDTCisInt<2>, SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>]>; +def imad + : SDNode<"NVPTXISD::IMAD", SDTIMAD>; + def MAD16rrr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c), "mad.lo.s16 \t$dst, $a, $b, $c;", - [(set Int16Regs:$dst, (add - (mul Int16Regs:$a, Int16Regs:$b), Int16Regs:$c))]>; + [(set Int16Regs:$dst, + (imad Int16Regs:$a, Int16Regs:$b, Int16Regs:$c))]>; def MAD16rri : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b, i16imm:$c), "mad.lo.s16 \t$dst, $a, $b, $c;", - [(set Int16Regs:$dst, (add - (mul Int16Regs:$a, Int16Regs:$b), imm:$c))]>; + [(set Int16Regs:$dst, + (imad Int16Regs:$a, Int16Regs:$b, imm:$c))]>; def MAD16rir : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b, Int16Regs:$c), "mad.lo.s16 \t$dst, $a, $b, $c;", - [(set Int16Regs:$dst, (add - (mul Int16Regs:$a, imm:$b), Int16Regs:$c))]>; + [(set Int16Regs:$dst, + (imad Int16Regs:$a, imm:$b, Int16Regs:$c))]>; def MAD16rii : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b, i16imm:$c), "mad.lo.s16 \t$dst, $a, $b, $c;", - [(set Int16Regs:$dst, (add (mul Int16Regs:$a, imm:$b), - imm:$c))]>; + [(set Int16Regs:$dst, + (imad Int16Regs:$a, imm:$b, imm:$c))]>; def MAD32rrr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c), "mad.lo.s32 \t$dst, $a, $b, $c;", - [(set Int32Regs:$dst, (add - (mul Int32Regs:$a, Int32Regs:$b), Int32Regs:$c))]>; + [(set Int32Regs:$dst, + (imad Int32Regs:$a, Int32Regs:$b, Int32Regs:$c))]>; def MAD32rri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b, i32imm:$c), "mad.lo.s32 \t$dst, $a, $b, $c;", - [(set Int32Regs:$dst, (add - (mul Int32Regs:$a, Int32Regs:$b), imm:$c))]>; + [(set Int32Regs:$dst, + (imad Int32Regs:$a, Int32Regs:$b, imm:$c))]>; def MAD32rir : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b, Int32Regs:$c), "mad.lo.s32 \t$dst, $a, $b, $c;", - [(set Int32Regs:$dst, (add - (mul Int32Regs:$a, imm:$b), Int32Regs:$c))]>; + [(set Int32Regs:$dst, + (imad Int32Regs:$a, imm:$b, Int32Regs:$c))]>; def MAD32rii : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b, i32imm:$c), "mad.lo.s32 \t$dst, $a, $b, $c;", - [(set Int32Regs:$dst, (add - (mul Int32Regs:$a, imm:$b), imm:$c))]>; + [(set Int32Regs:$dst, + (imad Int32Regs:$a, imm:$b, imm:$c))]>; def MAD64rrr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c), "mad.lo.s64 \t$dst, $a, $b, $c;", - [(set Int64Regs:$dst, (add - (mul Int64Regs:$a, Int64Regs:$b), Int64Regs:$c))]>; + [(set Int64Regs:$dst, + (imad Int64Regs:$a, Int64Regs:$b, Int64Regs:$c))]>; def MAD64rri : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b, i64imm:$c), "mad.lo.s64 \t$dst, $a, $b, $c;", - [(set Int64Regs:$dst, (add - (mul Int64Regs:$a, Int64Regs:$b), imm:$c))]>; + [(set Int64Regs:$dst, + (imad Int64Regs:$a, Int64Regs:$b, imm:$c))]>; def MAD64rir : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b, Int64Regs:$c), "mad.lo.s64 \t$dst, $a, $b, $c;", - [(set Int64Regs:$dst, (add - (mul Int64Regs:$a, imm:$b), Int64Regs:$c))]>; + [(set Int64Regs:$dst, + (imad Int64Regs:$a, imm:$b, Int64Regs:$c))]>; def MAD64rii : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b, i64imm:$c), "mad.lo.s64 \t$dst, $a, $b, $c;", - [(set Int64Regs:$dst, (add - (mul Int64Regs:$a, imm:$b), imm:$c))]>; - + [(set Int64Regs:$dst, + (imad Int64Regs:$a, imm:$b, imm:$c))]>; def INEG16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), "neg.s16 \t$dst, $src;", @@ -809,36 +868,26 @@ multiclass FPCONTRACT32<string OpcStr, Predicate Pred> { def rrr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, Float32Regs:$b, Float32Regs:$c), !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set Float32Regs:$dst, (fadd - (fmul Float32Regs:$a, Float32Regs:$b), - Float32Regs:$c))]>, Requires<[Pred]>; - // This is to WAR a weird bug in Tablegen that does not automatically - // generate the following permutated rule rrr2 from the above rrr. - // So we explicitly add it here. This happens to FMA32 only. - // See the comments at FMAD32 and FMA32 for more information. - def rrr2 : NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, Float32Regs:$b, Float32Regs:$c), - !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set Float32Regs:$dst, (fadd Float32Regs:$c, - (fmul Float32Regs:$a, Float32Regs:$b)))]>, + [(set Float32Regs:$dst, + (fma Float32Regs:$a, Float32Regs:$b, Float32Regs:$c))]>, Requires<[Pred]>; def rri : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, Float32Regs:$b, f32imm:$c), !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set Float32Regs:$dst, (fadd - (fmul Float32Regs:$a, Float32Regs:$b), fpimm:$c))]>, + [(set Float32Regs:$dst, + (fma Float32Regs:$a, Float32Regs:$b, fpimm:$c))]>, Requires<[Pred]>; def rir : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, f32imm:$b, Float32Regs:$c), !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set Float32Regs:$dst, (fadd - (fmul Float32Regs:$a, fpimm:$b), Float32Regs:$c))]>, + [(set Float32Regs:$dst, + (fma Float32Regs:$a, fpimm:$b, Float32Regs:$c))]>, Requires<[Pred]>; def rii : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a, f32imm:$b, f32imm:$c), !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set Float32Regs:$dst, (fadd - (fmul Float32Regs:$a, fpimm:$b), fpimm:$c))]>, + [(set Float32Regs:$dst, + (fma Float32Regs:$a, fpimm:$b, fpimm:$c))]>, Requires<[Pred]>; } @@ -846,73 +895,32 @@ multiclass FPCONTRACT64<string OpcStr, Predicate Pred> { def rrr : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a, Float64Regs:$b, Float64Regs:$c), !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set Float64Regs:$dst, (fadd - (fmul Float64Regs:$a, Float64Regs:$b), - Float64Regs:$c))]>, Requires<[Pred]>; + [(set Float64Regs:$dst, + (fma Float64Regs:$a, Float64Regs:$b, Float64Regs:$c))]>, + Requires<[Pred]>; def rri : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a, Float64Regs:$b, f64imm:$c), !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set Float64Regs:$dst, (fadd (fmul Float64Regs:$a, - Float64Regs:$b), fpimm:$c))]>, Requires<[Pred]>; + [(set Float64Regs:$dst, + (fma Float64Regs:$a, Float64Regs:$b, fpimm:$c))]>, + Requires<[Pred]>; def rir : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a, f64imm:$b, Float64Regs:$c), !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set Float64Regs:$dst, (fadd - (fmul Float64Regs:$a, fpimm:$b), Float64Regs:$c))]>, + [(set Float64Regs:$dst, + (fma Float64Regs:$a, fpimm:$b, Float64Regs:$c))]>, Requires<[Pred]>; def rii : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a, f64imm:$b, f64imm:$c), !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set Float64Regs:$dst, (fadd - (fmul Float64Regs:$a, fpimm:$b), fpimm:$c))]>, + [(set Float64Regs:$dst, + (fma Float64Regs:$a, fpimm:$b, fpimm:$c))]>, Requires<[Pred]>; } -// Due to a unknown reason (most likely a bug in tablegen), tablegen does not -// automatically generate the rrr2 rule from -// the rrr rule (see FPCONTRACT32) for FMA32, though it does for FMAD32. -// If we reverse the order of the following two lines, then rrr2 rule will be -// generated for FMA32, but not for rrr. -// Therefore, we manually write the rrr2 rule in FPCONTRACT32. -defm FMA32_ftz : FPCONTRACT32<"fma.rn.ftz.f32", doFMAF32_ftz>; -defm FMA32 : FPCONTRACT32<"fma.rn.f32", doFMAF32>; -defm FMA64 : FPCONTRACT64<"fma.rn.f64", doFMAF64>; - -// b*c-a => fmad(b, c, -a) -multiclass FPCONTRACT32_SUB_PAT_MAD<NVPTXInst Inst, Predicate Pred> { - def : Pat<(fsub (fmul Float32Regs:$b, Float32Regs:$c), Float32Regs:$a), - (Inst Float32Regs:$b, Float32Regs:$c, (FNEGf32 Float32Regs:$a))>, - Requires<[Pred]>; -} - -// a-b*c => fmad(-b,c, a) -// - legal because a-b*c <=> a+(-b*c) <=> a+(-b)*c -// b*c-a => fmad(b, c, -a) -// - legal because b*c-a <=> b*c+(-a) -multiclass FPCONTRACT32_SUB_PAT<NVPTXInst Inst, Predicate Pred> { - def : Pat<(fsub Float32Regs:$a, (fmul Float32Regs:$b, Float32Regs:$c)), - (Inst (FNEGf32 Float32Regs:$b), Float32Regs:$c, Float32Regs:$a)>, - Requires<[Pred]>; - def : Pat<(fsub (fmul Float32Regs:$b, Float32Regs:$c), Float32Regs:$a), - (Inst Float32Regs:$b, Float32Regs:$c, (FNEGf32 Float32Regs:$a))>, - Requires<[Pred]>; -} - -// a-b*c => fmad(-b,c, a) -// b*c-a => fmad(b, c, -a) -multiclass FPCONTRACT64_SUB_PAT<NVPTXInst Inst, Predicate Pred> { - def : Pat<(fsub Float64Regs:$a, (fmul Float64Regs:$b, Float64Regs:$c)), - (Inst (FNEGf64 Float64Regs:$b), Float64Regs:$c, Float64Regs:$a)>, - Requires<[Pred]>; - - def : Pat<(fsub (fmul Float64Regs:$b, Float64Regs:$c), Float64Regs:$a), - (Inst Float64Regs:$b, Float64Regs:$c, (FNEGf64 Float64Regs:$a))>, - Requires<[Pred]>; -} - -defm FMAF32ext_ftz : FPCONTRACT32_SUB_PAT<FMA32_ftzrrr, doFMAF32AGG_ftz>; -defm FMAF32ext : FPCONTRACT32_SUB_PAT<FMA32rrr, doFMAF32AGG>; -defm FMAF64ext : FPCONTRACT64_SUB_PAT<FMA64rrr, doFMAF64AGG>; +defm FMA32_ftz : FPCONTRACT32<"fma.rn.ftz.f32", doF32FTZ>; +defm FMA32 : FPCONTRACT32<"fma.rn.f32", doNoF32FTZ>; +defm FMA64 : FPCONTRACT64<"fma.rn.f64", doNoF32FTZ>; def SINF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src), "sin.approx.f32 \t$dst, $src;", @@ -1083,6 +1091,43 @@ multiclass RSHIFT_FORMAT<string OpcStr, SDNode OpNode> { defm SRA : RSHIFT_FORMAT<"shr.s", sra>; defm SRL : RSHIFT_FORMAT<"shr.u", srl>; +// +// Rotate: use ptx shf instruction if available. +// + +// 32 bit r2 = rotl r1, n +// => +// r2 = shf.l r1, r1, n +def ROTL32imm_hw : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$src, i32imm:$amt), + "shf.l.wrap.b32 \t$dst, $src, $src, $amt;", + [(set Int32Regs:$dst, (rotl Int32Regs:$src, (i32 imm:$amt)))]>, + Requires<[hasHWROT32]> ; + +def ROTL32reg_hw : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$src, Int32Regs:$amt), + "shf.l.wrap.b32 \t$dst, $src, $src, $amt;", + [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>, + Requires<[hasHWROT32]>; + +// 32 bit r2 = rotr r1, n +// => +// r2 = shf.r r1, r1, n +def ROTR32imm_hw : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$src, i32imm:$amt), + "shf.r.wrap.b32 \t$dst, $src, $src, $amt;", + [(set Int32Regs:$dst, (rotr Int32Regs:$src, (i32 imm:$amt)))]>, + Requires<[hasHWROT32]>; + +def ROTR32reg_hw : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$src, Int32Regs:$amt), + "shf.r.wrap.b32 \t$dst, $src, $src, $amt;", + [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>, + Requires<[hasHWROT32]>; + +// +// Rotate: if ptx shf instruction is not available, then use shift+add +// // 32bit def ROT32imm_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2), @@ -1100,9 +1145,11 @@ def SUB_FRM_32 : SDNodeXForm<imm, [{ }]>; def : Pat<(rotl Int32Regs:$src, (i32 imm:$amt)), - (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>; + (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>, + Requires<[noHWROT32]>; def : Pat<(rotr Int32Regs:$src, (i32 imm:$amt)), - (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>; + (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>, + Requires<[noHWROT32]>; def ROTL32reg_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), @@ -1115,7 +1162,8 @@ def ROTL32reg_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, !strconcat("shr.b32 \t%rhs, $src, %amt2;\n\t", !strconcat("add.u32 \t$dst, %lhs, %rhs;\n\t", !strconcat("}}", ""))))))))), - [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>; + [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>, + Requires<[noHWROT32]>; def ROTR32reg_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), @@ -1128,7 +1176,8 @@ def ROTR32reg_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, !strconcat("shl.b32 \t%rhs, $src, %amt2;\n\t", !strconcat("add.u32 \t$dst, %lhs, %rhs;\n\t", !strconcat("}}", ""))))))))), - [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>; + [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>, + Requires<[noHWROT32]>; // 64bit def ROT64imm_sw : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, @@ -1177,6 +1226,29 @@ def ROTR64reg_sw : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, !strconcat("}}", ""))))))))), [(set Int64Regs:$dst, (rotr Int64Regs:$src, Int32Regs:$amt))]>; +// BFE - bit-field extract + +multiclass BFE<string TyStr, RegisterClass RC> { + // BFE supports both 32-bit and 64-bit values, but the start and length + // operands are always 32-bit + def rrr + : NVPTXInst<(outs RC:$d), + (ins RC:$a, Int32Regs:$b, Int32Regs:$c), + !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>; + def rri + : NVPTXInst<(outs RC:$d), + (ins RC:$a, Int32Regs:$b, i32imm:$c), + !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>; + def rii + : NVPTXInst<(outs RC:$d), + (ins RC:$a, i32imm:$b, i32imm:$c), + !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>; +} + +defm BFE_S32 : BFE<"s32", Int32Regs>; +defm BFE_U32 : BFE<"u32", Int32Regs>; +defm BFE_S64 : BFE<"s64", Int64Regs>; +defm BFE_U64 : BFE<"u64", Int64Regs>; //----------------------------------- // General Comparison @@ -1292,6 +1364,32 @@ def : Pat<(i1 (select Int1Regs:$p, Int1Regs:$a, Int1Regs:$b)), (ORb1rr (ANDb1rr Int1Regs:$p, Int1Regs:$a), (ANDb1rr (NOT1 Int1Regs:$p), Int1Regs:$b))>; +// +// Funnnel shift in clamp mode +// +// - SDNodes are created so they can be used in the DAG code, +// e.g. NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts) +// +def SDTIntShiftDOp: SDTypeProfile<1, 3, + [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, + SDTCisInt<0>, SDTCisInt<3>]>; +def FUN_SHFL_CLAMP : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>; +def FUN_SHFR_CLAMP : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>; + +def FUNSHFLCLAMP : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), + "shf.l.clamp.b32 \t$dst, $lo, $hi, $amt;", + [(set Int32Regs:$dst, + (FUN_SHFL_CLAMP Int32Regs:$lo, + Int32Regs:$hi, Int32Regs:$amt))]>; + +def FUNSHFRCLAMP : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), + "shf.r.clamp.b32 \t$dst, $lo, $hi, $amt;", + [(set Int32Regs:$dst, + (FUN_SHFR_CLAMP Int32Regs:$lo, + Int32Regs:$hi, Int32Regs:$amt))]>; + //----------------------------------- // Data Movement (Load / Store, Move) //----------------------------------- diff --git a/lib/Target/NVPTX/NVPTXIntrinsics.td b/lib/Target/NVPTX/NVPTXIntrinsics.td index 5e228fc396ca..0ad3dfaa4c8f 100644 --- a/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -1057,12 +1057,24 @@ def atomic_load_max_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), (atomic_load_max_32 node:$a, node:$b)>; def atomic_load_max_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), (atomic_load_max_32 node:$a, node:$b)>; +def atomic_load_max_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b) + , (atomic_load_max_64 node:$a, node:$b)>; +def atomic_load_max_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_max_64 node:$a, node:$b)>; +def atomic_load_max_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_max_64 node:$a, node:$b)>; def atomic_load_umax_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), (atomic_load_umax_32 node:$a, node:$b)>; def atomic_load_umax_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), (atomic_load_umax_32 node:$a, node:$b)>; def atomic_load_umax_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), (atomic_load_umax_32 node:$a, node:$b)>; +def atomic_load_umax_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_load_umax_64 node:$a, node:$b)>; +def atomic_load_umax_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_umax_64 node:$a, node:$b)>; +def atomic_load_umax_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_umax_64 node:$a, node:$b)>; defm INT_PTX_ATOM_LOAD_MAX_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".s32", ".max", atomic_load_max_32_g, i32imm, imm, hasAtomRedG32>; @@ -1072,6 +1084,14 @@ defm INT_PTX_ATOM_LOAD_MAX_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".s32", ".max", atomic_load_max_32_gen, i32imm, imm, hasAtomRedGen32>; defm INT_PTX_ATOM_LOAD_MAX_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".s32", ".max", atomic_load_max_32_gen, i32imm, imm, useAtomRedG32forGen32>; +defm INT_PTX_ATOM_LOAD_MAX_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".s64", + ".max", atomic_load_max_64_g, i64imm, imm, hasAtomRedG64>; +defm INT_PTX_ATOM_LOAD_MAX_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".s64", + ".max", atomic_load_max_64_s, i64imm, imm, hasAtomRedS64>; +defm INT_PTX_ATOM_LOAD_MAX_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".s64", ".max", + atomic_load_max_64_gen, i64imm, imm, hasAtomRedGen64>; +defm INT_PTX_ATOM_LOAD_MAX_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", + ".s64", ".max", atomic_load_max_64_gen, i64imm, imm, useAtomRedG64forGen64>; defm INT_PTX_ATOM_LOAD_UMAX_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32", ".max", atomic_load_umax_32_g, i32imm, imm, hasAtomRedG32>; defm INT_PTX_ATOM_LOAD_UMAX_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32", @@ -1080,6 +1100,14 @@ defm INT_PTX_ATOM_LOAD_UMAX_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".max", atomic_load_umax_32_gen, i32imm, imm, hasAtomRedGen32>; defm INT_PTX_ATOM_LOAD_UMAX_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".u32", ".max", atomic_load_umax_32_gen, i32imm, imm, useAtomRedG32forGen32>; +defm INT_PTX_ATOM_LOAD_UMAX_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".u64", + ".max", atomic_load_umax_64_g, i64imm, imm, hasAtomRedG64>; +defm INT_PTX_ATOM_LOAD_UMAX_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".u64", + ".max", atomic_load_umax_64_s, i64imm, imm, hasAtomRedS64>; +defm INT_PTX_ATOM_LOAD_UMAX_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".u64", ".max", + atomic_load_umax_64_gen, i64imm, imm, hasAtomRedGen64>; +defm INT_PTX_ATOM_LOAD_UMAX_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", + ".u64", ".max", atomic_load_umax_64_gen, i64imm, imm, useAtomRedG64forGen64>; // atom_min @@ -1089,12 +1117,24 @@ def atomic_load_min_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), (atomic_load_min_32 node:$a, node:$b)>; def atomic_load_min_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), (atomic_load_min_32 node:$a, node:$b)>; +def atomic_load_min_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_load_min_64 node:$a, node:$b)>; +def atomic_load_min_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_min_64 node:$a, node:$b)>; +def atomic_load_min_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_min_64 node:$a, node:$b)>; def atomic_load_umin_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), (atomic_load_umin_32 node:$a, node:$b)>; def atomic_load_umin_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), (atomic_load_umin_32 node:$a, node:$b)>; def atomic_load_umin_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), (atomic_load_umin_32 node:$a, node:$b)>; +def atomic_load_umin_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_load_umin_64 node:$a, node:$b)>; +def atomic_load_umin_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_umin_64 node:$a, node:$b)>; +def atomic_load_umin_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_umin_64 node:$a, node:$b)>; defm INT_PTX_ATOM_LOAD_MIN_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".s32", ".min", atomic_load_min_32_g, i32imm, imm, hasAtomRedG32>; @@ -1104,6 +1144,14 @@ defm INT_PTX_ATOM_LOAD_MIN_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".s32", ".min", atomic_load_min_32_gen, i32imm, imm, hasAtomRedGen32>; defm INT_PTX_ATOM_LOAD_MIN_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".s32", ".min", atomic_load_min_32_gen, i32imm, imm, useAtomRedG32forGen32>; +defm INT_PTX_ATOM_LOAD_MIN_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".s64", + ".min", atomic_load_min_64_g, i64imm, imm, hasAtomRedG64>; +defm INT_PTX_ATOM_LOAD_MIN_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".s64", + ".min", atomic_load_min_64_s, i64imm, imm, hasAtomRedS64>; +defm INT_PTX_ATOM_LOAD_MIN_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".s64", ".min", + atomic_load_min_64_gen, i64imm, imm, hasAtomRedGen64>; +defm INT_PTX_ATOM_LOAD_MIN_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", + ".s64", ".min", atomic_load_min_64_gen, i64imm, imm, useAtomRedG64forGen64>; defm INT_PTX_ATOM_LOAD_UMIN_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32", ".min", atomic_load_umin_32_g, i32imm, imm, hasAtomRedG32>; defm INT_PTX_ATOM_LOAD_UMIN_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32", @@ -1112,6 +1160,14 @@ defm INT_PTX_ATOM_LOAD_UMIN_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".min", atomic_load_umin_32_gen, i32imm, imm, hasAtomRedGen32>; defm INT_PTX_ATOM_LOAD_UMIN_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".u32", ".min", atomic_load_umin_32_gen, i32imm, imm, useAtomRedG32forGen32>; +defm INT_PTX_ATOM_LOAD_UMIN_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".u64", + ".min", atomic_load_umin_64_g, i64imm, imm, hasAtomRedG64>; +defm INT_PTX_ATOM_LOAD_UMIN_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".u64", + ".min", atomic_load_umin_64_s, i64imm, imm, hasAtomRedS64>; +defm INT_PTX_ATOM_LOAD_UMIN_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".u64", ".min", + atomic_load_umin_64_gen, i64imm, imm, hasAtomRedGen64>; +defm INT_PTX_ATOM_LOAD_UMIN_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", + ".u64", ".min", atomic_load_umin_64_gen, i64imm, imm, useAtomRedG64forGen64>; // atom_inc atom_dec @@ -1153,6 +1209,12 @@ def atomic_load_and_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), (atomic_load_and_32 node:$a, node:$b)>; def atomic_load_and_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), (atomic_load_and_32 node:$a, node:$b)>; +def atomic_load_and_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_load_and_64 node:$a, node:$b)>; +def atomic_load_and_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_and_64 node:$a, node:$b)>; +def atomic_load_and_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_and_64 node:$a, node:$b)>; defm INT_PTX_ATOM_AND_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".and", atomic_load_and_32_g, i32imm, imm, hasAtomRedG32>; @@ -1162,6 +1224,14 @@ defm INT_PTX_ATOM_AND_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".and", atomic_load_and_32_gen, i32imm, imm, hasAtomRedGen32>; defm INT_PTX_ATOM_AND_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".and", atomic_load_and_32_gen, i32imm, imm, useAtomRedG32forGen32>; +defm INT_PTX_ATOM_AND_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".b64", ".and", + atomic_load_and_64_g, i64imm, imm, hasAtomRedG64>; +defm INT_PTX_ATOM_AND_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".b64", ".and", + atomic_load_and_64_s, i64imm, imm, hasAtomRedS64>; +defm INT_PTX_ATOM_AND_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".b64", ".and", + atomic_load_and_64_gen, i64imm, imm, hasAtomRedGen64>; +defm INT_PTX_ATOM_AND_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".b64", + ".and", atomic_load_and_64_gen, i64imm, imm, useAtomRedG64forGen64>; // atom_or @@ -1171,6 +1241,12 @@ def atomic_load_or_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), (atomic_load_or_32 node:$a, node:$b)>; def atomic_load_or_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), (atomic_load_or_32 node:$a, node:$b)>; +def atomic_load_or_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_load_or_64 node:$a, node:$b)>; +def atomic_load_or_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_or_64 node:$a, node:$b)>; +def atomic_load_or_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_or_64 node:$a, node:$b)>; defm INT_PTX_ATOM_OR_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".or", atomic_load_or_32_g, i32imm, imm, hasAtomRedG32>; @@ -1180,6 +1256,14 @@ defm INT_PTX_ATOM_OR_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".or", atomic_load_or_32_gen, i32imm, imm, useAtomRedG32forGen32>; defm INT_PTX_ATOM_OR_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".b32", ".or", atomic_load_or_32_s, i32imm, imm, hasAtomRedS32>; +defm INT_PTX_ATOM_OR_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".b64", ".or", + atomic_load_or_64_g, i64imm, imm, hasAtomRedG64>; +defm INT_PTX_ATOM_OR_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".b64", ".or", + atomic_load_or_64_gen, i64imm, imm, hasAtomRedGen64>; +defm INT_PTX_ATOM_OR_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".b64", + ".or", atomic_load_or_64_gen, i64imm, imm, useAtomRedG64forGen64>; +defm INT_PTX_ATOM_OR_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".b64", ".or", + atomic_load_or_64_s, i64imm, imm, hasAtomRedS64>; // atom_xor @@ -1189,6 +1273,12 @@ def atomic_load_xor_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), (atomic_load_xor_32 node:$a, node:$b)>; def atomic_load_xor_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), (atomic_load_xor_32 node:$a, node:$b)>; +def atomic_load_xor_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (atomic_load_xor_64 node:$a, node:$b)>; +def atomic_load_xor_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (atomic_load_xor_64 node:$a, node:$b)>; +def atomic_load_xor_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (atomic_load_xor_64 node:$a, node:$b)>; defm INT_PTX_ATOM_XOR_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".xor", atomic_load_xor_32_g, i32imm, imm, hasAtomRedG32>; @@ -1198,6 +1288,14 @@ defm INT_PTX_ATOM_XOR_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".xor", atomic_load_xor_32_gen, i32imm, imm, hasAtomRedGen32>; defm INT_PTX_ATOM_XOR_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".xor", atomic_load_xor_32_gen, i32imm, imm, useAtomRedG32forGen32>; +defm INT_PTX_ATOM_XOR_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".b64", ".xor", + atomic_load_xor_64_g, i64imm, imm, hasAtomRedG64>; +defm INT_PTX_ATOM_XOR_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".b64", ".xor", + atomic_load_xor_64_s, i64imm, imm, hasAtomRedS64>; +defm INT_PTX_ATOM_XOR_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".b64", ".xor", + atomic_load_xor_64_gen, i64imm, imm, hasAtomRedGen64>; +defm INT_PTX_ATOM_XOR_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".b64", + ".xor", atomic_load_xor_64_gen, i64imm, imm, useAtomRedG64forGen64>; // atom_cas @@ -1276,67 +1374,33 @@ def INT_PTX_SREG_WARPSIZE : F_SREG<"mov.u32 \t$dst, WARP_SZ;", Int32Regs, // Support for ldu on sm_20 or later //----------------------------------- -def ldu_i8 : PatFrag<(ops node:$ptr), (int_nvvm_ldu_global_i node:$ptr), [{ - MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N); - return M->getMemoryVT() == MVT::i8; -}]>; - // Scalar -// @TODO: Revisit this, Changed imemAny to imem -multiclass LDU_G<string TyStr, NVPTXRegClass regclass, Intrinsic IntOp> { +multiclass LDU_G<string TyStr, NVPTXRegClass regclass> { def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src), !strconcat("ldu.global.", TyStr), - [(set regclass:$result, (IntOp Int32Regs:$src))]>, Requires<[hasLDU]>; + []>, Requires<[hasLDU]>; def areg64: NVPTXInst<(outs regclass:$result), (ins Int64Regs:$src), !strconcat("ldu.global.", TyStr), - [(set regclass:$result, (IntOp Int64Regs:$src))]>, Requires<[hasLDU]>; - def avar: NVPTXInst<(outs regclass:$result), (ins imem:$src), + []>, Requires<[hasLDU]>; + def avar: NVPTXInst<(outs regclass:$result), (ins imemAny:$src), !strconcat("ldu.global.", TyStr), - [(set regclass:$result, (IntOp (Wrapper tglobaladdr:$src)))]>, - Requires<[hasLDU]>; + []>, Requires<[hasLDU]>; def ari : NVPTXInst<(outs regclass:$result), (ins MEMri:$src), !strconcat("ldu.global.", TyStr), - [(set regclass:$result, (IntOp ADDRri:$src))]>, Requires<[hasLDU]>; + []>, Requires<[hasLDU]>; def ari64 : NVPTXInst<(outs regclass:$result), (ins MEMri64:$src), !strconcat("ldu.global.", TyStr), - [(set regclass:$result, (IntOp ADDRri64:$src))]>, Requires<[hasLDU]>; + []>, Requires<[hasLDU]>; } -multiclass LDU_G_NOINTRIN<string TyStr, NVPTXRegClass regclass, PatFrag IntOp> { - def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src), - !strconcat("ldu.global.", TyStr), - [(set regclass:$result, (IntOp Int32Regs:$src))]>, Requires<[hasLDU]>; - def areg64: NVPTXInst<(outs regclass:$result), (ins Int64Regs:$src), - !strconcat("ldu.global.", TyStr), - [(set regclass:$result, (IntOp Int64Regs:$src))]>, Requires<[hasLDU]>; - def avar: NVPTXInst<(outs regclass:$result), (ins imem:$src), - !strconcat("ldu.global.", TyStr), - [(set regclass:$result, (IntOp (Wrapper tglobaladdr:$src)))]>, - Requires<[hasLDU]>; - def ari : NVPTXInst<(outs regclass:$result), (ins MEMri:$src), - !strconcat("ldu.global.", TyStr), - [(set regclass:$result, (IntOp ADDRri:$src))]>, Requires<[hasLDU]>; - def ari64 : NVPTXInst<(outs regclass:$result), (ins MEMri64:$src), - !strconcat("ldu.global.", TyStr), - [(set regclass:$result, (IntOp ADDRri64:$src))]>, Requires<[hasLDU]>; -} - -defm INT_PTX_LDU_GLOBAL_i8 : LDU_G_NOINTRIN<"u8 \t$result, [$src];", Int16Regs, - ldu_i8>; -defm INT_PTX_LDU_GLOBAL_i16 : LDU_G<"u16 \t$result, [$src];", Int16Regs, -int_nvvm_ldu_global_i>; -defm INT_PTX_LDU_GLOBAL_i32 : LDU_G<"u32 \t$result, [$src];", Int32Regs, -int_nvvm_ldu_global_i>; -defm INT_PTX_LDU_GLOBAL_i64 : LDU_G<"u64 \t$result, [$src];", Int64Regs, -int_nvvm_ldu_global_i>; -defm INT_PTX_LDU_GLOBAL_f32 : LDU_G<"f32 \t$result, [$src];", Float32Regs, -int_nvvm_ldu_global_f>; -defm INT_PTX_LDU_GLOBAL_f64 : LDU_G<"f64 \t$result, [$src];", Float64Regs, -int_nvvm_ldu_global_f>; -defm INT_PTX_LDU_GLOBAL_p32 : LDU_G<"u32 \t$result, [$src];", Int32Regs, -int_nvvm_ldu_global_p>; -defm INT_PTX_LDU_GLOBAL_p64 : LDU_G<"u64 \t$result, [$src];", Int64Regs, -int_nvvm_ldu_global_p>; +defm INT_PTX_LDU_GLOBAL_i8 : LDU_G<"u8 \t$result, [$src];", Int16Regs>; +defm INT_PTX_LDU_GLOBAL_i16 : LDU_G<"u16 \t$result, [$src];", Int16Regs>; +defm INT_PTX_LDU_GLOBAL_i32 : LDU_G<"u32 \t$result, [$src];", Int32Regs>; +defm INT_PTX_LDU_GLOBAL_i64 : LDU_G<"u64 \t$result, [$src];", Int64Regs>; +defm INT_PTX_LDU_GLOBAL_f32 : LDU_G<"f32 \t$result, [$src];", Float32Regs>; +defm INT_PTX_LDU_GLOBAL_f64 : LDU_G<"f64 \t$result, [$src];", Float64Regs>; +defm INT_PTX_LDU_GLOBAL_p32 : LDU_G<"u32 \t$result, [$src];", Int32Regs>; +defm INT_PTX_LDU_GLOBAL_p64 : LDU_G<"u64 \t$result, [$src];", Int64Regs>; // vector @@ -1406,65 +1470,40 @@ defm INT_PTX_LDU_G_v4f32_ELE // Support for ldg on sm_35 or later //----------------------------------- -def ldg_i8 : PatFrag<(ops node:$ptr), (int_nvvm_ldg_global_i node:$ptr), [{ - MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N); - return M->getMemoryVT() == MVT::i8; -}]>; - -multiclass LDG_G<string TyStr, NVPTXRegClass regclass, Intrinsic IntOp> { +multiclass LDG_G<string TyStr, NVPTXRegClass regclass> { def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src), !strconcat("ld.global.nc.", TyStr), - [(set regclass:$result, (IntOp Int32Regs:$src))]>, Requires<[hasLDG]>; + []>, Requires<[hasLDG]>; def areg64: NVPTXInst<(outs regclass:$result), (ins Int64Regs:$src), !strconcat("ld.global.nc.", TyStr), - [(set regclass:$result, (IntOp Int64Regs:$src))]>, Requires<[hasLDG]>; - def avar: NVPTXInst<(outs regclass:$result), (ins imem:$src), + []>, Requires<[hasLDG]>; + def avar: NVPTXInst<(outs regclass:$result), (ins imemAny:$src), !strconcat("ld.global.nc.", TyStr), - [(set regclass:$result, (IntOp (Wrapper tglobaladdr:$src)))]>, - Requires<[hasLDG]>; + []>, Requires<[hasLDG]>; def ari : NVPTXInst<(outs regclass:$result), (ins MEMri:$src), !strconcat("ld.global.nc.", TyStr), - [(set regclass:$result, (IntOp ADDRri:$src))]>, Requires<[hasLDG]>; + []>, Requires<[hasLDG]>; def ari64 : NVPTXInst<(outs regclass:$result), (ins MEMri64:$src), !strconcat("ld.global.nc.", TyStr), - [(set regclass:$result, (IntOp ADDRri64:$src))]>, Requires<[hasLDG]>; -} - -multiclass LDG_G_NOINTRIN<string TyStr, NVPTXRegClass regclass, PatFrag IntOp> { - def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src), - !strconcat("ld.global.nc.", TyStr), - [(set regclass:$result, (IntOp Int32Regs:$src))]>, Requires<[hasLDG]>; - def areg64: NVPTXInst<(outs regclass:$result), (ins Int64Regs:$src), - !strconcat("ld.global.nc.", TyStr), - [(set regclass:$result, (IntOp Int64Regs:$src))]>, Requires<[hasLDG]>; - def avar: NVPTXInst<(outs regclass:$result), (ins imem:$src), - !strconcat("ld.global.nc.", TyStr), - [(set regclass:$result, (IntOp (Wrapper tglobaladdr:$src)))]>, - Requires<[hasLDG]>; - def ari : NVPTXInst<(outs regclass:$result), (ins MEMri:$src), - !strconcat("ld.global.nc.", TyStr), - [(set regclass:$result, (IntOp ADDRri:$src))]>, Requires<[hasLDG]>; - def ari64 : NVPTXInst<(outs regclass:$result), (ins MEMri64:$src), - !strconcat("ld.global.nc.", TyStr), - [(set regclass:$result, (IntOp ADDRri64:$src))]>, Requires<[hasLDG]>; + []>, Requires<[hasLDG]>; } defm INT_PTX_LDG_GLOBAL_i8 - : LDG_G_NOINTRIN<"u8 \t$result, [$src];", Int16Regs, ldg_i8>; + : LDG_G<"u8 \t$result, [$src];", Int16Regs>; defm INT_PTX_LDG_GLOBAL_i16 - : LDG_G<"u16 \t$result, [$src];", Int16Regs, int_nvvm_ldg_global_i>; + : LDG_G<"u16 \t$result, [$src];", Int16Regs>; defm INT_PTX_LDG_GLOBAL_i32 - : LDG_G<"u32 \t$result, [$src];", Int32Regs, int_nvvm_ldg_global_i>; + : LDG_G<"u32 \t$result, [$src];", Int32Regs>; defm INT_PTX_LDG_GLOBAL_i64 - : LDG_G<"u64 \t$result, [$src];", Int64Regs, int_nvvm_ldg_global_i>; + : LDG_G<"u64 \t$result, [$src];", Int64Regs>; defm INT_PTX_LDG_GLOBAL_f32 - : LDG_G<"f32 \t$result, [$src];", Float32Regs, int_nvvm_ldg_global_f>; + : LDG_G<"f32 \t$result, [$src];", Float32Regs>; defm INT_PTX_LDG_GLOBAL_f64 - : LDG_G<"f64 \t$result, [$src];", Float64Regs, int_nvvm_ldg_global_f>; + : LDG_G<"f64 \t$result, [$src];", Float64Regs>; defm INT_PTX_LDG_GLOBAL_p32 - : LDG_G<"u32 \t$result, [$src];", Int32Regs, int_nvvm_ldg_global_p>; + : LDG_G<"u32 \t$result, [$src];", Int32Regs>; defm INT_PTX_LDG_GLOBAL_p64 - : LDG_G<"u64 \t$result, [$src];", Int64Regs, int_nvvm_ldg_global_p>; + : LDG_G<"u64 \t$result, [$src];", Int64Regs>; // vector @@ -1689,6 +1728,207 @@ def INT_NVVM_COMPILER_ERROR_64 : NVPTXInst<(outs), (ins Int64Regs:$a), [(int_nvvm_compiler_error Int64Regs:$a)]>; +// isspacep + +def ISSPACEP_CONST_32 + : NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a), + "isspacep.const \t$d, $a;", + [(set Int1Regs:$d, (int_nvvm_isspacep_const Int32Regs:$a))]>, + Requires<[hasPTX31]>; +def ISSPACEP_CONST_64 + : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a), + "isspacep.const \t$d, $a;", + [(set Int1Regs:$d, (int_nvvm_isspacep_const Int64Regs:$a))]>, + Requires<[hasPTX31]>; +def ISSPACEP_GLOBAL_32 + : NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a), + "isspacep.global \t$d, $a;", + [(set Int1Regs:$d, (int_nvvm_isspacep_global Int32Regs:$a))]>; +def ISSPACEP_GLOBAL_64 + : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a), + "isspacep.global \t$d, $a;", + [(set Int1Regs:$d, (int_nvvm_isspacep_global Int64Regs:$a))]>; +def ISSPACEP_LOCAL_32 + : NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a), + "isspacep.local \t$d, $a;", + [(set Int1Regs:$d, (int_nvvm_isspacep_local Int32Regs:$a))]>; +def ISSPACEP_LOCAL_64 + : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a), + "isspacep.local \t$d, $a;", + [(set Int1Regs:$d, (int_nvvm_isspacep_local Int64Regs:$a))]>; +def ISSPACEP_SHARED_32 + : NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a), + "isspacep.shared \t$d, $a;", + [(set Int1Regs:$d, (int_nvvm_isspacep_shared Int32Regs:$a))]>; +def ISSPACEP_SHARED_64 + : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a), + "isspacep.shared \t$d, $a;", + [(set Int1Regs:$d, (int_nvvm_isspacep_shared Int64Regs:$a))]>; + + +// Special register reads +def MOV_SPECIAL : NVPTXInst<(outs Int32Regs:$d), + (ins SpecialRegs:$r), + "mov.b32\t$d, $r;", []>; + +def : Pat<(int_nvvm_read_ptx_sreg_envreg0), (MOV_SPECIAL ENVREG0)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg1), (MOV_SPECIAL ENVREG1)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg2), (MOV_SPECIAL ENVREG2)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg3), (MOV_SPECIAL ENVREG3)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg4), (MOV_SPECIAL ENVREG4)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg5), (MOV_SPECIAL ENVREG5)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg6), (MOV_SPECIAL ENVREG6)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg7), (MOV_SPECIAL ENVREG7)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg8), (MOV_SPECIAL ENVREG8)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg9), (MOV_SPECIAL ENVREG9)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg10), (MOV_SPECIAL ENVREG10)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg11), (MOV_SPECIAL ENVREG11)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg12), (MOV_SPECIAL ENVREG12)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg13), (MOV_SPECIAL ENVREG13)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg14), (MOV_SPECIAL ENVREG14)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg15), (MOV_SPECIAL ENVREG15)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg16), (MOV_SPECIAL ENVREG16)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg17), (MOV_SPECIAL ENVREG17)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg18), (MOV_SPECIAL ENVREG18)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg19), (MOV_SPECIAL ENVREG19)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg20), (MOV_SPECIAL ENVREG20)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg21), (MOV_SPECIAL ENVREG21)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg22), (MOV_SPECIAL ENVREG22)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg23), (MOV_SPECIAL ENVREG23)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg24), (MOV_SPECIAL ENVREG24)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg25), (MOV_SPECIAL ENVREG25)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg26), (MOV_SPECIAL ENVREG26)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg27), (MOV_SPECIAL ENVREG27)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg28), (MOV_SPECIAL ENVREG28)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg29), (MOV_SPECIAL ENVREG29)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg30), (MOV_SPECIAL ENVREG30)>; +def : Pat<(int_nvvm_read_ptx_sreg_envreg31), (MOV_SPECIAL ENVREG31)>; + + +// rotate builtin support + +def ROTATE_B32_HW_IMM + : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$src, i32imm:$amt), + "shf.l.wrap.b32 \t$dst, $src, $src, $amt;", + [(set Int32Regs:$dst, + (int_nvvm_rotate_b32 Int32Regs:$src, (i32 imm:$amt)))]>, + Requires<[hasHWROT32]> ; + +def ROTATE_B32_HW_REG + : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$src, Int32Regs:$amt), + "shf.l.wrap.b32 \t$dst, $src, $src, $amt;", + [(set Int32Regs:$dst, + (int_nvvm_rotate_b32 Int32Regs:$src, Int32Regs:$amt))]>, + Requires<[hasHWROT32]> ; + +def : Pat<(int_nvvm_rotate_b32 Int32Regs:$src, (i32 imm:$amt)), + (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>, + Requires<[noHWROT32]> ; + +def : Pat<(int_nvvm_rotate_b32 Int32Regs:$src, Int32Regs:$amt), + (ROTL32reg_sw Int32Regs:$src, Int32Regs:$amt)>, + Requires<[noHWROT32]> ; + +def GET_LO_INT64 + : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src), + !strconcat("{{\n\t", + !strconcat(".reg .b32 %dummy;\n\t", + !strconcat("mov.b64 \t{$dst,%dummy}, $src;\n\t", + !strconcat("}}", "")))), + []> ; + +def GET_HI_INT64 + : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src), + !strconcat("{{\n\t", + !strconcat(".reg .b32 %dummy;\n\t", + !strconcat("mov.b64 \t{%dummy,$dst}, $src;\n\t", + !strconcat("}}", "")))), + []> ; + +def PACK_TWO_INT32 + : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$lo, Int32Regs:$hi), + "mov.b64 \t$dst, {{$lo, $hi}};", []> ; + +def : Pat<(int_nvvm_swap_lo_hi_b64 Int64Regs:$src), + (PACK_TWO_INT32 (GET_HI_INT64 Int64Regs:$src), + (GET_LO_INT64 Int64Regs:$src))> ; + +// funnel shift, requires >= sm_32 +def SHF_L_WRAP_B32_IMM + : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt), + "shf.l.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>, + Requires<[hasHWROT32]>; + +def SHF_L_WRAP_B32_REG + : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), + "shf.l.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>, + Requires<[hasHWROT32]>; + +def SHF_R_WRAP_B32_IMM + : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt), + "shf.r.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>, + Requires<[hasHWROT32]>; + +def SHF_R_WRAP_B32_REG + : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), + "shf.r.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>, + Requires<[hasHWROT32]>; + +// HW version of rotate 64 +def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, (i32 imm:$amt)), + (PACK_TWO_INT32 + (SHF_L_WRAP_B32_IMM (GET_HI_INT64 Int64Regs:$src), + (GET_LO_INT64 Int64Regs:$src), imm:$amt), + (SHF_L_WRAP_B32_IMM (GET_LO_INT64 Int64Regs:$src), + (GET_HI_INT64 Int64Regs:$src), imm:$amt))>, + Requires<[hasHWROT32]>; + +def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, Int32Regs:$amt), + (PACK_TWO_INT32 + (SHF_L_WRAP_B32_REG (GET_HI_INT64 Int64Regs:$src), + (GET_LO_INT64 Int64Regs:$src), Int32Regs:$amt), + (SHF_L_WRAP_B32_REG (GET_LO_INT64 Int64Regs:$src), + (GET_HI_INT64 Int64Regs:$src), Int32Regs:$amt))>, + Requires<[hasHWROT32]>; + + +def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, (i32 imm:$amt)), + (PACK_TWO_INT32 + (SHF_R_WRAP_B32_IMM (GET_LO_INT64 Int64Regs:$src), + (GET_HI_INT64 Int64Regs:$src), imm:$amt), + (SHF_R_WRAP_B32_IMM (GET_HI_INT64 Int64Regs:$src), + (GET_LO_INT64 Int64Regs:$src), imm:$amt))>, + Requires<[hasHWROT32]>; + +def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, Int32Regs:$amt), + (PACK_TWO_INT32 + (SHF_R_WRAP_B32_REG (GET_LO_INT64 Int64Regs:$src), + (GET_HI_INT64 Int64Regs:$src), Int32Regs:$amt), + (SHF_R_WRAP_B32_REG (GET_HI_INT64 Int64Regs:$src), + (GET_LO_INT64 Int64Regs:$src), Int32Regs:$amt))>, + Requires<[hasHWROT32]>; + +// SW version of rotate 64 +def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, (i32 imm:$amt)), + (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>, + Requires<[noHWROT32]>; +def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, Int32Regs:$amt), + (ROTL64reg_sw Int64Regs:$src, Int32Regs:$amt)>, + Requires<[noHWROT32]>; +def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, (i32 imm:$amt)), + (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>, + Requires<[noHWROT32]>; +def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, Int32Regs:$amt), + (ROTR64reg_sw Int64Regs:$src, Int32Regs:$amt)>, + Requires<[noHWROT32]>; + + //----------------------------------- // Texture Intrinsics //----------------------------------- diff --git a/lib/Target/NVPTX/NVPTXMCExpr.h b/lib/Target/NVPTX/NVPTXMCExpr.h index 0ee018cc7e5d..554764930a9e 100644 --- a/lib/Target/NVPTX/NVPTXMCExpr.h +++ b/lib/Target/NVPTX/NVPTXMCExpr.h @@ -66,7 +66,7 @@ public: const MCAsmLayout *Layout) const override { return false; } - void AddValueSymbols(MCAssembler *) const override {}; + void visitUsedExpr(MCStreamer &Streamer) const override {}; const MCSection *FindAssociatedSection() const override { return nullptr; } diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.td b/lib/Target/NVPTX/NVPTXRegisterInfo.td index 7a38a66b9227..34822489481c 100644 --- a/lib/Target/NVPTX/NVPTXRegisterInfo.td +++ b/lib/Target/NVPTX/NVPTXRegisterInfo.td @@ -46,6 +46,10 @@ foreach i = 0-4 in { def da#i : NVPTXReg<"%da"#i>; } +foreach i = 0-31 in { + def ENVREG#i : NVPTXReg<"%envreg"#i>; +} + //===----------------------------------------------------------------------===// // Register classes //===----------------------------------------------------------------------===// @@ -61,4 +65,5 @@ def Float32ArgRegs : NVPTXRegClass<[f32], 32, (add (sequence "fa%u", 0, 4))>; def Float64ArgRegs : NVPTXRegClass<[f64], 64, (add (sequence "da%u", 0, 4))>; // Read NVPTXRegisterInfo.cpp to see how VRFrame and VRDepot are used. -def SpecialRegs : NVPTXRegClass<[i32], 32, (add VRFrame, VRDepot)>; +def SpecialRegs : NVPTXRegClass<[i32], 32, (add VRFrame, VRDepot, + (sequence "ENVREG%u", 0, 31))>; diff --git a/lib/Target/NVPTX/NVPTXSubtarget.cpp b/lib/Target/NVPTX/NVPTXSubtarget.cpp index 8c7df52be344..d5cded218362 100644 --- a/lib/Target/NVPTX/NVPTXSubtarget.cpp +++ b/lib/Target/NVPTX/NVPTXSubtarget.cpp @@ -25,10 +25,41 @@ using namespace llvm; // Pin the vtable to this file. void NVPTXSubtarget::anchor() {} +static std::string computeDataLayout(bool is64Bit) { + std::string Ret = "e"; + + if (!is64Bit) + Ret += "-p:32:32"; + + Ret += "-i64:64-v16:16-v32:32-n16:32:64"; + + return Ret; +} + +NVPTXSubtarget &NVPTXSubtarget::initializeSubtargetDependencies(StringRef CPU, + StringRef FS) { + // Provide the default CPU if we don't have one. + if (CPU.empty() && FS.size()) + llvm_unreachable("we are not using FeatureStr"); + TargetName = CPU.empty() ? "sm_20" : CPU; + + ParseSubtargetFeatures(TargetName, FS); + + // Set default to PTX 3.2 (CUDA 5.5) + if (PTXVersion == 0) { + PTXVersion = 32; + } + + return *this; +} + NVPTXSubtarget::NVPTXSubtarget(const std::string &TT, const std::string &CPU, - const std::string &FS, bool is64Bit) + const std::string &FS, const TargetMachine &TM, + bool is64Bit) : NVPTXGenSubtargetInfo(TT, CPU, FS), Is64Bit(is64Bit), PTXVersion(0), - SmVersion(20) { + SmVersion(20), DL(computeDataLayout(is64Bit)), + InstrInfo(initializeSubtargetDependencies(CPU, FS)), + TLInfo((NVPTXTargetMachine &)TM), TSInfo(&DL), FrameLowering(*this) { Triple T(TT); @@ -36,26 +67,4 @@ NVPTXSubtarget::NVPTXSubtarget(const std::string &TT, const std::string &CPU, drvInterface = NVPTX::NVCL; else drvInterface = NVPTX::CUDA; - - // Provide the default CPU if none - std::string defCPU = "sm_20"; - - ParseSubtargetFeatures((CPU.empty() ? defCPU : CPU), FS); - - // Get the TargetName from the FS if available - if (FS.empty() && CPU.empty()) - TargetName = defCPU; - else if (!CPU.empty()) - TargetName = CPU; - else - llvm_unreachable("we are not using FeatureStr"); - - // We default to PTX 3.1, but we cannot just default to it in the initializer - // since the attribute parser checks if the given option is >= the default. - // So if we set ptx31 as the default, the ptx30 attribute would never match. - // Instead, we use 0 as the default and manually set 31 if the default is - // used. - if (PTXVersion == 0) { - PTXVersion = 31; - } } diff --git a/lib/Target/NVPTX/NVPTXSubtarget.h b/lib/Target/NVPTX/NVPTXSubtarget.h index 581e5edbcfb9..3ed5747b55f9 100644 --- a/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/lib/Target/NVPTX/NVPTXSubtarget.h @@ -15,6 +15,12 @@ #define NVPTXSUBTARGET_H #include "NVPTX.h" +#include "NVPTXFrameLowering.h" +#include "NVPTXISelLowering.h" +#include "NVPTXInstrInfo.h" +#include "NVPTXRegisterInfo.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/Target/TargetSelectionDAGInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" #include <string> @@ -35,12 +41,30 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo { // SM version x.y is represented as 10*x+y, e.g. 3.1 == 31 unsigned int SmVersion; + const DataLayout DL; // Calculates type size & alignment + NVPTXInstrInfo InstrInfo; + NVPTXTargetLowering TLInfo; + TargetSelectionDAGInfo TSInfo; + + // NVPTX does not have any call stack frame, but need a NVPTX specific + // FrameLowering class because TargetFrameLowering is abstract. + NVPTXFrameLowering FrameLowering; + public: /// This constructor initializes the data members to match that /// of the specified module. /// NVPTXSubtarget(const std::string &TT, const std::string &CPU, - const std::string &FS, bool is64Bit); + const std::string &FS, const TargetMachine &TM, bool is64Bit); + + const TargetFrameLowering *getFrameLowering() const { return &FrameLowering; } + const NVPTXInstrInfo *getInstrInfo() const { return &InstrInfo; } + const DataLayout *getDataLayout() const { return &DL; } + const NVPTXRegisterInfo *getRegisterInfo() const { + return &InstrInfo.getRegisterInfo(); + } + const NVPTXTargetLowering *getTargetLowering() const { return &TLInfo; } + const TargetSelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; } bool hasBrkPt() const { return SmVersion >= 11; } bool hasAtomRedG32() const { return SmVersion >= 11; } @@ -57,10 +81,12 @@ public: bool hasFMAF32() const { return SmVersion >= 20; } bool hasFMAF64() const { return SmVersion >= 13; } bool hasLDG() const { return SmVersion >= 32; } - bool hasLDU() const { return SmVersion >= 20; } + bool hasLDU() const { return ((SmVersion >= 20) && (SmVersion < 30)); } bool hasGenericLdSt() const { return SmVersion >= 20; } - inline bool hasHWROT32() const { return false; } - inline bool hasSWROT32() const { return true; } + inline bool hasHWROT32() const { return SmVersion >= 32; } + inline bool hasSWROT32() const { + return ((SmVersion >= 20) && (SmVersion < 32)); + } inline bool hasROT32() const { return hasHWROT32() || hasSWROT32(); } inline bool hasROT64() const { return SmVersion >= 20; } @@ -76,6 +102,7 @@ public: unsigned getPTXVersion() const { return PTXVersion; } + NVPTXSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS); void ParseSubtargetFeatures(StringRef CPU, StringRef FS); }; diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp index 26a4f840520e..069a1b9966f0 100644 --- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -66,26 +66,13 @@ extern "C" void LLVMInitializeNVPTXTarget() { *PassRegistry::getPassRegistry()); } -static std::string computeDataLayout(const NVPTXSubtarget &ST) { - std::string Ret = "e"; - - if (!ST.is64Bit()) - Ret += "-p:32:32"; - - Ret += "-i64:64-v16:16-v32:32-n16:32:64"; - - return Ret; -} - -NVPTXTargetMachine::NVPTXTargetMachine( - const Target &T, StringRef TT, StringRef CPU, StringRef FS, - const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL, bool is64bit) +NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, StringRef TT, + StringRef CPU, StringRef FS, + const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL, bool is64bit) : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), - Subtarget(TT, CPU, FS, is64bit), DL(computeDataLayout(Subtarget)), - InstrInfo(*this), TLInfo(*this), TSInfo(*this), - FrameLowering( - *this, is64bit) /*FrameInfo(TargetFrameInfo::StackGrowsUp, 8, 0)*/ { + Subtarget(TT, CPU, FS, *this, is64bit) { initAsmInfo(); } @@ -119,6 +106,7 @@ public: bool addInstSelector() override; bool addPreRegAlloc() override; bool addPostRegAlloc() override; + void addMachineSSAOptimization() override; FunctionPass *createTargetRegisterAllocator(bool) override; void addFastRegAlloc(FunctionPass *RegAllocPass) override; @@ -220,3 +208,43 @@ void NVPTXPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) { printAndVerify("After StackSlotColoring"); } + +void NVPTXPassConfig::addMachineSSAOptimization() { + // Pre-ra tail duplication. + if (addPass(&EarlyTailDuplicateID)) + printAndVerify("After Pre-RegAlloc TailDuplicate"); + + // Optimize PHIs before DCE: removing dead PHI cycles may make more + // instructions dead. + addPass(&OptimizePHIsID); + + // This pass merges large allocas. StackSlotColoring is a different pass + // which merges spill slots. + addPass(&StackColoringID); + + // If the target requests it, assign local variables to stack slots relative + // to one another and simplify frame index references where possible. + addPass(&LocalStackSlotAllocationID); + + // With optimization, dead code should already be eliminated. However + // there is one known exception: lowered code for arguments that are only + // used by tail calls, where the tail calls reuse the incoming stack + // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll). + addPass(&DeadMachineInstructionElimID); + printAndVerify("After codegen DCE pass"); + + // Allow targets to insert passes that improve instruction level parallelism, + // like if-conversion. Such passes will typically need dominator trees and + // loop info, just like LICM and CSE below. + if (addILPOpts()) + printAndVerify("After ILP optimizations"); + + addPass(&MachineLICMID); + addPass(&MachineCSEID); + + addPass(&MachineSinkingID); + printAndVerify("After Machine LICM, CSE and Sinking passes"); + + addPass(&PeepholeOptimizerID); + printAndVerify("After codegen peephole optimization pass"); +} diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.h b/lib/Target/NVPTX/NVPTXTargetMachine.h index 2db7c1861761..a7a1c8f4e171 100644 --- a/lib/Target/NVPTX/NVPTXTargetMachine.h +++ b/lib/Target/NVPTX/NVPTXTargetMachine.h @@ -14,13 +14,8 @@ #ifndef NVPTX_TARGETMACHINE_H #define NVPTX_TARGETMACHINE_H -#include "ManagedStringPool.h" -#include "NVPTXFrameLowering.h" -#include "NVPTXISelLowering.h" -#include "NVPTXInstrInfo.h" -#include "NVPTXRegisterInfo.h" #include "NVPTXSubtarget.h" -#include "llvm/IR/DataLayout.h" +#include "ManagedStringPool.h" #include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetSelectionDAGInfo.h" @@ -31,50 +26,37 @@ namespace llvm { /// class NVPTXTargetMachine : public LLVMTargetMachine { NVPTXSubtarget Subtarget; - const DataLayout DL; // Calculates type size & alignment - NVPTXInstrInfo InstrInfo; - NVPTXTargetLowering TLInfo; - TargetSelectionDAGInfo TSInfo; - - // NVPTX does not have any call stack frame, but need a NVPTX specific - // FrameLowering class because TargetFrameLowering is abstract. - NVPTXFrameLowering FrameLowering; // Hold Strings that can be free'd all together with NVPTXTargetMachine ManagedStringPool ManagedStrPool; - //bool addCommonCodeGenPasses(PassManagerBase &, CodeGenOpt::Level, - // bool DisableVerify, MCContext *&OutCtx); - public: NVPTXTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OP, bool is64bit); const TargetFrameLowering *getFrameLowering() const override { - return &FrameLowering; + return getSubtargetImpl()->getFrameLowering(); + } + const NVPTXInstrInfo *getInstrInfo() const override { + return getSubtargetImpl()->getInstrInfo(); + } + const DataLayout *getDataLayout() const override { + return getSubtargetImpl()->getDataLayout(); } - const NVPTXInstrInfo *getInstrInfo() const override { return &InstrInfo; } - const DataLayout *getDataLayout() const override { return &DL; } const NVPTXSubtarget *getSubtargetImpl() const override { return &Subtarget; } - const NVPTXRegisterInfo *getRegisterInfo() const override { - return &(InstrInfo.getRegisterInfo()); + return getSubtargetImpl()->getRegisterInfo(); } - NVPTXTargetLowering *getTargetLowering() const override { - return const_cast<NVPTXTargetLowering *>(&TLInfo); + const NVPTXTargetLowering *getTargetLowering() const override { + return getSubtargetImpl()->getTargetLowering(); } const TargetSelectionDAGInfo *getSelectionDAGInfo() const override { - return &TSInfo; + return getSubtargetImpl()->getSelectionDAGInfo(); } - //virtual bool addInstSelector(PassManagerBase &PM, - // CodeGenOpt::Level OptLevel); - - //virtual bool addPreRegAlloc(PassManagerBase &, CodeGenOpt::Level); - ManagedStringPool *getManagedStrPool() const { return const_cast<ManagedStringPool *>(&ManagedStrPool); } diff --git a/lib/Target/NVPTX/NVVMReflect.cpp b/lib/Target/NVPTX/NVVMReflect.cpp index cb8bd7260256..a8d6b95ae4ec 100644 --- a/lib/Target/NVPTX/NVVMReflect.cpp +++ b/lib/Target/NVPTX/NVVMReflect.cpp @@ -22,6 +22,7 @@ #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/Pass.h" @@ -47,17 +48,16 @@ class NVVMReflect : public ModulePass { private: StringMap<int> VarMap; typedef DenseMap<std::string, int>::iterator VarMapIter; - Function *ReflectFunction; public: static char ID; - NVVMReflect() : ModulePass(ID), ReflectFunction(nullptr) { + NVVMReflect() : ModulePass(ID) { initializeNVVMReflectPass(*PassRegistry::getPassRegistry()); VarMap.clear(); } NVVMReflect(const StringMap<int> &Mapping) - : ModulePass(ID), ReflectFunction(nullptr) { + : ModulePass(ID) { initializeNVVMReflectPass(*PassRegistry::getPassRegistry()); for (StringMap<int>::const_iterator I = Mapping.begin(), E = Mapping.end(); I != E; ++I) { @@ -70,6 +70,8 @@ public: } bool runOnModule(Module &) override; +private: + bool handleFunction(Function *ReflectFunction); void setVarMap(); }; } @@ -120,19 +122,7 @@ void NVVMReflect::setVarMap() { } } -bool NVVMReflect::runOnModule(Module &M) { - if (!NVVMReflectEnabled) - return false; - - setVarMap(); - - ReflectFunction = M.getFunction(NVVM_REFLECT_FUNCTION); - - // If reflect function is not used, then there will be - // no entry in the module. - if (!ReflectFunction) - return false; - +bool NVVMReflect::handleFunction(Function *ReflectFunction) { // Validate _reflect function assert(ReflectFunction->isDeclaration() && "_reflect function should not have a body"); @@ -155,13 +145,15 @@ bool NVVMReflect::runOnModule(Module &M) { "Only one operand expect for _reflect function"); // In cuda, we will have an extra constant-to-generic conversion of // the string. - const Value *conv = Reflect->getArgOperand(0); - assert(isa<CallInst>(conv) && "Expected a const-to-gen conversion"); - const CallInst *ConvCall = cast<CallInst>(conv); - const Value *str = ConvCall->getArgOperand(0); - assert(isa<ConstantExpr>(str) && + const Value *Str = Reflect->getArgOperand(0); + if (isa<CallInst>(Str)) { + // CUDA path + const CallInst *ConvCall = cast<CallInst>(Str); + Str = ConvCall->getArgOperand(0); + } + assert(isa<ConstantExpr>(Str) && "Format of _reflect function not recognized"); - const ConstantExpr *GEP = cast<ConstantExpr>(str); + const ConstantExpr *GEP = cast<ConstantExpr>(Str); const Value *Sym = GEP->getOperand(0); assert(isa<Constant>(Sym) && "Format of _reflect function not recognized"); @@ -195,3 +187,36 @@ bool NVVMReflect::runOnModule(Module &M) { ToRemove[i]->eraseFromParent(); return true; } + +bool NVVMReflect::runOnModule(Module &M) { + if (!NVVMReflectEnabled) + return false; + + setVarMap(); + + + bool Res = false; + std::string Name; + Type *Tys[1]; + Type *I8Ty = Type::getInt8Ty(M.getContext()); + Function *ReflectFunction; + + // Check for standard overloaded versions of llvm.nvvm.reflect + + for (unsigned i = 0; i != 5; ++i) { + Tys[0] = PointerType::get(I8Ty, i); + Name = Intrinsic::getName(Intrinsic::nvvm_reflect, Tys); + ReflectFunction = M.getFunction(Name); + if(ReflectFunction != 0) { + Res |= handleFunction(ReflectFunction); + } + } + + ReflectFunction = M.getFunction(NVVM_REFLECT_FUNCTION); + // If reflect function is not used, then there will be + // no entry in the module. + if (ReflectFunction != 0) + Res |= handleFunction(ReflectFunction); + + return Res; +} diff --git a/lib/Target/NVPTX/cl_common_defines.h b/lib/Target/NVPTX/cl_common_defines.h index 45cc0b8b67f2..02c5a94c3d03 100644 --- a/lib/Target/NVPTX/cl_common_defines.h +++ b/lib/Target/NVPTX/cl_common_defines.h @@ -1,5 +1,5 @@ -#ifndef __CL_COMMON_DEFINES_H__ -#define __CL_COMMON_DEFINES_H__ +#ifndef CL_COMMON_DEFINES_H +#define CL_COMMON_DEFINES_H // This file includes defines that are common to both kernel code and // the NVPTX back-end. @@ -119,4 +119,4 @@ typedef enum clk_sampler_type { #define CLK_LOCAL_MEM_FENCE (1 << 0) #define CLK_GLOBAL_MEM_FENCE (1 << 1) -#endif // __CL_COMMON_DEFINES_H__ +#endif // CL_COMMON_DEFINES_H diff --git a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp index 3ac037dc0fc4..2f562ca7891a 100644 --- a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp +++ b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp @@ -238,7 +238,7 @@ class PPCAsmParser : public MCTargetAsmParser { bool ParseExpression(const MCExpr *&EVal); bool ParseDarwinExpression(const MCExpr *&EVal); - bool ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands); + bool ParseOperand(OperandVector &Operands); bool ParseDirectiveWord(unsigned Size, SMLoc L); bool ParseDirectiveTC(unsigned Size, SMLoc L); @@ -246,12 +246,11 @@ class PPCAsmParser : public MCTargetAsmParser { bool ParseDarwinDirectiveMachine(SMLoc L); bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, - SmallVectorImpl<MCParsedAsmOperand*> &Operands, - MCStreamer &Out, unsigned &ErrorInfo, + OperandVector &Operands, MCStreamer &Out, + unsigned &ErrorInfo, bool MatchingInlineAsm) override; - void ProcessInstruction(MCInst &Inst, - const SmallVectorImpl<MCParsedAsmOperand*> &Ops); + void ProcessInstruction(MCInst &Inst, const OperandVector &Ops); /// @name Auto-generated Match Functions /// { @@ -276,13 +275,12 @@ public: setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); } - bool ParseInstruction(ParseInstructionInfo &Info, - StringRef Name, SMLoc NameLoc, - SmallVectorImpl<MCParsedAsmOperand*> &Operands) override; + bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + SMLoc NameLoc, OperandVector &Operands) override; bool ParseDirective(AsmToken DirectiveID) override; - unsigned validateTargetOperandClass(MCParsedAsmOperand *Op, + unsigned validateTargetOperandClass(MCParsedAsmOperand &Op, unsigned Kind) override; const MCExpr *applyModifierToExpr(const MCExpr *E, @@ -548,8 +546,9 @@ public: void print(raw_ostream &OS) const override; - static PPCOperand *CreateToken(StringRef Str, SMLoc S, bool IsPPC64) { - PPCOperand *Op = new PPCOperand(Token); + static std::unique_ptr<PPCOperand> CreateToken(StringRef Str, SMLoc S, + bool IsPPC64) { + auto Op = make_unique<PPCOperand>(Token); Op->Tok.Data = Str.data(); Op->Tok.Length = Str.size(); Op->StartLoc = S; @@ -558,22 +557,27 @@ public: return Op; } - static PPCOperand *CreateTokenWithStringCopy(StringRef Str, SMLoc S, - bool IsPPC64) { + static std::unique_ptr<PPCOperand> + CreateTokenWithStringCopy(StringRef Str, SMLoc S, bool IsPPC64) { // Allocate extra memory for the string and copy it. + // FIXME: This is incorrect, Operands are owned by unique_ptr with a default + // deleter which will destroy them by simply using "delete", not correctly + // calling operator delete on this extra memory after calling the dtor + // explicitly. void *Mem = ::operator new(sizeof(PPCOperand) + Str.size()); - PPCOperand *Op = new (Mem) PPCOperand(Token); - Op->Tok.Data = (const char *)(Op + 1); + std::unique_ptr<PPCOperand> Op(new (Mem) PPCOperand(Token)); + Op->Tok.Data = (const char *)(Op.get() + 1); Op->Tok.Length = Str.size(); - std::memcpy((char *)(Op + 1), Str.data(), Str.size()); + std::memcpy((void *)Op->Tok.Data, Str.data(), Str.size()); Op->StartLoc = S; Op->EndLoc = S; Op->IsPPC64 = IsPPC64; return Op; } - static PPCOperand *CreateImm(int64_t Val, SMLoc S, SMLoc E, bool IsPPC64) { - PPCOperand *Op = new PPCOperand(Immediate); + static std::unique_ptr<PPCOperand> CreateImm(int64_t Val, SMLoc S, SMLoc E, + bool IsPPC64) { + auto Op = make_unique<PPCOperand>(Immediate); Op->Imm.Val = Val; Op->StartLoc = S; Op->EndLoc = E; @@ -581,9 +585,9 @@ public: return Op; } - static PPCOperand *CreateExpr(const MCExpr *Val, - SMLoc S, SMLoc E, bool IsPPC64) { - PPCOperand *Op = new PPCOperand(Expression); + static std::unique_ptr<PPCOperand> CreateExpr(const MCExpr *Val, SMLoc S, + SMLoc E, bool IsPPC64) { + auto Op = make_unique<PPCOperand>(Expression); Op->Expr.Val = Val; Op->Expr.CRVal = EvaluateCRExpr(Val); Op->StartLoc = S; @@ -592,9 +596,9 @@ public: return Op; } - static PPCOperand *CreateTLSReg(const MCSymbolRefExpr *Sym, - SMLoc S, SMLoc E, bool IsPPC64) { - PPCOperand *Op = new PPCOperand(TLSRegister); + static std::unique_ptr<PPCOperand> + CreateTLSReg(const MCSymbolRefExpr *Sym, SMLoc S, SMLoc E, bool IsPPC64) { + auto Op = make_unique<PPCOperand>(TLSRegister); Op->TLSReg.Sym = Sym; Op->StartLoc = S; Op->EndLoc = E; @@ -602,8 +606,8 @@ public: return Op; } - static PPCOperand *CreateFromMCExpr(const MCExpr *Val, - SMLoc S, SMLoc E, bool IsPPC64) { + static std::unique_ptr<PPCOperand> + CreateFromMCExpr(const MCExpr *Val, SMLoc S, SMLoc E, bool IsPPC64) { if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Val)) return CreateImm(CE->getValue(), S, E, IsPPC64); @@ -634,10 +638,8 @@ void PPCOperand::print(raw_ostream &OS) const { } } - -void PPCAsmParser:: -ProcessInstruction(MCInst &Inst, - const SmallVectorImpl<MCParsedAsmOperand*> &Operands) { +void PPCAsmParser::ProcessInstruction(MCInst &Inst, + const OperandVector &Operands) { int Opcode = Inst.getOpcode(); switch (Opcode) { case PPC::LAx: { @@ -917,11 +919,10 @@ ProcessInstruction(MCInst &Inst, } } -bool PPCAsmParser:: -MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, - SmallVectorImpl<MCParsedAsmOperand*> &Operands, - MCStreamer &Out, unsigned &ErrorInfo, - bool MatchingInlineAsm) { +bool PPCAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + OperandVector &Operands, + MCStreamer &Out, unsigned &ErrorInfo, + bool MatchingInlineAsm) { MCInst Inst; switch (MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm)) { @@ -942,7 +943,7 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, if (ErrorInfo >= Operands.size()) return Error(IDLoc, "too few operands for instruction"); - ErrorLoc = ((PPCOperand*)Operands[ErrorInfo])->getStartLoc(); + ErrorLoc = ((PPCOperand &)*Operands[ErrorInfo]).getStartLoc(); if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc; } @@ -1216,12 +1217,10 @@ ParseDarwinExpression(const MCExpr *&EVal) { /// ParseOperand /// This handles registers in the form 'NN', '%rNN' for ELF platforms and /// rNN for MachO. -bool PPCAsmParser:: -ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { +bool PPCAsmParser::ParseOperand(OperandVector &Operands) { SMLoc S = Parser.getTok().getLoc(); SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1); const MCExpr *EVal; - PPCOperand *Op; // Attempt to parse the next token as an immediate switch (getLexer().getKind()) { @@ -1233,8 +1232,7 @@ ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { int64_t IntVal; if (!MatchRegisterName(Parser.getTok(), RegNo, IntVal)) { Parser.Lex(); // Eat the identifier token. - Op = PPCOperand::CreateImm(IntVal, S, E, isPPC64()); - Operands.push_back(Op); + Operands.push_back(PPCOperand::CreateImm(IntVal, S, E, isPPC64())); return false; } return Error(S, "invalid register name"); @@ -1249,8 +1247,7 @@ ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { int64_t IntVal; if (!MatchRegisterName(Parser.getTok(), RegNo, IntVal)) { Parser.Lex(); // Eat the identifier token. - Op = PPCOperand::CreateImm(IntVal, S, E, isPPC64()); - Operands.push_back(Op); + Operands.push_back(PPCOperand::CreateImm(IntVal, S, E, isPPC64())); return false; } } @@ -1272,8 +1269,7 @@ ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { } // Push the parsed operand into the list of operands - Op = PPCOperand::CreateFromMCExpr(EVal, S, E, isPPC64()); - Operands.push_back(Op); + Operands.push_back(PPCOperand::CreateFromMCExpr(EVal, S, E, isPPC64())); // Check whether this is a TLS call expression bool TLSCall = false; @@ -1292,8 +1288,7 @@ ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { E = Parser.getTok().getLoc(); Parser.Lex(); // Eat the ')'. - Op = PPCOperand::CreateFromMCExpr(TLSSym, S, E, isPPC64()); - Operands.push_back(Op); + Operands.push_back(PPCOperand::CreateFromMCExpr(TLSSym, S, E, isPPC64())); } // Otherwise, check for D-form memory operands @@ -1340,17 +1335,15 @@ ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { E = Parser.getTok().getLoc(); Parser.Lex(); // Eat the ')'. - Op = PPCOperand::CreateImm(IntVal, S, E, isPPC64()); - Operands.push_back(Op); + Operands.push_back(PPCOperand::CreateImm(IntVal, S, E, isPPC64())); } return false; } /// Parse an instruction mnemonic followed by its operands. -bool PPCAsmParser:: -ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, - SmallVectorImpl<MCParsedAsmOperand*> &Operands) { +bool PPCAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + SMLoc NameLoc, OperandVector &Operands) { // The first operand is the token for the instruction name. // If the next character is a '+' or '-', we need to add it to the // instruction name, to match what TableGen is doing. @@ -1554,7 +1547,7 @@ extern "C" void LLVMInitializePowerPCAsmParser() { // Define this matcher function after the auto-generated include so we // have the match class enum definitions. -unsigned PPCAsmParser::validateTargetOperandClass(MCParsedAsmOperand *AsmOp, +unsigned PPCAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp, unsigned Kind) { // If the kind is a token for a literal immediate, check if our asm // operand matches. This is for InstAliases which have a fixed-value @@ -1568,8 +1561,8 @@ unsigned PPCAsmParser::validateTargetOperandClass(MCParsedAsmOperand *AsmOp, default: return Match_InvalidOperand; } - PPCOperand *Op = static_cast<PPCOperand*>(AsmOp); - if (Op->isImm() && Op->getImm() == ImmVal) + PPCOperand &Op = static_cast<PPCOperand &>(AsmOp); + if (Op.isImm() && Op.getImm() == ImmVal) return Match_Success; return Match_InvalidOperand; diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp index a4983ad67235..435a93f78c1d 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp +++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp @@ -102,17 +102,45 @@ public: // Output the constant in big/little endian byte order. unsigned Size = Desc.getSize(); - if (IsLittleEndian) { - for (unsigned i = 0; i != Size; ++i) { - OS << (char)Bits; - Bits >>= 8; + switch (Size) { + case 4: + if (IsLittleEndian) { + OS << (char)(Bits); + OS << (char)(Bits >> 8); + OS << (char)(Bits >> 16); + OS << (char)(Bits >> 24); + } else { + OS << (char)(Bits >> 24); + OS << (char)(Bits >> 16); + OS << (char)(Bits >> 8); + OS << (char)(Bits); } - } else { - int ShiftValue = (Size * 8) - 8; - for (unsigned i = 0; i != Size; ++i) { - OS << (char)(Bits >> ShiftValue); - Bits <<= 8; + break; + case 8: + // If we emit a pair of instructions, the first one is + // always in the top 32 bits, even on little-endian. + if (IsLittleEndian) { + OS << (char)(Bits >> 32); + OS << (char)(Bits >> 40); + OS << (char)(Bits >> 48); + OS << (char)(Bits >> 56); + OS << (char)(Bits); + OS << (char)(Bits >> 8); + OS << (char)(Bits >> 16); + OS << (char)(Bits >> 24); + } else { + OS << (char)(Bits >> 56); + OS << (char)(Bits >> 48); + OS << (char)(Bits >> 40); + OS << (char)(Bits >> 32); + OS << (char)(Bits >> 24); + OS << (char)(Bits >> 16); + OS << (char)(Bits >> 8); + OS << (char)(Bits); } + break; + default: + llvm_unreachable ("Invalid instruction size"); } ++MCNumEmitted; // Keep track of the # of mi's emitted. diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp index 10d068dc49e6..3ac0aca6b78c 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp +++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp @@ -11,6 +11,7 @@ #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCContext.h" +#include "llvm/MC/MCObjectStreamer.h" using namespace llvm; @@ -127,33 +128,6 @@ PPCMCExpr::EvaluateAsRelocatableImpl(MCValue &Res, return true; } -// FIXME: This basically copies MCObjectStreamer::AddValueSymbols. Perhaps -// that method should be made public? -static void AddValueSymbols_(const MCExpr *Value, MCAssembler *Asm) { - switch (Value->getKind()) { - case MCExpr::Target: - llvm_unreachable("Can't handle nested target expr!"); - - case MCExpr::Constant: - break; - - case MCExpr::Binary: { - const MCBinaryExpr *BE = cast<MCBinaryExpr>(Value); - AddValueSymbols_(BE->getLHS(), Asm); - AddValueSymbols_(BE->getRHS(), Asm); - break; - } - - case MCExpr::SymbolRef: - Asm->getOrCreateSymbolData(cast<MCSymbolRefExpr>(Value)->getSymbol()); - break; - - case MCExpr::Unary: - AddValueSymbols_(cast<MCUnaryExpr>(Value)->getSubExpr(), Asm); - break; - } -} - -void PPCMCExpr::AddValueSymbols(MCAssembler *Asm) const { - AddValueSymbols_(getSubExpr(), Asm); +void PPCMCExpr::visitUsedExpr(MCStreamer &Streamer) const { + Streamer.visitUsedExpr(*getSubExpr()); } diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h index 3421b9157711..bca408507e72 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h +++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h @@ -79,7 +79,7 @@ public: void PrintImpl(raw_ostream &OS) const override; bool EvaluateAsRelocatableImpl(MCValue &Res, const MCAsmLayout *Layout) const override; - void AddValueSymbols(MCAssembler *) const override; + void visitUsedExpr(MCStreamer &Streamer) const override; const MCSection *FindAssociatedSection() const override { return getSubExpr()->FindAssociatedSection(); } diff --git a/lib/Target/PowerPC/PPC.td b/lib/Target/PowerPC/PPC.td index bd58539c6fc2..a9842b287cbb 100644 --- a/lib/Target/PowerPC/PPC.td +++ b/lib/Target/PowerPC/PPC.td @@ -46,6 +46,7 @@ def DirectivePwr5x: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR5X", "" def DirectivePwr6: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR6", "">; def DirectivePwr6x: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR6X", "">; def DirectivePwr7: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR7", "">; +def DirectivePwr8: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR8", "">; def Feature64Bit : SubtargetFeature<"64bit","Has64BitSupport", "true", "Enable 64-bit instructions">; @@ -285,6 +286,15 @@ def : ProcessorModel<"pwr7", P7Model, FeaturePOPCNTD, FeatureLDBRX, Feature64Bit /*, Feature64BitRegs */, DeprecatedMFTB, DeprecatedDST]>; +def : ProcessorModel<"pwr8", P7Model /* FIXME: Update to P8Model when available */, + [DirectivePwr8, FeatureAltivec, + FeatureMFOCRF, FeatureFCPSGN, FeatureFSqrt, FeatureFRE, + FeatureFRES, FeatureFRSQRTE, FeatureFRSQRTES, + FeatureRecipPrec, FeatureSTFIWX, FeatureLFIWAX, + FeatureFPRND, FeatureFPCVT, FeatureISEL, + FeaturePOPCNTD, FeatureLDBRX, + Feature64Bit /*, Feature64BitRegs */, + DeprecatedMFTB, DeprecatedDST]>; def : Processor<"ppc", G3Itineraries, [Directive32]>; def : ProcessorModel<"ppc64", G5Model, [Directive64, FeatureAltivec, diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp index e89fb2d58a1c..fd044d951fcc 100644 --- a/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -365,8 +365,8 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { // Transform %Xd = ADDIStocHA %X2, <ga:@sym> LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin()); - // Change the opcode to ADDIS8. If the global address is external, - // has common linkage, is a function address, or is a jump table + // Change the opcode to ADDIS8. If the global address is external, has + // common linkage, is a non-local function address, or is a jump table // address, then generate a TOC entry and reference that. Otherwise // reference the symbol directly. TmpInst.setOpcode(PPC::ADDIS8); @@ -375,7 +375,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { "Invalid operand for ADDIStocHA!"); MCSymbol *MOSymbol = nullptr; bool IsExternal = false; - bool IsFunction = false; + bool IsNonLocalFunction = false; bool IsCommon = false; bool IsAvailExt = false; @@ -384,15 +384,16 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { MOSymbol = getSymbol(GV); IsExternal = GV->isDeclaration(); IsCommon = GV->hasCommonLinkage(); - IsFunction = GV->getType()->getElementType()->isFunctionTy(); + IsNonLocalFunction = GV->getType()->getElementType()->isFunctionTy() && + (GV->isDeclaration() || GV->isWeakForLinker()); IsAvailExt = GV->hasAvailableExternallyLinkage(); } else if (MO.isCPI()) MOSymbol = GetCPISymbol(MO.getIndex()); else if (MO.isJTI()) MOSymbol = GetJTISymbol(MO.getIndex()); - if (IsExternal || IsFunction || IsCommon || IsAvailExt || MO.isJTI() || - TM.getCodeModel() == CodeModel::Large) + if (IsExternal || IsNonLocalFunction || IsCommon || IsAvailExt || + MO.isJTI() || TM.getCodeModel() == CodeModel::Large) MOSymbol = lookUpOrCreateTOCEntry(MOSymbol); const MCExpr *Exp = @@ -425,7 +426,8 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { else if (MO.isGlobal()) { const GlobalValue *GValue = MO.getGlobal(); MOSymbol = getSymbol(GValue); - if (GValue->isDeclaration() || GValue->hasCommonLinkage() || + if (GValue->getType()->getElementType()->isFunctionTy() || + GValue->isDeclaration() || GValue->hasCommonLinkage() || GValue->hasAvailableExternallyLinkage() || TM.getCodeModel() == CodeModel::Large) MOSymbol = lookUpOrCreateTOCEntry(MOSymbol); @@ -450,17 +452,19 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { assert((MO.isGlobal() || MO.isCPI()) && "Invalid operand for ADDItocL"); MCSymbol *MOSymbol = nullptr; bool IsExternal = false; - bool IsFunction = false; + bool IsNonLocalFunction = false; if (MO.isGlobal()) { const GlobalValue *GV = MO.getGlobal(); MOSymbol = getSymbol(GV); IsExternal = GV->isDeclaration(); - IsFunction = GV->getType()->getElementType()->isFunctionTy(); + IsNonLocalFunction = GV->getType()->getElementType()->isFunctionTy() && + (GV->isDeclaration() || GV->isWeakForLinker()); } else if (MO.isCPI()) MOSymbol = GetCPISymbol(MO.getIndex()); - if (IsFunction || IsExternal || TM.getCodeModel() == CodeModel::Large) + if (IsNonLocalFunction || IsExternal || + TM.getCodeModel() == CodeModel::Large) MOSymbol = lookUpOrCreateTOCEntry(MOSymbol); const MCExpr *Exp = diff --git a/lib/Target/PowerPC/PPCFastISel.cpp b/lib/Target/PowerPC/PPCFastISel.cpp index ed3cb4d3293d..92a0ec1a9b33 100644 --- a/lib/Target/PowerPC/PPCFastISel.cpp +++ b/lib/Target/PowerPC/PPCFastISel.cpp @@ -1030,6 +1030,10 @@ bool PPCFastISel::SelectFPToI(const Instruction *I, bool IsSigned) { if (DstVT != MVT::i32 && DstVT != MVT::i64) return false; + // If we don't have FCTIDUZ and we need it, punt to SelectionDAG. + if (DstVT == MVT::i64 && !IsSigned && !PPCSubTarget->hasFPCVT()) + return false; + Value *Src = I->getOperand(0); Type *SrcTy = Src->getType(); if (!isTypeLegal(SrcTy, SrcVT)) @@ -1197,6 +1201,11 @@ bool PPCFastISel::processCallArgs(SmallVectorImpl<Value*> &Args, bool IsVarArg) { SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, TM, ArgLocs, *Context); + + // Reserve space for the linkage area on the stack. + unsigned LinkageSize = PPCFrameLowering::getLinkageSize(true, false); + CCInfo.AllocateStack(LinkageSize, 8); + CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CC_PPC64_ELF_FIS); // Bail out if we can't handle any of the arguments. @@ -1218,6 +1227,13 @@ bool PPCFastISel::processCallArgs(SmallVectorImpl<Value*> &Args, // Get a count of how many bytes are to be pushed onto the stack. NumBytes = CCInfo.getNextStackOffset(); + // The prolog code of the callee may store up to 8 GPR argument registers to + // the stack, allowing va_start to index over them in memory if its varargs. + // Because we cannot tell if this is needed on the caller side, we have to + // conservatively assume that it is needed. As such, make sure we have at + // least enough stack space for the caller to store the 8 GPRs. + NumBytes = std::max(NumBytes, LinkageSize + 64); + // Issue CALLSEQ_START. BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TII.getCallFrameSetupOpcode())) @@ -1858,16 +1874,9 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) { // FIXME: Jump tables are not yet required because fast-isel doesn't // handle switches; if that changes, we need them as well. For now, // what follows assumes everything's a generic (or TLS) global address. - const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV); - if (!GVar) { - // If GV is an alias, use the aliasee for determining thread-locality. - if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) - GVar = dyn_cast_or_null<GlobalVariable>(GA->getAliasee()); - } // FIXME: We don't yet handle the complexity of TLS. - bool IsTLS = GVar && GVar->isThreadLocal(); - if (IsTLS) + if (GV->isThreadLocal()) return 0; // For small code model, generate a simple TOC load. @@ -1877,8 +1886,8 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) { .addGlobalAddress(GV) .addReg(PPC::X2); else { - // If the address is an externally defined symbol, a symbol with - // common or externally available linkage, a function address, or a + // If the address is an externally defined symbol, a symbol with common + // or externally available linkage, a non-local function address, or a // jump table address (not yet needed), or if we are generating code // for large code model, we generate: // LDtocL(GV, ADDIStocHA(%X2, GV)) @@ -1889,12 +1898,13 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDIStocHA), HighPartReg).addReg(PPC::X2).addGlobalAddress(GV); - // !GVar implies a function address. An external variable is one - // without an initializer. // If/when switches are implemented, jump tables should be handled // on the "if" path here. - if (CModel == CodeModel::Large || !GVar || !GVar->hasInitializer() || - GVar->hasCommonLinkage() || GVar->hasAvailableExternallyLinkage()) + if (CModel == CodeModel::Large || + (GV->getType()->getElementType()->isFunctionTy() && + (GV->isDeclaration() || GV->isWeakForLinker())) || + GV->isDeclaration() || GV->hasCommonLinkage() || + GV->hasAvailableExternallyLinkage()) BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtocL), DestReg).addGlobalAddress(GV).addReg(HighPartReg); else diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp index e2941568f1e2..65e9cf2b43a6 100644 --- a/lib/Target/PowerPC/PPCFrameLowering.cpp +++ b/lib/Target/PowerPC/PPCFrameLowering.cpp @@ -15,6 +15,7 @@ #include "PPCInstrBuilder.h" #include "PPCInstrInfo.h" #include "PPCMachineFunctionInfo.h" +#include "PPCSubtarget.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -35,6 +36,167 @@ static const uint16_t VRRegNo[] = { PPC::V24, PPC::V25, PPC::V26, PPC::V27, PPC::V28, PPC::V29, PPC::V30, PPC::V31 }; +PPCFrameLowering::PPCFrameLowering(const PPCSubtarget &STI) + : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, + (STI.hasQPX() || STI.isBGQ()) ? 32 : 16, 0), + Subtarget(STI) {} + +// With the SVR4 ABI, callee-saved registers have fixed offsets on the stack. +const PPCFrameLowering::SpillSlot *PPCFrameLowering::getCalleeSavedSpillSlots( + unsigned &NumEntries) const { + if (Subtarget.isDarwinABI()) { + NumEntries = 1; + if (Subtarget.isPPC64()) { + static const SpillSlot darwin64Offsets = {PPC::X31, -8}; + return &darwin64Offsets; + } else { + static const SpillSlot darwinOffsets = {PPC::R31, -4}; + return &darwinOffsets; + } + } + + // Early exit if not using the SVR4 ABI. + if (!Subtarget.isSVR4ABI()) { + NumEntries = 0; + return nullptr; + } + + // Note that the offsets here overlap, but this is fixed up in + // processFunctionBeforeFrameFinalized. + + static const SpillSlot Offsets[] = { + // Floating-point register save area offsets. + {PPC::F31, -8}, + {PPC::F30, -16}, + {PPC::F29, -24}, + {PPC::F28, -32}, + {PPC::F27, -40}, + {PPC::F26, -48}, + {PPC::F25, -56}, + {PPC::F24, -64}, + {PPC::F23, -72}, + {PPC::F22, -80}, + {PPC::F21, -88}, + {PPC::F20, -96}, + {PPC::F19, -104}, + {PPC::F18, -112}, + {PPC::F17, -120}, + {PPC::F16, -128}, + {PPC::F15, -136}, + {PPC::F14, -144}, + + // General register save area offsets. + {PPC::R31, -4}, + {PPC::R30, -8}, + {PPC::R29, -12}, + {PPC::R28, -16}, + {PPC::R27, -20}, + {PPC::R26, -24}, + {PPC::R25, -28}, + {PPC::R24, -32}, + {PPC::R23, -36}, + {PPC::R22, -40}, + {PPC::R21, -44}, + {PPC::R20, -48}, + {PPC::R19, -52}, + {PPC::R18, -56}, + {PPC::R17, -60}, + {PPC::R16, -64}, + {PPC::R15, -68}, + {PPC::R14, -72}, + + // CR save area offset. We map each of the nonvolatile CR fields + // to the slot for CR2, which is the first of the nonvolatile CR + // fields to be assigned, so that we only allocate one save slot. + // See PPCRegisterInfo::hasReservedSpillSlot() for more information. + {PPC::CR2, -4}, + + // VRSAVE save area offset. + {PPC::VRSAVE, -4}, + + // Vector register save area + {PPC::V31, -16}, + {PPC::V30, -32}, + {PPC::V29, -48}, + {PPC::V28, -64}, + {PPC::V27, -80}, + {PPC::V26, -96}, + {PPC::V25, -112}, + {PPC::V24, -128}, + {PPC::V23, -144}, + {PPC::V22, -160}, + {PPC::V21, -176}, + {PPC::V20, -192}}; + + static const SpillSlot Offsets64[] = { + // Floating-point register save area offsets. + {PPC::F31, -8}, + {PPC::F30, -16}, + {PPC::F29, -24}, + {PPC::F28, -32}, + {PPC::F27, -40}, + {PPC::F26, -48}, + {PPC::F25, -56}, + {PPC::F24, -64}, + {PPC::F23, -72}, + {PPC::F22, -80}, + {PPC::F21, -88}, + {PPC::F20, -96}, + {PPC::F19, -104}, + {PPC::F18, -112}, + {PPC::F17, -120}, + {PPC::F16, -128}, + {PPC::F15, -136}, + {PPC::F14, -144}, + + // General register save area offsets. + {PPC::X31, -8}, + {PPC::X30, -16}, + {PPC::X29, -24}, + {PPC::X28, -32}, + {PPC::X27, -40}, + {PPC::X26, -48}, + {PPC::X25, -56}, + {PPC::X24, -64}, + {PPC::X23, -72}, + {PPC::X22, -80}, + {PPC::X21, -88}, + {PPC::X20, -96}, + {PPC::X19, -104}, + {PPC::X18, -112}, + {PPC::X17, -120}, + {PPC::X16, -128}, + {PPC::X15, -136}, + {PPC::X14, -144}, + + // VRSAVE save area offset. + {PPC::VRSAVE, -4}, + + // Vector register save area + {PPC::V31, -16}, + {PPC::V30, -32}, + {PPC::V29, -48}, + {PPC::V28, -64}, + {PPC::V27, -80}, + {PPC::V26, -96}, + {PPC::V25, -112}, + {PPC::V24, -128}, + {PPC::V23, -144}, + {PPC::V22, -160}, + {PPC::V21, -176}, + {PPC::V20, -192}}; + + if (Subtarget.isPPC64()) { + NumEntries = array_lengthof(Offsets64); + + return Offsets64; + } else { + NumEntries = array_lengthof(Offsets); + + return Offsets; + } +} + /// RemoveVRSaveCode - We have found that this function does not need any code /// to manipulate the VRSAVE register, even though it uses vector registers. /// This can happen when the only registers used are known to be live in or out @@ -236,9 +398,9 @@ unsigned PPCFrameLowering::determineFrameLayout(MachineFunction &MF, // Get the maximum call frame size of all the calls. unsigned maxCallFrameSize = MFI->getMaxCallFrameSize(); - // Maximum call frame needs to be at least big enough for linkage and 8 args. - unsigned minCallFrameSize = getMinCallFrameSize(Subtarget.isPPC64(), - Subtarget.isDarwinABI()); + // Maximum call frame needs to be at least big enough for linkage area. + unsigned minCallFrameSize = getLinkageSize(Subtarget.isPPC64(), + Subtarget.isDarwinABI()); maxCallFrameSize = std::max(maxCallFrameSize, minCallFrameSize); // If we have dynamic alloca then maxCallFrameSize needs to be aligned so diff --git a/lib/Target/PowerPC/PPCFrameLowering.h b/lib/Target/PowerPC/PPCFrameLowering.h index 94e9b67338b4..7a226f74e080 100644 --- a/lib/Target/PowerPC/PPCFrameLowering.h +++ b/lib/Target/PowerPC/PPCFrameLowering.h @@ -14,23 +14,18 @@ #define POWERPC_FRAMEINFO_H #include "PPC.h" -#include "PPCSubtarget.h" #include "llvm/ADT/STLExtras.h" #include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetMachine.h" namespace llvm { - class PPCSubtarget; +class PPCSubtarget; class PPCFrameLowering: public TargetFrameLowering { const PPCSubtarget &Subtarget; public: - PPCFrameLowering(const PPCSubtarget &sti) - : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, - (sti.hasQPX() || sti.isBGQ()) ? 32 : 16, 0), - Subtarget(sti) { - } + PPCFrameLowering(const PPCSubtarget &STI); unsigned determineFrameLayout(MachineFunction &MF, bool UpdateMF = true, @@ -79,6 +74,12 @@ public: return isPPC64 ? 16 : 4; } + /// getTOCSaveOffset - Return the previous frame offset to save the + /// TOC register -- 64-bit SVR4 ABI only. + static unsigned getTOCSaveOffset(void) { + return 40; + } + /// getFramePointerSaveOffset - Return the previous frame offset to save the /// frame pointer. static unsigned getFramePointerSaveOffset(bool isPPC64, bool isDarwinABI) { @@ -114,190 +115,9 @@ public: return 8; } - /// getMinCallArgumentsSize - Return the size of the minium PowerPC ABI - /// argument area. - static unsigned getMinCallArgumentsSize(bool isPPC64, bool isDarwinABI) { - // For the Darwin ABI / 64-bit SVR4 ABI: - // The prolog code of the callee may store up to 8 GPR argument registers to - // the stack, allowing va_start to index over them in memory if its varargs. - // Because we cannot tell if this is needed on the caller side, we have to - // conservatively assume that it is needed. As such, make sure we have at - // least enough stack space for the caller to store the 8 GPRs. - if (isDarwinABI || isPPC64) - return 8 * (isPPC64 ? 8 : 4); - - // 32-bit SVR4 ABI: - // There is no default stack allocated for the 8 first GPR arguments. - return 0; - } - - /// getMinCallFrameSize - Return the minimum size a call frame can be using - /// the PowerPC ABI. - static unsigned getMinCallFrameSize(bool isPPC64, bool isDarwinABI) { - // The call frame needs to be at least big enough for linkage and 8 args. - return getLinkageSize(isPPC64, isDarwinABI) + - getMinCallArgumentsSize(isPPC64, isDarwinABI); - } - - // With the SVR4 ABI, callee-saved registers have fixed offsets on the stack. const SpillSlot * - getCalleeSavedSpillSlots(unsigned &NumEntries) const override { - if (Subtarget.isDarwinABI()) { - NumEntries = 1; - if (Subtarget.isPPC64()) { - static const SpillSlot darwin64Offsets = {PPC::X31, -8}; - return &darwin64Offsets; - } else { - static const SpillSlot darwinOffsets = {PPC::R31, -4}; - return &darwinOffsets; - } - } - - // Early exit if not using the SVR4 ABI. - if (!Subtarget.isSVR4ABI()) { - NumEntries = 0; - return nullptr; - } - - // Note that the offsets here overlap, but this is fixed up in - // processFunctionBeforeFrameFinalized. - - static const SpillSlot Offsets[] = { - // Floating-point register save area offsets. - {PPC::F31, -8}, - {PPC::F30, -16}, - {PPC::F29, -24}, - {PPC::F28, -32}, - {PPC::F27, -40}, - {PPC::F26, -48}, - {PPC::F25, -56}, - {PPC::F24, -64}, - {PPC::F23, -72}, - {PPC::F22, -80}, - {PPC::F21, -88}, - {PPC::F20, -96}, - {PPC::F19, -104}, - {PPC::F18, -112}, - {PPC::F17, -120}, - {PPC::F16, -128}, - {PPC::F15, -136}, - {PPC::F14, -144}, - - // General register save area offsets. - {PPC::R31, -4}, - {PPC::R30, -8}, - {PPC::R29, -12}, - {PPC::R28, -16}, - {PPC::R27, -20}, - {PPC::R26, -24}, - {PPC::R25, -28}, - {PPC::R24, -32}, - {PPC::R23, -36}, - {PPC::R22, -40}, - {PPC::R21, -44}, - {PPC::R20, -48}, - {PPC::R19, -52}, - {PPC::R18, -56}, - {PPC::R17, -60}, - {PPC::R16, -64}, - {PPC::R15, -68}, - {PPC::R14, -72}, - - // CR save area offset. We map each of the nonvolatile CR fields - // to the slot for CR2, which is the first of the nonvolatile CR - // fields to be assigned, so that we only allocate one save slot. - // See PPCRegisterInfo::hasReservedSpillSlot() for more information. - {PPC::CR2, -4}, - - // VRSAVE save area offset. - {PPC::VRSAVE, -4}, - - // Vector register save area - {PPC::V31, -16}, - {PPC::V30, -32}, - {PPC::V29, -48}, - {PPC::V28, -64}, - {PPC::V27, -80}, - {PPC::V26, -96}, - {PPC::V25, -112}, - {PPC::V24, -128}, - {PPC::V23, -144}, - {PPC::V22, -160}, - {PPC::V21, -176}, - {PPC::V20, -192} - }; - - static const SpillSlot Offsets64[] = { - // Floating-point register save area offsets. - {PPC::F31, -8}, - {PPC::F30, -16}, - {PPC::F29, -24}, - {PPC::F28, -32}, - {PPC::F27, -40}, - {PPC::F26, -48}, - {PPC::F25, -56}, - {PPC::F24, -64}, - {PPC::F23, -72}, - {PPC::F22, -80}, - {PPC::F21, -88}, - {PPC::F20, -96}, - {PPC::F19, -104}, - {PPC::F18, -112}, - {PPC::F17, -120}, - {PPC::F16, -128}, - {PPC::F15, -136}, - {PPC::F14, -144}, - - // General register save area offsets. - {PPC::X31, -8}, - {PPC::X30, -16}, - {PPC::X29, -24}, - {PPC::X28, -32}, - {PPC::X27, -40}, - {PPC::X26, -48}, - {PPC::X25, -56}, - {PPC::X24, -64}, - {PPC::X23, -72}, - {PPC::X22, -80}, - {PPC::X21, -88}, - {PPC::X20, -96}, - {PPC::X19, -104}, - {PPC::X18, -112}, - {PPC::X17, -120}, - {PPC::X16, -128}, - {PPC::X15, -136}, - {PPC::X14, -144}, - - // VRSAVE save area offset. - {PPC::VRSAVE, -4}, - - // Vector register save area - {PPC::V31, -16}, - {PPC::V30, -32}, - {PPC::V29, -48}, - {PPC::V28, -64}, - {PPC::V27, -80}, - {PPC::V26, -96}, - {PPC::V25, -112}, - {PPC::V24, -128}, - {PPC::V23, -144}, - {PPC::V22, -160}, - {PPC::V21, -176}, - {PPC::V20, -192} - }; - - if (Subtarget.isPPC64()) { - NumEntries = array_lengthof(Offsets64); - - return Offsets64; - } else { - NumEntries = array_lengthof(Offsets); - - return Offsets; - } - } + getCalleeSavedSpillSlots(unsigned &NumEntries) const override; }; - } // End llvm namespace #endif diff --git a/lib/Target/PowerPC/PPCHazardRecognizers.cpp b/lib/Target/PowerPC/PPCHazardRecognizers.cpp index 7ca706b81c3e..d9b242cad265 100644 --- a/lib/Target/PowerPC/PPCHazardRecognizers.cpp +++ b/lib/Target/PowerPC/PPCHazardRecognizers.cpp @@ -162,7 +162,8 @@ unsigned PPCDispatchGroupSBHazardRecognizer::PreEmitNoops(SUnit *SU) { unsigned Directive = DAG->TM.getSubtarget<PPCSubtarget>().getDarwinDirective(); // If we're using a special group-terminating nop, then we need only one. - if (Directive == PPC::DIR_PWR6 || Directive == PPC::DIR_PWR7) + if (Directive == PPC::DIR_PWR6 || Directive == PPC::DIR_PWR7 || + Directive == PPC::DIR_PWR8 ) return 1; return 5 - CurSlots; @@ -223,7 +224,7 @@ void PPCDispatchGroupSBHazardRecognizer::EmitNoop() { // If the group has now filled all of its slots, or if we're using a special // group-terminating nop, the group is complete. if (Directive == PPC::DIR_PWR6 || Directive == PPC::DIR_PWR7 || - CurSlots == 6) { + Directive == PPC::DIR_PWR8 || CurSlots == 6) { CurGroup.clear(); CurSlots = CurBranches = 0; } else { @@ -258,8 +259,8 @@ void PPCDispatchGroupSBHazardRecognizer::EmitNoop() { // 3. Handling of the esoteric cases in "Resource-based Instruction Grouping". // -PPCHazardRecognizer970::PPCHazardRecognizer970(const TargetMachine &TM) - : TM(TM) { +PPCHazardRecognizer970::PPCHazardRecognizer970(const ScheduleDAG &DAG) + : DAG(DAG) { EndDispatchGroup(); } @@ -278,7 +279,7 @@ PPCHazardRecognizer970::GetInstrType(unsigned Opcode, bool &isFirst, bool &isSingle, bool &isCracked, bool &isLoad, bool &isStore) { - const MCInstrDesc &MCID = TM.getInstrInfo()->get(Opcode); + const MCInstrDesc &MCID = DAG.TII->get(Opcode); isLoad = MCID.mayLoad(); isStore = MCID.mayStore(); diff --git a/lib/Target/PowerPC/PPCHazardRecognizers.h b/lib/Target/PowerPC/PPCHazardRecognizers.h index cf4332cffa8c..23f76c16d138 100644 --- a/lib/Target/PowerPC/PPCHazardRecognizers.h +++ b/lib/Target/PowerPC/PPCHazardRecognizers.h @@ -54,7 +54,7 @@ public: /// setting the CTR register then branching through it within a dispatch group), /// or storing then loading from the same address within a dispatch group. class PPCHazardRecognizer970 : public ScheduleHazardRecognizer { - const TargetMachine &TM; + const ScheduleDAG &DAG; unsigned NumIssued; // Number of insts issued, including advanced cycles. @@ -75,7 +75,7 @@ class PPCHazardRecognizer970 : public ScheduleHazardRecognizer { unsigned NumStores; public: - PPCHazardRecognizer970(const TargetMachine &TM); + PPCHazardRecognizer970(const ScheduleDAG &DAG); virtual HazardType getHazardType(SUnit *SU, int Stalls) override; virtual void EmitInstruction(SUnit *SU) override; virtual void AdvanceCycle() override; diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index 251e8b6246f6..4881b3fbb7ac 100644 --- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -1454,10 +1454,10 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) { if (CModel != CodeModel::Medium && CModel != CodeModel::Large) break; - // The first source operand is a TargetGlobalAddress or a - // TargetJumpTable. If it is an externally defined symbol, a symbol - // with common linkage, a function address, or a jump table address, - // or if we are generating code for large code model, we generate: + // The first source operand is a TargetGlobalAddress or a TargetJumpTable. + // If it is an externally defined symbol, a symbol with common linkage, + // a non-local function address, or a jump table address, or if we are + // generating code for large code model, we generate: // LDtocL(<ga:@sym>, ADDIStocHA(%X2, <ga:@sym>)) // Otherwise we generate: // ADDItocL(ADDIStocHA(%X2, <ga:@sym>), <ga:@sym>) @@ -1472,8 +1472,10 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) { if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(GA)) { const GlobalValue *GValue = G->getGlobal(); - if (GValue->isDeclaration() || GValue->hasCommonLinkage() || - GValue->hasAvailableExternallyLinkage()) + if ((GValue->getType()->getElementType()->isFunctionTy() && + (GValue->isDeclaration() || GValue->isWeakForLinker())) || + GValue->isDeclaration() || GValue->hasCommonLinkage() || + GValue->hasAvailableExternallyLinkage()) return CurDAG->getMachineNode(PPC::LDtocL, dl, MVT::i64, GA, SDValue(Tmp, 0)); } diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index cf4c9e61a58d..bc057bfd3b62 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -19,6 +19,7 @@ #include "PPCTargetObjectFile.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringSwitch.h" +#include "llvm/ADT/Triple.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -50,20 +51,18 @@ cl::desc("disable unaligned load/store generation on PPC"), cl::Hidden); // FIXME: Remove this once the bug has been fixed! extern cl::opt<bool> ANDIGlueBug; -static TargetLoweringObjectFile *CreateTLOF(const PPCTargetMachine &TM) { - if (TM.getSubtargetImpl()->isDarwin()) +static TargetLoweringObjectFile *createTLOF(const Triple &TT) { + // If it isn't a Mach-O file then it's going to be a linux ELF + // object file. + if (TT.isOSDarwin()) return new TargetLoweringObjectFileMachO(); - if (TM.getSubtargetImpl()->isSVR4ABI()) - return new PPC64LinuxTargetObjectFile(); - - return new TargetLoweringObjectFileELF(); + return new PPC64LinuxTargetObjectFile(); } PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) - : TargetLowering(TM, CreateTLOF(TM)), PPCSubTarget(*TM.getSubtargetImpl()) { - const PPCSubtarget *Subtarget = &TM.getSubtarget<PPCSubtarget>(); - + : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))), + Subtarget(*TM.getSubtargetImpl()) { setPow2DivIsCheap(); // Use _setjmp/_longjmp instead of setjmp/longjmp. @@ -72,7 +71,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) // On PPC32/64, arguments smaller than 4/8 bytes are extended, so all // arguments are at least 4/8 bytes aligned. - bool isPPC64 = Subtarget->isPPC64(); + bool isPPC64 = Subtarget.isPPC64(); setMinStackArgumentAlignment(isPPC64 ? 8:4); // Set up the register classes. @@ -98,10 +97,10 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setIndexedStoreAction(ISD::PRE_INC, MVT::i32, Legal); setIndexedStoreAction(ISD::PRE_INC, MVT::i64, Legal); - if (Subtarget->useCRBits()) { + if (Subtarget.useCRBits()) { setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); - if (isPPC64 || Subtarget->hasFPCVT()) { + if (isPPC64 || Subtarget.hasFPCVT()) { setOperationAction(ISD::SINT_TO_FP, MVT::i1, Promote); AddPromotedToType (ISD::SINT_TO_FP, MVT::i1, isPPC64 ? MVT::i64 : MVT::i32); @@ -176,17 +175,17 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom); // If we're enabling GP optimizations, use hardware square root - if (!Subtarget->hasFSQRT() && + if (!Subtarget.hasFSQRT() && !(TM.Options.UnsafeFPMath && - Subtarget->hasFRSQRTE() && Subtarget->hasFRE())) + Subtarget.hasFRSQRTE() && Subtarget.hasFRE())) setOperationAction(ISD::FSQRT, MVT::f64, Expand); - if (!Subtarget->hasFSQRT() && + if (!Subtarget.hasFSQRT() && !(TM.Options.UnsafeFPMath && - Subtarget->hasFRSQRTES() && Subtarget->hasFRES())) + Subtarget.hasFRSQRTES() && Subtarget.hasFRES())) setOperationAction(ISD::FSQRT, MVT::f32, Expand); - if (Subtarget->hasFCPSGN()) { + if (Subtarget.hasFCPSGN()) { setOperationAction(ISD::FCOPYSIGN, MVT::f64, Legal); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Legal); } else { @@ -194,7 +193,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); } - if (Subtarget->hasFPRND()) { + if (Subtarget.hasFPRND()) { setOperationAction(ISD::FFLOOR, MVT::f64, Legal); setOperationAction(ISD::FCEIL, MVT::f64, Legal); setOperationAction(ISD::FTRUNC, MVT::f64, Legal); @@ -216,7 +215,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand); - if (Subtarget->hasPOPCNTD()) { + if (Subtarget.hasPOPCNTD()) { setOperationAction(ISD::CTPOP, MVT::i32 , Legal); setOperationAction(ISD::CTPOP, MVT::i64 , Legal); } else { @@ -228,7 +227,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setOperationAction(ISD::ROTR, MVT::i32 , Expand); setOperationAction(ISD::ROTR, MVT::i64 , Expand); - if (!Subtarget->useCRBits()) { + if (!Subtarget.useCRBits()) { // PowerPC does not have Select setOperationAction(ISD::SELECT, MVT::i32, Expand); setOperationAction(ISD::SELECT, MVT::i64, Expand); @@ -241,11 +240,11 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); // PowerPC wants to optimize integer setcc a bit - if (!Subtarget->useCRBits()) + if (!Subtarget.useCRBits()) setOperationAction(ISD::SETCC, MVT::i32, Custom); // PowerPC does not have BRCOND which requires SetCC - if (!Subtarget->useCRBits()) + if (!Subtarget.useCRBits()) setOperationAction(ISD::BRCOND, MVT::Other, Expand); setOperationAction(ISD::BR_JT, MVT::Other, Expand); @@ -297,7 +296,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) // VASTART needs to be custom lowered to use the VarArgsFrameIndex setOperationAction(ISD::VASTART , MVT::Other, Custom); - if (Subtarget->isSVR4ABI()) { + if (Subtarget.isSVR4ABI()) { if (isPPC64) { // VAARG always uses double-word chunks, so promote anything smaller. setOperationAction(ISD::VAARG, MVT::i1, Promote); @@ -317,7 +316,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) } else setOperationAction(ISD::VAARG, MVT::Other, Expand); - if (Subtarget->isSVR4ABI() && !isPPC64) + if (Subtarget.isSVR4ABI() && !isPPC64) // VACOPY is custom lowered with the 32-bit SVR4 ABI. setOperationAction(ISD::VACOPY , MVT::Other, Custom); else @@ -350,7 +349,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setCondCodeAction(ISD::SETONE, MVT::f32, Expand); setCondCodeAction(ISD::SETONE, MVT::f64, Expand); - if (Subtarget->has64BitSupport()) { + if (Subtarget.has64BitSupport()) { // They also have instructions for converting between i64 and fp. setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand); @@ -360,7 +359,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) // We cannot do this with Promote because i64 is not a legal type. setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); - if (PPCSubTarget.hasLFIWAX() || Subtarget->isPPC64()) + if (Subtarget.hasLFIWAX() || Subtarget.isPPC64()) setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); } else { // PowerPC does not have FP_TO_UINT on 32-bit implementations. @@ -368,8 +367,8 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) } // With the instructions enabled under FPCVT, we can do everything. - if (PPCSubTarget.hasFPCVT()) { - if (Subtarget->has64BitSupport()) { + if (Subtarget.hasFPCVT()) { + if (Subtarget.has64BitSupport()) { setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); @@ -382,7 +381,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); } - if (Subtarget->use64BitRegs()) { + if (Subtarget.use64BitRegs()) { // 64-bit PowerPC implementations can support i64 types directly addRegisterClass(MVT::i64, &PPC::G8RCRegClass); // BUILD_PAIR can't be handled natively, and should be expanded to shl/or @@ -398,7 +397,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); } - if (Subtarget->hasAltivec()) { + if (Subtarget.hasAltivec()) { // First set operation action for all vector types to expand. Then we // will selectively turn on ones that can be effectively codegen'd. for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; @@ -488,7 +487,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setOperationAction(ISD::XOR , MVT::v4i32, Legal); setOperationAction(ISD::LOAD , MVT::v4i32, Legal); setOperationAction(ISD::SELECT, MVT::v4i32, - Subtarget->useCRBits() ? Legal : Expand); + Subtarget.useCRBits() ? Legal : Expand); setOperationAction(ISD::STORE , MVT::v4i32, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal); @@ -507,7 +506,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setOperationAction(ISD::MUL, MVT::v4f32, Legal); setOperationAction(ISD::FMA, MVT::v4f32, Legal); - if (TM.Options.UnsafeFPMath || Subtarget->hasVSX()) { + if (TM.Options.UnsafeFPMath || Subtarget.hasVSX()) { setOperationAction(ISD::FDIV, MVT::v4f32, Legal); setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); } @@ -535,7 +534,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setCondCodeAction(ISD::SETO, MVT::v4f32, Expand); setCondCodeAction(ISD::SETONE, MVT::v4f32, Expand); - if (Subtarget->hasVSX()) { + if (Subtarget.hasVSX()) { setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal); @@ -613,7 +612,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) } } - if (Subtarget->has64BitSupport()) { + if (Subtarget.has64BitSupport()) { setOperationAction(ISD::PREFETCH, MVT::Other, Legal); setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); } @@ -642,7 +641,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setTargetDAGCombine(ISD::LOAD); setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::BR_CC); - if (Subtarget->useCRBits()) + if (Subtarget.useCRBits()) setTargetDAGCombine(ISD::BRCOND); setTargetDAGCombine(ISD::BSWAP); setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); @@ -651,7 +650,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::ANY_EXTEND); - if (Subtarget->useCRBits()) { + if (Subtarget.useCRBits()) { setTargetDAGCombine(ISD::TRUNCATE); setTargetDAGCombine(ISD::SETCC); setTargetDAGCombine(ISD::SELECT_CC); @@ -664,7 +663,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) } // Darwin long double math library functions have $LDBL128 appended. - if (Subtarget->isDarwin()) { + if (Subtarget.isDarwin()) { setLibcallName(RTLIB::COS_PPCF128, "cosl$LDBL128"); setLibcallName(RTLIB::POW_PPCF128, "powl$LDBL128"); setLibcallName(RTLIB::REM_PPCF128, "fmodl$LDBL128"); @@ -679,21 +678,21 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) // With 32 condition bits, we don't need to sink (and duplicate) compares // aggressively in CodeGenPrep. - if (Subtarget->useCRBits()) + if (Subtarget.useCRBits()) setHasMultipleConditionRegisters(); setMinFunctionAlignment(2); - if (PPCSubTarget.isDarwin()) + if (Subtarget.isDarwin()) setPrefFunctionAlignment(4); - if (isPPC64 && Subtarget->isJITCodeModel()) + if (isPPC64 && Subtarget.isJITCodeModel()) // Temporary workaround for the inability of PPC64 JIT to handle jump // tables. setSupportJumpTables(false); setInsertFencesForAtomic(true); - if (Subtarget->enableMachineScheduler()) + if (Subtarget.enableMachineScheduler()) setSchedulingPreference(Sched::Source); else setSchedulingPreference(Sched::Hybrid); @@ -702,8 +701,8 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) // The Freescale cores does better with aggressive inlining of memcpy and // friends. Gcc uses same threshold of 128 bytes (= 32 word stores). - if (Subtarget->getDarwinDirective() == PPC::DIR_E500mc || - Subtarget->getDarwinDirective() == PPC::DIR_E5500) { + if (Subtarget.getDarwinDirective() == PPC::DIR_E500mc || + Subtarget.getDarwinDirective() == PPC::DIR_E5500) { MaxStoresPerMemset = 32; MaxStoresPerMemsetOptSize = 16; MaxStoresPerMemcpy = 32; @@ -747,14 +746,14 @@ static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign, /// function arguments in the caller parameter area. unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty) const { // Darwin passes everything on 4 byte boundary. - if (PPCSubTarget.isDarwin()) + if (Subtarget.isDarwin()) return 4; // 16byte and wider vectors are passed on 16byte boundary. // The rest is 8 on PPC64 and 4 on PPC32 boundary. - unsigned Align = PPCSubTarget.isPPC64() ? 8 : 4; - if (PPCSubTarget.hasAltivec() || PPCSubTarget.hasQPX()) - getMaxByValAlign(Ty, Align, PPCSubTarget.hasQPX() ? 32 : 16); + unsigned Align = Subtarget.isPPC64() ? 8 : 4; + if (Subtarget.hasAltivec() || Subtarget.hasQPX()) + getMaxByValAlign(Ty, Align, Subtarget.hasQPX() ? 32 : 16); return Align; } @@ -774,7 +773,6 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::Hi: return "PPCISD::Hi"; case PPCISD::Lo: return "PPCISD::Lo"; case PPCISD::TOC_ENTRY: return "PPCISD::TOC_ENTRY"; - case PPCISD::TOC_RESTORE: return "PPCISD::TOC_RESTORE"; case PPCISD::LOAD: return "PPCISD::LOAD"; case PPCISD::LOAD_TOC: return "PPCISD::LOAD_TOC"; case PPCISD::DYNALLOC: return "PPCISD::DYNALLOC"; @@ -826,7 +824,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { EVT PPCTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { if (!VT.isVector()) - return PPCSubTarget.useCRBits() ? MVT::i1 : MVT::i32; + return Subtarget.useCRBits() ? MVT::i1 : MVT::i32; return VT.changeVectorElementTypeToInteger(); } @@ -855,15 +853,17 @@ static bool isConstantOrUndef(int Op, int Val) { /// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a /// VPKUHUM instruction. -bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary) { +bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary, + SelectionDAG &DAG) { + unsigned j = DAG.getTarget().getDataLayout()->isLittleEndian() ? 0 : 1; if (!isUnary) { for (unsigned i = 0; i != 16; ++i) - if (!isConstantOrUndef(N->getMaskElt(i), i*2+1)) + if (!isConstantOrUndef(N->getMaskElt(i), i*2+j)) return false; } else { for (unsigned i = 0; i != 8; ++i) - if (!isConstantOrUndef(N->getMaskElt(i), i*2+1) || - !isConstantOrUndef(N->getMaskElt(i+8), i*2+1)) + if (!isConstantOrUndef(N->getMaskElt(i), i*2+j) || + !isConstantOrUndef(N->getMaskElt(i+8), i*2+j)) return false; } return true; @@ -871,18 +871,27 @@ bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary) { /// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a /// VPKUWUM instruction. -bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary) { +bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary, + SelectionDAG &DAG) { + unsigned j, k; + if (DAG.getTarget().getDataLayout()->isLittleEndian()) { + j = 0; + k = 1; + } else { + j = 2; + k = 3; + } if (!isUnary) { for (unsigned i = 0; i != 16; i += 2) - if (!isConstantOrUndef(N->getMaskElt(i ), i*2+2) || - !isConstantOrUndef(N->getMaskElt(i+1), i*2+3)) + if (!isConstantOrUndef(N->getMaskElt(i ), i*2+j) || + !isConstantOrUndef(N->getMaskElt(i+1), i*2+k)) return false; } else { for (unsigned i = 0; i != 8; i += 2) - if (!isConstantOrUndef(N->getMaskElt(i ), i*2+2) || - !isConstantOrUndef(N->getMaskElt(i+1), i*2+3) || - !isConstantOrUndef(N->getMaskElt(i+8), i*2+2) || - !isConstantOrUndef(N->getMaskElt(i+9), i*2+3)) + if (!isConstantOrUndef(N->getMaskElt(i ), i*2+j) || + !isConstantOrUndef(N->getMaskElt(i+1), i*2+k) || + !isConstantOrUndef(N->getMaskElt(i+8), i*2+j) || + !isConstantOrUndef(N->getMaskElt(i+9), i*2+k)) return false; } return true; @@ -909,27 +918,39 @@ static bool isVMerge(ShuffleVectorSDNode *N, unsigned UnitSize, } /// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for -/// a VRGL* instruction with the specified unit size (1,2 or 4 bytes). +/// a VMRGL* instruction with the specified unit size (1,2 or 4 bytes). bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize, - bool isUnary) { - if (!isUnary) - return isVMerge(N, UnitSize, 8, 24); - return isVMerge(N, UnitSize, 8, 8); + bool isUnary, SelectionDAG &DAG) { + if (DAG.getTarget().getDataLayout()->isLittleEndian()) { + if (!isUnary) + return isVMerge(N, UnitSize, 0, 16); + return isVMerge(N, UnitSize, 0, 0); + } else { + if (!isUnary) + return isVMerge(N, UnitSize, 8, 24); + return isVMerge(N, UnitSize, 8, 8); + } } /// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for -/// a VRGH* instruction with the specified unit size (1,2 or 4 bytes). +/// a VMRGH* instruction with the specified unit size (1,2 or 4 bytes). bool PPC::isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize, - bool isUnary) { - if (!isUnary) - return isVMerge(N, UnitSize, 0, 16); - return isVMerge(N, UnitSize, 0, 0); + bool isUnary, SelectionDAG &DAG) { + if (DAG.getTarget().getDataLayout()->isLittleEndian()) { + if (!isUnary) + return isVMerge(N, UnitSize, 8, 24); + return isVMerge(N, UnitSize, 8, 8); + } else { + if (!isUnary) + return isVMerge(N, UnitSize, 0, 16); + return isVMerge(N, UnitSize, 0, 0); + } } /// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift /// amount, otherwise return -1. -int PPC::isVSLDOIShuffleMask(SDNode *N, bool isUnary) { +int PPC::isVSLDOIShuffleMask(SDNode *N, bool isUnary, SelectionDAG &DAG) { if (N->getValueType(0) != MVT::v16i8) return -1; @@ -946,18 +967,38 @@ int PPC::isVSLDOIShuffleMask(SDNode *N, bool isUnary) { // numbered from this value. unsigned ShiftAmt = SVOp->getMaskElt(i); if (ShiftAmt < i) return -1; - ShiftAmt -= i; - if (!isUnary) { - // Check the rest of the elements to see if they are consecutive. - for (++i; i != 16; ++i) - if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i)) - return -1; - } else { - // Check the rest of the elements to see if they are consecutive. - for (++i; i != 16; ++i) - if (!isConstantOrUndef(SVOp->getMaskElt(i), (ShiftAmt+i) & 15)) - return -1; + if (DAG.getTarget().getDataLayout()->isLittleEndian()) { + + ShiftAmt += i; + + if (!isUnary) { + // Check the rest of the elements to see if they are consecutive. + for (++i; i != 16; ++i) + if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt - i)) + return -1; + } else { + // Check the rest of the elements to see if they are consecutive. + for (++i; i != 16; ++i) + if (!isConstantOrUndef(SVOp->getMaskElt(i), (ShiftAmt - i) & 15)) + return -1; + } + + } else { // Big Endian + + ShiftAmt -= i; + + if (!isUnary) { + // Check the rest of the elements to see if they are consecutive. + for (++i; i != 16; ++i) + if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i)) + return -1; + } else { + // Check the rest of the elements to see if they are consecutive. + for (++i; i != 16; ++i) + if (!isConstantOrUndef(SVOp->getMaskElt(i), (ShiftAmt+i) & 15)) + return -1; + } } return ShiftAmt; } @@ -1010,10 +1051,14 @@ bool PPC::isAllNegativeZeroVector(SDNode *N) { /// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the /// specified isSplatShuffleMask VECTOR_SHUFFLE mask. -unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize) { +unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize, + SelectionDAG &DAG) { ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); assert(isSplatShuffleMask(SVOp, EltSize)); - return SVOp->getMaskElt(0) / EltSize; + if (DAG.getTarget().getDataLayout()->isLittleEndian()) + return (16 / EltSize) - 1 - (SVOp->getMaskElt(0) / EltSize); + else + return SVOp->getMaskElt(0) / EltSize; } /// get_VSPLTI_elt - If this is a build_vector of constants which can be formed @@ -1299,7 +1344,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, short Imm; if (isIntS16Immediate(CN, Imm) && (!Aligned || (Imm & 3) == 0)) { Disp = DAG.getTargetConstant(Imm, CN->getValueType(0)); - Base = DAG.getRegister(PPCSubTarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO, + Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO, CN->getValueType(0)); return true; } @@ -1350,7 +1395,7 @@ bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base, } // Otherwise, do it the hard way, using R0 as the base register. - Base = DAG.getRegister(PPCSubTarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO, + Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO, N.getValueType()); Index = N; return true; @@ -1497,7 +1542,7 @@ SDValue PPCTargetLowering::LowerConstantPool(SDValue Op, // 64-bit SVR4 ABI code is always position-independent. // The actual address of the GlobalValue is stored in the TOC. - if (PPCSubTarget.isSVR4ABI() && PPCSubTarget.isPPC64()) { + if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) { SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0); return DAG.getNode(PPCISD::TOC_ENTRY, SDLoc(CP), MVT::i64, GA, DAG.getRegister(PPC::X2, MVT::i64)); @@ -1518,7 +1563,7 @@ SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { // 64-bit SVR4 ABI code is always position-independent. // The actual address of the GlobalValue is stored in the TOC. - if (PPCSubTarget.isSVR4ABI() && PPCSubTarget.isPPC64()) { + if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) { SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT); return DAG.getNode(PPCISD::TOC_ENTRY, SDLoc(JT), MVT::i64, GA, DAG.getRegister(PPC::X2, MVT::i64)); @@ -1555,7 +1600,7 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op, SDLoc dl(GA); const GlobalValue *GV = GA->getGlobal(); EVT PtrVT = getPointerTy(); - bool is64bit = PPCSubTarget.isPPC64(); + bool is64bit = Subtarget.isPPC64(); TLSModel::Model Model = getTargetMachine().getTLSModel(GV); @@ -1646,7 +1691,7 @@ SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op, // 64-bit SVR4 ABI code is always position-independent. // The actual address of the GlobalValue is stored in the TOC. - if (PPCSubTarget.isSVR4ABI() && PPCSubTarget.isPPC64()) { + if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) { SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset()); return DAG.getNode(PPCISD::TOC_ENTRY, DL, MVT::i64, GA, DAG.getRegister(PPC::X2, MVT::i64)); @@ -1891,7 +1936,8 @@ SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(Chain) .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), - DAG.getExternalSymbol("__trampoline_setup", PtrVT), &Args, 0); + DAG.getExternalSymbol("__trampoline_setup", PtrVT), + std::move(Args), 0); std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); return CallResult.second; @@ -2086,6 +2132,43 @@ static unsigned CalculateStackSlotSize(EVT ArgVT, ISD::ArgFlagsTy Flags, return ArgSize; } +/// CalculateStackSlotAlignment - Calculates the alignment of this argument +/// on the stack. +static unsigned CalculateStackSlotAlignment(EVT ArgVT, ISD::ArgFlagsTy Flags, + unsigned PtrByteSize) { + unsigned Align = PtrByteSize; + + // Altivec parameters are padded to a 16 byte boundary. + if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 || + ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 || + ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64) + Align = 16; + + // ByVal parameters are aligned as requested. + if (Flags.isByVal()) { + unsigned BVAlign = Flags.getByValAlign(); + if (BVAlign > PtrByteSize) { + if (BVAlign % PtrByteSize != 0) + llvm_unreachable( + "ByVal alignment is not a multiple of the pointer size"); + + Align = BVAlign; + } + } + + return Align; +} + +/// EnsureStackAlignment - Round stack frame size up from NumBytes to +/// ensure minimum alignment required for target. +static unsigned EnsureStackAlignment(const TargetMachine &Target, + unsigned NumBytes) { + unsigned TargetAlign = Target.getFrameLowering()->getStackAlignment(); + unsigned AlignMask = TargetAlign - 1; + NumBytes = (NumBytes + AlignMask) & ~AlignMask; + return NumBytes; +} + SDValue PPCTargetLowering::LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, @@ -2094,8 +2177,8 @@ PPCTargetLowering::LowerFormalArguments(SDValue Chain, SDLoc dl, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { - if (PPCSubTarget.isSVR4ABI()) { - if (PPCSubTarget.isPPC64()) + if (Subtarget.isSVR4ABI()) { + if (Subtarget.isPPC64()) return LowerFormalArguments_64SVR4(Chain, CallConv, isVarArg, Ins, dl, DAG, InVals); else @@ -2161,7 +2244,8 @@ PPCTargetLowering::LowerFormalArguments_32SVR4( getTargetMachine(), ArgLocs, *DAG.getContext()); // Reserve space for the linkage area on the stack. - CCInfo.AllocateStack(PPCFrameLowering::getLinkageSize(false, false), PtrByteSize); + unsigned LinkageSize = PPCFrameLowering::getLinkageSize(false, false); + CCInfo.AllocateStack(LinkageSize, PtrByteSize); CCInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4); @@ -2184,7 +2268,7 @@ PPCTargetLowering::LowerFormalArguments_32SVR4( RC = &PPC::F4RCRegClass; break; case MVT::f64: - if (PPCSubTarget.hasVSX()) + if (Subtarget.hasVSX()) RC = &PPC::VSFRCRegClass; else RC = &PPC::F8RCRegClass; @@ -2240,23 +2324,14 @@ PPCTargetLowering::LowerFormalArguments_32SVR4( // Area that is at least reserved in the caller of this function. unsigned MinReservedArea = CCByValInfo.getNextStackOffset(); + MinReservedArea = std::max(MinReservedArea, LinkageSize); // Set the size that is at least reserved in caller of this function. Tail // call optimized function's reserved stack space needs to be aligned so that // taking the difference between two stack areas will result in an aligned // stack. - PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>(); - - MinReservedArea = - std::max(MinReservedArea, - PPCFrameLowering::getMinCallFrameSize(false, false)); - - unsigned TargetAlign = DAG.getMachineFunction().getTarget().getFrameLowering()-> - getStackAlignment(); - unsigned AlignMask = TargetAlign-1; - MinReservedArea = (MinReservedArea + AlignMask) & ~AlignMask; - - FI->setMinReservedArea(MinReservedArea); + MinReservedArea = EnsureStackAlignment(MF.getTarget(), MinReservedArea); + FuncInfo->setMinReservedArea(MinReservedArea); SmallVector<SDValue, 8> MemOps; @@ -2352,32 +2427,6 @@ PPCTargetLowering::extendArgForPPC64(ISD::ArgFlagsTy Flags, EVT ObjectVT, return DAG.getNode(ISD::TRUNCATE, dl, ObjectVT, ArgVal); } -// Set the size that is at least reserved in caller of this function. Tail -// call optimized functions' reserved stack space needs to be aligned so that -// taking the difference between two stack areas will result in an aligned -// stack. -void -PPCTargetLowering::setMinReservedArea(MachineFunction &MF, SelectionDAG &DAG, - unsigned nAltivecParamsAtEnd, - unsigned MinReservedArea, - bool isPPC64) const { - PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>(); - // Add the Altivec parameters at the end, if needed. - if (nAltivecParamsAtEnd) { - MinReservedArea = ((MinReservedArea+15)/16)*16; - MinReservedArea += 16*nAltivecParamsAtEnd; - } - MinReservedArea = - std::max(MinReservedArea, - PPCFrameLowering::getMinCallFrameSize(isPPC64, true)); - unsigned TargetAlign - = DAG.getMachineFunction().getTarget().getFrameLowering()-> - getStackAlignment(); - unsigned AlignMask = TargetAlign-1; - MinReservedArea = (MinReservedArea + AlignMask) & ~AlignMask; - FI->setMinReservedArea(MinReservedArea); -} - SDValue PPCTargetLowering::LowerFormalArguments_64SVR4( SDValue Chain, @@ -2388,6 +2437,7 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( SmallVectorImpl<SDValue> &InVals) const { // TODO: add description of PPC stack frame format, or at least some docs. // + bool isLittleEndian = Subtarget.isLittleEndian(); MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); @@ -2398,9 +2448,8 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( (CallConv == CallingConv::Fast)); unsigned PtrByteSize = 8; - unsigned ArgOffset = PPCFrameLowering::getLinkageSize(true, true); - // Area that is at least reserved in caller of this function. - unsigned MinReservedArea = ArgOffset; + unsigned LinkageSize = PPCFrameLowering::getLinkageSize(true, false); + unsigned ArgOffset = LinkageSize; static const MCPhysReg GPR[] = { PPC::X3, PPC::X4, PPC::X5, PPC::X6, @@ -2422,14 +2471,13 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( const unsigned Num_FPR_Regs = 13; const unsigned Num_VR_Regs = array_lengthof(VR); - unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; + unsigned GPR_idx, FPR_idx = 0, VR_idx = 0; // Add DAG nodes to load the arguments or copy them out of registers. On // entry to a function on PPC, the arguments start after the linkage area, // although the first ones are often in registers. SmallVector<SDValue, 8> MemOps; - unsigned nAltivecParamsAtEnd = 0; Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin(); unsigned CurArgIdx = 0; for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) { @@ -2442,24 +2490,15 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( std::advance(FuncArg, Ins[ArgNo].OrigArgIndex - CurArgIdx); CurArgIdx = Ins[ArgNo].OrigArgIndex; + /* Respect alignment of argument on the stack. */ + unsigned Align = + CalculateStackSlotAlignment(ObjectVT, Flags, PtrByteSize); + ArgOffset = ((ArgOffset + Align - 1) / Align) * Align; unsigned CurArgOffset = ArgOffset; - // Varargs or 64 bit Altivec parameters are padded to a 16 byte boundary. - if (ObjectVT==MVT::v4f32 || ObjectVT==MVT::v4i32 || - ObjectVT==MVT::v8i16 || ObjectVT==MVT::v16i8 || - ObjectVT==MVT::v2f64 || ObjectVT==MVT::v2i64) { - if (isVarArg) { - MinReservedArea = ((MinReservedArea+15)/16)*16; - MinReservedArea += CalculateStackSlotSize(ObjectVT, - Flags, - PtrByteSize); - } else - nAltivecParamsAtEnd++; - } else - // Calculate min reserved area. - MinReservedArea += CalculateStackSlotSize(Ins[ArgNo].VT, - Flags, - PtrByteSize); + /* Compute GPR index associated with argument offset. */ + GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize; + GPR_idx = std::min(GPR_idx, Num_GPR_Regs); // FIXME the codegen can be much improved in some cases. // We do not have to keep everything in memory. @@ -2481,14 +2520,8 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( continue; } - unsigned BVAlign = Flags.getByValAlign(); - if (BVAlign > 8) { - ArgOffset = ((ArgOffset+BVAlign-1)/BVAlign)*BVAlign; - CurArgOffset = ArgOffset; - } - // All aggregates smaller than 8 bytes must be passed right-justified. - if (ObjSize < PtrByteSize) + if (ObjSize < PtrByteSize && !isLittleEndian) CurArgOffset = CurArgOffset + (PtrByteSize - ObjSize); // The value of the object is its address. int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset, true); @@ -2522,7 +2555,6 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( } MemOps.push_back(Store); - ++GPR_idx; } // Whether we copied from a register or not, advance the offset // into the parameter save area by a full doubleword. @@ -2567,8 +2599,6 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( // PPC64 passes i8, i16, and i32 values in i64 registers. Promote // value to MVT::i64 and then truncate to the correct register size. ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl); - - ++GPR_idx; } else { needsLoad = true; ArgSize = PtrByteSize; @@ -2578,18 +2608,13 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( case MVT::f32: case MVT::f64: - // Every 8 bytes of argument space consumes one of the GPRs available for - // argument passing. - if (GPR_idx != Num_GPR_Regs) { - ++GPR_idx; - } if (FPR_idx != Num_FPR_Regs) { unsigned VReg; if (ObjectVT == MVT::f32) VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F4RCRegClass); else - VReg = MF.addLiveIn(FPR[FPR_idx], PPCSubTarget.hasVSX() ? + VReg = MF.addLiveIn(FPR[FPR_idx], Subtarget.hasVSX() ? &PPC::VSFRCRegClass : &PPC::F8RCRegClass); @@ -2608,39 +2633,25 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( case MVT::v16i8: case MVT::v2f64: case MVT::v2i64: - // Note that vector arguments in registers don't reserve stack space, - // except in varargs functions. if (VR_idx != Num_VR_Regs) { unsigned VReg = (ObjectVT == MVT::v2f64 || ObjectVT == MVT::v2i64) ? MF.addLiveIn(VSRH[VR_idx], &PPC::VSHRCRegClass) : MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass); ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); - if (isVarArg) { - while ((ArgOffset % 16) != 0) { - ArgOffset += PtrByteSize; - if (GPR_idx != Num_GPR_Regs) - GPR_idx++; - } - ArgOffset += 16; - GPR_idx = std::min(GPR_idx+4, Num_GPR_Regs); // FIXME correct for ppc64? - } ++VR_idx; } else { - // Vectors are aligned. - ArgOffset = ((ArgOffset+15)/16)*16; - CurArgOffset = ArgOffset; - ArgOffset += 16; needsLoad = true; } + ArgOffset += 16; break; } // We need to load the argument to a virtual register if we determined // above that we ran out of physical registers of the appropriate type. if (needsLoad) { - int FI = MFI->CreateFixedObject(ObjSize, - CurArgOffset + (ArgSize - ObjSize), - isImmutable); + if (ObjSize < ArgSize && !isLittleEndian) + CurArgOffset += ArgSize - ObjSize; + int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset, isImmutable); SDValue FIN = DAG.getFrameIndex(FI, PtrVT); ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo(), false, false, false, 0); @@ -2649,11 +2660,16 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( InVals.push_back(ArgVal); } + // Area that is at least reserved in the caller of this function. + unsigned MinReservedArea; + MinReservedArea = std::max(ArgOffset, LinkageSize + 8 * PtrByteSize); + // Set the size that is at least reserved in caller of this function. Tail // call optimized functions' reserved stack space needs to be aligned so that // taking the difference between two stack areas will result in an aligned // stack. - setMinReservedArea(MF, DAG, nAltivecParamsAtEnd, MinReservedArea, true); + MinReservedArea = EnsureStackAlignment(MF.getTarget(), MinReservedArea); + FuncInfo->setMinReservedArea(MinReservedArea); // If the function takes variable number of arguments, make a frame index for // the start of the first vararg value... for expansion of llvm.va_start. @@ -2667,7 +2683,8 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( // If this function is vararg, store any remaining integer argument regs // to their spots on the stack so that they may be loaded by deferencing the // result of va_next. - for (; GPR_idx != Num_GPR_Regs; ++GPR_idx) { + for (GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize; + GPR_idx < Num_GPR_Regs; ++GPR_idx) { unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, @@ -2706,7 +2723,8 @@ PPCTargetLowering::LowerFormalArguments_Darwin( (CallConv == CallingConv::Fast)); unsigned PtrByteSize = isPPC64 ? 8 : 4; - unsigned ArgOffset = PPCFrameLowering::getLinkageSize(isPPC64, true); + unsigned LinkageSize = PPCFrameLowering::getLinkageSize(isPPC64, true); + unsigned ArgOffset = LinkageSize; // Area that is at least reserved in caller of this function. unsigned MinReservedArea = ArgOffset; @@ -2997,11 +3015,21 @@ PPCTargetLowering::LowerFormalArguments_Darwin( InVals.push_back(ArgVal); } + // Allow for Altivec parameters at the end, if needed. + if (nAltivecParamsAtEnd) { + MinReservedArea = ((MinReservedArea+15)/16)*16; + MinReservedArea += 16*nAltivecParamsAtEnd; + } + + // Area that is at least reserved in the caller of this function. + MinReservedArea = std::max(MinReservedArea, LinkageSize + 8 * PtrByteSize); + // Set the size that is at least reserved in caller of this function. Tail // call optimized functions' reserved stack space needs to be aligned so that // taking the difference between two stack areas will result in an aligned // stack. - setMinReservedArea(MF, DAG, nAltivecParamsAtEnd, MinReservedArea, isPPC64); + MinReservedArea = EnsureStackAlignment(MF.getTarget(), MinReservedArea); + FuncInfo->setMinReservedArea(MinReservedArea); // If the function takes variable number of arguments, make a frame index for // the start of the first vararg value... for expansion of llvm.va_start. @@ -3040,75 +3068,6 @@ PPCTargetLowering::LowerFormalArguments_Darwin( return Chain; } -/// CalculateParameterAndLinkageAreaSize - Get the size of the parameter plus -/// linkage area for the Darwin ABI, or the 64-bit SVR4 ABI. -static unsigned -CalculateParameterAndLinkageAreaSize(SelectionDAG &DAG, - bool isPPC64, - bool isVarArg, - unsigned CC, - const SmallVectorImpl<ISD::OutputArg> - &Outs, - const SmallVectorImpl<SDValue> &OutVals, - unsigned &nAltivecParamsAtEnd) { - // Count how many bytes are to be pushed on the stack, including the linkage - // area, and parameter passing area. We start with 24/48 bytes, which is - // prereserved space for [SP][CR][LR][3 x unused]. - unsigned NumBytes = PPCFrameLowering::getLinkageSize(isPPC64, true); - unsigned NumOps = Outs.size(); - unsigned PtrByteSize = isPPC64 ? 8 : 4; - - // Add up all the space actually used. - // In 32-bit non-varargs calls, Altivec parameters all go at the end; usually - // they all go in registers, but we must reserve stack space for them for - // possible use by the caller. In varargs or 64-bit calls, parameters are - // assigned stack space in order, with padding so Altivec parameters are - // 16-byte aligned. - nAltivecParamsAtEnd = 0; - for (unsigned i = 0; i != NumOps; ++i) { - ISD::ArgFlagsTy Flags = Outs[i].Flags; - EVT ArgVT = Outs[i].VT; - // Varargs Altivec parameters are padded to a 16 byte boundary. - if (ArgVT==MVT::v4f32 || ArgVT==MVT::v4i32 || - ArgVT==MVT::v8i16 || ArgVT==MVT::v16i8 || - ArgVT==MVT::v2f64 || ArgVT==MVT::v2i64) { - if (!isVarArg && !isPPC64) { - // Non-varargs Altivec parameters go after all the non-Altivec - // parameters; handle those later so we know how much padding we need. - nAltivecParamsAtEnd++; - continue; - } - // Varargs and 64-bit Altivec parameters are padded to 16 byte boundary. - NumBytes = ((NumBytes+15)/16)*16; - } - NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize); - } - - // Allow for Altivec parameters at the end, if needed. - if (nAltivecParamsAtEnd) { - NumBytes = ((NumBytes+15)/16)*16; - NumBytes += 16*nAltivecParamsAtEnd; - } - - // The prolog code of the callee may store up to 8 GPR argument registers to - // the stack, allowing va_start to index over them in memory if its varargs. - // Because we cannot tell if this is needed on the caller side, we have to - // conservatively assume that it is needed. As such, make sure we have at - // least enough stack space for the caller to store the 8 GPRs. - NumBytes = std::max(NumBytes, - PPCFrameLowering::getMinCallFrameSize(isPPC64, true)); - - // Tail call needs the stack to be aligned. - if (CC == CallingConv::Fast && DAG.getTarget().Options.GuaranteedTailCallOpt){ - unsigned TargetAlign = DAG.getMachineFunction().getTarget(). - getFrameLowering()->getStackAlignment(); - unsigned AlignMask = TargetAlign-1; - NumBytes = (NumBytes + AlignMask) & ~AlignMask; - } - - return NumBytes; -} - /// CalculateTailCallSPDiff - Get the amount the stack pointer has to be /// adjusted to accommodate the arguments for the tailcall. static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall, @@ -3280,7 +3239,7 @@ SDValue PPCTargetLowering::EmitTailCallLoadFPAndRetAddr(SelectionDAG & DAG, SDLoc dl) const { if (SPDiff) { // Load the LR and FP stack slot for later adjusting. - EVT VT = PPCSubTarget.isPPC64() ? MVT::i64 : MVT::i32; + EVT VT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32; LROpOut = getReturnAddrFrameIndex(DAG); LROpOut = DAG.getLoad(VT, dl, Chain, LROpOut, MachinePointerInfo(), false, false, false, 0); @@ -3373,10 +3332,10 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain, SDLoc dl, int SPDiff, bool isTailCall, SmallVectorImpl<std::pair<unsigned, SDValue> > &RegsToPass, SmallVectorImpl<SDValue> &Ops, std::vector<EVT> &NodeTys, - const PPCSubtarget &PPCSubTarget) { + const PPCSubtarget &Subtarget) { - bool isPPC64 = PPCSubTarget.isPPC64(); - bool isSVR4ABI = PPCSubTarget.isSVR4ABI(); + bool isPPC64 = Subtarget.isPPC64(); + bool isSVR4ABI = Subtarget.isSVR4ABI(); EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); NodeTys.push_back(MVT::Other); // Returns a chain @@ -3385,11 +3344,12 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, unsigned CallOpc = PPCISD::CALL; bool needIndirectCall = true; - if (SDNode *Dest = isBLACompatibleAddress(Callee, DAG)) { - // If this is an absolute destination address, use the munged value. - Callee = SDValue(Dest, 0); - needIndirectCall = false; - } + if (!isSVR4ABI || !isPPC64) + if (SDNode *Dest = isBLACompatibleAddress(Callee, DAG)) { + // If this is an absolute destination address, use the munged value. + Callee = SDValue(Dest, 0); + needIndirectCall = false; + } if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { // XXX Work around for http://llvm.org/bugs/show_bug.cgi?id=5201 @@ -3398,8 +3358,8 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, if (!DAG.getTarget().getSubtarget<PPCSubtarget>().isJITCodeModel()) { unsigned OpFlags = 0; if (DAG.getTarget().getRelocationModel() != Reloc::Static && - (PPCSubTarget.getTargetTriple().isMacOSX() && - PPCSubTarget.getTargetTriple().isMacOSXVersionLT(10, 5)) && + (Subtarget.getTargetTriple().isMacOSX() && + Subtarget.getTargetTriple().isMacOSXVersionLT(10, 5)) && (G->getGlobal()->isDeclaration() || G->getGlobal()->isWeakForLinker())) { // PC-relative references to external symbols should go through $stub, @@ -3422,8 +3382,8 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, unsigned char OpFlags = 0; if (DAG.getTarget().getRelocationModel() != Reloc::Static && - (PPCSubTarget.getTargetTriple().isMacOSX() && - PPCSubTarget.getTargetTriple().isMacOSXVersionLT(10, 5))) { + (Subtarget.getTargetTriple().isMacOSX() && + Subtarget.getTargetTriple().isMacOSXVersionLT(10, 5))) { // PC-relative references to external symbols should go through $stub, // unless we're building with the leopard linker or later, which // automatically synthesizes these stubs. @@ -3497,8 +3457,10 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, // additional register being allocated and an unnecessary move instruction // being generated. VTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue TOCOff = DAG.getIntPtrConstant(8); + SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, Callee, TOCOff); SDValue LoadTOCPtr = DAG.getNode(PPCISD::LOAD_TOC, dl, VTs, Chain, - Callee, InFlag); + AddTOC, InFlag); Chain = LoadTOCPtr.getValue(0); InFlag = LoadTOCPtr.getValue(1); @@ -3613,10 +3575,10 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl, SmallVector<SDValue, 8> Ops; unsigned CallOpc = PrepareCall(DAG, Callee, InFlag, Chain, dl, SPDiff, isTailCall, RegsToPass, Ops, NodeTys, - PPCSubTarget); + Subtarget); // Add implicit use of CR bit 6 for 32-bit SVR4 vararg calls - if (isVarArg && PPCSubTarget.isSVR4ABI() && !PPCSubTarget.isPPC64()) + if (isVarArg && Subtarget.isSVR4ABI() && !Subtarget.isPPC64()) Ops.push_back(DAG.getRegister(PPC::CR1EQ, MVT::i32)); // When performing tail call optimization the callee pops its arguments off @@ -3657,7 +3619,7 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl, // same TOC), the NOP will remain unchanged. bool needsTOCRestore = false; - if (!isTailCall && PPCSubTarget.isSVR4ABI()&& PPCSubTarget.isPPC64()) { + if (!isTailCall && Subtarget.isSVR4ABI()&& Subtarget.isPPC64()) { if (CallOpc == PPCISD::BCTRL) { // This is a call through a function pointer. // Restore the caller TOC from the save area into R2. @@ -3682,7 +3644,12 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl, if (needsTOCRestore) { SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); - Chain = DAG.getNode(PPCISD::TOC_RESTORE, dl, VTs, Chain, InFlag); + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + SDValue StackPtr = DAG.getRegister(PPC::X1, PtrVT); + unsigned TOCSaveOffset = PPCFrameLowering::getTOCSaveOffset(); + SDValue TOCOff = DAG.getIntPtrConstant(TOCSaveOffset); + SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, StackPtr, TOCOff); + Chain = DAG.getNode(PPCISD::LOAD_TOC, dl, VTs, Chain, AddTOC, InFlag); InFlag = Chain.getValue(1); } @@ -3718,8 +3685,8 @@ PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, report_fatal_error("failed to perform tail call elimination on a call " "site marked musttail"); - if (PPCSubTarget.isSVR4ABI()) { - if (PPCSubTarget.isPPC64()) + if (Subtarget.isSVR4ABI()) { + if (Subtarget.isPPC64()) return LowerCall_64SVR4(Chain, Callee, CallConv, isVarArg, isTailCall, Outs, OutVals, Ins, dl, DAG, InVals); @@ -3981,6 +3948,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, SDLoc dl, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { + bool isLittleEndian = Subtarget.isLittleEndian(); unsigned NumOps = Outs.size(); EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); @@ -3997,16 +3965,37 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, CallConv == CallingConv::Fast) MF.getInfo<PPCFunctionInfo>()->setHasFastCall(); - unsigned nAltivecParamsAtEnd = 0; - // Count how many bytes are to be pushed on the stack, including the linkage // area, and parameter passing area. We start with at least 48 bytes, which // is reserved space for [SP][CR][LR][3 x unused]. - // NOTE: For PPC64, nAltivecParamsAtEnd always remains zero as a result - // of this call. - unsigned NumBytes = - CalculateParameterAndLinkageAreaSize(DAG, true, isVarArg, CallConv, - Outs, OutVals, nAltivecParamsAtEnd); + unsigned LinkageSize = PPCFrameLowering::getLinkageSize(true, false); + unsigned NumBytes = LinkageSize; + + // Add up all the space actually used. + for (unsigned i = 0; i != NumOps; ++i) { + ISD::ArgFlagsTy Flags = Outs[i].Flags; + EVT ArgVT = Outs[i].VT; + + /* Respect alignment of argument on the stack. */ + unsigned Align = CalculateStackSlotAlignment(ArgVT, Flags, PtrByteSize); + NumBytes = ((NumBytes + Align - 1) / Align) * Align; + + NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize); + } + + unsigned NumBytesActuallyUsed = NumBytes; + + // The prolog code of the callee may store up to 8 GPR argument registers to + // the stack, allowing va_start to index over them in memory if its varargs. + // Because we cannot tell if this is needed on the caller side, we have to + // conservatively assume that it is needed. As such, make sure we have at + // least enough stack space for the caller to store the 8 GPRs. + NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize); + + // Tail call needs the stack to be aligned. + if (getTargetMachine().Options.GuaranteedTailCallOpt && + CallConv == CallingConv::Fast) + NumBytes = EnsureStackAlignment(MF.getTarget(), NumBytes); // Calculate by how many bytes the stack has to be adjusted in case of tail // call optimization. @@ -4038,8 +4027,8 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, // memory. Also, if this is a vararg function, floating point operations // must be stored to our stack, and loaded into integer regs as well, if // any integer regs are available for argument passing. - unsigned ArgOffset = PPCFrameLowering::getLinkageSize(true, true); - unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; + unsigned ArgOffset = LinkageSize; + unsigned GPR_idx, FPR_idx = 0, VR_idx = 0; static const MCPhysReg GPR[] = { PPC::X3, PPC::X4, PPC::X5, PPC::X6, @@ -4068,6 +4057,15 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, SDValue Arg = OutVals[i]; ISD::ArgFlagsTy Flags = Outs[i].Flags; + /* Respect alignment of argument on the stack. */ + unsigned Align = + CalculateStackSlotAlignment(Outs[i].VT, Flags, PtrByteSize); + ArgOffset = ((ArgOffset + Align - 1) / Align) * Align; + + /* Compute GPR index associated with argument offset. */ + GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize; + GPR_idx = std::min(GPR_idx, NumGPRs); + // PtrOff will be used to store the current argument to the stack if a // register cannot be found for it. SDValue PtrOff; @@ -4099,15 +4097,6 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, if (Size == 0) continue; - unsigned BVAlign = Flags.getByValAlign(); - if (BVAlign > 8) { - if (BVAlign % PtrByteSize != 0) - llvm_unreachable( - "ByVal alignment is not a multiple of the pointer size"); - - ArgOffset = ((ArgOffset+BVAlign-1)/BVAlign)*BVAlign; - } - // All aggregates smaller than 8 bytes must be passed right-justified. if (Size==1 || Size==2 || Size==4) { EVT VT = (Size==1) ? MVT::i8 : ((Size==2) ? MVT::i16 : MVT::i32); @@ -4116,7 +4105,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, MachinePointerInfo(), VT, false, false, 0); MemOpChains.push_back(Load.getValue(1)); - RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); + RegsToPass.push_back(std::make_pair(GPR[GPR_idx], Load)); ArgOffset += PtrByteSize; continue; @@ -4124,9 +4113,12 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, } if (GPR_idx == NumGPRs && Size < 8) { - SDValue Const = DAG.getConstant(PtrByteSize - Size, - PtrOff.getValueType()); - SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const); + SDValue AddPtr = PtrOff; + if (!isLittleEndian) { + SDValue Const = DAG.getConstant(PtrByteSize - Size, + PtrOff.getValueType()); + AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const); + } Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr, CallSeqStart, Flags, DAG, dl); @@ -4161,8 +4153,11 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, // small aggregates, particularly for packed ones. // FIXME: It would be preferable to use the slot in the // parameter save area instead of a new local variable. - SDValue Const = DAG.getConstant(8 - Size, PtrOff.getValueType()); - SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const); + SDValue AddPtr = PtrOff; + if (!isLittleEndian) { + SDValue Const = DAG.getConstant(8 - Size, PtrOff.getValueType()); + AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const); + } Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr, CallSeqStart, Flags, DAG, dl); @@ -4172,7 +4167,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, MachinePointerInfo(), false, false, false, 0); MemOpChains.push_back(Load.getValue(1)); - RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); + RegsToPass.push_back(std::make_pair(GPR[GPR_idx], Load)); // Done with this argument. ArgOffset += PtrByteSize; @@ -4205,7 +4200,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, case MVT::i32: case MVT::i64: if (GPR_idx != NumGPRs) { - RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg)); + RegsToPass.push_back(std::make_pair(GPR[GPR_idx], Arg)); } else { LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, true, isTailCall, false, MemOpChains, @@ -4223,7 +4218,8 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, // must be passed right-justified in the stack doubleword, and // in the GPR, if one is available. SDValue StoreOff; - if (Arg.getSimpleValueType().SimpleTy == MVT::f32) { + if (Arg.getSimpleValueType().SimpleTy == MVT::f32 && + !isLittleEndian) { SDValue ConstFour = DAG.getConstant(4, PtrOff.getValueType()); StoreOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour); } else @@ -4239,15 +4235,13 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, MachinePointerInfo(), false, false, false, 0); MemOpChains.push_back(Load.getValue(1)); - RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); + RegsToPass.push_back(std::make_pair(GPR[GPR_idx], Load)); } - } else if (GPR_idx != NumGPRs) - // If we have any FPRs remaining, we may also have GPRs remaining. - ++GPR_idx; + } } else { // Single-precision floating-point values are mapped to the // second (rightmost) word of the stack doubleword. - if (Arg.getValueType() == MVT::f32) { + if (Arg.getValueType() == MVT::f32 && !isLittleEndian) { SDValue ConstFour = DAG.getConstant(4, PtrOff.getValueType()); PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour); } @@ -4264,21 +4258,13 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, case MVT::v16i8: case MVT::v2f64: case MVT::v2i64: + // For a varargs call, named arguments go into VRs or on the stack as + // usual; unnamed arguments always go to the stack or the corresponding + // GPRs when within range. For now, we always put the value in both + // locations (or even all three). if (isVarArg) { - // These go aligned on the stack, or in the corresponding R registers - // when within range. The Darwin PPC ABI doc claims they also go in - // V registers; in fact gcc does this only for arguments that are - // prototyped, not for those that match the ... We do it for all - // arguments, seems to work. - while (ArgOffset % 16 !=0) { - ArgOffset += PtrByteSize; - if (GPR_idx != NumGPRs) - GPR_idx++; - } // We could elide this store in the case where the object fits // entirely in R registers. Maybe later. - PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, - DAG.getConstant(ArgOffset, PtrVT)); SDValue Store = DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo(), false, false, 0); MemOpChains.push_back(Store); @@ -4309,10 +4295,8 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, break; } - // Non-varargs Altivec params generally go in registers, but have - // stack space allocated at the end. + // Non-varargs Altivec params go into VRs or on the stack. if (VR_idx != NumVRs) { - // Doesn't have GPR space allocated. unsigned VReg = (Arg.getSimpleValueType() == MVT::v2f64 || Arg.getSimpleValueType() == MVT::v2i64) ? VSRH[VR_idx] : VR[VR_idx]; @@ -4323,12 +4307,15 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, true, isTailCall, true, MemOpChains, TailCallArguments, dl); - ArgOffset += 16; } + ArgOffset += 16; break; } } + assert(NumBytesActuallyUsed == ArgOffset); + (void)NumBytesActuallyUsed; + if (!MemOpChains.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); @@ -4337,19 +4324,15 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, // pointers in the 64-bit SVR4 ABI. if (!isTailCall && !dyn_cast<GlobalAddressSDNode>(Callee) && - !dyn_cast<ExternalSymbolSDNode>(Callee) && - !isBLACompatibleAddress(Callee, DAG)) { + !dyn_cast<ExternalSymbolSDNode>(Callee)) { // Load r2 into a virtual register and store it to the TOC save area. SDValue Val = DAG.getCopyFromReg(Chain, dl, PPC::X2, MVT::i64); // TOC save area offset. - SDValue PtrOff = DAG.getIntPtrConstant(40); + unsigned TOCSaveOffset = PPCFrameLowering::getTOCSaveOffset(); + SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset); SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff); Chain = DAG.getStore(Val.getValue(1), dl, Val, AddPtr, MachinePointerInfo(), false, false, 0); - // R12 must contain the address of an indirect callee. This does not - // mean the MTCTR instruction must use R12; it's easier to model this - // as an extra parameter, so do that. - RegsToPass.push_back(std::make_pair((unsigned)PPC::X12, Callee)); } // Build a sequence of copy-to-reg nodes chained together with token chain @@ -4397,15 +4380,55 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee, CallConv == CallingConv::Fast) MF.getInfo<PPCFunctionInfo>()->setHasFastCall(); - unsigned nAltivecParamsAtEnd = 0; - // Count how many bytes are to be pushed on the stack, including the linkage // area, and parameter passing area. We start with 24/48 bytes, which is // prereserved space for [SP][CR][LR][3 x unused]. - unsigned NumBytes = - CalculateParameterAndLinkageAreaSize(DAG, isPPC64, isVarArg, CallConv, - Outs, OutVals, - nAltivecParamsAtEnd); + unsigned LinkageSize = PPCFrameLowering::getLinkageSize(isPPC64, true); + unsigned NumBytes = LinkageSize; + + // Add up all the space actually used. + // In 32-bit non-varargs calls, Altivec parameters all go at the end; usually + // they all go in registers, but we must reserve stack space for them for + // possible use by the caller. In varargs or 64-bit calls, parameters are + // assigned stack space in order, with padding so Altivec parameters are + // 16-byte aligned. + unsigned nAltivecParamsAtEnd = 0; + for (unsigned i = 0; i != NumOps; ++i) { + ISD::ArgFlagsTy Flags = Outs[i].Flags; + EVT ArgVT = Outs[i].VT; + // Varargs Altivec parameters are padded to a 16 byte boundary. + if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 || + ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 || + ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64) { + if (!isVarArg && !isPPC64) { + // Non-varargs Altivec parameters go after all the non-Altivec + // parameters; handle those later so we know how much padding we need. + nAltivecParamsAtEnd++; + continue; + } + // Varargs and 64-bit Altivec parameters are padded to 16 byte boundary. + NumBytes = ((NumBytes+15)/16)*16; + } + NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize); + } + + // Allow for Altivec parameters at the end, if needed. + if (nAltivecParamsAtEnd) { + NumBytes = ((NumBytes+15)/16)*16; + NumBytes += 16*nAltivecParamsAtEnd; + } + + // The prolog code of the callee may store up to 8 GPR argument registers to + // the stack, allowing va_start to index over them in memory if its varargs. + // Because we cannot tell if this is needed on the caller side, we have to + // conservatively assume that it is needed. As such, make sure we have at + // least enough stack space for the caller to store the 8 GPRs. + NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize); + + // Tail call needs the stack to be aligned. + if (getTargetMachine().Options.GuaranteedTailCallOpt && + CallConv == CallingConv::Fast) + NumBytes = EnsureStackAlignment(MF.getTarget(), NumBytes); // Calculate by how many bytes the stack has to be adjusted in case of tail // call optimization. @@ -4441,7 +4464,7 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee, // memory. Also, if this is a vararg function, floating point operations // must be stored to our stack, and loaded into integer regs as well, if // any integer regs are available for argument passing. - unsigned ArgOffset = PPCFrameLowering::getLinkageSize(isPPC64, true); + unsigned ArgOffset = LinkageSize; unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; static const MCPhysReg GPR_32[] = { // 32-bit registers. @@ -4818,8 +4841,8 @@ SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG, SDValue PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG & DAG) const { MachineFunction &MF = DAG.getMachineFunction(); - bool isPPC64 = PPCSubTarget.isPPC64(); - bool isDarwinABI = PPCSubTarget.isDarwinABI(); + bool isPPC64 = Subtarget.isPPC64(); + bool isDarwinABI = Subtarget.isDarwinABI(); EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); // Get current frame pointer save index. The users of this index will be @@ -4842,8 +4865,8 @@ PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG & DAG) const { SDValue PPCTargetLowering::getFramePointerFrameIndex(SelectionDAG & DAG) const { MachineFunction &MF = DAG.getMachineFunction(); - bool isPPC64 = PPCSubTarget.isPPC64(); - bool isDarwinABI = PPCSubTarget.isDarwinABI(); + bool isPPC64 = Subtarget.isPPC64(); + bool isDarwinABI = Subtarget.isDarwinABI(); EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); // Get current frame pointer save index. The users of this index will be @@ -5063,12 +5086,12 @@ SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!"); case MVT::i32: Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIWZ : - (PPCSubTarget.hasFPCVT() ? PPCISD::FCTIWUZ : + (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : PPCISD::FCTIDZ), dl, MVT::f64, Src); break; case MVT::i64: - assert((Op.getOpcode() == ISD::FP_TO_SINT || PPCSubTarget.hasFPCVT()) && + assert((Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()) && "i64 FP_TO_UINT is supported only with FPCVT"); Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIDZ : PPCISD::FCTIDUZ, @@ -5077,8 +5100,8 @@ SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, } // Convert the FP value to an int value through memory. - bool i32Stack = Op.getValueType() == MVT::i32 && PPCSubTarget.hasSTFIWX() && - (Op.getOpcode() == ISD::FP_TO_SINT || PPCSubTarget.hasFPCVT()); + bool i32Stack = Op.getValueType() == MVT::i32 && Subtarget.hasSTFIWX() && + (Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()); SDValue FIPtr = DAG.CreateStackTemporary(i32Stack ? MVT::i32 : MVT::f64); int FI = cast<FrameIndexSDNode>(FIPtr)->getIndex(); MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(FI); @@ -5120,17 +5143,17 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, DAG.getConstantFP(1.0, Op.getValueType()), DAG.getConstantFP(0.0, Op.getValueType())); - assert((Op.getOpcode() == ISD::SINT_TO_FP || PPCSubTarget.hasFPCVT()) && + assert((Op.getOpcode() == ISD::SINT_TO_FP || Subtarget.hasFPCVT()) && "UINT_TO_FP is supported only with FPCVT"); // If we have FCFIDS, then use it when converting to single-precision. // Otherwise, convert to double-precision and then round. - unsigned FCFOp = (PPCSubTarget.hasFPCVT() && Op.getValueType() == MVT::f32) ? + unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) ? (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDUS : PPCISD::FCFIDS) : (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDU : PPCISD::FCFID); - MVT FCFTy = (PPCSubTarget.hasFPCVT() && Op.getValueType() == MVT::f32) ? + MVT FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) ? MVT::f32 : MVT::f64; if (Op.getOperand(0).getValueType() == MVT::i64) { @@ -5146,7 +5169,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, // However, if -enable-unsafe-fp-math is in effect, accept double // rounding to avoid the extra overhead. if (Op.getValueType() == MVT::f32 && - !PPCSubTarget.hasFPCVT() && + !Subtarget.hasFPCVT() && !DAG.getTarget().Options.UnsafeFPMath) { // Twiddle input to make sure the low 11 bits are zero. (If this @@ -5184,7 +5207,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, SDValue Bits = DAG.getNode(ISD::BITCAST, dl, MVT::f64, SINT); SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Bits); - if (Op.getValueType() == MVT::f32 && !PPCSubTarget.hasFPCVT()) + if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP, DAG.getIntPtrConstant(0)); return FP; @@ -5201,7 +5224,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); SDValue Ld; - if (PPCSubTarget.hasLFIWAX() || PPCSubTarget.hasFPCVT()) { + if (Subtarget.hasLFIWAX() || Subtarget.hasFPCVT()) { int FrameIdx = FrameInfo->CreateStackObject(4, 4, false); SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); @@ -5220,7 +5243,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, dl, DAG.getVTList(MVT::f64, MVT::Other), Ops, MVT::i32, MMO); } else { - assert(PPCSubTarget.isPPC64() && + assert(Subtarget.isPPC64() && "i32->FP without LFIWAX supported only on PPC64"); int FrameIdx = FrameInfo->CreateStackObject(8, 8, false); @@ -5242,7 +5265,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, // FCFID it and return it. SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Ld); - if (Op.getValueType() == MVT::f32 && !PPCSubTarget.hasFPCVT()) + if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP, DAG.getIntPtrConstant(0)); return FP; } @@ -5557,6 +5580,22 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); } + // The remaining cases assume either big endian element order or + // a splat-size that equates to the element size of the vector + // to be built. An example that doesn't work for little endian is + // {0, -1, 0, -1, 0, -1, 0, -1} which has a splat size of 32 bits + // and a vector element size of 16 bits. The code below will + // produce the vector in big endian element order, which for little + // endian is {-1, 0, -1, 0, -1, 0, -1, 0}. + + // For now, just avoid these optimizations in that case. + // FIXME: Develop correct optimizations for LE with mismatched + // splat and element sizes. + + if (Subtarget.isLittleEndian() && + SplatSize != Op.getValueType().getVectorElementType().getSizeInBits()) + return SDValue(); + // Check to see if this is a wide variety of vsplti*, binop self cases. static const signed char SplatCsts[] = { -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7, @@ -5725,6 +5764,7 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SDValue V2 = Op.getOperand(1); ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); EVT VT = Op.getValueType(); + bool isLittleEndian = Subtarget.isLittleEndian(); // Cases that are handled by instructions that take permute immediates // (such as vsplt*) should be left as VECTOR_SHUFFLE nodes so they can be @@ -5733,15 +5773,15 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, if (PPC::isSplatShuffleMask(SVOp, 1) || PPC::isSplatShuffleMask(SVOp, 2) || PPC::isSplatShuffleMask(SVOp, 4) || - PPC::isVPKUWUMShuffleMask(SVOp, true) || - PPC::isVPKUHUMShuffleMask(SVOp, true) || - PPC::isVSLDOIShuffleMask(SVOp, true) != -1 || - PPC::isVMRGLShuffleMask(SVOp, 1, true) || - PPC::isVMRGLShuffleMask(SVOp, 2, true) || - PPC::isVMRGLShuffleMask(SVOp, 4, true) || - PPC::isVMRGHShuffleMask(SVOp, 1, true) || - PPC::isVMRGHShuffleMask(SVOp, 2, true) || - PPC::isVMRGHShuffleMask(SVOp, 4, true)) { + PPC::isVPKUWUMShuffleMask(SVOp, true, DAG) || + PPC::isVPKUHUMShuffleMask(SVOp, true, DAG) || + PPC::isVSLDOIShuffleMask(SVOp, true, DAG) != -1 || + PPC::isVMRGLShuffleMask(SVOp, 1, true, DAG) || + PPC::isVMRGLShuffleMask(SVOp, 2, true, DAG) || + PPC::isVMRGLShuffleMask(SVOp, 4, true, DAG) || + PPC::isVMRGHShuffleMask(SVOp, 1, true, DAG) || + PPC::isVMRGHShuffleMask(SVOp, 2, true, DAG) || + PPC::isVMRGHShuffleMask(SVOp, 4, true, DAG)) { return Op; } } @@ -5749,15 +5789,15 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, // Altivec has a variety of "shuffle immediates" that take two vector inputs // and produce a fixed permutation. If any of these match, do not lower to // VPERM. - if (PPC::isVPKUWUMShuffleMask(SVOp, false) || - PPC::isVPKUHUMShuffleMask(SVOp, false) || - PPC::isVSLDOIShuffleMask(SVOp, false) != -1 || - PPC::isVMRGLShuffleMask(SVOp, 1, false) || - PPC::isVMRGLShuffleMask(SVOp, 2, false) || - PPC::isVMRGLShuffleMask(SVOp, 4, false) || - PPC::isVMRGHShuffleMask(SVOp, 1, false) || - PPC::isVMRGHShuffleMask(SVOp, 2, false) || - PPC::isVMRGHShuffleMask(SVOp, 4, false)) + if (PPC::isVPKUWUMShuffleMask(SVOp, false, DAG) || + PPC::isVPKUHUMShuffleMask(SVOp, false, DAG) || + PPC::isVSLDOIShuffleMask(SVOp, false, DAG) != -1 || + PPC::isVMRGLShuffleMask(SVOp, 1, false, DAG) || + PPC::isVMRGLShuffleMask(SVOp, 2, false, DAG) || + PPC::isVMRGLShuffleMask(SVOp, 4, false, DAG) || + PPC::isVMRGHShuffleMask(SVOp, 1, false, DAG) || + PPC::isVMRGHShuffleMask(SVOp, 2, false, DAG) || + PPC::isVMRGHShuffleMask(SVOp, 4, false, DAG)) return Op; // Check to see if this is a shuffle of 4-byte values. If so, we can use our @@ -5791,7 +5831,9 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, // If this shuffle can be expressed as a shuffle of 4-byte elements, use the // perfect shuffle vector to determine if it is cost effective to do this as // discrete instructions, or whether we should use a vperm. - if (isFourElementShuffle) { + // For now, we skip this for little endian until such time as we have a + // little-endian perfect shuffle table. + if (isFourElementShuffle && !isLittleEndian) { // Compute the index in the perfect shuffle table. unsigned PFTableIndex = PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; @@ -5820,6 +5862,11 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, // The SHUFFLE_VECTOR mask is almost exactly what we want for vperm, except // that it is in input element units, not in bytes. Convert now. + + // For little endian, the order of the input vectors is reversed, and + // the permutation mask is complemented with respect to 31. This is + // necessary to produce proper semantics with the big-endian-biased vperm + // instruction. EVT EltVT = V1.getValueType().getVectorElementType(); unsigned BytesPerElement = EltVT.getSizeInBits()/8; @@ -5828,13 +5875,22 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, unsigned SrcElt = PermMask[i] < 0 ? 0 : PermMask[i]; for (unsigned j = 0; j != BytesPerElement; ++j) - ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement+j, - MVT::i32)); + if (isLittleEndian) + ResultMask.push_back(DAG.getConstant(31 - (SrcElt*BytesPerElement+j), + MVT::i32)); + else + ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement+j, + MVT::i32)); } SDValue VPermMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i8, ResultMask); - return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(), V1, V2, VPermMask); + if (isLittleEndian) + return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(), + V2, V1, VPermMask); + else + return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(), + V1, V2, VPermMask); } /// getAltivecCompareInfo - Given an intrinsic, return false if it is not an @@ -6027,6 +6083,7 @@ SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { LHS, RHS, Zero, DAG, dl); } else if (Op.getValueType() == MVT::v16i8) { SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1); + bool isLittleEndian = Subtarget.isLittleEndian(); // Multiply the even 8-bit parts, producing 16-bit sums. SDValue EvenParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuleub, @@ -6038,13 +6095,24 @@ SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { LHS, RHS, DAG, dl, MVT::v8i16); OddParts = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OddParts); - // Merge the results together. + // Merge the results together. Because vmuleub and vmuloub are + // instructions with a big-endian bias, we must reverse the + // element numbering and reverse the meaning of "odd" and "even" + // when generating little endian code. int Ops[16]; for (unsigned i = 0; i != 8; ++i) { - Ops[i*2 ] = 2*i+1; - Ops[i*2+1] = 2*i+1+16; + if (isLittleEndian) { + Ops[i*2 ] = 2*i; + Ops[i*2+1] = 2*i+16; + } else { + Ops[i*2 ] = 2*i+1; + Ops[i*2+1] = 2*i+1+16; + } } - return DAG.getVectorShuffle(MVT::v16i8, dl, EvenParts, OddParts, Ops); + if (isLittleEndian) + return DAG.getVectorShuffle(MVT::v16i8, dl, OddParts, EvenParts, Ops); + else + return DAG.getVectorShuffle(MVT::v16i8, dl, EvenParts, OddParts, Ops); } else { llvm_unreachable("Unknown mul to lower!"); } @@ -6064,17 +6132,17 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG); case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG); case ISD::VASTART: - return LowerVASTART(Op, DAG, PPCSubTarget); + return LowerVASTART(Op, DAG, Subtarget); case ISD::VAARG: - return LowerVAARG(Op, DAG, PPCSubTarget); + return LowerVAARG(Op, DAG, Subtarget); case ISD::VACOPY: - return LowerVACOPY(Op, DAG, PPCSubTarget); + return LowerVACOPY(Op, DAG, Subtarget); - case ISD::STACKRESTORE: return LowerSTACKRESTORE(Op, DAG, PPCSubTarget); + case ISD::STACKRESTORE: return LowerSTACKRESTORE(Op, DAG, Subtarget); case ISD::DYNAMIC_STACKALLOC: - return LowerDYNAMIC_STACKALLOC(Op, DAG, PPCSubTarget); + return LowerDYNAMIC_STACKALLOC(Op, DAG, Subtarget); case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG); case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG); @@ -6144,7 +6212,7 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N, EVT VT = N->getValueType(0); if (VT == MVT::i64) { - SDValue NewNode = LowerVAARG(SDValue(N, 1), DAG, PPCSubTarget); + SDValue NewNode = LowerVAARG(SDValue(N, 1), DAG, Subtarget); Results.push_back(NewNode); Results.push_back(NewNode.getValue(1)); @@ -6255,7 +6323,7 @@ PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr *MI, // lwarx/stwcx are 32 bits. With the 32-bit atomics we can use address // registers without caring whether they're 32 or 64, but here we're // doing actual arithmetic on the addresses. - bool is64bit = PPCSubTarget.isPPC64(); + bool is64bit = Subtarget.isPPC64(); unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO; const BasicBlock *LLVM_BB = BB->getBasicBlock(); @@ -6450,7 +6518,7 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, unsigned LabelReg = MRI.createVirtualRegister(PtrRC); unsigned BufReg = MI->getOperand(1).getReg(); - if (PPCSubTarget.isPPC64() && PPCSubTarget.isSVR4ABI()) { + if (Subtarget.isPPC64() && Subtarget.isSVR4ABI()) { MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::STD)) .addReg(PPC::X2) .addImm(TOCOffset) @@ -6463,12 +6531,12 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, unsigned BaseReg; if (MF->getFunction()->getAttributes().hasAttribute( AttributeSet::FunctionIndex, Attribute::Naked)) - BaseReg = PPCSubTarget.isPPC64() ? PPC::X1 : PPC::R1; + BaseReg = Subtarget.isPPC64() ? PPC::X1 : PPC::R1; else - BaseReg = PPCSubTarget.isPPC64() ? PPC::BP8 : PPC::BP; + BaseReg = Subtarget.isPPC64() ? PPC::BP8 : PPC::BP; MIB = BuildMI(*thisMBB, MI, DL, - TII->get(PPCSubTarget.isPPC64() ? PPC::STD : PPC::STW)) + TII->get(Subtarget.isPPC64() ? PPC::STD : PPC::STW)) .addReg(BaseReg) .addImm(BPOffset) .addReg(BufReg); @@ -6492,10 +6560,10 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, // mainMBB: // mainDstReg = 0 MIB = BuildMI(mainMBB, DL, - TII->get(PPCSubTarget.isPPC64() ? PPC::MFLR8 : PPC::MFLR), LabelReg); + TII->get(Subtarget.isPPC64() ? PPC::MFLR8 : PPC::MFLR), LabelReg); // Store IP - if (PPCSubTarget.isPPC64()) { + if (Subtarget.isPPC64()) { MIB = BuildMI(mainMBB, DL, TII->get(PPC::STD)) .addReg(LabelReg) .addImm(LabelOffset) @@ -6607,7 +6675,7 @@ PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, MIB.setMemRefs(MMOBegin, MMOEnd); // Reload TOC - if (PVT == MVT::i64 && PPCSubTarget.isSVR4ABI()) { + if (PVT == MVT::i64 && Subtarget.isSVR4ABI()) { MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), PPC::X2) .addImm(TOCOffset) .addReg(BufReg); @@ -6645,7 +6713,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, MachineFunction *F = BB->getParent(); - if (PPCSubTarget.hasISEL() && (MI->getOpcode() == PPC::SELECT_CC_I4 || + if (Subtarget.hasISEL() && (MI->getOpcode() == PPC::SELECT_CC_I4 || MI->getOpcode() == PPC::SELECT_CC_I8 || MI->getOpcode() == PPC::SELECT_I4 || MI->getOpcode() == PPC::SELECT_I8)) { @@ -6765,13 +6833,13 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, BB = EmitAtomicBinary(MI, BB, true, PPC::XOR8); else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I8) - BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::ANDC); + BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::NAND); else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I16) - BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::ANDC); + BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::NAND); else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I32) - BB = EmitAtomicBinary(MI, BB, false, PPC::ANDC); + BB = EmitAtomicBinary(MI, BB, false, PPC::NAND); else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I64) - BB = EmitAtomicBinary(MI, BB, true, PPC::ANDC8); + BB = EmitAtomicBinary(MI, BB, true, PPC::NAND8); else if (MI->getOpcode() == PPC::ATOMIC_LOAD_SUB_I8) BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::SUBF); @@ -6862,7 +6930,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // We must use 64-bit registers for addresses when targeting 64-bit, // since we're actually doing arithmetic on them. Other registers // can be 32-bit. - bool is64bit = PPCSubTarget.isPPC64(); + bool is64bit = Subtarget.isPPC64(); bool is8bit = MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I8; unsigned dest = MI->getOperand(0).getReg(); @@ -7070,10 +7138,10 @@ SDValue PPCTargetLowering::DAGCombineFastRecip(SDValue Op, EVT VT = Op.getValueType(); - if ((VT == MVT::f32 && PPCSubTarget.hasFRES()) || - (VT == MVT::f64 && PPCSubTarget.hasFRE()) || - (VT == MVT::v4f32 && PPCSubTarget.hasAltivec()) || - (VT == MVT::v2f64 && PPCSubTarget.hasVSX())) { + if ((VT == MVT::f32 && Subtarget.hasFRES()) || + (VT == MVT::f64 && Subtarget.hasFRE()) || + (VT == MVT::v4f32 && Subtarget.hasAltivec()) || + (VT == MVT::v2f64 && Subtarget.hasVSX())) { // Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i) // For the reciprocal, we need to find the zero of the function: @@ -7086,7 +7154,7 @@ SDValue PPCTargetLowering::DAGCombineFastRecip(SDValue Op, // correct after every iteration. The minimum architected relative // accuracy is 2^-5. When hasRecipPrec(), this is 2^-14. IEEE float has // 23 digits and double has 52 digits. - int Iterations = PPCSubTarget.hasRecipPrec() ? 1 : 3; + int Iterations = Subtarget.hasRecipPrec() ? 1 : 3; if (VT.getScalarType() == MVT::f64) ++Iterations; @@ -7133,10 +7201,10 @@ SDValue PPCTargetLowering::DAGCombineFastRecipFSQRT(SDValue Op, EVT VT = Op.getValueType(); - if ((VT == MVT::f32 && PPCSubTarget.hasFRSQRTES()) || - (VT == MVT::f64 && PPCSubTarget.hasFRSQRTE()) || - (VT == MVT::v4f32 && PPCSubTarget.hasAltivec()) || - (VT == MVT::v2f64 && PPCSubTarget.hasVSX())) { + if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) || + (VT == MVT::f64 && Subtarget.hasFRSQRTE()) || + (VT == MVT::v4f32 && Subtarget.hasAltivec()) || + (VT == MVT::v2f64 && Subtarget.hasVSX())) { // Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i) // For the reciprocal sqrt, we need to find the zero of the function: @@ -7149,7 +7217,7 @@ SDValue PPCTargetLowering::DAGCombineFastRecipFSQRT(SDValue Op, // correct after every iteration. The minimum architected relative // accuracy is 2^-5. When hasRecipPrec(), this is 2^-14. IEEE float has // 23 digits and double has 52 digits. - int Iterations = PPCSubTarget.hasRecipPrec() ? 1 : 3; + int Iterations = Subtarget.hasRecipPrec() ? 1 : 3; if (VT.getScalarType() == MVT::f64) ++Iterations; @@ -7266,10 +7334,9 @@ static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) { if (!Visited.count(ChainLD->getChain().getNode())) Queue.push_back(ChainLD->getChain().getNode()); } else if (ChainNext->getOpcode() == ISD::TokenFactor) { - for (SDNode::op_iterator O = ChainNext->op_begin(), - OE = ChainNext->op_end(); O != OE; ++O) - if (!Visited.count(O->getNode())) - Queue.push_back(O->getNode()); + for (const SDUse &O : ChainNext->ops()) + if (!Visited.count(O.getNode())) + Queue.push_back(O.getNode()); } else LoadRoots.insert(ChainNext); } @@ -7312,7 +7379,7 @@ SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N, SelectionDAG &DAG = DCI.DAG; SDLoc dl(N); - assert(PPCSubTarget.useCRBits() && + assert(Subtarget.useCRBits() && "Expecting to be tracking CR bits"); // If we're tracking CR bits, we need to be careful that we don't have: // trunc(binary-ops(zext(x), zext(y))) @@ -7610,9 +7677,9 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N, return SDValue(); if (!((N->getOperand(0).getValueType() == MVT::i1 && - PPCSubTarget.useCRBits()) || + Subtarget.useCRBits()) || (N->getOperand(0).getValueType() == MVT::i32 && - PPCSubTarget.isPPC64()))) + Subtarget.isPPC64()))) return SDValue(); if (N->getOperand(0).getOpcode() != ISD::AND && @@ -7930,8 +7997,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, DCI.AddToWorklist(RV.getNode()); RV = DAGCombineFastRecip(RV, DCI); if (RV.getNode()) { - // Unfortunately, RV is now NaN if the input was exactly 0. Select out - // this case and force the answer to 0. + // Unfortunately, RV is now NaN if the input was exactly 0. Select out + // this case and force the answer to 0. EVT VT = RV.getValueType(); @@ -8051,6 +8118,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, // This is a type-legal unaligned Altivec load. SDValue Chain = LD->getChain(); SDValue Ptr = LD->getBasePtr(); + bool isLittleEndian = Subtarget.isLittleEndian(); // This implements the loading of unaligned vectors as described in // the venerable Apple Velocity Engine overview. Specifically: @@ -8058,25 +8126,28 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, // https://developer.apple.com/hardwaredrivers/ve/code_optimization.html // // The general idea is to expand a sequence of one or more unaligned - // loads into a alignment-based permutation-control instruction (lvsl), - // a series of regular vector loads (which always truncate their - // input address to an aligned address), and a series of permutations. - // The results of these permutations are the requested loaded values. - // The trick is that the last "extra" load is not taken from the address - // you might suspect (sizeof(vector) bytes after the last requested - // load), but rather sizeof(vector) - 1 bytes after the last - // requested vector. The point of this is to avoid a page fault if the - // base address happened to be aligned. This works because if the base - // address is aligned, then adding less than a full vector length will - // cause the last vector in the sequence to be (re)loaded. Otherwise, - // the next vector will be fetched as you might suspect was necessary. + // loads into an alignment-based permutation-control instruction (lvsl + // or lvsr), a series of regular vector loads (which always truncate + // their input address to an aligned address), and a series of + // permutations. The results of these permutations are the requested + // loaded values. The trick is that the last "extra" load is not taken + // from the address you might suspect (sizeof(vector) bytes after the + // last requested load), but rather sizeof(vector) - 1 bytes after the + // last requested vector. The point of this is to avoid a page fault if + // the base address happened to be aligned. This works because if the + // base address is aligned, then adding less than a full vector length + // will cause the last vector in the sequence to be (re)loaded. + // Otherwise, the next vector will be fetched as you might suspect was + // necessary. // We might be able to reuse the permutation generation from // a different base address offset from this one by an aligned amount. // The INTRINSIC_WO_CHAIN DAG combine will attempt to perform this // optimization later. - SDValue PermCntl = BuildIntrinsicOp(Intrinsic::ppc_altivec_lvsl, Ptr, - DAG, dl, MVT::v16i8); + Intrinsic::ID Intr = (isLittleEndian ? + Intrinsic::ppc_altivec_lvsr : + Intrinsic::ppc_altivec_lvsl); + SDValue PermCntl = BuildIntrinsicOp(Intr, Ptr, DAG, dl, MVT::v16i8); // Refine the alignment of the original load (a "new" load created here // which was identical to the first except for the alignment would be @@ -8125,8 +8196,18 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, if (ExtraLoad.getValueType() != MVT::v4i32) ExtraLoad = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, ExtraLoad); - SDValue Perm = BuildIntrinsicOp(Intrinsic::ppc_altivec_vperm, - BaseLoad, ExtraLoad, PermCntl, DAG, dl); + // Because vperm has a big-endian bias, we must reverse the order + // of the input vectors and complement the permute control vector + // when generating little endian code. We have already handled the + // latter by using lvsr instead of lvsl, so just reverse BaseLoad + // and ExtraLoad here. + SDValue Perm; + if (isLittleEndian) + Perm = BuildIntrinsicOp(Intrinsic::ppc_altivec_vperm, + ExtraLoad, BaseLoad, PermCntl, DAG, dl); + else + Perm = BuildIntrinsicOp(Intrinsic::ppc_altivec_vperm, + BaseLoad, ExtraLoad, PermCntl, DAG, dl); if (VT != MVT::v4i32) Perm = DAG.getNode(ISD::BITCAST, dl, VT, Perm); @@ -8151,12 +8232,11 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, ++UI; SmallVector<SDValue, 8> Ops; - for (SDNode::op_iterator O = User->op_begin(), - OE = User->op_end(); O != OE; ++O) { - if (*O == Use) + for (const SDUse &O : User->ops()) { + if (O == Use) Ops.push_back(To); else - Ops.push_back(*O); + Ops.push_back(O); } DAG.UpdateNodeOperands(User, Ops); @@ -8166,9 +8246,12 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, } } break; - case ISD::INTRINSIC_WO_CHAIN: - if (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue() == - Intrinsic::ppc_altivec_lvsl && + case ISD::INTRINSIC_WO_CHAIN: { + bool isLittleEndian = Subtarget.isLittleEndian(); + Intrinsic::ID Intr = (isLittleEndian ? + Intrinsic::ppc_altivec_lvsr : + Intrinsic::ppc_altivec_lvsl); + if (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue() == Intr && N->getOperand(1)->getOpcode() == ISD::ADD) { SDValue Add = N->getOperand(1); @@ -8180,8 +8263,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, UE = BasePtr->use_end(); UI != UE; ++UI) { if (UI->getOpcode() == ISD::INTRINSIC_WO_CHAIN && cast<ConstantSDNode>(UI->getOperand(0))->getZExtValue() == - Intrinsic::ppc_altivec_lvsl) { - // We've found another LVSL, and this address if an aligned + Intr) { + // We've found another LVSL/LVSR, and this address is an aligned // multiple of that one. The results will be the same, so use the // one we've just found instead. @@ -8190,6 +8273,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, } } } + } break; case ISD::BSWAP: @@ -8537,11 +8621,11 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, // GCC RS6000 Constraint Letters switch (Constraint[0]) { case 'b': // R1-R31 - if (VT == MVT::i64 && PPCSubTarget.isPPC64()) + if (VT == MVT::i64 && Subtarget.isPPC64()) return std::make_pair(0U, &PPC::G8RC_NOX0RegClass); return std::make_pair(0U, &PPC::GPRC_NOR0RegClass); case 'r': // R0-R31 - if (VT == MVT::i64 && PPCSubTarget.isPPC64()) + if (VT == MVT::i64 && Subtarget.isPPC64()) return std::make_pair(0U, &PPC::G8RCRegClass); return std::make_pair(0U, &PPC::GPRCRegClass); case 'f': @@ -8573,7 +8657,7 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, // register. // FIXME: If TargetLowering::getRegForInlineAsmConstraint could somehow use // the AsmName field from *RegisterInfo.td, then this would not be necessary. - if (R.first && VT == MVT::i64 && PPCSubTarget.isPPC64() && + if (R.first && VT == MVT::i64 && Subtarget.isPPC64() && PPC::GPRCRegClass.contains(R.first)) { const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); return std::make_pair(TRI->getMatchingSuperReg(R.first, @@ -8707,8 +8791,8 @@ SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op, // the stack. PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); FuncInfo->setLRStoreRequired(); - bool isPPC64 = PPCSubTarget.isPPC64(); - bool isDarwinABI = PPCSubTarget.isDarwinABI(); + bool isPPC64 = Subtarget.isPPC64(); + bool isDarwinABI = Subtarget.isDarwinABI(); if (Depth > 0) { SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); @@ -8762,8 +8846,8 @@ SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op, // this table could be generated automatically from RegInfo. unsigned PPCTargetLowering::getRegisterByName(const char* RegName, EVT VT) const { - bool isPPC64 = PPCSubTarget.isPPC64(); - bool isDarwinABI = PPCSubTarget.isDarwinABI(); + bool isPPC64 = Subtarget.isPPC64(); + bool isDarwinABI = Subtarget.isDarwinABI(); if ((isPPC64 && VT != MVT::i64 && VT != MVT::i32) || (!isPPC64 && VT != MVT::i32)) @@ -8804,7 +8888,7 @@ EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, MachineFunction &MF) const { - if (this->PPCSubTarget.isPPC64()) { + if (Subtarget.isPPC64()) { return MVT::i64; } else { return MVT::i32; @@ -8863,7 +8947,7 @@ bool PPCTargetLowering::allowsUnalignedMemoryAccesses(EVT VT, return false; if (VT.getSimpleVT().isVector()) { - if (PPCSubTarget.hasVSX()) { + if (Subtarget.hasVSX()) { if (VT != MVT::v2f64 && VT != MVT::v2i64) return false; } else { @@ -8907,7 +8991,7 @@ PPCTargetLowering::shouldExpandBuildVectorWithShuffles( } Sched::Preference PPCTargetLowering::getSchedulingPreference(SDNode *N) const { - if (DisableILPPref || PPCSubTarget.enableMachineScheduler()) + if (DisableILPPref || Subtarget.enableMachineScheduler()) return TargetLowering::getSchedulingPreference(N); return Sched::ILP; diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h index 080ef5d0f76f..df05aa51ffa0 100644 --- a/lib/Target/PowerPC/PPCISelLowering.h +++ b/lib/Target/PowerPC/PPCISelLowering.h @@ -18,7 +18,6 @@ #include "PPC.h" #include "PPCInstrInfo.h" #include "PPCRegisterInfo.h" -#include "PPCSubtarget.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/Target/TargetLowering.h" @@ -71,19 +70,14 @@ namespace llvm { TOC_ENTRY, - /// The following three target-specific nodes are used for calls through + /// The following two target-specific nodes are used for calls through /// function pointers in the 64-bit SVR4 ABI. - /// Restore the TOC from the TOC save area of the current stack frame. - /// This is basically a hard coded load instruction which additionally - /// takes/produces a flag. - TOC_RESTORE, - /// Like a regular LOAD but additionally taking/producing a flag. LOAD, - /// LOAD into r2 (also taking/producing a flag). Like TOC_RESTORE, this is - /// a hard coded load instruction. + /// Like LOAD (taking/producing a flag), but using r2 as hard-coded + /// destination. LOAD_TOC, /// OPRC, CHAIN = DYNALLOC(CHAIN, NEGSIZE, FRAME_INDEX) @@ -303,25 +297,27 @@ namespace llvm { namespace PPC { /// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a /// VPKUHUM instruction. - bool isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary); + bool isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary, + SelectionDAG &DAG); /// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a /// VPKUWUM instruction. - bool isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary); + bool isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary, + SelectionDAG &DAG); /// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for /// a VRGL* instruction with the specified unit size (1,2 or 4 bytes). bool isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize, - bool isUnary); + bool isUnary, SelectionDAG &DAG); /// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for /// a VRGH* instruction with the specified unit size (1,2 or 4 bytes). bool isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize, - bool isUnary); + bool isUnary, SelectionDAG &DAG); /// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift /// amount, otherwise return -1. - int isVSLDOIShuffleMask(SDNode *N, bool isUnary); + int isVSLDOIShuffleMask(SDNode *N, bool isUnary, SelectionDAG &DAG); /// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a splat of a single element that is suitable for input to @@ -334,7 +330,7 @@ namespace llvm { /// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the /// specified isSplatShuffleMask VECTOR_SHUFFLE mask. - unsigned getVSPLTImmediate(SDNode *N, unsigned EltSize); + unsigned getVSPLTImmediate(SDNode *N, unsigned EltSize, SelectionDAG &DAG); /// get_VSPLTI_elt - If this is a build_vector of constants which can be /// formed by using a vspltis[bhw] instruction of the specified element @@ -343,8 +339,9 @@ namespace llvm { SDValue get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG); } + class PPCSubtarget; class PPCTargetLowering : public TargetLowering { - const PPCSubtarget &PPCSubTarget; + const PPCSubtarget &Subtarget; public: explicit PPCTargetLowering(PPCTargetMachine &TM); @@ -613,11 +610,6 @@ namespace llvm { extendArgForPPC64(ISD::ArgFlagsTy Flags, EVT ObjectVT, SelectionDAG &DAG, SDValue ArgVal, SDLoc dl) const; - void - setMinReservedArea(MachineFunction &MF, SelectionDAG &DAG, - unsigned nAltivecParamsAtEnd, - unsigned MinReservedArea, bool isPPC64) const; - SDValue LowerFormalArguments_Darwin(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td index b71c09ea8e12..9318f70f4305 100644 --- a/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/lib/Target/PowerPC/PPCInstr64Bit.td @@ -802,17 +802,11 @@ def LDtocCPT: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc:$reg), [(set i64:$rD, (PPCtoc_entry tconstpool:$disp, i64:$reg))]>, isPPC64; -let hasSideEffects = 1, isCodeGenOnly = 1 in { -let RST = 2, DS = 2 in -def LDinto_toc: DSForm_1a<58, 0, (outs), (ins g8rc:$reg), - "ld 2, 8($reg)", IIC_LdStLD, - [(PPCload_toc i64:$reg)]>, isPPC64; - -let RST = 2, DS = 10, RA = 1 in -def LDtoc_restore : DSForm_1a<58, 0, (outs), (ins), - "ld 2, 40(1)", IIC_LdStLD, - [(PPCtoc_restore)]>, isPPC64; -} +let hasSideEffects = 1, isCodeGenOnly = 1, RST = 2 in +def LDinto_toc: DSForm_1<58, 0, (outs), (ins memrix:$src), + "ld 2, $src", IIC_LdStLD, + [(PPCload_toc ixaddr:$src)]>, isPPC64; + def LDX : XForm_1<31, 21, (outs g8rc:$rD), (ins memrr:$src), "ldx $rD, $src", IIC_LdStLD, [(set i64:$rD, (load xaddr:$src))]>, isPPC64; diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td index f3c2eab9746d..dce46d84e6e1 100644 --- a/lib/Target/PowerPC/PPCInstrAltivec.td +++ b/lib/Target/PowerPC/PPCInstrAltivec.td @@ -22,111 +22,127 @@ def vnot_ppc : PatFrag<(ops node:$in), def vpkuhum_shuffle : PatFrag<(ops node:$lhs, node:$rhs), (vector_shuffle node:$lhs, node:$rhs), [{ - return PPC::isVPKUHUMShuffleMask(cast<ShuffleVectorSDNode>(N), false); + return PPC::isVPKUHUMShuffleMask(cast<ShuffleVectorSDNode>(N), false, + *CurDAG); }]>; def vpkuwum_shuffle : PatFrag<(ops node:$lhs, node:$rhs), (vector_shuffle node:$lhs, node:$rhs), [{ - return PPC::isVPKUWUMShuffleMask(cast<ShuffleVectorSDNode>(N), false); + return PPC::isVPKUWUMShuffleMask(cast<ShuffleVectorSDNode>(N), false, + *CurDAG); }]>; def vpkuhum_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs), (vector_shuffle node:$lhs, node:$rhs), [{ - return PPC::isVPKUHUMShuffleMask(cast<ShuffleVectorSDNode>(N), true); + return PPC::isVPKUHUMShuffleMask(cast<ShuffleVectorSDNode>(N), true, + *CurDAG); }]>; def vpkuwum_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs), (vector_shuffle node:$lhs, node:$rhs), [{ - return PPC::isVPKUWUMShuffleMask(cast<ShuffleVectorSDNode>(N), true); + return PPC::isVPKUWUMShuffleMask(cast<ShuffleVectorSDNode>(N), true, + *CurDAG); }]>; def vmrglb_shuffle : PatFrag<(ops node:$lhs, node:$rhs), (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{ - return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 1, false); + return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 1, false, + *CurDAG); }]>; def vmrglh_shuffle : PatFrag<(ops node:$lhs, node:$rhs), (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{ - return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 2, false); + return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 2, false, + *CurDAG); }]>; def vmrglw_shuffle : PatFrag<(ops node:$lhs, node:$rhs), (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{ - return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 4, false); + return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 4, false, + *CurDAG); }]>; def vmrghb_shuffle : PatFrag<(ops node:$lhs, node:$rhs), (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{ - return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 1, false); + return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 1, false, + *CurDAG); }]>; def vmrghh_shuffle : PatFrag<(ops node:$lhs, node:$rhs), (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{ - return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 2, false); + return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 2, false, + *CurDAG); }]>; def vmrghw_shuffle : PatFrag<(ops node:$lhs, node:$rhs), (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{ - return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 4, false); + return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 4, false, + *CurDAG); }]>; def vmrglb_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs), (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{ - return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 1, true); + return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 1, true, + *CurDAG); }]>; def vmrglh_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs), (vector_shuffle node:$lhs, node:$rhs), [{ - return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 2, true); + return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 2, true, + *CurDAG); }]>; def vmrglw_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs), (vector_shuffle node:$lhs, node:$rhs), [{ - return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 4, true); + return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 4, true, + *CurDAG); }]>; def vmrghb_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs), (vector_shuffle node:$lhs, node:$rhs), [{ - return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 1, true); + return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 1, true, + *CurDAG); }]>; def vmrghh_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs), (vector_shuffle node:$lhs, node:$rhs), [{ - return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 2, true); + return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 2, true, + *CurDAG); }]>; def vmrghw_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs), (vector_shuffle node:$lhs, node:$rhs), [{ - return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 4, true); + return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 4, true, + *CurDAG); }]>; def VSLDOI_get_imm : SDNodeXForm<vector_shuffle, [{ - return getI32Imm(PPC::isVSLDOIShuffleMask(N, false)); + return getI32Imm(PPC::isVSLDOIShuffleMask(N, false, *CurDAG)); }]>; def vsldoi_shuffle : PatFrag<(ops node:$lhs, node:$rhs), (vector_shuffle node:$lhs, node:$rhs), [{ - return PPC::isVSLDOIShuffleMask(N, false) != -1; + return PPC::isVSLDOIShuffleMask(N, false, *CurDAG) != -1; }], VSLDOI_get_imm>; /// VSLDOI_unary* - These are used to match vsldoi(X,X), which is turned into /// vector_shuffle(X,undef,mask) by the dag combiner. def VSLDOI_unary_get_imm : SDNodeXForm<vector_shuffle, [{ - return getI32Imm(PPC::isVSLDOIShuffleMask(N, true)); + return getI32Imm(PPC::isVSLDOIShuffleMask(N, true, *CurDAG)); }]>; def vsldoi_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs), (vector_shuffle node:$lhs, node:$rhs), [{ - return PPC::isVSLDOIShuffleMask(N, true) != -1; + return PPC::isVSLDOIShuffleMask(N, true, *CurDAG) != -1; }], VSLDOI_unary_get_imm>; // VSPLT*_get_imm xform function: convert vector_shuffle mask to VSPLT* imm. def VSPLTB_get_imm : SDNodeXForm<vector_shuffle, [{ - return getI32Imm(PPC::getVSPLTImmediate(N, 1)); + return getI32Imm(PPC::getVSPLTImmediate(N, 1, *CurDAG)); }]>; def vspltb_shuffle : PatFrag<(ops node:$lhs, node:$rhs), (vector_shuffle node:$lhs, node:$rhs), [{ return PPC::isSplatShuffleMask(cast<ShuffleVectorSDNode>(N), 1); }], VSPLTB_get_imm>; def VSPLTH_get_imm : SDNodeXForm<vector_shuffle, [{ - return getI32Imm(PPC::getVSPLTImmediate(N, 2)); + return getI32Imm(PPC::getVSPLTImmediate(N, 2, *CurDAG)); }]>; def vsplth_shuffle : PatFrag<(ops node:$lhs, node:$rhs), (vector_shuffle node:$lhs, node:$rhs), [{ return PPC::isSplatShuffleMask(cast<ShuffleVectorSDNode>(N), 2); }], VSPLTH_get_imm>; def VSPLTW_get_imm : SDNodeXForm<vector_shuffle, [{ - return getI32Imm(PPC::getVSPLTImmediate(N, 4)); + return getI32Imm(PPC::getVSPLTImmediate(N, 4, *CurDAG)); }]>; def vspltw_shuffle : PatFrag<(ops node:$lhs, node:$rhs), (vector_shuffle node:$lhs, node:$rhs), [{ diff --git a/lib/Target/PowerPC/PPCInstrFormats.td b/lib/Target/PowerPC/PPCInstrFormats.td index 7fed2c65da73..1e4396cd1017 100644 --- a/lib/Target/PowerPC/PPCInstrFormats.td +++ b/lib/Target/PowerPC/PPCInstrFormats.td @@ -360,20 +360,6 @@ class DSForm_1<bits<6> opcode, bits<2> xo, dag OOL, dag IOL, string asmstr, let Inst{30-31} = xo; } -class DSForm_1a<bits<6> opcode, bits<2> xo, dag OOL, dag IOL, string asmstr, - InstrItinClass itin, list<dag> pattern> - : I<opcode, OOL, IOL, asmstr, itin> { - bits<5> RST; - bits<14> DS; - bits<5> RA; - - let Pattern = pattern; - - let Inst{6-10} = RST; - let Inst{11-15} = RA; - let Inst{16-29} = DS; - let Inst{30-31} = xo; -} // 1.7.6 X-Form class XForm_base_r3xo<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp index fd7238401bb0..9bac91d7d412 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -27,6 +27,7 @@ #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/PseudoSourceValue.h" +#include "llvm/CodeGen/ScheduleDAG.h" #include "llvm/CodeGen/SlotIndexes.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/Support/CommandLine.h" @@ -60,23 +61,25 @@ cl::Hidden); // Pin the vtable to this file. void PPCInstrInfo::anchor() {} -PPCInstrInfo::PPCInstrInfo(PPCTargetMachine &tm) - : PPCGenInstrInfo(PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP), - TM(tm), RI(*TM.getSubtargetImpl()) {} +PPCInstrInfo::PPCInstrInfo(PPCSubtarget &STI) + : PPCGenInstrInfo(PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP), + Subtarget(STI), RI(STI) {} /// CreateTargetHazardRecognizer - Return the hazard recognizer to use for /// this target when scheduling the DAG. -ScheduleHazardRecognizer *PPCInstrInfo::CreateTargetHazardRecognizer( - const TargetMachine *TM, - const ScheduleDAG *DAG) const { - unsigned Directive = TM->getSubtarget<PPCSubtarget>().getDarwinDirective(); +ScheduleHazardRecognizer * +PPCInstrInfo::CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI, + const ScheduleDAG *DAG) const { + unsigned Directive = + static_cast<const PPCSubtarget *>(STI)->getDarwinDirective(); if (Directive == PPC::DIR_440 || Directive == PPC::DIR_A2 || Directive == PPC::DIR_E500mc || Directive == PPC::DIR_E5500) { - const InstrItineraryData *II = TM->getInstrItineraryData(); + const InstrItineraryData *II = + &static_cast<const PPCSubtarget *>(STI)->getInstrItineraryData(); return new ScoreboardHazardRecognizer(II, DAG); } - return TargetInstrInfo::CreateTargetHazardRecognizer(TM, DAG); + return TargetInstrInfo::CreateTargetHazardRecognizer(STI, DAG); } /// CreateTargetPostRAHazardRecognizer - Return the postRA hazard recognizer @@ -84,17 +87,18 @@ ScheduleHazardRecognizer *PPCInstrInfo::CreateTargetHazardRecognizer( ScheduleHazardRecognizer *PPCInstrInfo::CreateTargetPostRAHazardRecognizer( const InstrItineraryData *II, const ScheduleDAG *DAG) const { - unsigned Directive = TM.getSubtarget<PPCSubtarget>().getDarwinDirective(); + unsigned Directive = + DAG->TM.getSubtarget<PPCSubtarget>().getDarwinDirective(); - if (Directive == PPC::DIR_PWR7) + if (Directive == PPC::DIR_PWR7 || Directive == PPC::DIR_PWR8) return new PPCDispatchGroupSBHazardRecognizer(II, DAG); // Most subtargets use a PPC970 recognizer. if (Directive != PPC::DIR_440 && Directive != PPC::DIR_A2 && Directive != PPC::DIR_E500mc && Directive != PPC::DIR_E5500) { - assert(TM.getInstrInfo() && "No InstrInfo?"); + assert(DAG->TII && "No InstrInfo?"); - return new PPCHazardRecognizer970(TM); + return new PPCHazardRecognizer970(*DAG); } return new ScoreboardHazardRecognizer(II, DAG); @@ -129,7 +133,7 @@ int PPCInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, // On some cores, there is an additional delay between writing to a condition // register, and using it from a branch. - unsigned Directive = TM.getSubtarget<PPCSubtarget>().getDarwinDirective(); + unsigned Directive = Subtarget.getDarwinDirective(); switch (Directive) { default: break; case PPC::DIR_7400: @@ -142,6 +146,7 @@ int PPCInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, case PPC::DIR_PWR6: case PPC::DIR_PWR6X: case PPC::DIR_PWR7: + case PPC::DIR_PWR8: Latency += 2; break; } @@ -313,12 +318,13 @@ void PPCInstrInfo::insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const { // This function is used for scheduling, and the nop wanted here is the type // that terminates dispatch groups on the POWER cores. - unsigned Directive = TM.getSubtarget<PPCSubtarget>().getDarwinDirective(); + unsigned Directive = Subtarget.getDarwinDirective(); unsigned Opcode; switch (Directive) { default: Opcode = PPC::NOP; break; case PPC::DIR_PWR6: Opcode = PPC::NOP_GT_PWR6; break; case PPC::DIR_PWR7: Opcode = PPC::NOP_GT_PWR7; break; + case PPC::DIR_PWR8: Opcode = PPC::NOP_GT_PWR7; break; /* FIXME: Update when P8 InstrScheduling model is ready */ } DebugLoc DL; @@ -332,7 +338,7 @@ bool PPCInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const { - bool isPPC64 = TM.getSubtargetImpl()->isPPC64(); + bool isPPC64 = Subtarget.isPPC64(); // If the block has no terminators, it just falls into the block after it. MachineBasicBlock::iterator I = MBB.end(); @@ -538,7 +544,7 @@ PPCInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, assert((Cond.size() == 2 || Cond.size() == 0) && "PPC branch conditions have two components!"); - bool isPPC64 = TM.getSubtargetImpl()->isPPC64(); + bool isPPC64 = Subtarget.isPPC64(); // One-way branch. if (!FBB) { @@ -579,7 +585,7 @@ bool PPCInstrInfo::canInsertSelect(const MachineBasicBlock &MBB, const SmallVectorImpl<MachineOperand> &Cond, unsigned TrueReg, unsigned FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const { - if (!TM.getSubtargetImpl()->hasISEL()) + if (!Subtarget.hasISEL()) return false; if (Cond.size() != 2) @@ -623,7 +629,7 @@ void PPCInstrInfo::insertSelect(MachineBasicBlock &MBB, assert(Cond.size() == 2 && "PPC branch conditions have two components!"); - assert(TM.getSubtargetImpl()->hasISEL() && + assert(Subtarget.hasISEL() && "Cannot insert select on target without ISEL support"); // Get the register classes. @@ -826,7 +832,7 @@ PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF, FrameIdx)); NonRI = true; } else if (PPC::VRSAVERCRegClass.hasSubClassEq(RC)) { - assert(TM.getSubtargetImpl()->isDarwin() && + assert(Subtarget.isDarwin() && "VRSAVE only needs spill/restore on Darwin"); NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::SPILL_VRSAVE)) .addReg(SrcReg, @@ -921,7 +927,7 @@ PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL, FrameIdx)); NonRI = true; } else if (PPC::VRSAVERCRegClass.hasSubClassEq(RC)) { - assert(TM.getSubtargetImpl()->isDarwin() && + assert(Subtarget.isDarwin() && "VRSAVE only needs spill/restore on Darwin"); NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::RESTORE_VRSAVE), @@ -1035,7 +1041,7 @@ bool PPCInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, unsigned ZeroReg; if (UseInfo->isLookupPtrRegClass()) { - bool isPPC64 = TM.getSubtargetImpl()->isPPC64(); + bool isPPC64 = Subtarget.isPPC64(); ZeroReg = isPPC64 ? PPC::ZERO8 : PPC::ZERO; } else { ZeroReg = UseInfo->RegClass == PPC::G8RC_NOX0RegClassID ? @@ -1102,7 +1108,7 @@ bool PPCInstrInfo::PredicateInstruction( unsigned OpC = MI->getOpcode(); if (OpC == PPC::BLR) { if (Pred[1].getReg() == PPC::CTR8 || Pred[1].getReg() == PPC::CTR) { - bool isPPC64 = TM.getSubtargetImpl()->isPPC64(); + bool isPPC64 = Subtarget.isPPC64(); MI->setDesc(get(Pred[0].getImm() ? (isPPC64 ? PPC::BDNZLR8 : PPC::BDNZLR) : (isPPC64 ? PPC::BDZLR8 : PPC::BDZLR))); @@ -1124,7 +1130,7 @@ bool PPCInstrInfo::PredicateInstruction( return true; } else if (OpC == PPC::B) { if (Pred[1].getReg() == PPC::CTR8 || Pred[1].getReg() == PPC::CTR) { - bool isPPC64 = TM.getSubtargetImpl()->isPPC64(); + bool isPPC64 = Subtarget.isPPC64(); MI->setDesc(get(Pred[0].getImm() ? (isPPC64 ? PPC::BDNZ8 : PPC::BDNZ) : (isPPC64 ? PPC::BDZ8 : PPC::BDZ))); @@ -1162,7 +1168,7 @@ bool PPCInstrInfo::PredicateInstruction( llvm_unreachable("Cannot predicate bctr[l] on the ctr register"); bool setLR = OpC == PPC::BCTRL || OpC == PPC::BCTRL8; - bool isPPC64 = TM.getSubtargetImpl()->isPPC64(); + bool isPPC64 = Subtarget.isPPC64(); if (Pred[0].getImm() == PPC::PRED_BIT_SET) { MI->setDesc(get(isPPC64 ? (setLR ? PPC::BCCTRL8 : PPC::BCCTR8) : @@ -1323,7 +1329,7 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr *CmpInstr, // for equality checks (as those don't depend on the sign). On PPC64, // we are restricted to equality for unsigned 64-bit comparisons and for // signed 32-bit comparisons the applicability is more restricted. - bool isPPC64 = TM.getSubtargetImpl()->isPPC64(); + bool isPPC64 = Subtarget.isPPC64(); bool is32BitSignedCompare = OpC == PPC::CMPWI || OpC == PPC::CMPW; bool is32BitUnsignedCompare = OpC == PPC::CMPLWI || OpC == PPC::CMPLW; bool is64BitUnsignedCompare = OpC == PPC::CMPLDI || OpC == PPC::CMPLD; diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h index d9db3e1b1dc5..83f14c6cf214 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.h +++ b/lib/Target/PowerPC/PPCInstrInfo.h @@ -65,7 +65,7 @@ enum PPC970_Unit { class PPCInstrInfo : public PPCGenInstrInfo { - PPCTargetMachine &TM; + PPCSubtarget &Subtarget; const PPCRegisterInfo RI; bool StoreRegToStackSlot(MachineFunction &MF, @@ -80,7 +80,7 @@ class PPCInstrInfo : public PPCGenInstrInfo { bool &NonRI, bool &SpillsVRS) const; virtual void anchor(); public: - explicit PPCInstrInfo(PPCTargetMachine &TM); + explicit PPCInstrInfo(PPCSubtarget &STI); /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As /// such, whenever a client has an instance of instruction info, it should @@ -89,7 +89,7 @@ public: const PPCRegisterInfo &getRegisterInfo() const { return RI; } ScheduleHazardRecognizer * - CreateTargetHazardRecognizer(const TargetMachine *TM, + CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI, const ScheduleDAG *DAG) const override; ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td index e421f8e69e65..c2e3382b3e79 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.td +++ b/lib/Target/PowerPC/PPCInstrInfo.td @@ -141,9 +141,6 @@ def PPCload : SDNode<"PPCISD::LOAD", SDTypeProfile<1, 1, []>, def PPCload_toc : SDNode<"PPCISD::LOAD_TOC", SDTypeProfile<0, 1, []>, [SDNPHasChain, SDNPSideEffect, SDNPInGlue, SDNPOutGlue]>; -def PPCtoc_restore : SDNode<"PPCISD::TOC_RESTORE", SDTypeProfile<0, 0, []>, - [SDNPHasChain, SDNPSideEffect, - SDNPInGlue, SDNPOutGlue]>; def PPCmtctr : SDNode<"PPCISD::MTCTR", SDT_PPCCall, [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; def PPCbctrl : SDNode<"PPCISD::BCTRL", SDTNone, diff --git a/lib/Target/PowerPC/PPCJITInfo.cpp b/lib/Target/PowerPC/PPCJITInfo.cpp index 7bbc71bd1fa9..e5f113a0c030 100644 --- a/lib/Target/PowerPC/PPCJITInfo.cpp +++ b/lib/Target/PowerPC/PPCJITInfo.cpp @@ -13,7 +13,7 @@ #include "PPCJITInfo.h" #include "PPCRelocations.h" -#include "PPCTargetMachine.h" +#include "PPCSubtarget.h" #include "llvm/IR/Function.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" @@ -25,6 +25,11 @@ using namespace llvm; static TargetJITInfo::JITCompilerFn JITCompilerFunction; +PPCJITInfo::PPCJITInfo(PPCSubtarget &STI) + : Subtarget(STI), is64Bit(STI.isPPC64()) { + useGOT = 0; +} + #define BUILD_ADDIS(RD,RS,IMM16) \ ((15 << 26) | ((RD) << 21) | ((RS) << 16) | ((IMM16) & 65535)) #define BUILD_ORI(RD,RS,UIMM16) \ @@ -393,7 +398,7 @@ void *PPCJITInfo::emitFunctionStub(const Function* F, void *Fn, JCE.emitWordBE(0xf821ffb1); // stdu r1,-80(r1) JCE.emitWordBE(0x7d6802a6); // mflr r11 JCE.emitWordBE(0xf9610060); // std r11, 96(r1) - } else if (TM.getSubtargetImpl()->isDarwinABI()){ + } else if (Subtarget.isDarwinABI()){ JCE.emitWordBE(0x9421ffe0); // stwu r1,-32(r1) JCE.emitWordBE(0x7d6802a6); // mflr r11 JCE.emitWordBE(0x91610028); // stw r11, 40(r1) diff --git a/lib/Target/PowerPC/PPCJITInfo.h b/lib/Target/PowerPC/PPCJITInfo.h index 0693e3e86cd5..b6b37ffb852b 100644 --- a/lib/Target/PowerPC/PPCJITInfo.h +++ b/lib/Target/PowerPC/PPCJITInfo.h @@ -18,32 +18,29 @@ #include "llvm/Target/TargetJITInfo.h" namespace llvm { - class PPCTargetMachine; +class PPCSubtarget; +class PPCJITInfo : public TargetJITInfo { +protected: + PPCSubtarget &Subtarget; + bool is64Bit; - class PPCJITInfo : public TargetJITInfo { - protected: - PPCTargetMachine &TM; - bool is64Bit; - public: - PPCJITInfo(PPCTargetMachine &tm, bool tmIs64Bit) : TM(tm) { - useGOT = 0; - is64Bit = tmIs64Bit; - } +public: + PPCJITInfo(PPCSubtarget &STI); - StubLayout getStubLayout() override; - void *emitFunctionStub(const Function* F, void *Fn, - JITCodeEmitter &JCE) override; - LazyResolverFn getLazyResolverFunction(JITCompilerFn) override; - void relocate(void *Function, MachineRelocation *MR, - unsigned NumRelocs, unsigned char* GOTBase) override; + StubLayout getStubLayout() override; + void *emitFunctionStub(const Function *F, void *Fn, + JITCodeEmitter &JCE) override; + LazyResolverFn getLazyResolverFunction(JITCompilerFn) override; + void relocate(void *Function, MachineRelocation *MR, unsigned NumRelocs, + unsigned char *GOTBase) override; - /// replaceMachineCodeForFunction - Make it so that calling the function - /// whose machine code is at OLD turns into a call to NEW, perhaps by - /// overwriting OLD with a branch to NEW. This is used for self-modifying - /// code. - /// - void replaceMachineCodeForFunction(void *Old, void *New) override; - }; + /// replaceMachineCodeForFunction - Make it so that calling the function + /// whose machine code is at OLD turns into a call to NEW, perhaps by + /// overwriting OLD with a branch to NEW. This is used for self-modifying + /// code. + /// + void replaceMachineCodeForFunction(void *Old, void *New) override; +}; } #endif diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp index e333b51b8774..eca774ead935 100644 --- a/lib/Target/PowerPC/PPCRegisterInfo.cpp +++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -973,6 +973,14 @@ void PPCRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, unsigned OffsetOperandNo = getOffsetONFromFION(MI, FIOperandNum); Offset += MI.getOperand(OffsetOperandNo).getImm(); MI.getOperand(OffsetOperandNo).ChangeToImmediate(Offset); + + MachineBasicBlock &MBB = *MI.getParent(); + MachineFunction &MF = *MBB.getParent(); + const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + const MCInstrDesc &MCID = MI.getDesc(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + MRI.constrainRegClass(BaseReg, + TII.getRegClass(MCID, FIOperandNum, this, MF)); } bool PPCRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, diff --git a/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp b/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp index f742f726186b..dc1674214769 100644 --- a/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp +++ b/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp @@ -16,9 +16,7 @@ using namespace llvm; #define DEBUG_TYPE "powerpc-selectiondag-info" -PPCSelectionDAGInfo::PPCSelectionDAGInfo(const PPCTargetMachine &TM) - : TargetSelectionDAGInfo(TM) { -} +PPCSelectionDAGInfo::PPCSelectionDAGInfo(const DataLayout *DL) + : TargetSelectionDAGInfo(DL) {} -PPCSelectionDAGInfo::~PPCSelectionDAGInfo() { -} +PPCSelectionDAGInfo::~PPCSelectionDAGInfo() {} diff --git a/lib/Target/PowerPC/PPCSelectionDAGInfo.h b/lib/Target/PowerPC/PPCSelectionDAGInfo.h index 341b69cdfb5f..b2e7f3b5f2ac 100644 --- a/lib/Target/PowerPC/PPCSelectionDAGInfo.h +++ b/lib/Target/PowerPC/PPCSelectionDAGInfo.h @@ -22,7 +22,7 @@ class PPCTargetMachine; class PPCSelectionDAGInfo : public TargetSelectionDAGInfo { public: - explicit PPCSelectionDAGInfo(const PPCTargetMachine &TM); + explicit PPCSelectionDAGInfo(const DataLayout *DL); ~PPCSelectionDAGInfo(); }; diff --git a/lib/Target/PowerPC/PPCSubtarget.cpp b/lib/Target/PowerPC/PPCSubtarget.cpp index ea9daee4f8e8..2e1b74a13175 100644 --- a/lib/Target/PowerPC/PPCSubtarget.cpp +++ b/lib/Target/PowerPC/PPCSubtarget.cpp @@ -32,15 +32,57 @@ using namespace llvm; #define GET_SUBTARGETINFO_CTOR #include "PPCGenSubtargetInfo.inc" -PPCSubtarget::PPCSubtarget(const std::string &TT, const std::string &CPU, - const std::string &FS, bool is64Bit, - CodeGenOpt::Level OptLevel) - : PPCGenSubtargetInfo(TT, CPU, FS), IsPPC64(is64Bit), TargetTriple(TT), - OptLevel(OptLevel) { +/// Return the datalayout string of a subtarget. +static std::string getDataLayoutString(const PPCSubtarget &ST) { + const Triple &T = ST.getTargetTriple(); + + std::string Ret; + + // Most PPC* platforms are big endian, PPC64LE is little endian. + if (ST.isLittleEndian()) + Ret = "e"; + else + Ret = "E"; + + Ret += DataLayout::getManglingComponent(T); + + // PPC32 has 32 bit pointers. The PS3 (OS Lv2) is a PPC64 machine with 32 bit + // pointers. + if (!ST.isPPC64() || T.getOS() == Triple::Lv2) + Ret += "-p:32:32"; + + // Note, the alignment values for f64 and i64 on ppc64 in Darwin + // documentation are wrong; these are correct (i.e. "what gcc does"). + if (ST.isPPC64() || ST.isSVR4ABI()) + Ret += "-i64:64"; + else + Ret += "-f64:32:64"; + + // PPC64 has 32 and 64 bit registers, PPC32 has only 32 bit ones. + if (ST.isPPC64()) + Ret += "-n32:64"; + else + Ret += "-n32"; + + return Ret; +} + +PPCSubtarget &PPCSubtarget::initializeSubtargetDependencies(StringRef CPU, + StringRef FS) { initializeEnvironment(); resetSubtargetFeatures(CPU, FS); + return *this; } +PPCSubtarget::PPCSubtarget(const std::string &TT, const std::string &CPU, + const std::string &FS, PPCTargetMachine &TM, + bool is64Bit, CodeGenOpt::Level OptLevel) + : PPCGenSubtargetInfo(TT, CPU, FS), IsPPC64(is64Bit), TargetTriple(TT), + OptLevel(OptLevel), + FrameLowering(initializeSubtargetDependencies(CPU, FS)), + DL(getDataLayoutString(*this)), InstrInfo(*this), JITInfo(*this), + TLInfo(TM), TSInfo(&DL) {} + /// SetJITMode - This is called to inform the subtarget info that we are /// producing code for the JIT. void PPCSubtarget::SetJITMode() { @@ -156,6 +198,11 @@ void PPCSubtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) { // Determine endianness. IsLittleEndian = (TargetTriple.getArch() == Triple::ppc64le); + + // FIXME: For now, we disable VSX in little-endian mode until endian + // issues in those instructions can be addressed. + if (IsLittleEndian) + HasVSX = false; } /// hasLazyResolverStub - Return true if accesses to the specified global have @@ -200,6 +247,7 @@ static bool needsAggressiveScheduling(unsigned Directive) { case PPC::DIR_E500mc: case PPC::DIR_E5500: case PPC::DIR_PWR7: + case PPC::DIR_PWR8: return true; } } diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h index ee43fd5f807f..2a166995d922 100644 --- a/lib/Target/PowerPC/PPCSubtarget.h +++ b/lib/Target/PowerPC/PPCSubtarget.h @@ -14,7 +14,13 @@ #ifndef POWERPCSUBTARGET_H #define POWERPCSUBTARGET_H +#include "PPCFrameLowering.h" +#include "PPCInstrInfo.h" +#include "PPCISelLowering.h" +#include "PPCJITInfo.h" +#include "PPCSelectionDAGInfo.h" #include "llvm/ADT/Triple.h" +#include "llvm/IR/DataLayout.h" #include "llvm/MC/MCInstrItineraries.h" #include "llvm/Target/TargetSubtargetInfo.h" #include <string> @@ -50,6 +56,7 @@ namespace PPC { DIR_PWR6, DIR_PWR6X, DIR_PWR7, + DIR_PWR8, DIR_64 }; } @@ -102,12 +109,19 @@ protected: /// OptLevel - What default optimization level we're emitting code for. CodeGenOpt::Level OptLevel; + PPCFrameLowering FrameLowering; + const DataLayout DL; + PPCInstrInfo InstrInfo; + PPCJITInfo JITInfo; + PPCTargetLowering TLInfo; + PPCSelectionDAGInfo TSInfo; + public: /// This constructor initializes the data members to match that /// of the specified triple. /// PPCSubtarget(const std::string &TT, const std::string &CPU, - const std::string &FS, bool is64Bit, + const std::string &FS, PPCTargetMachine &TM, bool is64Bit, CodeGenOpt::Level OptLevel); /// ParseSubtargetFeatures - Parses features string setting specified @@ -127,10 +141,21 @@ public: /// unsigned getDarwinDirective() const { return DarwinDirective; } - /// getInstrItins - Return the instruction itineraies based on subtarget + /// getInstrItins - Return the instruction itineraries based on subtarget /// selection. const InstrItineraryData &getInstrItineraryData() const { return InstrItins; } + const PPCFrameLowering *getFrameLowering() const { return &FrameLowering; } + const DataLayout *getDataLayout() const { return &DL; } + const PPCInstrInfo *getInstrInfo() const { return &InstrInfo; } + PPCJITInfo *getJITInfo() { return &JITInfo; } + const PPCTargetLowering *getTargetLowering() const { return &TLInfo; } + const PPCSelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; } + + /// initializeSubtargetDependencies - Initializes using a CPU and feature string + /// so that we can use initializer lists for subtarget initialization. + PPCSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS); + /// \brief Reset the features for the PowerPC target. void resetSubtargetFeatures(const MachineFunction *MF) override; private: diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp index 2323addd21af..9563b9045c39 100644 --- a/lib/Target/PowerPC/PPCTargetMachine.cpp +++ b/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -37,53 +37,12 @@ extern "C" void LLVMInitializePowerPCTarget() { RegisterTargetMachine<PPC64TargetMachine> C(ThePPC64LETarget); } -/// Return the datalayout string of a subtarget. -static std::string getDataLayoutString(const PPCSubtarget &ST) { - const Triple &T = ST.getTargetTriple(); - - std::string Ret; - - // Most PPC* platforms are big endian, PPC64LE is little endian. - if (ST.isLittleEndian()) - Ret = "e"; - else - Ret = "E"; - - Ret += DataLayout::getManglingComponent(T); - - // PPC32 has 32 bit pointers. The PS3 (OS Lv2) is a PPC64 machine with 32 bit - // pointers. - if (!ST.isPPC64() || T.getOS() == Triple::Lv2) - Ret += "-p:32:32"; - - // Note, the alignment values for f64 and i64 on ppc64 in Darwin - // documentation are wrong; these are correct (i.e. "what gcc does"). - if (ST.isPPC64() || ST.isSVR4ABI()) - Ret += "-i64:64"; - else - Ret += "-f64:32:64"; - - // PPC64 has 32 and 64 bit registers, PPC32 has only 32 bit ones. - if (ST.isPPC64()) - Ret += "-n32:64"; - else - Ret += "-n32"; - - return Ret; -} - -PPCTargetMachine::PPCTargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, - const TargetOptions &Options, +PPCTargetMachine::PPCTargetMachine(const Target &T, StringRef TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL, - bool is64Bit) - : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), - Subtarget(TT, CPU, FS, is64Bit, OL), - DL(getDataLayoutString(Subtarget)), InstrInfo(*this), - FrameLowering(Subtarget), JITInfo(*this, is64Bit), - TLInfo(*this), TSInfo(*this), - InstrItins(Subtarget.getInstrItineraryData()) { + CodeGenOpt::Level OL, bool is64Bit) + : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), + Subtarget(TT, CPU, FS, *this, is64Bit, OL) { initAsmInfo(); } diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h index 9e924945d8fa..4c7029ca7a36 100644 --- a/lib/Target/PowerPC/PPCTargetMachine.h +++ b/lib/Target/PowerPC/PPCTargetMachine.h @@ -14,11 +14,7 @@ #ifndef PPC_TARGETMACHINE_H #define PPC_TARGETMACHINE_H -#include "PPCFrameLowering.h" -#include "PPCISelLowering.h" #include "PPCInstrInfo.h" -#include "PPCJITInfo.h" -#include "PPCSelectionDAGInfo.h" #include "PPCSubtarget.h" #include "llvm/IR/DataLayout.h" #include "llvm/Target/TargetMachine.h" @@ -29,13 +25,6 @@ namespace llvm { /// class PPCTargetMachine : public LLVMTargetMachine { PPCSubtarget Subtarget; - const DataLayout DL; // Calculates type size & alignment - PPCInstrInfo InstrInfo; - PPCFrameLowering FrameLowering; - PPCJITInfo JITInfo; - PPCTargetLowering TLInfo; - PPCSelectionDAGInfo TSInfo; - InstrItineraryData InstrItins; public: PPCTargetMachine(const Target &T, StringRef TT, @@ -43,25 +32,29 @@ public: Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL, bool is64Bit); - const PPCInstrInfo *getInstrInfo() const override { return &InstrInfo; } - const PPCFrameLowering *getFrameLowering() const override { - return &FrameLowering; + const PPCInstrInfo *getInstrInfo() const override { + return getSubtargetImpl()->getInstrInfo(); } - PPCJITInfo *getJITInfo() override { return &JITInfo; } + const PPCFrameLowering *getFrameLowering() const override { + return getSubtargetImpl()->getFrameLowering(); + } + PPCJITInfo *getJITInfo() override { return Subtarget.getJITInfo(); } const PPCTargetLowering *getTargetLowering() const override { - return &TLInfo; + return getSubtargetImpl()->getTargetLowering(); } const PPCSelectionDAGInfo* getSelectionDAGInfo() const override { - return &TSInfo; + return getSubtargetImpl()->getSelectionDAGInfo(); } - const PPCRegisterInfo *getRegisterInfo() const override { - return &InstrInfo.getRegisterInfo(); + const PPCRegisterInfo *getRegisterInfo() const override { + return &getInstrInfo()->getRegisterInfo(); } - const DataLayout *getDataLayout() const override { return &DL; } + const DataLayout *getDataLayout() const override { + return getSubtargetImpl()->getDataLayout(); + } const PPCSubtarget *getSubtargetImpl() const override { return &Subtarget; } const InstrItineraryData *getInstrItineraryData() const override { - return &InstrItins; + return &getSubtargetImpl()->getInstrItineraryData(); } // Pass Pipeline Configuration diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h index 949fdfb217ab..713fc4b8f329 100644 --- a/lib/Target/R600/AMDGPU.h +++ b/lib/Target/R600/AMDGPU.h @@ -17,6 +17,7 @@ namespace llvm { class AMDGPUInstrPrinter; +class AMDGPUSubtarget; class AMDGPUTargetMachine; class FunctionPass; class MCAsmInfo; @@ -40,6 +41,7 @@ FunctionPass *createSIAnnotateControlFlowPass(); FunctionPass *createSILowerI1CopiesPass(); FunctionPass *createSILowerControlFlowPass(TargetMachine &tm); FunctionPass *createSIFixSGPRCopiesPass(TargetMachine &tm); +FunctionPass *createSIFixSGPRLiveRangesPass(); FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS); FunctionPass *createSIInsertWaits(TargetMachine &tm); @@ -47,14 +49,18 @@ void initializeSILowerI1CopiesPass(PassRegistry &); extern char &SILowerI1CopiesID; // Passes common to R600 and SI +FunctionPass *createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST); Pass *createAMDGPUStructurizeCFGPass(); -FunctionPass *createAMDGPUConvertToISAPass(TargetMachine &tm); FunctionPass *createAMDGPUISelDag(TargetMachine &tm); /// \brief Creates an AMDGPU-specific Target Transformation Info pass. ImmutablePass * createAMDGPUTargetTransformInfoPass(const AMDGPUTargetMachine *TM); +void initializeSIFixSGPRLiveRangesPass(PassRegistry&); +extern char &SIFixSGPRLiveRangesID; + + extern Target TheAMDGPUTarget; } // End namespace llvm diff --git a/lib/Target/R600/AMDGPU.td b/lib/Target/R600/AMDGPU.td index 2edc1150203e..6ff9ab7ab7d2 100644 --- a/lib/Target/R600/AMDGPU.td +++ b/lib/Target/R600/AMDGPU.td @@ -7,8 +7,7 @@ // //==-----------------------------------------------------------------------===// -// Include AMDIL TD files -include "AMDILBase.td" +include "llvm/Target/Target.td" //===----------------------------------------------------------------------===// // Subtarget Features @@ -33,30 +32,25 @@ def FeatureIfCvt : SubtargetFeature <"disable-ifcvt", "false", "Disable the if conversion pass">; -def FeatureFP64 : SubtargetFeature<"fp64", +def FeatureFP64 : SubtargetFeature<"fp64", "FP64", "true", - "Enable 64bit double precision operations">; + "Enable double precision operations">; def Feature64BitPtr : SubtargetFeature<"64BitPtr", "Is64bit", "true", - "Specify if 64bit addressing should be used.">; - -def Feature32on64BitPtr : SubtargetFeature<"64on32BitPtr", - "Is32on64bit", - "false", - "Specify if 64bit sized pointers with 32bit addressing should be used.">; + "Specify if 64-bit addressing should be used">; def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst", "R600ALUInst", "false", - "Older version of ALU instructions encoding.">; + "Older version of ALU instructions encoding">; def FeatureVertexCache : SubtargetFeature<"HasVertexCache", "HasVertexCache", "true", - "Specify use of dedicated vertex cache.">; + "Specify use of dedicated vertex cache">; def FeatureCaymanISA : SubtargetFeature<"caymanISA", "CaymanISA", @@ -87,28 +81,40 @@ def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<16>; def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>; def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>; +class SubtargetFeatureLocalMemorySize <int Value> : SubtargetFeature< + "localmemorysize"#Value, + "LocalMemorySize", + !cast<string>(Value), + "The size of local memory in bytes">; + class SubtargetFeatureGeneration <string Value, list<SubtargetFeature> Implies> : SubtargetFeature <Value, "Gen", "AMDGPUSubtarget::"#Value, Value#" GPU generation", Implies>; +def FeatureLocalMemorySize0 : SubtargetFeatureLocalMemorySize<0>; +def FeatureLocalMemorySize32768 : SubtargetFeatureLocalMemorySize<32768>; +def FeatureLocalMemorySize65536 : SubtargetFeatureLocalMemorySize<65536>; + def FeatureR600 : SubtargetFeatureGeneration<"R600", - [FeatureR600ALUInst, FeatureFetchLimit8]>; + [FeatureR600ALUInst, FeatureFetchLimit8, FeatureLocalMemorySize0]>; def FeatureR700 : SubtargetFeatureGeneration<"R700", - [FeatureFetchLimit16]>; + [FeatureFetchLimit16, FeatureLocalMemorySize0]>; def FeatureEvergreen : SubtargetFeatureGeneration<"EVERGREEN", - [FeatureFetchLimit16]>; + [FeatureFetchLimit16, FeatureLocalMemorySize32768]>; def FeatureNorthernIslands : SubtargetFeatureGeneration<"NORTHERN_ISLANDS", - [FeatureFetchLimit16, FeatureWavefrontSize64]>; + [FeatureFetchLimit16, FeatureWavefrontSize64, + FeatureLocalMemorySize32768] +>; def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS", - [Feature64BitPtr, FeatureFP64]>; + [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize32768]>; def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS", - [Feature64BitPtr, FeatureFP64]>; + [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536]>; //===----------------------------------------------------------------------===// def AMDGPUInstrInfo : InstrInfo { @@ -120,6 +126,10 @@ def AMDGPU : Target { let InstructionSet = AMDGPUInstrInfo; } +// Dummy Instruction itineraries for pseudo instructions +def ALU_NULL : FuncUnit; +def NullALU : InstrItinClass; + //===----------------------------------------------------------------------===// // Predicate helper class //===----------------------------------------------------------------------===// diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp index 170f47905744..a6e217b969ad 100644 --- a/lib/Target/R600/AMDGPUAsmPrinter.cpp +++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp @@ -19,6 +19,7 @@ #include "AMDGPUAsmPrinter.h" #include "AMDGPU.h" +#include "AMDGPUSubtarget.h" #include "R600Defines.h" #include "R600MachineFunctionInfo.h" #include "R600RegisterInfo.h" @@ -35,6 +36,24 @@ using namespace llvm; +// TODO: This should get the default rounding mode from the kernel. We just set +// the default here, but this could change if the OpenCL rounding mode pragmas +// are used. +// +// The denormal mode here should match what is reported by the OpenCL runtime +// for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but +// can also be override to flush with the -cl-denorms-are-zero compiler flag. +// +// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double +// precision, and leaves single precision to flush all and does not report +// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports +// CL_FP_DENORM for both. +static uint32_t getFPMode(MachineFunction &) { + return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) | + FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) | + FP_DENORM_MODE_SP(FP_DENORM_FLUSH_NONE) | + FP_DENORM_MODE_DP(FP_DENORM_FLUSH_NONE); +} static AsmPrinter *createAMDGPUAsmPrinterPass(TargetMachine &tm, MCStreamer &Streamer) { @@ -92,6 +111,10 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { false); OutStreamer.emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR), false); + OutStreamer.emitRawComment(" FloatMode: " + Twine(KernelInfo.FloatMode), + false); + OutStreamer.emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode), + false); } else { R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); OutStreamer.emitRawComment( @@ -279,16 +302,27 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, if (VCCUsed) MaxSGPR += 2; - ProgInfo.CodeLen = CodeSize; - ProgInfo.NumSGPR = MaxSGPR; ProgInfo.NumVGPR = MaxVGPR; + ProgInfo.NumSGPR = MaxSGPR; + + // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode + // register. + ProgInfo.FloatMode = getFPMode(MF); + + // XXX: Not quite sure what this does, but sc seems to unset this. + ProgInfo.IEEEMode = 0; + + // Do not clamp NAN to 0. + ProgInfo.DX10Clamp = 0; + + ProgInfo.CodeLen = CodeSize; } void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF, const SIProgramInfo &KernelInfo) { const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>(); - SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + unsigned RsrcReg; switch (MFI->ShaderType) { default: // Fall through @@ -298,25 +332,41 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF, case ShaderType::VERTEX: RsrcReg = R_00B128_SPI_SHADER_PGM_RSRC1_VS; break; } - OutStreamer.EmitIntValue(RsrcReg, 4); - OutStreamer.EmitIntValue(S_00B028_VGPRS(KernelInfo.NumVGPR / 4) | - S_00B028_SGPRS(KernelInfo.NumSGPR / 8), 4); - unsigned LDSAlignShift; if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) { - // LDS is allocated in 64 dword blocks + // LDS is allocated in 64 dword blocks. LDSAlignShift = 8; } else { - // LDS is allocated in 128 dword blocks + // LDS is allocated in 128 dword blocks. LDSAlignShift = 9; } + unsigned LDSBlocks = - RoundUpToAlignment(MFI->LDSSize, 1 << LDSAlignShift) >> LDSAlignShift; + RoundUpToAlignment(MFI->LDSSize, 1 << LDSAlignShift) >> LDSAlignShift; if (MFI->ShaderType == ShaderType::COMPUTE) { + OutStreamer.EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4); + + const uint32_t ComputePGMRSrc1 = + S_00B848_VGPRS(KernelInfo.NumVGPR / 4) | + S_00B848_SGPRS(KernelInfo.NumSGPR / 8) | + S_00B848_PRIORITY(KernelInfo.Priority) | + S_00B848_FLOAT_MODE(KernelInfo.FloatMode) | + S_00B848_PRIV(KernelInfo.Priv) | + S_00B848_DX10_CLAMP(KernelInfo.DX10Clamp) | + S_00B848_IEEE_MODE(KernelInfo.DebugMode) | + S_00B848_IEEE_MODE(KernelInfo.IEEEMode); + + OutStreamer.EmitIntValue(ComputePGMRSrc1, 4); + OutStreamer.EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4); OutStreamer.EmitIntValue(S_00B84C_LDS_SIZE(LDSBlocks), 4); + } else { + OutStreamer.EmitIntValue(RsrcReg, 4); + OutStreamer.EmitIntValue(S_00B028_VGPRS(KernelInfo.NumVGPR / 4) | + S_00B028_SGPRS(KernelInfo.NumSGPR / 8), 4); } + if (MFI->ShaderType == ShaderType::PIXEL) { OutStreamer.EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4); OutStreamer.EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(LDSBlocks), 4); diff --git a/lib/Target/R600/AMDGPUAsmPrinter.h b/lib/Target/R600/AMDGPUAsmPrinter.h index 71adc9a4d1f6..c1acb6ea0ff2 100644 --- a/lib/Target/R600/AMDGPUAsmPrinter.h +++ b/lib/Target/R600/AMDGPUAsmPrinter.h @@ -25,13 +25,28 @@ class AMDGPUAsmPrinter : public AsmPrinter { private: struct SIProgramInfo { SIProgramInfo() : - CodeLen(0), + NumVGPR(0), NumSGPR(0), - NumVGPR(0) {} + Priority(0), + FloatMode(0), + Priv(0), + DX10Clamp(0), + DebugMode(0), + IEEEMode(0), + CodeLen(0) {} + // Fields set in PGM_RSRC1 pm4 packet. + uint32_t NumVGPR; + uint32_t NumSGPR; + uint32_t Priority; + uint32_t FloatMode; + uint32_t Priv; + uint32_t DX10Clamp; + uint32_t DebugMode; + uint32_t IEEEMode; + + // Bonus information for debugging. uint64_t CodeLen; - unsigned NumSGPR; - unsigned NumVGPR; }; void getSIProgramInfo(SIProgramInfo &Out, MachineFunction &MF) const; diff --git a/lib/Target/R600/AMDGPUConvertToISA.cpp b/lib/Target/R600/AMDGPUConvertToISA.cpp deleted file mode 100644 index 91aeee2fc57b..000000000000 --- a/lib/Target/R600/AMDGPUConvertToISA.cpp +++ /dev/null @@ -1,62 +0,0 @@ -//===-- AMDGPUConvertToISA.cpp - Lower AMDIL to HW ISA --------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief This pass lowers AMDIL machine instructions to the appropriate -/// hardware instructions. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "AMDGPUInstrInfo.h" -#include "llvm/CodeGen/MachineFunctionPass.h" - -using namespace llvm; - -namespace { - -class AMDGPUConvertToISAPass : public MachineFunctionPass { - -private: - static char ID; - TargetMachine &TM; - -public: - AMDGPUConvertToISAPass(TargetMachine &tm) : - MachineFunctionPass(ID), TM(tm) { } - - bool runOnMachineFunction(MachineFunction &MF) override; - - const char *getPassName() const override {return "AMDGPU Convert to ISA";} - -}; - -} // End anonymous namespace - -char AMDGPUConvertToISAPass::ID = 0; - -FunctionPass *llvm::createAMDGPUConvertToISAPass(TargetMachine &tm) { - return new AMDGPUConvertToISAPass(tm); -} - -bool AMDGPUConvertToISAPass::runOnMachineFunction(MachineFunction &MF) { - const AMDGPUInstrInfo * TII = - static_cast<const AMDGPUInstrInfo*>(TM.getInstrInfo()); - - for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); - BB != BB_E; ++BB) { - MachineBasicBlock &MBB = *BB; - for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); - I != E; ++I) { - MachineInstr &MI = *I; - TII->convertToISA(MI, MF, MBB.findDebugLoc(I)); - } - } - return false; -} diff --git a/lib/Target/R600/AMDGPUFrameLowering.cpp b/lib/Target/R600/AMDGPUFrameLowering.cpp index e7e90d3184de..9e8302ec0a1b 100644 --- a/lib/Target/R600/AMDGPUFrameLowering.cpp +++ b/lib/Target/R600/AMDGPUFrameLowering.cpp @@ -83,7 +83,7 @@ int AMDGPUFrameLowering::getFrameIndexOffset(const MachineFunction &MF, for (int i = MFI->getObjectIndexBegin(); i < UpperBound; ++i) { OffsetBytes = RoundUpToAlignment(OffsetBytes, MFI->getObjectAlignment(i)); OffsetBytes += MFI->getObjectSize(i); - // Each regiter holds 4 bytes, so we must always align the offset to at + // Each register holds 4 bytes, so we must always align the offset to at // least 4 bytes, so that 2 frame objects won't share the same register. OffsetBytes = RoundUpToAlignment(OffsetBytes, 4); } diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp index f1f0bfa89ac0..b4d79e5754e8 100644 --- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp @@ -14,6 +14,7 @@ #include "AMDGPUInstrInfo.h" #include "AMDGPUISelLowering.h" // For AMDGPUISD #include "AMDGPURegisterInfo.h" +#include "AMDGPUSubtarget.h" #include "R600InstrInfo.h" #include "SIISelLowering.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" @@ -83,6 +84,11 @@ private: SDValue& Offset); bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset); bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset); + bool SelectMUBUFAddr64(SDValue Addr, SDValue &Ptr, SDValue &Offset, + SDValue &ImmOffset) const; + + SDNode *SelectADD_SUB_I64(SDNode *N); + SDNode *SelectDIV_SCALE(SDNode *N); // Include the pieces autogenerated from the target description. #include "AMDGPUGenDAGISel.inc" @@ -211,51 +217,16 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { // We are selecting i64 ADD here instead of custom lower it during // DAG legalization, so we can fold some i64 ADDs used for address // calculation into the LOAD and STORE instructions. - case ISD::ADD: { + case ISD::ADD: + case ISD::SUB: { if (N->getValueType(0) != MVT::i64 || ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) break; - SDLoc DL(N); - SDValue LHS = N->getOperand(0); - SDValue RHS = N->getOperand(1); - - SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32); - SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32); - - SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, - DL, MVT::i32, LHS, Sub0); - SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, - DL, MVT::i32, LHS, Sub1); - - SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, - DL, MVT::i32, RHS, Sub0); - SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, - DL, MVT::i32, RHS, Sub1); - - SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue); - - SmallVector<SDValue, 8> AddLoArgs; - AddLoArgs.push_back(SDValue(Lo0, 0)); - AddLoArgs.push_back(SDValue(Lo1, 0)); - - SDNode *AddLo = CurDAG->getMachineNode( - isCFDepth0() ? AMDGPU::S_ADD_I32 : AMDGPU::V_ADD_I32_e32, - DL, VTList, AddLoArgs); - SDValue Carry = SDValue(AddLo, 1); - SDNode *AddHi = CurDAG->getMachineNode( - isCFDepth0() ? AMDGPU::S_ADDC_U32 : AMDGPU::V_ADDC_U32_e32, - DL, MVT::i32, SDValue(Hi0, 0), SDValue(Hi1, 0), Carry); - - SDValue Args[5] = { - CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, MVT::i32), - SDValue(AddLo,0), - Sub0, - SDValue(AddHi,0), - Sub1, - }; - return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args); + return SelectADD_SUB_I64(N); } + case ISD::SCALAR_TO_VECTOR: + case AMDGPUISD::BUILD_VERTICAL_VECTOR: case ISD::BUILD_VECTOR: { unsigned RegClassID; const AMDGPURegisterInfo *TRI = @@ -264,7 +235,8 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { static_cast<const SIRegisterInfo*>(TM.getRegisterInfo()); EVT VT = N->getValueType(0); unsigned NumVectorElts = VT.getVectorNumElements(); - assert(VT.getVectorElementType().bitsEq(MVT::i32)); + EVT EltVT = VT.getVectorElementType(); + assert(EltVT.bitsEq(MVT::i32)); if (ST.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { bool UseVReg = true; for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end(); @@ -305,7 +277,12 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { // can't be bundled by our scheduler. switch(NumVectorElts) { case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break; - case 4: RegClassID = AMDGPU::R600_Reg128RegClassID; break; + case 4: + if (Opc == AMDGPUISD::BUILD_VERTICAL_VECTOR) + RegClassID = AMDGPU::R600_Reg128VerticalRegClassID; + else + RegClassID = AMDGPU::R600_Reg128RegClassID; + break; default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR"); } } @@ -313,8 +290,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { SDValue RegClass = CurDAG->getTargetConstant(RegClassID, MVT::i32); if (NumVectorElts == 1) { - return CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, - VT.getVectorElementType(), + return CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0), RegClass); } @@ -323,11 +299,12 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { // 16 = Max Num Vector Elements // 2 = 2 REG_SEQUENCE operands per element (value, subreg index) // 1 = Vector Register Class - SmallVector<SDValue, 16 * 2 + 1> RegSeqArgs(N->getNumOperands() * 2 + 1); + SmallVector<SDValue, 16 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1); RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, MVT::i32); bool IsRegSeq = true; - for (unsigned i = 0; i < N->getNumOperands(); i++) { + unsigned NOps = N->getNumOperands(); + for (unsigned i = 0; i < NOps; i++) { // XXX: Why is this here? if (dyn_cast<RegisterSDNode>(N->getOperand(i))) { IsRegSeq = false; @@ -337,6 +314,20 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), MVT::i32); } + + if (NOps != NumVectorElts) { + // Fill in the missing undef elements if this was a scalar_to_vector. + assert(Opc == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts); + + MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, + SDLoc(N), EltVT); + for (unsigned i = NOps; i < NumVectorElts; ++i) { + RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0); + RegSeqArgs[1 + (2 * i) + 1] = + CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), MVT::i32); + } + } + if (!IsRegSeq) break; return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), @@ -466,6 +457,9 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { PackedOffsetWidth); } + case AMDGPUISD::DIV_SCALE: { + return SelectDIV_SCALE(N); + } } return SelectCode(N); } @@ -659,6 +653,129 @@ bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base, return true; } +SDNode *AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) { + SDLoc DL(N); + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + bool IsAdd = (N->getOpcode() == ISD::ADD); + + SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32); + SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32); + + SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, LHS, Sub0); + SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, LHS, Sub1); + + SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, RHS, Sub0); + SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, RHS, Sub1); + + SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue); + SDValue AddLoArgs[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) }; + + + unsigned Opc = IsAdd ? AMDGPU::S_ADD_I32 : AMDGPU::S_SUB_I32; + unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; + + if (!isCFDepth0()) { + Opc = IsAdd ? AMDGPU::V_ADD_I32_e32 : AMDGPU::V_SUB_I32_e32; + CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e32 : AMDGPU::V_SUBB_U32_e32; + } + + SDNode *AddLo = CurDAG->getMachineNode( Opc, DL, VTList, AddLoArgs); + SDValue Carry(AddLo, 1); + SDNode *AddHi + = CurDAG->getMachineNode(CarryOpc, DL, MVT::i32, + SDValue(Hi0, 0), SDValue(Hi1, 0), Carry); + + SDValue Args[5] = { + CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, MVT::i32), + SDValue(AddLo,0), + Sub0, + SDValue(AddHi,0), + Sub1, + }; + return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args); +} + +SDNode *AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) { + SDLoc SL(N); + EVT VT = N->getValueType(0); + + assert(VT == MVT::f32 || VT == MVT::f64); + + unsigned Opc + = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64 : AMDGPU::V_DIV_SCALE_F32; + + const SDValue Zero = CurDAG->getTargetConstant(0, MVT::i32); + + SDValue Ops[] = { + N->getOperand(0), + N->getOperand(1), + N->getOperand(2), + Zero, + Zero, + Zero, + Zero + }; + + return CurDAG->SelectNodeTo(N, Opc, VT, MVT::i1, Ops); +} + +static SDValue wrapAddr64Rsrc(SelectionDAG *DAG, SDLoc DL, SDValue Ptr) { + return SDValue(DAG->getMachineNode(AMDGPU::SI_ADDR64_RSRC, DL, MVT::v4i32, + Ptr), 0); +} + +bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &Ptr, + SDValue &Offset, + SDValue &ImmOffset) const { + SDLoc DL(Addr); + + if (CurDAG->isBaseWithConstantOffset(Addr)) { + SDValue N0 = Addr.getOperand(0); + SDValue N1 = Addr.getOperand(1); + ConstantSDNode *C1 = cast<ConstantSDNode>(N1); + + if (isUInt<12>(C1->getZExtValue())) { + + if (N0.getOpcode() == ISD::ADD) { + // (add (add N2, N3), C1) + SDValue N2 = N0.getOperand(0); + SDValue N3 = N0.getOperand(1); + Ptr = wrapAddr64Rsrc(CurDAG, DL, N2); + Offset = N3; + ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16); + return true; + } + + // (add N0, C1) + Ptr = wrapAddr64Rsrc(CurDAG, DL, CurDAG->getTargetConstant(0, MVT::i64));; + Offset = N0; + ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16); + return true; + } + } + if (Addr.getOpcode() == ISD::ADD) { + // (add N0, N1) + SDValue N0 = Addr.getOperand(0); + SDValue N1 = Addr.getOperand(1); + Ptr = wrapAddr64Rsrc(CurDAG, DL, N0); + Offset = N1; + ImmOffset = CurDAG->getTargetConstant(0, MVT::i16); + return true; + } + + // default case + Ptr = wrapAddr64Rsrc(CurDAG, DL, CurDAG->getConstant(0, MVT::i64)); + Offset = Addr; + ImmOffset = CurDAG->getTargetConstant(0, MVT::i16); + return true; +} + void AMDGPUDAGToDAGISel::PostprocessISelDAG() { const AMDGPUTargetLowering& Lowering = *static_cast<const AMDGPUTargetLowering*>(getTargetLowering()); diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp index 6c443ea828b7..0ada7a3ed827 100644 --- a/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/lib/Target/R600/AMDGPUISelLowering.cpp @@ -16,9 +16,9 @@ #include "AMDGPUISelLowering.h" #include "AMDGPU.h" #include "AMDGPUFrameLowering.h" +#include "AMDGPUIntrinsicInfo.h" #include "AMDGPURegisterInfo.h" #include "AMDGPUSubtarget.h" -#include "AMDILIntrinsicInfo.h" #include "R600MachineFunctionInfo.h" #include "SIMachineFunctionInfo.h" #include "llvm/Analysis/ValueTracking.h" @@ -84,13 +84,37 @@ static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT, #include "AMDGPUGenCallingConv.inc" +// Find a larger type to do a load / store of a vector with. +EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) { + unsigned StoreSize = VT.getStoreSizeInBits(); + if (StoreSize <= 32) + return EVT::getIntegerVT(Ctx, StoreSize); + + assert(StoreSize % 32 == 0 && "Store size not a multiple of 32"); + return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); +} + +// Type for a vector that will be loaded to. +EVT AMDGPUTargetLowering::getEquivalentLoadRegType(LLVMContext &Ctx, EVT VT) { + unsigned StoreSize = VT.getStoreSizeInBits(); + if (StoreSize <= 32) + return EVT::getIntegerVT(Ctx, 32); + + return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); +} + AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : TargetLowering(TM, new TargetLoweringObjectFileELF()) { Subtarget = &TM.getSubtarget<AMDGPUSubtarget>(); - // Initialize target lowering borrowed from AMDIL - InitAMDILLowering(); + setOperationAction(ISD::Constant, MVT::i32, Legal); + setOperationAction(ISD::Constant, MVT::i64, Legal); + setOperationAction(ISD::ConstantFP, MVT::f32, Legal); + setOperationAction(ISD::ConstantFP, MVT::f64, Legal); + + setOperationAction(ISD::BR_JT, MVT::Other, Expand); + setOperationAction(ISD::BRIND, MVT::Other, Expand); // We need to custom lower some of the intrinsics setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); @@ -107,9 +131,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::FROUND, MVT::f32, Legal); setOperationAction(ISD::FTRUNC, MVT::f32, Legal); - // The hardware supports ROTR, but not ROTL - setOperationAction(ISD::ROTL, MVT::i32, Expand); - // Lower floating point store/load to integer store/load to reduce the number // of patterns in tablegen. setOperationAction(ISD::STORE, MVT::f32, Promote); @@ -118,6 +139,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::STORE, MVT::v2f32, Promote); AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32); + setOperationAction(ISD::STORE, MVT::i64, Promote); + AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32); + setOperationAction(ISD::STORE, MVT::v4f32, Promote); AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32); @@ -161,6 +185,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::LOAD, MVT::v2f32, Promote); AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32); + setOperationAction(ISD::LOAD, MVT::i64, Promote); + AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32); + setOperationAction(ISD::LOAD, MVT::v4f32, Promote); AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32); @@ -202,29 +229,63 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::BR_CC, MVT::i1, Expand); - setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); + if (Subtarget->getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) { + setOperationAction(ISD::FCEIL, MVT::f64, Custom); + setOperationAction(ISD::FTRUNC, MVT::f64, Custom); + setOperationAction(ISD::FRINT, MVT::f64, Custom); + setOperationAction(ISD::FFLOOR, MVT::f64, Custom); + } - setOperationAction(ISD::FNEG, MVT::v2f32, Expand); - setOperationAction(ISD::FNEG, MVT::v4f32, Expand); + if (!Subtarget->hasBFI()) { + // fcopysign can be done in a single instruction with BFI. + setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); + } - setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); + const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; + for (MVT VT : ScalarIntVTs) { + setOperationAction(ISD::SREM, VT, Expand); + setOperationAction(ISD::SDIV, VT, Expand); - setOperationAction(ISD::MUL, MVT::i64, Expand); - setOperationAction(ISD::SUB, MVT::i64, Expand); + // GPU does not have divrem function for signed or unsigned. + setOperationAction(ISD::SDIVREM, VT, Custom); + setOperationAction(ISD::UDIVREM, VT, Custom); + + // GPU does not have [S|U]MUL_LOHI functions as a single instruction. + setOperationAction(ISD::SMUL_LOHI, VT, Expand); + setOperationAction(ISD::UMUL_LOHI, VT, Expand); + setOperationAction(ISD::BSWAP, VT, Expand); + setOperationAction(ISD::CTTZ, VT, Expand); + setOperationAction(ISD::CTLZ, VT, Expand); + } + + if (!Subtarget->hasBCNT(32)) + setOperationAction(ISD::CTPOP, MVT::i32, Expand); + + if (!Subtarget->hasBCNT(64)) + setOperationAction(ISD::CTPOP, MVT::i64, Expand); + + // The hardware supports 32-bit ROTR, but not ROTL. + setOperationAction(ISD::ROTL, MVT::i32, Expand); + setOperationAction(ISD::ROTL, MVT::i64, Expand); + setOperationAction(ISD::ROTR, MVT::i64, Expand); + + setOperationAction(ISD::FP_TO_SINT, MVT::i64, Expand); + setOperationAction(ISD::MUL, MVT::i64, Expand); + setOperationAction(ISD::MULHU, MVT::i64, Expand); + setOperationAction(ISD::MULHS, MVT::i64, Expand); setOperationAction(ISD::UDIV, MVT::i32, Expand); - setOperationAction(ISD::UDIVREM, MVT::i32, Custom); - setOperationAction(ISD::UDIVREM, MVT::i64, Custom); setOperationAction(ISD::UREM, MVT::i32, Expand); - setOperationAction(ISD::VSELECT, MVT::v2f32, Expand); - setOperationAction(ISD::VSELECT, MVT::v4f32, Expand); + setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); + setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); - static const MVT::SimpleValueType IntTypes[] = { + static const MVT::SimpleValueType VectorIntTypes[] = { MVT::v2i32, MVT::v4i32 }; - for (MVT VT : IntTypes) { - //Expand the following operations for the current type by default + for (MVT VT : VectorIntTypes) { + // Expand the following operations for the current type by default. setOperationAction(ISD::ADD, VT, Expand); setOperationAction(ISD::AND, VT, Expand); setOperationAction(ISD::FP_TO_SINT, VT, Expand); @@ -232,40 +293,93 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::MUL, VT, Expand); setOperationAction(ISD::OR, VT, Expand); setOperationAction(ISD::SHL, VT, Expand); - setOperationAction(ISD::SINT_TO_FP, VT, Expand); - setOperationAction(ISD::SRL, VT, Expand); setOperationAction(ISD::SRA, VT, Expand); + setOperationAction(ISD::SRL, VT, Expand); + setOperationAction(ISD::ROTL, VT, Expand); + setOperationAction(ISD::ROTR, VT, Expand); setOperationAction(ISD::SUB, VT, Expand); - setOperationAction(ISD::UDIV, VT, Expand); + setOperationAction(ISD::SINT_TO_FP, VT, Expand); setOperationAction(ISD::UINT_TO_FP, VT, Expand); + // TODO: Implement custom UREM / SREM routines. + setOperationAction(ISD::SDIV, VT, Expand); + setOperationAction(ISD::UDIV, VT, Expand); + setOperationAction(ISD::SREM, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); + setOperationAction(ISD::SMUL_LOHI, VT, Expand); + setOperationAction(ISD::UMUL_LOHI, VT, Expand); + setOperationAction(ISD::SDIVREM, VT, Custom); + setOperationAction(ISD::UDIVREM, VT, Custom); + setOperationAction(ISD::ADDC, VT, Expand); + setOperationAction(ISD::SUBC, VT, Expand); + setOperationAction(ISD::ADDE, VT, Expand); + setOperationAction(ISD::SUBE, VT, Expand); setOperationAction(ISD::SELECT, VT, Expand); setOperationAction(ISD::VSELECT, VT, Expand); + setOperationAction(ISD::SELECT_CC, VT, Expand); setOperationAction(ISD::XOR, VT, Expand); + setOperationAction(ISD::BSWAP, VT, Expand); + setOperationAction(ISD::CTPOP, VT, Expand); + setOperationAction(ISD::CTTZ, VT, Expand); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand); + setOperationAction(ISD::CTLZ, VT, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); } - static const MVT::SimpleValueType FloatTypes[] = { + static const MVT::SimpleValueType FloatVectorTypes[] = { MVT::v2f32, MVT::v4f32 }; - for (MVT VT : FloatTypes) { + for (MVT VT : FloatVectorTypes) { setOperationAction(ISD::FABS, VT, Expand); setOperationAction(ISD::FADD, VT, Expand); + setOperationAction(ISD::FCEIL, VT, Expand); setOperationAction(ISD::FCOS, VT, Expand); setOperationAction(ISD::FDIV, VT, Expand); + setOperationAction(ISD::FEXP2, VT, Expand); + setOperationAction(ISD::FLOG2, VT, Expand); setOperationAction(ISD::FPOW, VT, Expand); setOperationAction(ISD::FFLOOR, VT, Expand); setOperationAction(ISD::FTRUNC, VT, Expand); setOperationAction(ISD::FMUL, VT, Expand); + setOperationAction(ISD::FMA, VT, Expand); setOperationAction(ISD::FRINT, VT, Expand); + setOperationAction(ISD::FNEARBYINT, VT, Expand); setOperationAction(ISD::FSQRT, VT, Expand); setOperationAction(ISD::FSIN, VT, Expand); setOperationAction(ISD::FSUB, VT, Expand); + setOperationAction(ISD::FNEG, VT, Expand); setOperationAction(ISD::SELECT, VT, Expand); + setOperationAction(ISD::VSELECT, VT, Expand); + setOperationAction(ISD::SELECT_CC, VT, Expand); + setOperationAction(ISD::FCOPYSIGN, VT, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); } + setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom); + setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom); + setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::SELECT_CC); + + setSchedulingPreference(Sched::RegPressure); + setJumpIsExpensive(true); + + setSelectIsExpensive(false); + PredictableSelectIsExpensive = false; + + // There are no integer divide instructions, and these expand to a pretty + // large sequence of instructions. + setIntDivIsCheap(false); + setPow2DivIsCheap(false); + + // TODO: Investigate this when 64-bit divides are implemented. + addBypassSlowDiv(64, 32); + + // FIXME: Need to really handle these. + MaxStoresPerMemcpy = 4096; + MaxStoresPerMemmove = 4096; + MaxStoresPerMemset = 4096; } //===----------------------------------------------------------------------===// @@ -276,6 +390,23 @@ MVT AMDGPUTargetLowering::getVectorIdxTy() const { return MVT::i32; } +bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const { + return true; +} + +// The backend supports 32 and 64 bit floating point immediates. +// FIXME: Why are we reporting vectors of FP immediates as legal? +bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { + EVT ScalarVT = VT.getScalarType(); + return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64); +} + +// We don't want to shrink f64 / f32 constants. +bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const { + EVT ScalarVT = VT.getScalarType(); + return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64); +} + bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy) const { if (LoadTy.getSizeInBits() != CastTy.getSizeInBits()) @@ -330,6 +461,10 @@ bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const { return Src == MVT::i32 && Dest == MVT::i64; } +bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { + return isZExtFree(Val.getValueType(), VT2); +} + bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const { // There aren't really 64-bit registers, but pairs of 32-bit ones and only a // limited number of native 64-bit operations. Shrinking an operation to fit @@ -383,25 +518,28 @@ SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI, return SDValue(); } -SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) - const { +SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, + SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: Op.getNode()->dump(); llvm_unreachable("Custom lowering code for this" "instruction is not implemented yet!"); break; - // AMDIL DAG lowering - case ISD::SDIV: return LowerSDIV(Op, DAG); - case ISD::SREM: return LowerSREM(Op, DAG); case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG); - case ISD::BRCOND: return LowerBRCOND(Op, DAG); - // AMDGPU DAG lowering case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); + case ISD::SDIV: return LowerSDIV(Op, DAG); + case ISD::SREM: return LowerSREM(Op, DAG); case ISD::UDIVREM: return LowerUDIVREM(Op, DAG); + case ISD::SDIVREM: return LowerSDIVREM(Op, DAG); + case ISD::FCEIL: return LowerFCEIL(Op, DAG); + case ISD::FTRUNC: return LowerFTRUNC(Op, DAG); + case ISD::FRINT: return LowerFRINT(Op, DAG); + case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG); + case ISD::FFLOOR: return LowerFFLOOR(Op, DAG); case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); } return Op; @@ -419,95 +557,23 @@ void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N, // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do // nothing here and let the illegal result integer be handled normally. return; - case ISD::UDIV: { - SDValue Op = SDValue(N, 0); - SDLoc DL(Op); - EVT VT = Op.getValueType(); - SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), - N->getOperand(0), N->getOperand(1)); - Results.push_back(UDIVREM); - break; - } - case ISD::UREM: { - SDValue Op = SDValue(N, 0); - SDLoc DL(Op); - EVT VT = Op.getValueType(); - SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), - N->getOperand(0), N->getOperand(1)); - Results.push_back(UDIVREM.getValue(1)); - break; - } - case ISD::UDIVREM: { - SDValue Op = SDValue(N, 0); - SDLoc DL(Op); - EVT VT = Op.getValueType(); - EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); - - SDValue one = DAG.getConstant(1, HalfVT); - SDValue zero = DAG.getConstant(0, HalfVT); - - //HiLo split - SDValue LHS = N->getOperand(0); - SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero); - SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one); - - SDValue RHS = N->getOperand(1); - SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero); - SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one); - - // Get Speculative values - SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo); - SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo); - - SDValue REM_Hi = zero; - SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ); - - SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ); - SDValue DIV_Lo = zero; - - const unsigned halfBitWidth = HalfVT.getSizeInBits(); - - for (unsigned i = 0; i < halfBitWidth; ++i) { - SDValue POS = DAG.getConstant(halfBitWidth - i - 1, HalfVT); - // Get Value of high bit - SDValue HBit; - if (halfBitWidth == 32 && Subtarget->hasBFE()) { - HBit = DAG.getNode(AMDGPUISD::BFE_U32, DL, HalfVT, LHS_Lo, POS, one); - } else { - HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS); - HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one); - } - - SDValue Carry = DAG.getNode(ISD::SRL, DL, HalfVT, REM_Lo, - DAG.getConstant(halfBitWidth - 1, HalfVT)); - REM_Hi = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Hi, one); - REM_Hi = DAG.getNode(ISD::OR, DL, HalfVT, REM_Hi, Carry); - - REM_Lo = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Lo, one); - REM_Lo = DAG.getNode(ISD::OR, DL, HalfVT, REM_Lo, HBit); - - - SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi); - - SDValue BIT = DAG.getConstant(1 << (halfBitWidth - i - 1), HalfVT); - SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETGE); - - DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT); - - // Update REM - - SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS); - - REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETGE); - REM_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, zero); - REM_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, one); - } + case ISD::LOAD: { + SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode(); + if (!Node) + return; - SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi); - SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi); - Results.push_back(DIV); - Results.push_back(REM); - break; + Results.push_back(SDValue(Node, 0)); + Results.push_back(SDValue(Node, 1)); + // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode + // function + DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1)); + return; + } + case ISD::STORE: { + SDValue Lowered = LowerSTORE(SDValue(N, 0), DAG); + if (Lowered.getNode()) + Results.push_back(Lowered); + return; } default: return; @@ -531,12 +597,14 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init, SelectionDAG &DAG) const { const DataLayout *TD = getTargetMachine().getDataLayout(); SDLoc DL(InitPtr); + Type *InitTy = Init->getType(); + if (const ConstantInt *CI = dyn_cast<ConstantInt>(Init)) { - EVT VT = EVT::getEVT(CI->getType()); - PointerType *PtrTy = PointerType::get(CI->getType(), 0); - return DAG.getStore(Chain, DL, DAG.getConstant(*CI, VT), InitPtr, - MachinePointerInfo(UndefValue::get(PtrTy)), false, false, - TD->getPrefTypeAlignment(CI->getType())); + EVT VT = EVT::getEVT(InitTy); + PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS); + return DAG.getStore(Chain, DL, DAG.getConstant(*CI, VT), InitPtr, + MachinePointerInfo(UndefValue::get(PtrTy)), false, false, + TD->getPrefTypeAlignment(InitTy)); } if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Init)) { @@ -547,7 +615,6 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init, TD->getPrefTypeAlignment(CFP->getType())); } - Type *InitTy = Init->getType(); if (StructType *ST = dyn_cast<StructType>(InitTy)) { const StructLayout *SL = TD->getStructLayout(ST); @@ -589,6 +656,14 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init, return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); } + if (isa<UndefValue>(Init)) { + EVT VT = EVT::getEVT(InitTy); + PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS); + return DAG.getStore(Chain, DL, DAG.getUNDEF(VT), InitPtr, + MachinePointerInfo(UndefValue::get(PtrTy)), false, false, + TD->getPrefTypeAlignment(InitTy)); + } + Init->dump(); llvm_unreachable("Unhandled constant initializer"); } @@ -628,11 +703,19 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, unsigned Size = TD->getTypeAllocSize(EltType); unsigned Alignment = TD->getPrefTypeAlignment(EltType); + MVT PrivPtrVT = getPointerTy(AMDGPUAS::PRIVATE_ADDRESS); + MVT ConstPtrVT = getPointerTy(AMDGPUAS::CONSTANT_ADDRESS); + + int FI = FrameInfo->CreateStackObject(Size, Alignment, false); + SDValue InitPtr = DAG.getFrameIndex(FI, PrivPtrVT); + const GlobalVariable *Var = cast<GlobalVariable>(GV); + if (!Var->hasInitializer()) { + // This has no use, but bugpoint will hit it. + return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT); + } + const Constant *Init = Var->getInitializer(); - int FI = FrameInfo->CreateStackObject(Size, Alignment, false); - SDValue InitPtr = DAG.getFrameIndex(FI, - getPointerTy(AMDGPUAS::PRIVATE_ADDRESS)); SmallVector<SDNode*, 8> WorkList; for (SDNode::use_iterator I = DAG.getEntryNode()->use_begin(), @@ -651,8 +734,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, } DAG.UpdateNodeOperands(*I, Ops); } - return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), - getPointerTy(AMDGPUAS::CONSTANT_ADDRESS)); + return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT); } } } @@ -688,8 +770,7 @@ SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op, const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering()); - FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op); - assert(FIN); + FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op); unsigned FrameIndex = FIN->getIndex(); unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex); @@ -705,26 +786,66 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, switch (IntrinsicID) { default: return Op; - case AMDGPUIntrinsic::AMDIL_abs: + case AMDGPUIntrinsic::AMDGPU_abs: + case AMDGPUIntrinsic::AMDIL_abs: // Legacy name. return LowerIntrinsicIABS(Op, DAG); - case AMDGPUIntrinsic::AMDIL_exp: - return DAG.getNode(ISD::FEXP2, DL, VT, Op.getOperand(1)); case AMDGPUIntrinsic::AMDGPU_lrp: return LowerIntrinsicLRP(Op, DAG); - case AMDGPUIntrinsic::AMDIL_fraction: + case AMDGPUIntrinsic::AMDGPU_fract: + case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name. return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1)); - case AMDGPUIntrinsic::AMDIL_max: - return DAG.getNode(AMDGPUISD::FMAX, DL, VT, Op.getOperand(1), - Op.getOperand(2)); + + case AMDGPUIntrinsic::AMDGPU_clamp: + case AMDGPUIntrinsic::AMDIL_clamp: // Legacy name. + return DAG.getNode(AMDGPUISD::CLAMP, DL, VT, + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + + case Intrinsic::AMDGPU_div_scale: { + // 3rd parameter required to be a constant. + const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3)); + if (!Param) + return DAG.getUNDEF(VT); + + // Translate to the operands expected by the machine instruction. The + // first parameter must be the same as the first instruction. + SDValue Numerator = Op.getOperand(1); + SDValue Denominator = Op.getOperand(2); + SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator; + + return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, VT, + Src0, Denominator, Numerator); + } + + case Intrinsic::AMDGPU_div_fmas: + return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + + case Intrinsic::AMDGPU_div_fixup: + return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + + case Intrinsic::AMDGPU_trig_preop: + return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT, + Op.getOperand(1), Op.getOperand(2)); + + case Intrinsic::AMDGPU_rcp: + return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1)); + + case Intrinsic::AMDGPU_rsq: + return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); + + case AMDGPUIntrinsic::AMDGPU_legacy_rsq: + return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); + + case Intrinsic::AMDGPU_rsq_clamped: + return DAG.getNode(AMDGPUISD::RSQ_CLAMPED, DL, VT, Op.getOperand(1)); + case AMDGPUIntrinsic::AMDGPU_imax: return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Op.getOperand(1), Op.getOperand(2)); case AMDGPUIntrinsic::AMDGPU_umax: return DAG.getNode(AMDGPUISD::UMAX, DL, VT, Op.getOperand(1), Op.getOperand(2)); - case AMDGPUIntrinsic::AMDIL_min: - return DAG.getNode(AMDGPUISD::FMIN, DL, VT, Op.getOperand(1), - Op.getOperand(2)); case AMDGPUIntrinsic::AMDGPU_imin: return DAG.getNode(AMDGPUISD::SMIN, DL, VT, Op.getOperand(1), Op.getOperand(2)); @@ -748,6 +869,18 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(AMDGPUISD::MAD_I24, DL, VT, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte0: + return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Op.getOperand(1)); + + case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte1: + return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE1, DL, VT, Op.getOperand(1)); + + case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte2: + return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE2, DL, VT, Op.getOperand(1)); + + case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte3: + return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE3, DL, VT, Op.getOperand(1)); + case AMDGPUIntrinsic::AMDGPU_bfe_i32: return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1), @@ -771,8 +904,16 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, Op.getOperand(1), Op.getOperand(2)); - case AMDGPUIntrinsic::AMDIL_round_nearest: + case AMDGPUIntrinsic::AMDGPU_brev: + return DAG.getNode(AMDGPUISD::BREV, DL, VT, Op.getOperand(1)); + + case AMDGPUIntrinsic::AMDIL_exp: // Legacy name. + return DAG.getNode(ISD::FEXP2, DL, VT, Op.getOperand(1)); + + case AMDGPUIntrinsic::AMDIL_round_nearest: // Legacy name. return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1)); + case AMDGPUIntrinsic::AMDGPU_trunc: // Legacy name. + return DAG.getNode(ISD::FTRUNC, DL, VT, Op.getOperand(1)); } } @@ -863,27 +1004,41 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue &Op, SelectionDAG &DAG) const { LoadSDNode *Load = dyn_cast<LoadSDNode>(Op); EVT MemEltVT = Load->getMemoryVT().getVectorElementType(); + EVT LoadVT = Op.getValueType(); EVT EltVT = Op.getValueType().getVectorElementType(); EVT PtrVT = Load->getBasePtr().getValueType(); + unsigned NumElts = Load->getMemoryVT().getVectorNumElements(); SmallVector<SDValue, 8> Loads; + SmallVector<SDValue, 8> Chains; + SDLoc SL(Op); for (unsigned i = 0, e = NumElts; i != e; ++i) { SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Load->getBasePtr(), DAG.getConstant(i * (MemEltVT.getSizeInBits() / 8), PtrVT)); - Loads.push_back(DAG.getExtLoad(Load->getExtensionType(), SL, EltVT, - Load->getChain(), Ptr, - MachinePointerInfo(Load->getMemOperand()->getValue()), - MemEltVT, Load->isVolatile(), Load->isNonTemporal(), - Load->getAlignment())); + + SDValue NewLoad + = DAG.getExtLoad(Load->getExtensionType(), SL, EltVT, + Load->getChain(), Ptr, + MachinePointerInfo(Load->getMemOperand()->getValue()), + MemEltVT, Load->isVolatile(), Load->isNonTemporal(), + Load->getAlignment()); + Loads.push_back(NewLoad.getValue(0)); + Chains.push_back(NewLoad.getValue(1)); } - return DAG.getNode(ISD::BUILD_VECTOR, SL, Op.getValueType(), Loads); + + SDValue Ops[] = { + DAG.getNode(ISD::BUILD_VECTOR, SL, LoadVT, Loads), + DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains) + }; + + return DAG.getMergeValues(Ops, SL); } SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op, SelectionDAG &DAG) const { - StoreSDNode *Store = dyn_cast<StoreSDNode>(Op); + StoreSDNode *Store = cast<StoreSDNode>(Op); EVT MemVT = Store->getMemoryVT(); unsigned MemBits = MemVT.getSizeInBits(); @@ -981,7 +1136,13 @@ SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { Load->getBasePtr(), MemVT, Load->getMemOperand()); - return DAG.getNode(ISD::getExtForLoadExtType(ExtType), DL, VT, ExtLoad32); + + SDValue Ops[] = { + DAG.getNode(ISD::getExtForLoadExtType(ExtType), DL, VT, ExtLoad32), + ExtLoad32.getValue(1) + }; + + return DAG.getMergeValues(Ops, DL); } if (ExtType == ISD::NON_EXTLOAD && VT.getSizeInBits() < 32) { @@ -995,7 +1156,13 @@ SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr, MVT::i8, MMO); - return DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD); + + SDValue Ops[] = { + DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD), + NewLD.getValue(1) + }; + + return DAG.getMergeValues(Ops, DL); } // Lower loads constant address space global variable loads @@ -1003,11 +1170,12 @@ SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { isa<GlobalVariable>( GetUnderlyingObject(Load->getMemOperand()->getValue()))) { + SDValue Ptr = DAG.getZExtOrTrunc(Load->getBasePtr(), DL, getPointerTy(AMDGPUAS::PRIVATE_ADDRESS)); Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(2, MVT::i32)); - return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(), + return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(), Load->getChain(), Ptr, DAG.getTargetConstant(0, MVT::i32), Op.getOperand(2)); } @@ -1034,10 +1202,21 @@ SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { EVT MemEltVT = MemVT.getScalarType(); if (ExtType == ISD::SEXTLOAD) { SDValue MemEltVTNode = DAG.getValueType(MemEltVT); - return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode); + + SDValue Ops[] = { + DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode), + Load->getChain() + }; + + return DAG.getMergeValues(Ops, DL); } - return DAG.getZeroExtendInReg(Ret, DL, MemEltVT); + SDValue Ops[] = { + DAG.getZeroExtendInReg(Ret, DL, MemEltVT), + Load->getChain() + }; + + return DAG.getMergeValues(Ops, DL); } SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { @@ -1097,6 +1276,251 @@ SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } +SDValue AMDGPUTargetLowering::LowerSDIV24(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT OVT = Op.getValueType(); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + MVT INTTY; + MVT FLTTY; + if (!OVT.isVector()) { + INTTY = MVT::i32; + FLTTY = MVT::f32; + } else if (OVT.getVectorNumElements() == 2) { + INTTY = MVT::v2i32; + FLTTY = MVT::v2f32; + } else if (OVT.getVectorNumElements() == 4) { + INTTY = MVT::v4i32; + FLTTY = MVT::v4f32; + } + unsigned bitsize = OVT.getScalarType().getSizeInBits(); + // char|short jq = ia ^ ib; + SDValue jq = DAG.getNode(ISD::XOR, DL, OVT, LHS, RHS); + + // jq = jq >> (bitsize - 2) + jq = DAG.getNode(ISD::SRA, DL, OVT, jq, DAG.getConstant(bitsize - 2, OVT)); + + // jq = jq | 0x1 + jq = DAG.getNode(ISD::OR, DL, OVT, jq, DAG.getConstant(1, OVT)); + + // jq = (int)jq + jq = DAG.getSExtOrTrunc(jq, DL, INTTY); + + // int ia = (int)LHS; + SDValue ia = DAG.getSExtOrTrunc(LHS, DL, INTTY); + + // int ib, (int)RHS; + SDValue ib = DAG.getSExtOrTrunc(RHS, DL, INTTY); + + // float fa = (float)ia; + SDValue fa = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ia); + + // float fb = (float)ib; + SDValue fb = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ib); + + // float fq = native_divide(fa, fb); + SDValue fq = DAG.getNode(ISD::FMUL, DL, FLTTY, + fa, DAG.getNode(AMDGPUISD::RCP, DL, FLTTY, fb)); + + // fq = trunc(fq); + fq = DAG.getNode(ISD::FTRUNC, DL, FLTTY, fq); + + // float fqneg = -fq; + SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FLTTY, fq); + + // float fr = mad(fqneg, fb, fa); + SDValue fr = DAG.getNode(ISD::FADD, DL, FLTTY, + DAG.getNode(ISD::MUL, DL, FLTTY, fqneg, fb), fa); + + // int iq = (int)fq; + SDValue iq = DAG.getNode(ISD::FP_TO_SINT, DL, INTTY, fq); + + // fr = fabs(fr); + fr = DAG.getNode(ISD::FABS, DL, FLTTY, fr); + + // fb = fabs(fb); + fb = DAG.getNode(ISD::FABS, DL, FLTTY, fb); + + // int cv = fr >= fb; + SDValue cv; + if (INTTY == MVT::i32) { + cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE); + } else { + cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE); + } + // jq = (cv ? jq : 0); + jq = DAG.getNode(ISD::SELECT, DL, OVT, cv, jq, + DAG.getConstant(0, OVT)); + // dst = iq + jq; + iq = DAG.getSExtOrTrunc(iq, DL, OVT); + iq = DAG.getNode(ISD::ADD, DL, OVT, iq, jq); + return iq; +} + +SDValue AMDGPUTargetLowering::LowerSDIV32(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT OVT = Op.getValueType(); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + // The LowerSDIV32 function generates equivalent to the following IL. + // mov r0, LHS + // mov r1, RHS + // ilt r10, r0, 0 + // ilt r11, r1, 0 + // iadd r0, r0, r10 + // iadd r1, r1, r11 + // ixor r0, r0, r10 + // ixor r1, r1, r11 + // udiv r0, r0, r1 + // ixor r10, r10, r11 + // iadd r0, r0, r10 + // ixor DST, r0, r10 + + // mov r0, LHS + SDValue r0 = LHS; + + // mov r1, RHS + SDValue r1 = RHS; + + // ilt r10, r0, 0 + SDValue r10 = DAG.getSelectCC(DL, + r0, DAG.getConstant(0, OVT), + DAG.getConstant(-1, OVT), + DAG.getConstant(0, OVT), + ISD::SETLT); + + // ilt r11, r1, 0 + SDValue r11 = DAG.getSelectCC(DL, + r1, DAG.getConstant(0, OVT), + DAG.getConstant(-1, OVT), + DAG.getConstant(0, OVT), + ISD::SETLT); + + // iadd r0, r0, r10 + r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10); + + // iadd r1, r1, r11 + r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11); + + // ixor r0, r0, r10 + r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); + + // ixor r1, r1, r11 + r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11); + + // udiv r0, r0, r1 + r0 = DAG.getNode(ISD::UDIV, DL, OVT, r0, r1); + + // ixor r10, r10, r11 + r10 = DAG.getNode(ISD::XOR, DL, OVT, r10, r11); + + // iadd r0, r0, r10 + r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10); + + // ixor DST, r0, r10 + SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); + return DST; +} + +SDValue AMDGPUTargetLowering::LowerSDIV64(SDValue Op, SelectionDAG &DAG) const { + return SDValue(Op.getNode(), 0); +} + +SDValue AMDGPUTargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const { + EVT OVT = Op.getValueType().getScalarType(); + + if (OVT == MVT::i64) + return LowerSDIV64(Op, DAG); + + if (OVT.getScalarType() == MVT::i32) + return LowerSDIV32(Op, DAG); + + if (OVT == MVT::i16 || OVT == MVT::i8) { + // FIXME: We should be checking for the masked bits. This isn't reached + // because i8 and i16 are not legal types. + return LowerSDIV24(Op, DAG); + } + + return SDValue(Op.getNode(), 0); +} + +SDValue AMDGPUTargetLowering::LowerSREM32(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT OVT = Op.getValueType(); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + // The LowerSREM32 function generates equivalent to the following IL. + // mov r0, LHS + // mov r1, RHS + // ilt r10, r0, 0 + // ilt r11, r1, 0 + // iadd r0, r0, r10 + // iadd r1, r1, r11 + // ixor r0, r0, r10 + // ixor r1, r1, r11 + // udiv r20, r0, r1 + // umul r20, r20, r1 + // sub r0, r0, r20 + // iadd r0, r0, r10 + // ixor DST, r0, r10 + + // mov r0, LHS + SDValue r0 = LHS; + + // mov r1, RHS + SDValue r1 = RHS; + + // ilt r10, r0, 0 + SDValue r10 = DAG.getSetCC(DL, OVT, r0, DAG.getConstant(0, OVT), ISD::SETLT); + + // ilt r11, r1, 0 + SDValue r11 = DAG.getSetCC(DL, OVT, r1, DAG.getConstant(0, OVT), ISD::SETLT); + + // iadd r0, r0, r10 + r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10); + + // iadd r1, r1, r11 + r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11); + + // ixor r0, r0, r10 + r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); + + // ixor r1, r1, r11 + r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11); + + // udiv r20, r0, r1 + SDValue r20 = DAG.getNode(ISD::UREM, DL, OVT, r0, r1); + + // umul r20, r20, r1 + r20 = DAG.getNode(AMDGPUISD::UMUL, DL, OVT, r20, r1); + + // sub r0, r0, r20 + r0 = DAG.getNode(ISD::SUB, DL, OVT, r0, r20); + + // iadd r0, r0, r10 + r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10); + + // ixor DST, r0, r10 + SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); + return DST; +} + +SDValue AMDGPUTargetLowering::LowerSREM64(SDValue Op, SelectionDAG &DAG) const { + return SDValue(Op.getNode(), 0); +} + +SDValue AMDGPUTargetLowering::LowerSREM(SDValue Op, SelectionDAG &DAG) const { + EVT OVT = Op.getValueType(); + + if (OVT.getScalarType() == MVT::i64) + return LowerSREM64(Op, DAG); + + if (OVT.getScalarType() == MVT::i32) + return LowerSREM32(Op, DAG); + + return SDValue(Op.getNode(), 0); +} + SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); @@ -1201,6 +1625,177 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, return DAG.getMergeValues(Ops, DL); } +SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + + SDValue Zero = DAG.getConstant(0, VT); + SDValue NegOne = DAG.getConstant(-1, VT); + + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + + SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT); + SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT); + SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign); + SDValue RSign = LHSign; // Remainder sign is the same as LHS + + LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign); + RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign); + + LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign); + RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign); + + SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS); + SDValue Rem = Div.getValue(1); + + Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign); + Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign); + + Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign); + Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign); + + SDValue Res[2] = { + Div, + Rem + }; + return DAG.getMergeValues(Res, DL); +} + +SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue Src = Op.getOperand(0); + + // result = trunc(src) + // if (src > 0.0 && src != result) + // result += 1.0 + + SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src); + + const SDValue Zero = DAG.getConstantFP(0.0, MVT::f64); + const SDValue One = DAG.getConstantFP(1.0, MVT::f64); + + EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64); + + SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT); + SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE); + SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc); + + SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero); + return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); +} + +SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue Src = Op.getOperand(0); + + assert(Op.getValueType() == MVT::f64); + + const SDValue Zero = DAG.getConstant(0, MVT::i32); + const SDValue One = DAG.getConstant(1, MVT::i32); + + SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src); + + // Extract the upper half, since this is where we will find the sign and + // exponent. + SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One); + + const unsigned FractBits = 52; + const unsigned ExpBits = 11; + + // Extract the exponent. + SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_I32, SL, MVT::i32, + Hi, + DAG.getConstant(FractBits - 32, MVT::i32), + DAG.getConstant(ExpBits, MVT::i32)); + SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart, + DAG.getConstant(1023, MVT::i32)); + + // Extract the sign bit. + const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, MVT::i32); + SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask); + + // Extend back to to 64-bits. + SDValue SignBit64 = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, + Zero, SignBit); + SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64); + + SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src); + const SDValue FractMask + = DAG.getConstant((UINT64_C(1) << FractBits) - 1, MVT::i64); + + SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp); + SDValue Not = DAG.getNOT(SL, Shr, MVT::i64); + SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not); + + EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::i32); + + const SDValue FiftyOne = DAG.getConstant(FractBits - 1, MVT::i32); + + SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT); + SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT); + + SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0); + SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1); + + return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2); +} + +SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue Src = Op.getOperand(0); + + assert(Op.getValueType() == MVT::f64); + + APFloat C1Val(APFloat::IEEEdouble, "0x1.0p+52"); + SDValue C1 = DAG.getConstantFP(C1Val, MVT::f64); + SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src); + + SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign); + SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign); + + SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src); + + APFloat C2Val(APFloat::IEEEdouble, "0x1.fffffffffffffp+51"); + SDValue C2 = DAG.getConstantFP(C2Val, MVT::f64); + + EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64); + SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT); + + return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2); +} + +SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const { + // FNEARBYINT and FRINT are the same, except in their handling of FP + // exceptions. Those aren't really meaningful for us, and OpenCL only has + // rint, so just treat them as equivalent. + return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0)); +} + +SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue Src = Op.getOperand(0); + + // result = trunc(src); + // if (src < 0.0 && src != result) + // result += -1.0. + + SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src); + + const SDValue Zero = DAG.getConstantFP(0.0, MVT::f64); + const SDValue NegOne = DAG.getConstantFP(-1.0, MVT::f64); + + EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64); + + SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT); + SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE); + SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc); + + SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero); + return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); +} + SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { SDValue S0 = Op.getOperand(0); @@ -1218,7 +1813,6 @@ SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op, FloatHi = DAG.getNode(ISD::FMUL, DL, MVT::f32, FloatHi, DAG.getConstantFP(4294967296.0f, MVT::f32)); // 2^32 return DAG.getNode(ISD::FADD, DL, MVT::f32, FloatLo, FloatHi); - } SDValue AMDGPUTargetLowering::ExpandSIGN_EXTEND_INREG(SDValue Op, @@ -1303,6 +1897,37 @@ static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, return DAG.getConstant(Src0 >> Offset, MVT::i32); } +SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + EVT VT = N->getValueType(0); + + if (VT.isVector() || VT.getSizeInBits() > 32) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDValue Mul; + + if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) { + N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32); + N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32); + Mul = DAG.getNode(AMDGPUISD::MUL_U24, DL, MVT::i32, N0, N1); + } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) { + N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32); + N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32); + Mul = DAG.getNode(AMDGPUISD::MUL_I24, DL, MVT::i32, N0, N1); + } else { + return SDValue(); + } + + // We need to use sext even for MUL_U24, because MUL_U24 is used + // for signed multiply of 8 and 16-bit types. + return DAG.getSExtOrTrunc(Mul, DL, VT); +} + SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -1310,34 +1935,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, switch(N->getOpcode()) { default: break; - case ISD::MUL: { - EVT VT = N->getValueType(0); - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - SDValue Mul; - - // FIXME: Add support for 24-bit multiply with 64-bit output on SI. - if (VT.isVector() || VT.getSizeInBits() > 32) - break; - - if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) { - N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32); - N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32); - Mul = DAG.getNode(AMDGPUISD::MUL_U24, DL, MVT::i32, N0, N1); - } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) { - N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32); - N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32); - Mul = DAG.getNode(AMDGPUISD::MUL_I24, DL, MVT::i32, N0, N1); - } else { - break; - } - - // We need to use sext even for MUL_U24, because MUL_U24 is used - // for signed multiply of 8 and 16-bit types. - SDValue Reg = DAG.getSExtOrTrunc(Mul, DL, VT); - - return Reg; - } + case ISD::MUL: + return performMulCombine(N, DCI); case AMDGPUISD::MUL_I24: case AMDGPUISD::MUL_U24: { SDValue N0 = N->getOperand(0); @@ -1511,29 +2110,38 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { // AMDIL DAG nodes NODE_NAME_CASE(CALL); NODE_NAME_CASE(UMUL); - NODE_NAME_CASE(DIV_INF); NODE_NAME_CASE(RET_FLAG); NODE_NAME_CASE(BRANCH_COND); // AMDGPU DAG nodes NODE_NAME_CASE(DWORDADDR) NODE_NAME_CASE(FRACT) + NODE_NAME_CASE(CLAMP) NODE_NAME_CASE(FMAX) NODE_NAME_CASE(SMAX) NODE_NAME_CASE(UMAX) NODE_NAME_CASE(FMIN) NODE_NAME_CASE(SMIN) NODE_NAME_CASE(UMIN) + NODE_NAME_CASE(URECIP) + NODE_NAME_CASE(DIV_SCALE) + NODE_NAME_CASE(DIV_FMAS) + NODE_NAME_CASE(DIV_FIXUP) + NODE_NAME_CASE(TRIG_PREOP) + NODE_NAME_CASE(RCP) + NODE_NAME_CASE(RSQ) + NODE_NAME_CASE(RSQ_LEGACY) + NODE_NAME_CASE(RSQ_CLAMPED) + NODE_NAME_CASE(DOT4) NODE_NAME_CASE(BFE_U32) NODE_NAME_CASE(BFE_I32) NODE_NAME_CASE(BFI) NODE_NAME_CASE(BFM) + NODE_NAME_CASE(BREV) NODE_NAME_CASE(MUL_U24) NODE_NAME_CASE(MUL_I24) NODE_NAME_CASE(MAD_U24) NODE_NAME_CASE(MAD_I24) - NODE_NAME_CASE(URECIP) - NODE_NAME_CASE(DOT4) NODE_NAME_CASE(EXPORT) NODE_NAME_CASE(CONST_ADDRESS) NODE_NAME_CASE(REGISTER_LOAD) @@ -1544,6 +2152,11 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(SAMPLEB) NODE_NAME_CASE(SAMPLED) NODE_NAME_CASE(SAMPLEL) + NODE_NAME_CASE(CVT_F32_UBYTE0) + NODE_NAME_CASE(CVT_F32_UBYTE1) + NODE_NAME_CASE(CVT_F32_UBYTE2) + NODE_NAME_CASE(CVT_F32_UBYTE3) + NODE_NAME_CASE(BUILD_VERTICAL_VECTOR) NODE_NAME_CASE(STORE_MSKOR) NODE_NAME_CASE(TBUFFER_STORE_FORMAT) } diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h index d5d821d9364b..98a92ade115c 100644 --- a/lib/Target/R600/AMDGPUISelLowering.h +++ b/lib/Target/R600/AMDGPUISelLowering.h @@ -42,10 +42,33 @@ private: SDValue MergeVectorStore(const SDValue &Op, SelectionDAG &DAG) const; /// \brief Split a vector store into multiple scalar stores. /// \returns The resulting chain. + + SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSDIV24(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSDIV32(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSDIV64(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSREM(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSREM32(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSREM64(SDValue Op, SelectionDAG &DAG) const; SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; + SDValue ExpandSIGN_EXTEND_INREG(SDValue Op, + unsigned BitsDiff, + SelectionDAG &DAG) const; + SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; + + SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const; + protected: + static EVT getEquivalentMemType(LLVMContext &Context, EVT VT); + static EVT getEquivalentLoadRegType(LLVMContext &Context, EVT VT); /// \brief Helper function that adds Reg to the LiveIn list of the DAG's /// MachineFunction. @@ -61,6 +84,7 @@ protected: SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const; SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const; bool isHWTrueValue(SDValue Op) const; bool isHWFalseValue(SDValue Op) const; @@ -87,10 +111,16 @@ public: bool isZExtFree(Type *Src, Type *Dest) const override; bool isZExtFree(EVT Src, EVT Dest) const override; + bool isZExtFree(SDValue Val, EVT VT2) const override; bool isNarrowingProfitable(EVT VT1, EVT VT2) const override; MVT getVectorIdxTy() const override; + bool isSelectSupported(SelectSupportKind) const override; + + bool isFPImmLegal(const APFloat &Imm, EVT VT) const override; + bool ShouldShrinkFPConstant(EVT VT) const override; + bool isLoadBitCastBeneficial(EVT, EVT) const override; SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, @@ -101,6 +131,7 @@ public: SmallVectorImpl<SDValue> &InVals) const override; SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; + SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; void ReplaceNodeResults(SDNode * N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const override; @@ -128,38 +159,6 @@ public: SDValue Op, const SelectionDAG &DAG, unsigned Depth = 0) const override; - -// Functions defined in AMDILISelLowering.cpp -public: - bool getTgtMemIntrinsic(IntrinsicInfo &Info, - const CallInst &I, unsigned Intrinsic) const override; - - /// We want to mark f32/f64 floating point values as legal. - bool isFPImmLegal(const APFloat &Imm, EVT VT) const override; - - /// We don't want to shrink f64/f32 constants. - bool ShouldShrinkFPConstant(EVT VT) const override; - - SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; - -private: - void InitAMDILLowering(); - SDValue LowerSREM(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSREM8(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSREM16(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSREM32(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSREM64(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSDIV24(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSDIV32(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSDIV64(SDValue Op, SelectionDAG &DAG) const; - - SDValue ExpandSIGN_EXTEND_INREG(SDValue Op, - unsigned BitsDiff, - SelectionDAG &DAG) const; - SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; - EVT genIntType(uint32_t size = 32, uint32_t numEle = 1) const; - SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; }; namespace AMDGPUISD { @@ -169,12 +168,15 @@ enum { FIRST_NUMBER = ISD::BUILTIN_OP_END, CALL, // Function call based on a single integer UMUL, // 32bit unsigned multiplication - DIV_INF, // Divide with infinity returned on zero divisor RET_FLAG, BRANCH_COND, // End AMDIL ISD Opcodes DWORDADDR, FRACT, + CLAMP, + + // SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi. + // Denormals handled on some parts. COS_HW, SIN_HW, FMAX, @@ -184,11 +186,23 @@ enum { SMIN, UMIN, URECIP, + DIV_SCALE, + DIV_FMAS, + DIV_FIXUP, + TRIG_PREOP, // 1 ULP max error for f64 + + // RCP, RSQ - For f32, 1 ULP max error, no denormal handling. + // For f64, max error 2^29 ULP, handles denormals. + RCP, + RSQ, + RSQ_LEGACY, + RSQ_CLAMPED, DOT4, BFE_U32, // Extract range of bits with zero extension to 32-bits. BFE_I32, // Extract range of bits with sign extension to 32-bits. BFI, // (src0 & src1) | (~src0 & src2) BFM, // Insert a range of bits into a 32-bit word. + BREV, // Reverse bits. MUL_U24, MUL_I24, MAD_U24, @@ -203,6 +217,21 @@ enum { SAMPLEB, SAMPLED, SAMPLEL, + + // These cvt_f32_ubyte* nodes need to remain consecutive and in order. + CVT_F32_UBYTE0, + CVT_F32_UBYTE1, + CVT_F32_UBYTE2, + CVT_F32_UBYTE3, + /// This node is for VLIW targets and it is used to represent a vector + /// that is stored in consecutive registers with the same channel. + /// For example: + /// |X |Y|Z|W| + /// T0|v.x| | | | + /// T1|v.y| | | | + /// T2|v.z| | | | + /// T3|v.w| | | | + BUILD_VERTICAL_VECTOR, FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE, STORE_MSKOR, LOAD_CONSTANT, diff --git a/lib/Target/R600/AMDGPUInstrInfo.cpp b/lib/Target/R600/AMDGPUInstrInfo.cpp index 1c3361a26581..fef5b8cac5ba 100644 --- a/lib/Target/R600/AMDGPUInstrInfo.cpp +++ b/lib/Target/R600/AMDGPUInstrInfo.cpp @@ -30,8 +30,8 @@ using namespace llvm; // Pin the vtable to this file. void AMDGPUInstrInfo::anchor() {} -AMDGPUInstrInfo::AMDGPUInstrInfo(TargetMachine &tm) - : AMDGPUGenInstrInfo(-1,-1), RI(tm), TM(tm) { } +AMDGPUInstrInfo::AMDGPUInstrInfo(const AMDGPUSubtarget &st) + : AMDGPUGenInstrInfo(-1,-1), RI(st), ST(st) { } const AMDGPURegisterInfo &AMDGPUInstrInfo::getRegisterInfo() const { return RI; @@ -320,33 +320,11 @@ int AMDGPUInstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const { return -1; } - Offset = TM.getFrameLowering()->getFrameIndexOffset(MF, -1); + Offset = MF.getTarget().getFrameLowering()->getFrameIndexOffset(MF, -1); return getIndirectIndexBegin(MF) + Offset; } - -void AMDGPUInstrInfo::convertToISA(MachineInstr & MI, MachineFunction &MF, - DebugLoc DL) const { - MachineRegisterInfo &MRI = MF.getRegInfo(); - const AMDGPURegisterInfo & RI = getRegisterInfo(); - - for (unsigned i = 0; i < MI.getNumOperands(); i++) { - MachineOperand &MO = MI.getOperand(i); - // Convert dst regclass to one that is supported by the ISA - if (MO.isReg() && MO.isDef()) { - if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) { - const TargetRegisterClass * oldRegClass = MRI.getRegClass(MO.getReg()); - const TargetRegisterClass * newRegClass = RI.getISARegClass(oldRegClass); - - assert(newRegClass); - - MRI.setRegClass(MO.getReg(), newRegClass); - } - } - } -} - int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const { switch (Channels) { default: return Opcode; diff --git a/lib/Target/R600/AMDGPUInstrInfo.h b/lib/Target/R600/AMDGPUInstrInfo.h index 74baf6b2a6f7..95dc8c14c291 100644 --- a/lib/Target/R600/AMDGPUInstrInfo.h +++ b/lib/Target/R600/AMDGPUInstrInfo.h @@ -33,7 +33,7 @@ namespace llvm { -class AMDGPUTargetMachine; +class AMDGPUSubtarget; class MachineFunction; class MachineInstr; class MachineInstrBuilder; @@ -45,9 +45,9 @@ private: MachineBasicBlock &MBB) const; virtual void anchor(); protected: - TargetMachine &TM; + const AMDGPUSubtarget &ST; public: - explicit AMDGPUInstrInfo(TargetMachine &tm); + explicit AMDGPUInstrInfo(const AMDGPUSubtarget &st); virtual const AMDGPURegisterInfo &getRegisterInfo() const = 0; @@ -137,14 +137,6 @@ public: bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override; // Helper functions that check the opcode for status information - bool isLoadInst(llvm::MachineInstr *MI) const; - bool isExtLoadInst(llvm::MachineInstr *MI) const; - bool isSWSExtLoadInst(llvm::MachineInstr *MI) const; - bool isSExtLoadInst(llvm::MachineInstr *MI) const; - bool isZExtLoadInst(llvm::MachineInstr *MI) const; - bool isAExtLoadInst(llvm::MachineInstr *MI) const; - bool isStoreInst(llvm::MachineInstr *MI) const; - bool isTruncStoreInst(llvm::MachineInstr *MI) const; bool isRegisterStore(const MachineInstr &MI) const; bool isRegisterLoad(const MachineInstr &MI) const; @@ -185,11 +177,6 @@ public: unsigned ValueReg, unsigned Address, unsigned OffsetReg) const = 0; - - /// \brief Convert the AMDIL MachineInstr to a supported ISA - /// MachineInstr - void convertToISA(MachineInstr & MI, MachineFunction &MF, DebugLoc DL) const; - /// \brief Build a MOV instruction. virtual MachineInstr *buildMovInstr(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, diff --git a/lib/Target/R600/AMDGPUInstrInfo.td b/lib/Target/R600/AMDGPUInstrInfo.td index f96dbb4d8a1b..934d59d12699 100644 --- a/lib/Target/R600/AMDGPUInstrInfo.td +++ b/lib/Target/R600/AMDGPUInstrInfo.td @@ -19,6 +19,14 @@ def AMDGPUDTIntTernaryOp : SDTypeProfile<1, 3, [ SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>, SDTCisInt<3> ]>; +def AMDGPUTrigPreOp : SDTypeProfile<1, 2, + [SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>] +>; + +def AMDGPUDivScaleOp : SDTypeProfile<2, 3, + [SDTCisFP<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisSameAs<0, 4>] +>; + //===----------------------------------------------------------------------===// // AMDGPU DAG Nodes // @@ -29,11 +37,25 @@ def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>; // out = a - floor(a) def AMDGPUfract : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>; +// out = 1.0 / a +def AMDGPUrcp : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>; + +// out = 1.0 / sqrt(a) +def AMDGPUrsq : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>; + +// out = 1.0 / sqrt(a) +def AMDGPUrsq_legacy : SDNode<"AMDGPUISD::RSQ_LEGACY", SDTFPUnaryOp>; + +// out = 1.0 / sqrt(a) result clamped to +/- max_float. +def AMDGPUrsq_clamped : SDNode<"AMDGPUISD::RSQ_CLAMPED", SDTFPUnaryOp>; + // out = max(a, b) a and b are floats def AMDGPUfmax : SDNode<"AMDGPUISD::FMAX", SDTFPBinOp, [SDNPCommutative, SDNPAssociative] >; +def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPTernaryOp, []>; + // out = max(a, b) a and b are signed ints def AMDGPUsmax : SDNode<"AMDGPUISD::SMAX", SDTIntBinOp, [SDNPCommutative, SDNPAssociative] @@ -59,12 +81,38 @@ def AMDGPUumin : SDNode<"AMDGPUISD::UMIN", SDTIntBinOp, [SDNPCommutative, SDNPAssociative] >; + +def AMDGPUcvt_f32_ubyte0 : SDNode<"AMDGPUISD::CVT_F32_UBYTE0", + SDTIntToFPOp, []>; +def AMDGPUcvt_f32_ubyte1 : SDNode<"AMDGPUISD::CVT_F32_UBYTE1", + SDTIntToFPOp, []>; +def AMDGPUcvt_f32_ubyte2 : SDNode<"AMDGPUISD::CVT_F32_UBYTE2", + SDTIntToFPOp, []>; +def AMDGPUcvt_f32_ubyte3 : SDNode<"AMDGPUISD::CVT_F32_UBYTE3", + SDTIntToFPOp, []>; + + // urecip - This operation is a helper for integer division, it returns the // result of 1 / a as a fractional unsigned integer. // out = (2^32 / a) + e // e is rounding error def AMDGPUurecip : SDNode<"AMDGPUISD::URECIP", SDTIntUnaryOp>; +// Special case divide preop and flags. +def AMDGPUdiv_scale : SDNode<"AMDGPUISD::DIV_SCALE", AMDGPUDivScaleOp>; + +// Special case divide FMA with scale and flags (src0 = Quotient, +// src1 = Denominator, src2 = Numerator). +def AMDGPUdiv_fmas : SDNode<"AMDGPUISD::DIV_FMAS", SDTFPTernaryOp>; + +// Single or double precision division fixup. +// Special case divide fixup and flags(src0 = Quotient, src1 = +// Denominator, src2 = Numerator). +def AMDGPUdiv_fixup : SDNode<"AMDGPUISD::DIV_FIXUP", SDTFPTernaryOp>; + +// Look Up 2.0 / pi src0 with segment select src1[4:0] +def AMDGPUtrig_preop : SDNode<"AMDGPUISD::TRIG_PREOP", AMDGPUTrigPreOp>; + def AMDGPUregister_load : SDNode<"AMDGPUISD::REGISTER_LOAD", SDTypeProfile<1, 2, [SDTCisPtrTy<1>, SDTCisInt<2>]>, [SDNPHasChain, SDNPMayLoad]>; @@ -92,6 +140,8 @@ def AMDGPUbfe_i32 : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>; def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>; def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>; +def AMDGPUbrev : SDNode<"AMDGPUISD::BREV", SDTIntUnaryOp>; + // Signed and unsigned 24-bit mulitply. The highest 8-bits are ignore when // performing the mulitply. The result is a 32-bit value. def AMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp, @@ -107,3 +157,22 @@ def AMDGPUmad_u24 : SDNode<"AMDGPUISD::MAD_U24", AMDGPUDTIntTernaryOp, def AMDGPUmad_i24 : SDNode<"AMDGPUISD::MAD_I24", AMDGPUDTIntTernaryOp, [] >; + +//===----------------------------------------------------------------------===// +// Flow Control Profile Types +//===----------------------------------------------------------------------===// +// Branch instruction where second and third are basic blocks +def SDTIL_BRCond : SDTypeProfile<0, 2, [ + SDTCisVT<0, OtherVT> + ]>; + +//===----------------------------------------------------------------------===// +// Flow Control DAG Nodes +//===----------------------------------------------------------------------===// +def IL_brcond : SDNode<"AMDGPUISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChain]>; + +//===----------------------------------------------------------------------===// +// Call/Return DAG Nodes +//===----------------------------------------------------------------------===// +def IL_retflag : SDNode<"AMDGPUISD::RET_FLAG", SDTNone, + [SDNPHasChain, SDNPOptInGlue]>; diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td index 80bdf5b86c48..b86b7818fc1a 100644 --- a/lib/Target/R600/AMDGPUInstructions.td +++ b/lib/Target/R600/AMDGPUInstructions.td @@ -49,6 +49,11 @@ def u8imm : Operand<i8> { let PrintMethod = "printU8ImmOperand"; } +//===--------------------------------------------------------------------===// +// Custom Operands +//===--------------------------------------------------------------------===// +def brtarget : Operand<OtherVT>; + //===----------------------------------------------------------------------===// // PatLeafs for floating-point comparisons //===----------------------------------------------------------------------===// @@ -127,6 +132,21 @@ def COND_NULL : PatLeaf < // Load/Store Pattern Fragments //===----------------------------------------------------------------------===// +def global_store : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return isGlobalStore(dyn_cast<StoreSDNode>(N)); +}]>; + +// Global address space loads +def global_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return isGlobalLoad(dyn_cast<LoadSDNode>(N)); +}]>; + +// Constant address space loads +def constant_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return isConstantLoad(dyn_cast<LoadSDNode>(N), -1); +}]>; + def az_extload : PatFrag<(ops node:$ptr), (unindexedload node:$ptr), [{ LoadSDNode *L = cast<LoadSDNode>(N); return L->getExtensionType() == ISD::ZEXTLOAD || @@ -232,26 +252,55 @@ def local_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ return isLocalLoad(dyn_cast<LoadSDNode>(N)); }]>; -def atomic_load_add_local : PatFrag<(ops node:$ptr, node:$value), - (atomic_load_add node:$ptr, node:$value), [{ - return dyn_cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; -}]>; -def atomic_load_sub_local : PatFrag<(ops node:$ptr, node:$value), - (atomic_load_sub node:$ptr, node:$value), [{ - return dyn_cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; +class local_binary_atomic_op<SDNode atomic_op> : + PatFrag<(ops node:$ptr, node:$value), + (atomic_op node:$ptr, node:$value), [{ + return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; }]>; + +def atomic_swap_local : local_binary_atomic_op<atomic_swap>; +def atomic_load_add_local : local_binary_atomic_op<atomic_load_add>; +def atomic_load_sub_local : local_binary_atomic_op<atomic_load_sub>; +def atomic_load_and_local : local_binary_atomic_op<atomic_load_and>; +def atomic_load_or_local : local_binary_atomic_op<atomic_load_or>; +def atomic_load_xor_local : local_binary_atomic_op<atomic_load_xor>; +def atomic_load_nand_local : local_binary_atomic_op<atomic_load_nand>; +def atomic_load_min_local : local_binary_atomic_op<atomic_load_min>; +def atomic_load_max_local : local_binary_atomic_op<atomic_load_max>; +def atomic_load_umin_local : local_binary_atomic_op<atomic_load_umin>; +def atomic_load_umax_local : local_binary_atomic_op<atomic_load_umax>; + def mskor_global : PatFrag<(ops node:$val, node:$ptr), (AMDGPUstore_mskor node:$val, node:$ptr), [{ return dyn_cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; }]>; +def atomic_cmp_swap_32_local : + PatFrag<(ops node:$ptr, node:$cmp, node:$swap), + (atomic_cmp_swap node:$ptr, node:$cmp, node:$swap), [{ + AtomicSDNode *AN = cast<AtomicSDNode>(N); + return AN->getMemoryVT() == MVT::i32 && + AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; +}]>; + +def atomic_cmp_swap_64_local : + PatFrag<(ops node:$ptr, node:$cmp, node:$swap), + (atomic_cmp_swap node:$ptr, node:$cmp, node:$swap), [{ + AtomicSDNode *AN = cast<AtomicSDNode>(N); + return AN->getMemoryVT() == MVT::i64 && + AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; +}]>; + + class Constants { int TWO_PI = 0x40c90fdb; int PI = 0x40490fdb; int TWO_PI_INV = 0x3e22f983; int FP_UINT_MAX_PLUS_1 = 0x4f800000; // 1 << 32 in floating point encoding +int FP32_NEG_ONE = 0xbf800000; +int FP32_ONE = 0x3f800000; } def CONST : Constants; @@ -273,7 +322,7 @@ class CLAMP <RegisterClass rc> : AMDGPUShaderInst < (outs rc:$dst), (ins rc:$src0), "CLAMP $dst, $src0", - [(set f32:$dst, (int_AMDIL_clamp f32:$src0, (f32 FP_ZERO), (f32 FP_ONE)))] + [(set f32:$dst, (AMDGPUclamp f32:$src0, (f32 FP_ZERO), (f32 FP_ONE)))] >; class FABS <RegisterClass rc> : AMDGPUShaderInst < @@ -363,7 +412,7 @@ class DwordAddrPat<ValueType vt, RegisterClass rc> : Pat < // BFI_INT patterns -multiclass BFIPatterns <Instruction BFI_INT> { +multiclass BFIPatterns <Instruction BFI_INT, Instruction LoadImm32> { // Definition from ISA doc: // (y & x) | (z & ~x) @@ -379,6 +428,19 @@ multiclass BFIPatterns <Instruction BFI_INT> { (BFI_INT $x, $y, $z) >; + def : Pat < + (fcopysign f32:$src0, f32:$src1), + (BFI_INT (LoadImm32 0x7fffffff), $src0, $src1) + >; + + def : Pat < + (f64 (fcopysign f64:$src0, f64:$src1)), + (INSERT_SUBREG (INSERT_SUBREG (f64 (IMPLICIT_DEF)), + (i32 (EXTRACT_SUBREG $src0, sub0)), sub0), + (BFI_INT (LoadImm32 0x7fffffff), + (i32 (EXTRACT_SUBREG $src0, sub1)), + (i32 (EXTRACT_SUBREG $src1, sub1))), sub1) + >; } // SHA-256 Ma patterns @@ -457,6 +519,23 @@ multiclass Expand24UBitOps<Instruction MulInst, Instruction AddInst> { >; } +class RcpPat<Instruction RcpInst, ValueType vt> : Pat < + (fdiv FP_ONE, vt:$src), + (RcpInst $src) +>; + +multiclass RsqPat<Instruction RsqInst, ValueType vt> { + def : Pat < + (fdiv FP_ONE, (fsqrt vt:$src)), + (RsqInst $src) + >; + + def : Pat < + (AMDGPUrcp (fsqrt vt:$src)), + (RsqInst $src) + >; +} + include "R600Instructions.td" include "R700Instructions.td" include "EvergreenInstructions.td" diff --git a/lib/Target/R600/AMDILIntrinsicInfo.cpp b/lib/Target/R600/AMDGPUIntrinsicInfo.cpp index fab4a3b8961a..58916a995496 100644 --- a/lib/Target/R600/AMDILIntrinsicInfo.cpp +++ b/lib/Target/R600/AMDGPUIntrinsicInfo.cpp @@ -1,4 +1,4 @@ -//===- AMDILIntrinsicInfo.cpp - AMDGPU Intrinsic Information ------*- C++ -*-===// +//===- AMDGPUIntrinsicInfo.cpp - AMDGPU Intrinsic Information ---*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -12,7 +12,7 @@ // //===-----------------------------------------------------------------------===// -#include "AMDILIntrinsicInfo.h" +#include "AMDGPUIntrinsicInfo.h" #include "AMDGPUSubtarget.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Intrinsics.h" @@ -24,14 +24,12 @@ using namespace llvm; #include "AMDGPUGenIntrinsics.inc" #undef GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN -AMDGPUIntrinsicInfo::AMDGPUIntrinsicInfo(TargetMachine *tm) - : TargetIntrinsicInfo() { -} +AMDGPUIntrinsicInfo::AMDGPUIntrinsicInfo(TargetMachine *tm) + : TargetIntrinsicInfo() {} -std::string -AMDGPUIntrinsicInfo::getName(unsigned int IntrID, Type **Tys, - unsigned int numTys) const { - static const char* const names[] = { +std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys, + unsigned numTys) const { + static const char *const names[] = { #define GET_INTRINSIC_NAME_TABLE #include "AMDGPUGenIntrinsics.inc" #undef GET_INTRINSIC_NAME_TABLE @@ -40,23 +38,23 @@ AMDGPUIntrinsicInfo::getName(unsigned int IntrID, Type **Tys, if (IntrID < Intrinsic::num_intrinsics) { return nullptr; } - assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics - && "Invalid intrinsic ID"); + assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics && + "Invalid intrinsic ID"); std::string Result(names[IntrID - Intrinsic::num_intrinsics]); return Result; } -unsigned int -AMDGPUIntrinsicInfo::lookupName(const char *Name, unsigned int Len) const { +unsigned AMDGPUIntrinsicInfo::lookupName(const char *Name, + unsigned Len) const { if (!StringRef(Name, Len).startswith("llvm.")) return 0; // All intrinsics start with 'llvm.' #define GET_FUNCTION_RECOGNIZER #include "AMDGPUGenIntrinsics.inc" #undef GET_FUNCTION_RECOGNIZER - AMDGPUIntrinsic::ID IntrinsicID - = (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic; + AMDGPUIntrinsic::ID IntrinsicID = + (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic; IntrinsicID = getIntrinsicForGCCBuiltin("AMDGPU", Name); if (IntrinsicID != (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic) { @@ -65,17 +63,15 @@ AMDGPUIntrinsicInfo::lookupName(const char *Name, unsigned int Len) const { return 0; } -bool -AMDGPUIntrinsicInfo::isOverloaded(unsigned id) const { - // Overload Table +bool AMDGPUIntrinsicInfo::isOverloaded(unsigned id) const { +// Overload Table #define GET_INTRINSIC_OVERLOAD_TABLE #include "AMDGPUGenIntrinsics.inc" #undef GET_INTRINSIC_OVERLOAD_TABLE } -Function* -AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID, - Type **Tys, - unsigned numTys) const { +Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID, + Type **Tys, + unsigned numTys) const { llvm_unreachable("Not implemented"); } diff --git a/lib/Target/R600/AMDILIntrinsicInfo.h b/lib/Target/R600/AMDGPUIntrinsicInfo.h index 924275aec2da..5be68a217da5 100644 --- a/lib/Target/R600/AMDILIntrinsicInfo.h +++ b/lib/Target/R600/AMDGPUIntrinsicInfo.h @@ -1,4 +1,4 @@ -//===- AMDILIntrinsicInfo.h - AMDGPU Intrinsic Information ------*- C++ -*-===// +//===- AMDGPUIntrinsicInfo.h - AMDGPU Intrinsic Information ------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -11,8 +11,8 @@ /// \brief Interface for the AMDGPU Implementation of the Intrinsic Info class. // //===-----------------------------------------------------------------------===// -#ifndef AMDIL_INTRINSICS_H -#define AMDIL_INTRINSICS_H +#ifndef AMDGPU_INTRINSICINFO_H +#define AMDGPU_INTRINSICINFO_H #include "llvm/IR/Intrinsics.h" #include "llvm/Target/TargetIntrinsicInfo.h" @@ -34,16 +34,15 @@ enum ID { class AMDGPUIntrinsicInfo : public TargetIntrinsicInfo { public: AMDGPUIntrinsicInfo(TargetMachine *tm); - std::string getName(unsigned int IntrId, Type **Tys = nullptr, - unsigned int numTys = 0) const override; - unsigned int lookupName(const char *Name, unsigned int Len) const override; - bool isOverloaded(unsigned int IID) const override; - Function *getDeclaration(Module *M, unsigned int ID, + std::string getName(unsigned IntrId, Type **Tys = nullptr, + unsigned numTys = 0) const override; + unsigned lookupName(const char *Name, unsigned Len) const override; + bool isOverloaded(unsigned IID) const override; + Function *getDeclaration(Module *M, unsigned ID, Type **Tys = nullptr, - unsigned int numTys = 0) const override; + unsigned numTys = 0) const override; }; } // end namespace llvm -#endif // AMDIL_INTRINSICS_H - +#endif // AMDGPU_INTRINSICINFO_H diff --git a/lib/Target/R600/AMDGPUIntrinsics.td b/lib/Target/R600/AMDGPUIntrinsics.td index 9ad5e72d3f0c..d934676bbf65 100644 --- a/lib/Target/R600/AMDGPUIntrinsics.td +++ b/lib/Target/R600/AMDGPUIntrinsics.td @@ -18,18 +18,26 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in { def int_AMDGPU_reserve_reg : Intrinsic<[], [llvm_i32_ty], [IntrNoMem]>; def int_AMDGPU_store_output : Intrinsic<[], [llvm_float_ty, llvm_i32_ty], []>; def int_AMDGPU_swizzle : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem]>; - + def int_AMDGPU_abs : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; def int_AMDGPU_arl : Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>; def int_AMDGPU_cndlt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; def int_AMDGPU_div : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_fract : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; + def int_AMDGPU_clamp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; + + // This is named backwards (instead of rsq_legacy) so we don't have + // to define it with the public builtins intrinsics. This is a + // workaround for how intrinsic names are parsed. If the name is + // llvm.AMDGPU.rsq.legacy, the parser assumes that you meant + // llvm.AMDGPU.rsq.{f32 | f64} and incorrectly mangled the name. + def int_AMDGPU_legacy_rsq : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_dp4 : Intrinsic<[llvm_float_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>; def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>; def int_AMDGPU_kilp : Intrinsic<[], [], []>; def int_AMDGPU_lrp : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; def int_AMDGPU_mul : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; def int_AMDGPU_pow : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_rcp : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_rsq : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; def int_AMDGPU_seq : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; def int_AMDGPU_sgt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; def int_AMDGPU_sge : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; @@ -53,12 +61,27 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in { def int_AMDGPU_imul24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_AMDGPU_imad24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_AMDGPU_umad24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_cvt_f32_ubyte0 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_cvt_f32_ubyte1 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_cvt_f32_ubyte2 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_cvt_f32_ubyte3 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; def int_AMDGPU_cube : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>; def int_AMDGPU_bfi : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_AMDGPU_bfe_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_AMDGPU_bfe_u32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_AMDGPU_bfm : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_brev : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; def int_AMDGPU_barrier_local : Intrinsic<[], [], []>; + def int_AMDGPU_barrier_global : Intrinsic<[], [], []>; +} + +// Legacy names for compatibility. +let TargetPrefix = "AMDIL", isTarget = 1 in { + def int_AMDIL_abs : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>; + def int_AMDIL_fraction : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; + def int_AMDIL_clamp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; + def int_AMDIL_exp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; + def int_AMDIL_round_nearest : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; } let TargetPrefix = "TGSI", isTarget = 1 in { diff --git a/lib/Target/R600/AMDGPUMCInstLower.cpp b/lib/Target/R600/AMDGPUMCInstLower.cpp index b759495ad8e2..ac82e88c9266 100644 --- a/lib/Target/R600/AMDGPUMCInstLower.cpp +++ b/lib/Target/R600/AMDGPUMCInstLower.cpp @@ -15,6 +15,7 @@ #include "AMDGPUMCInstLower.h" #include "AMDGPUAsmPrinter.h" +#include "AMDGPUTargetMachine.h" #include "InstPrinter/AMDGPUInstPrinter.h" #include "R600InstrInfo.h" #include "SIInstrInfo.h" diff --git a/lib/Target/R600/AMDGPUMCInstLower.h b/lib/Target/R600/AMDGPUMCInstLower.h index 2b7f1e3074fe..58fe34d32d31 100644 --- a/lib/Target/R600/AMDGPUMCInstLower.h +++ b/lib/Target/R600/AMDGPUMCInstLower.h @@ -14,9 +14,9 @@ namespace llvm { class AMDGPUSubtarget; -class MCInst; -class MCContext; class MachineInstr; +class MCContext; +class MCInst; class AMDGPUMCInstLower { diff --git a/lib/Target/R600/AMDGPUPromoteAlloca.cpp b/lib/Target/R600/AMDGPUPromoteAlloca.cpp new file mode 100644 index 000000000000..218750d445e6 --- /dev/null +++ b/lib/Target/R600/AMDGPUPromoteAlloca.cpp @@ -0,0 +1,387 @@ +//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass eliminates allocas by either converting them into vectors or +// by migrating them to local address space. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstVisitor.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "amdgpu-promote-alloca" + +using namespace llvm; + +namespace { + +class AMDGPUPromoteAlloca : public FunctionPass, + public InstVisitor<AMDGPUPromoteAlloca> { + + static char ID; + Module *Mod; + const AMDGPUSubtarget &ST; + int LocalMemAvailable; + +public: + AMDGPUPromoteAlloca(const AMDGPUSubtarget &st) : FunctionPass(ID), ST(st), + LocalMemAvailable(0) { } + virtual bool doInitialization(Module &M); + virtual bool runOnFunction(Function &F); + virtual const char *getPassName() const { + return "AMDGPU Promote Alloca"; + } + void visitAlloca(AllocaInst &I); +}; + +} // End anonymous namespace + +char AMDGPUPromoteAlloca::ID = 0; + +bool AMDGPUPromoteAlloca::doInitialization(Module &M) { + Mod = &M; + return false; +} + +bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { + + const FunctionType *FTy = F.getFunctionType(); + + LocalMemAvailable = ST.getLocalMemorySize(); + + + // If the function has any arguments in the local address space, then it's + // possible these arguments require the entire local memory space, so + // we cannot use local memory in the pass. + for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) { + const Type *ParamTy = FTy->getParamType(i); + if (ParamTy->isPointerTy() && + ParamTy->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { + LocalMemAvailable = 0; + DEBUG(dbgs() << "Function has local memory argument. Promoting to " + "local memory disabled.\n"); + break; + } + } + + if (LocalMemAvailable > 0) { + // Check how much local memory is being used by global objects + for (Module::global_iterator I = Mod->global_begin(), + E = Mod->global_end(); I != E; ++I) { + GlobalVariable *GV = I; + PointerType *GVTy = GV->getType(); + if (GVTy->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) + continue; + for (Value::use_iterator U = GV->use_begin(), + UE = GV->use_end(); U != UE; ++U) { + Instruction *Use = dyn_cast<Instruction>(*U); + if (!Use) + continue; + if (Use->getParent()->getParent() == &F) + LocalMemAvailable -= + Mod->getDataLayout()->getTypeAllocSize(GVTy->getElementType()); + } + } + } + + LocalMemAvailable = std::max(0, LocalMemAvailable); + DEBUG(dbgs() << LocalMemAvailable << "bytes free in local memory.\n"); + + visit(F); + + return false; +} + +static VectorType *arrayTypeToVecType(const Type *ArrayTy) { + return VectorType::get(ArrayTy->getArrayElementType(), + ArrayTy->getArrayNumElements()); +} + +static Value* calculateVectorIndex(Value *Ptr, + std::map<GetElementPtrInst*, Value*> GEPIdx) { + if (isa<AllocaInst>(Ptr)) + return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext())); + + GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr); + + return GEPIdx[GEP]; +} + +static Value* GEPToVectorIndex(GetElementPtrInst *GEP) { + // FIXME we only support simple cases + if (GEP->getNumOperands() != 3) + return NULL; + + ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1)); + if (!I0 || !I0->isZero()) + return NULL; + + return GEP->getOperand(2); +} + +// Not an instruction handled below to turn into a vector. +// +// TODO: Check isTriviallyVectorizable for calls and handle other +// instructions. +static bool canVectorizeInst(Instruction *Inst) { + switch (Inst->getOpcode()) { + case Instruction::Load: + case Instruction::Store: + case Instruction::BitCast: + case Instruction::AddrSpaceCast: + return true; + default: + return false; + } +} + +static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { + Type *AllocaTy = Alloca->getAllocatedType(); + + DEBUG(dbgs() << "Alloca Candidate for vectorization \n"); + + // FIXME: There is no reason why we can't support larger arrays, we + // are just being conservative for now. + if (!AllocaTy->isArrayTy() || + AllocaTy->getArrayElementType()->isVectorTy() || + AllocaTy->getArrayNumElements() > 4) { + + DEBUG(dbgs() << " Cannot convert type to vector"); + return false; + } + + std::map<GetElementPtrInst*, Value*> GEPVectorIdx; + std::vector<Value*> WorkList; + for (User *AllocaUser : Alloca->users()) { + GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser); + if (!GEP) { + if (!canVectorizeInst(cast<Instruction>(AllocaUser))) + return false; + + WorkList.push_back(AllocaUser); + continue; + } + + Value *Index = GEPToVectorIndex(GEP); + + // If we can't compute a vector index from this GEP, then we can't + // promote this alloca to vector. + if (!Index) { + DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP << '\n'); + return false; + } + + GEPVectorIdx[GEP] = Index; + for (User *GEPUser : AllocaUser->users()) { + if (!canVectorizeInst(cast<Instruction>(GEPUser))) + return false; + + WorkList.push_back(GEPUser); + } + } + + VectorType *VectorTy = arrayTypeToVecType(AllocaTy); + + DEBUG(dbgs() << " Converting alloca to vector " + << *AllocaTy << " -> " << *VectorTy << '\n'); + + for (std::vector<Value*>::iterator I = WorkList.begin(), + E = WorkList.end(); I != E; ++I) { + Instruction *Inst = cast<Instruction>(*I); + IRBuilder<> Builder(Inst); + switch (Inst->getOpcode()) { + case Instruction::Load: { + Value *Ptr = Inst->getOperand(0); + Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); + Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0)); + Value *VecValue = Builder.CreateLoad(BitCast); + Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index); + Inst->replaceAllUsesWith(ExtractElement); + Inst->eraseFromParent(); + break; + } + case Instruction::Store: { + Value *Ptr = Inst->getOperand(1); + Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); + Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0)); + Value *VecValue = Builder.CreateLoad(BitCast); + Value *NewVecValue = Builder.CreateInsertElement(VecValue, + Inst->getOperand(0), + Index); + Builder.CreateStore(NewVecValue, BitCast); + Inst->eraseFromParent(); + break; + } + case Instruction::BitCast: + case Instruction::AddrSpaceCast: + break; + + default: + Inst->dump(); + llvm_unreachable("Inconsistency in instructions promotable to vector"); + } + } + return true; +} + +static void collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) { + for (User *User : Val->users()) { + if(std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end()) + continue; + if (isa<CallInst>(User)) { + WorkList.push_back(User); + continue; + } + if (!User->getType()->isPointerTy()) + continue; + WorkList.push_back(User); + collectUsesWithPtrTypes(User, WorkList); + } +} + +void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) { + IRBuilder<> Builder(&I); + + // First try to replace the alloca with a vector + Type *AllocaTy = I.getAllocatedType(); + + DEBUG(dbgs() << "Trying to promote " << I << '\n'); + + if (tryPromoteAllocaToVector(&I)) + return; + + DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n"); + + // FIXME: This is the maximum work group size. We should try to get + // value from the reqd_work_group_size function attribute if it is + // available. + unsigned WorkGroupSize = 256; + int AllocaSize = WorkGroupSize * + Mod->getDataLayout()->getTypeAllocSize(AllocaTy); + + if (AllocaSize > LocalMemAvailable) { + DEBUG(dbgs() << " Not enough local memory to promote alloca.\n"); + return; + } + + DEBUG(dbgs() << "Promoting alloca to local memory\n"); + LocalMemAvailable -= AllocaSize; + + GlobalVariable *GV = new GlobalVariable( + *Mod, ArrayType::get(I.getAllocatedType(), 256), false, + GlobalValue::ExternalLinkage, 0, I.getName(), 0, + GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS); + + FunctionType *FTy = FunctionType::get( + Type::getInt32Ty(Mod->getContext()), false); + AttributeSet AttrSet; + AttrSet.addAttribute(Mod->getContext(), 0, Attribute::ReadNone); + + Value *ReadLocalSizeY = Mod->getOrInsertFunction( + "llvm.r600.read.local.size.y", FTy, AttrSet); + Value *ReadLocalSizeZ = Mod->getOrInsertFunction( + "llvm.r600.read.local.size.z", FTy, AttrSet); + Value *ReadTIDIGX = Mod->getOrInsertFunction( + "llvm.r600.read.tidig.x", FTy, AttrSet); + Value *ReadTIDIGY = Mod->getOrInsertFunction( + "llvm.r600.read.tidig.y", FTy, AttrSet); + Value *ReadTIDIGZ = Mod->getOrInsertFunction( + "llvm.r600.read.tidig.z", FTy, AttrSet); + + + Value *TCntY = Builder.CreateCall(ReadLocalSizeY); + Value *TCntZ = Builder.CreateCall(ReadLocalSizeZ); + Value *TIdX = Builder.CreateCall(ReadTIDIGX); + Value *TIdY = Builder.CreateCall(ReadTIDIGY); + Value *TIdZ = Builder.CreateCall(ReadTIDIGZ); + + Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ); + Tmp0 = Builder.CreateMul(Tmp0, TIdX); + Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ); + Value *TID = Builder.CreateAdd(Tmp0, Tmp1); + TID = Builder.CreateAdd(TID, TIdZ); + + std::vector<Value*> Indices; + Indices.push_back(Constant::getNullValue(Type::getInt32Ty(Mod->getContext()))); + Indices.push_back(TID); + + Value *Offset = Builder.CreateGEP(GV, Indices); + I.mutateType(Offset->getType()); + I.replaceAllUsesWith(Offset); + I.eraseFromParent(); + + std::vector<Value*> WorkList; + + collectUsesWithPtrTypes(Offset, WorkList); + + for (std::vector<Value*>::iterator i = WorkList.begin(), + e = WorkList.end(); i != e; ++i) { + Value *V = *i; + CallInst *Call = dyn_cast<CallInst>(V); + if (!Call) { + Type *EltTy = V->getType()->getPointerElementType(); + PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS); + V->mutateType(NewTy); + continue; + } + + IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Call); + if (!Intr) { + std::vector<Type*> ArgTypes; + for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands(); + ArgIdx != ArgEnd; ++ArgIdx) { + ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType()); + } + Function *F = Call->getCalledFunction(); + FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes, + F->isVarArg()); + Constant *C = Mod->getOrInsertFunction(StringRef(F->getName().str() + ".local"), NewType, + F->getAttributes()); + Function *NewF = cast<Function>(C); + Call->setCalledFunction(NewF); + continue; + } + + Builder.SetInsertPoint(Intr); + switch (Intr->getIntrinsicID()) { + case Intrinsic::lifetime_start: + case Intrinsic::lifetime_end: + // These intrinsics are for address space 0 only + Intr->eraseFromParent(); + continue; + case Intrinsic::memcpy: { + MemCpyInst *MemCpy = cast<MemCpyInst>(Intr); + Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(), + MemCpy->getLength(), MemCpy->getAlignment(), + MemCpy->isVolatile()); + Intr->eraseFromParent(); + continue; + } + case Intrinsic::memset: { + MemSetInst *MemSet = cast<MemSetInst>(Intr); + Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(), + MemSet->getLength(), MemSet->getAlignment(), + MemSet->isVolatile()); + Intr->eraseFromParent(); + continue; + } + default: + Intr->dump(); + llvm_unreachable("Don't know how to promote alloca intrinsic use."); + } + } +} + +FunctionPass *llvm::createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST) { + return new AMDGPUPromoteAlloca(ST); +} diff --git a/lib/Target/R600/AMDGPURegisterInfo.cpp b/lib/Target/R600/AMDGPURegisterInfo.cpp index 19927faaa488..34332808f865 100644 --- a/lib/Target/R600/AMDGPURegisterInfo.cpp +++ b/lib/Target/R600/AMDGPURegisterInfo.cpp @@ -17,9 +17,9 @@ using namespace llvm; -AMDGPURegisterInfo::AMDGPURegisterInfo(TargetMachine &tm) +AMDGPURegisterInfo::AMDGPURegisterInfo(const AMDGPUSubtarget &st) : AMDGPUGenRegisterInfo(0), - TM(tm) + ST(st) { } //===----------------------------------------------------------------------===// diff --git a/lib/Target/R600/AMDGPURegisterInfo.h b/lib/Target/R600/AMDGPURegisterInfo.h index a7cba0d25048..4731595d4f71 100644 --- a/lib/Target/R600/AMDGPURegisterInfo.h +++ b/lib/Target/R600/AMDGPURegisterInfo.h @@ -25,27 +25,19 @@ namespace llvm { -class AMDGPUTargetMachine; +class AMDGPUSubtarget; class TargetInstrInfo; struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo { - TargetMachine &TM; static const MCPhysReg CalleeSavedReg; + const AMDGPUSubtarget &ST; - AMDGPURegisterInfo(TargetMachine &tm); + AMDGPURegisterInfo(const AMDGPUSubtarget &st); BitVector getReservedRegs(const MachineFunction &MF) const override { assert(!"Unimplemented"); return BitVector(); } - /// \param RC is an AMDIL reg class. - /// - /// \returns The ISA reg class that is equivalent to \p RC. - virtual const TargetRegisterClass * getISARegClass( - const TargetRegisterClass * RC) const { - assert(!"Unimplemented"); return nullptr; - } - virtual const TargetRegisterClass* getCFGStructurizerRegClass(MVT VT) const { assert(!"Unimplemented"); return nullptr; } diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp index f3b993204a54..b83c290c1f03 100644 --- a/lib/Target/R600/AMDGPUSubtarget.cpp +++ b/lib/Target/R600/AMDGPUSubtarget.cpp @@ -13,6 +13,8 @@ //===----------------------------------------------------------------------===// #include "AMDGPUSubtarget.h" +#include "R600InstrInfo.h" +#include "SIInstrInfo.h" using namespace llvm; @@ -23,90 +25,42 @@ using namespace llvm; #define GET_SUBTARGETINFO_CTOR #include "AMDGPUGenSubtargetInfo.inc" -AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS) : - AMDGPUGenSubtargetInfo(TT, CPU, FS), DumpCode(false) { - InstrItins = getInstrItineraryForCPU(CPU); - - // Default card - StringRef GPU = CPU; - Is64bit = false; - HasVertexCache = false; - TexVTXClauseSize = 0; - Gen = AMDGPUSubtarget::R600; - FP64 = false; - CaymanISA = false; - EnableIRStructurizer = true; - EnableIfCvt = true; - WavefrontSize = 0; - CFALUBug = false; +AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef GPU, StringRef FS) : + AMDGPUGenSubtargetInfo(TT, GPU, FS), + DevName(GPU), + Is64bit(false), + DumpCode(false), + R600ALUInst(false), + HasVertexCache(false), + TexVTXClauseSize(0), + Gen(AMDGPUSubtarget::R600), + FP64(false), + CaymanISA(false), + EnableIRStructurizer(true), + EnableIfCvt(true), + WavefrontSize(0), + CFALUBug(false), + LocalMemorySize(0), + InstrItins(getInstrItineraryForCPU(GPU)) { ParseSubtargetFeatures(GPU, FS); - DevName = GPU; -} -bool -AMDGPUSubtarget::is64bit() const { - return Is64bit; -} -bool -AMDGPUSubtarget::hasVertexCache() const { - return HasVertexCache; -} -short -AMDGPUSubtarget::getTexVTXClauseSize() const { - return TexVTXClauseSize; -} -enum AMDGPUSubtarget::Generation -AMDGPUSubtarget::getGeneration() const { - return Gen; -} -bool -AMDGPUSubtarget::hasHWFP64() const { - return FP64; -} -bool -AMDGPUSubtarget::hasCaymanISA() const { - return CaymanISA; -} -bool -AMDGPUSubtarget::IsIRStructurizerEnabled() const { - return EnableIRStructurizer; -} -bool -AMDGPUSubtarget::isIfCvtEnabled() const { - return EnableIfCvt; -} -unsigned -AMDGPUSubtarget::getWavefrontSize() const { - return WavefrontSize; + if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { + InstrInfo.reset(new R600InstrInfo(*this)); + } else { + InstrInfo.reset(new SIInstrInfo(*this)); + } } -unsigned -AMDGPUSubtarget::getStackEntrySize() const { + +unsigned AMDGPUSubtarget::getStackEntrySize() const { assert(getGeneration() <= NORTHERN_ISLANDS); switch(getWavefrontSize()) { case 16: return 8; case 32: - if (hasCaymanISA()) - return 4; - else - return 8; + return hasCaymanISA() ? 4 : 8; case 64: return 4; default: llvm_unreachable("Illegal wavefront size."); } } -bool -AMDGPUSubtarget::hasCFAluBug() const { - assert(getGeneration() <= NORTHERN_ISLANDS); - return CFALUBug; -} -bool -AMDGPUSubtarget::isTargetELF() const { - return false; -} - -std::string -AMDGPUSubtarget::getDeviceName() const { - return DevName; -} diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h index 1b041d6bd2bd..0c388b33872a 100644 --- a/lib/Target/R600/AMDGPUSubtarget.h +++ b/lib/Target/R600/AMDGPUSubtarget.h @@ -15,6 +15,7 @@ #ifndef AMDGPUSUBTARGET_H #define AMDGPUSUBTARGET_H #include "AMDGPU.h" +#include "AMDGPUInstrInfo.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/Target/TargetSubtargetInfo.h" @@ -27,6 +28,9 @@ namespace llvm { class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo { + + std::unique_ptr<AMDGPUInstrInfo> InstrInfo; + public: enum Generation { R600 = 0, @@ -40,42 +44,78 @@ public: private: std::string DevName; bool Is64bit; - bool Is32on64bit; bool DumpCode; bool R600ALUInst; bool HasVertexCache; short TexVTXClauseSize; - enum Generation Gen; + Generation Gen; bool FP64; bool CaymanISA; bool EnableIRStructurizer; bool EnableIfCvt; unsigned WavefrontSize; bool CFALUBug; + int LocalMemorySize; InstrItineraryData InstrItins; public: AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS); - const InstrItineraryData &getInstrItineraryData() const { return InstrItins; } + const AMDGPUInstrInfo *getInstrInfo() const { + return InstrInfo.get(); + } + + const InstrItineraryData &getInstrItineraryData() const { + return InstrItins; + } + void ParseSubtargetFeatures(StringRef CPU, StringRef FS); - bool is64bit() const; - bool hasVertexCache() const; - short getTexVTXClauseSize() const; - enum Generation getGeneration() const; - bool hasHWFP64() const; - bool hasCaymanISA() const; + bool is64bit() const { + return Is64bit; + } + + bool hasVertexCache() const { + return HasVertexCache; + } + + short getTexVTXClauseSize() const { + return TexVTXClauseSize; + } + + Generation getGeneration() const { + return Gen; + } + + bool hasHWFP64() const { + return FP64; + } + + bool hasCaymanISA() const { + return CaymanISA; + } bool hasBFE() const { return (getGeneration() >= EVERGREEN); } + bool hasBFI() const { + return (getGeneration() >= EVERGREEN); + } + bool hasBFM() const { return hasBFE(); } + bool hasBCNT(unsigned Size) const { + if (Size == 32) + return (getGeneration() >= EVERGREEN); + + assert(Size == 64); + return (getGeneration() >= SOUTHERN_ISLANDS); + } + bool hasMulU24() const { return (getGeneration() >= EVERGREEN); } @@ -85,22 +125,48 @@ public: hasCaymanISA()); } - bool IsIRStructurizerEnabled() const; - bool isIfCvtEnabled() const; - unsigned getWavefrontSize() const; + bool IsIRStructurizerEnabled() const { + return EnableIRStructurizer; + } + + bool isIfCvtEnabled() const { + return EnableIfCvt; + } + + unsigned getWavefrontSize() const { + return WavefrontSize; + } + unsigned getStackEntrySize() const; - bool hasCFAluBug() const; + + bool hasCFAluBug() const { + assert(getGeneration() <= NORTHERN_ISLANDS); + return CFALUBug; + } + + int getLocalMemorySize() const { + return LocalMemorySize; + } bool enableMachineScheduler() const override { return getGeneration() <= NORTHERN_ISLANDS; } // Helper functions to simplify if statements - bool isTargetELF() const; - std::string getDeviceName() const; - bool dumpCode() const { return DumpCode; } - bool r600ALUEncoding() const { return R600ALUInst; } + bool isTargetELF() const { + return false; + } + StringRef getDeviceName() const { + return DevName; + } + + bool dumpCode() const { + return DumpCode; + } + bool r600ALUEncoding() const { + return R600ALUInst; + } }; } // End namespace llvm diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp index 174fdca3bd77..8aab94446b5d 100644 --- a/lib/Target/R600/AMDGPUTargetMachine.cpp +++ b/lib/Target/R600/AMDGPUTargetMachine.cpp @@ -80,10 +80,8 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, StringRef TT, InstrItins(&Subtarget.getInstrItineraryData()) { // TLInfo uses InstrInfo so it must be initialized after. if (Subtarget.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { - InstrInfo.reset(new R600InstrInfo(*this)); TLInfo.reset(new R600TargetLowering(*this)); } else { - InstrInfo.reset(new SIInstrInfo(*this)); TLInfo.reset(new SITargetLowering(*this)); } setRequiresStructuredCFG(true); @@ -111,6 +109,7 @@ public: return nullptr; } + virtual void addCodeGenPrepare(); bool addPreISel() override; bool addInstSelector() override; bool addPreRegAlloc() override; @@ -136,6 +135,13 @@ void AMDGPUTargetMachine::addAnalysisPasses(PassManagerBase &PM) { PM.add(createAMDGPUTargetTransformInfoPass(this)); } +void AMDGPUPassConfig::addCodeGenPrepare() { + const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(); + addPass(createAMDGPUPromoteAlloca(ST)); + addPass(createSROAPass()); + TargetPassConfig::addCodeGenPrepare(); +} + bool AMDGPUPassConfig::addPreISel() { const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(); @@ -159,7 +165,6 @@ bool AMDGPUPassConfig::addInstSelector() { } bool AMDGPUPassConfig::addPreRegAlloc() { - addPass(createAMDGPUConvertToISAPass(*TM)); const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(); if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { @@ -169,6 +174,8 @@ bool AMDGPUPassConfig::addPreRegAlloc() { // SIFixSGPRCopies can generate a lot of duplicate instructions, // so we need to run MachineCSE afterwards. addPass(&MachineCSEID); + initializeSIFixSGPRLiveRangesPass(*PassRegistry::getPassRegistry()); + insertPass(&RegisterCoalescerID, &SIFixSGPRLiveRangesID); } return false; } diff --git a/lib/Target/R600/AMDGPUTargetMachine.h b/lib/Target/R600/AMDGPUTargetMachine.h index 1287e134b59f..3bb15beb6bf1 100644 --- a/lib/Target/R600/AMDGPUTargetMachine.h +++ b/lib/Target/R600/AMDGPUTargetMachine.h @@ -17,8 +17,8 @@ #include "AMDGPUFrameLowering.h" #include "AMDGPUInstrInfo.h" +#include "AMDGPUIntrinsicInfo.h" #include "AMDGPUSubtarget.h" -#include "AMDILIntrinsicInfo.h" #include "R600ISelLowering.h" #include "llvm/IR/DataLayout.h" @@ -30,7 +30,6 @@ class AMDGPUTargetMachine : public LLVMTargetMachine { const DataLayout Layout; AMDGPUFrameLowering FrameLowering; AMDGPUIntrinsicInfo IntrinsicInfo; - std::unique_ptr<AMDGPUInstrInfo> InstrInfo; std::unique_ptr<AMDGPUTargetLowering> TLInfo; const InstrItineraryData *InstrItins; @@ -46,13 +45,13 @@ public: return &IntrinsicInfo; } const AMDGPUInstrInfo *getInstrInfo() const override { - return InstrInfo.get(); + return getSubtargetImpl()->getInstrInfo(); } const AMDGPUSubtarget *getSubtargetImpl() const override { return &Subtarget; } const AMDGPURegisterInfo *getRegisterInfo() const override { - return &InstrInfo->getRegisterInfo(); + return &getInstrInfo()->getRegisterInfo(); } AMDGPUTargetLowering *getTargetLowering() const override { return TLInfo.get(); diff --git a/lib/Target/R600/AMDILBase.td b/lib/Target/R600/AMDILBase.td deleted file mode 100644 index 5dcd478fe5bc..000000000000 --- a/lib/Target/R600/AMDILBase.td +++ /dev/null @@ -1,25 +0,0 @@ -//===- AMDIL.td - AMDIL Target Machine -------------*- tablegen -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// Target-independent interfaces which we are implementing -//===----------------------------------------------------------------------===// - -include "llvm/Target/Target.td" - -// Dummy Instruction itineraries for pseudo instructions -def ALU_NULL : FuncUnit; -def NullALU : InstrItinClass; - -//===----------------------------------------------------------------------===// -// Register File, Calling Conv, Instruction Descriptions -//===----------------------------------------------------------------------===// - - -include "AMDILRegisterInfo.td" -include "AMDILInstrInfo.td" - diff --git a/lib/Target/R600/AMDILISelLowering.cpp b/lib/Target/R600/AMDILISelLowering.cpp deleted file mode 100644 index 7cea803f8987..000000000000 --- a/lib/Target/R600/AMDILISelLowering.cpp +++ /dev/null @@ -1,560 +0,0 @@ -//===-- AMDILISelLowering.cpp - AMDIL DAG Lowering Implementation ---------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//==-----------------------------------------------------------------------===// -// -/// \file -/// \brief TargetLowering functions borrowed from AMDIL. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPUISelLowering.h" -#include "AMDGPURegisterInfo.h" -#include "AMDGPUSubtarget.h" -#include "AMDILIntrinsicInfo.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/PseudoSourceValue.h" -#include "llvm/CodeGen/SelectionDAG.h" -#include "llvm/CodeGen/SelectionDAGNodes.h" -#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" -#include "llvm/IR/CallingConv.h" -#include "llvm/IR/DerivedTypes.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/Intrinsics.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetInstrInfo.h" -#include "llvm/Target/TargetOptions.h" - -using namespace llvm; -//===----------------------------------------------------------------------===// -// TargetLowering Implementation Help Functions End -//===----------------------------------------------------------------------===// - -//===----------------------------------------------------------------------===// -// TargetLowering Class Implementation Begins -//===----------------------------------------------------------------------===// -void AMDGPUTargetLowering::InitAMDILLowering() { - static const MVT::SimpleValueType types[] = { - MVT::i8, - MVT::i16, - MVT::i32, - MVT::f32, - MVT::f64, - MVT::i64, - MVT::v2i8, - MVT::v4i8, - MVT::v2i16, - MVT::v4i16, - MVT::v4f32, - MVT::v4i32, - MVT::v2f32, - MVT::v2i32, - MVT::v2f64, - MVT::v2i64 - }; - - static const MVT::SimpleValueType IntTypes[] = { - MVT::i8, - MVT::i16, - MVT::i32, - MVT::i64 - }; - - static const MVT::SimpleValueType FloatTypes[] = { - MVT::f32, - MVT::f64 - }; - - static const MVT::SimpleValueType VectorTypes[] = { - MVT::v2i8, - MVT::v4i8, - MVT::v2i16, - MVT::v4i16, - MVT::v4f32, - MVT::v4i32, - MVT::v2f32, - MVT::v2i32, - MVT::v2f64, - MVT::v2i64 - }; - - const AMDGPUSubtarget &STM = getTargetMachine().getSubtarget<AMDGPUSubtarget>(); - // These are the current register classes that are - // supported - - for (MVT VT : types) { - setOperationAction(ISD::SUBE, VT, Expand); - setOperationAction(ISD::SUBC, VT, Expand); - setOperationAction(ISD::ADDE, VT, Expand); - setOperationAction(ISD::ADDC, VT, Expand); - setOperationAction(ISD::BRCOND, VT, Custom); - setOperationAction(ISD::BR_JT, VT, Expand); - setOperationAction(ISD::BRIND, VT, Expand); - // TODO: Implement custom UREM/SREM routines - setOperationAction(ISD::SREM, VT, Expand); - setOperationAction(ISD::SMUL_LOHI, VT, Expand); - setOperationAction(ISD::UMUL_LOHI, VT, Expand); - if (VT != MVT::i64 && VT != MVT::v2i64) { - setOperationAction(ISD::SDIV, VT, Custom); - } - } - for (MVT VT : FloatTypes) { - // IL does not have these operations for floating point types - setOperationAction(ISD::FP_ROUND_INREG, VT, Expand); - setOperationAction(ISD::SETOLT, VT, Expand); - setOperationAction(ISD::SETOGE, VT, Expand); - setOperationAction(ISD::SETOGT, VT, Expand); - setOperationAction(ISD::SETOLE, VT, Expand); - setOperationAction(ISD::SETULT, VT, Expand); - setOperationAction(ISD::SETUGE, VT, Expand); - setOperationAction(ISD::SETUGT, VT, Expand); - setOperationAction(ISD::SETULE, VT, Expand); - } - - for (MVT VT : IntTypes) { - // GPU also does not have divrem function for signed or unsigned - setOperationAction(ISD::SDIVREM, VT, Expand); - - // GPU does not have [S|U]MUL_LOHI functions as a single instruction - setOperationAction(ISD::SMUL_LOHI, VT, Expand); - setOperationAction(ISD::UMUL_LOHI, VT, Expand); - - setOperationAction(ISD::BSWAP, VT, Expand); - - // GPU doesn't have any counting operators - setOperationAction(ISD::CTPOP, VT, Expand); - setOperationAction(ISD::CTTZ, VT, Expand); - setOperationAction(ISD::CTLZ, VT, Expand); - } - - for (MVT VT : VectorTypes) { - setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); - setOperationAction(ISD::SDIVREM, VT, Expand); - setOperationAction(ISD::SMUL_LOHI, VT, Expand); - // setOperationAction(ISD::VSETCC, VT, Expand); - setOperationAction(ISD::SELECT_CC, VT, Expand); - - } - setOperationAction(ISD::MULHU, MVT::i64, Expand); - setOperationAction(ISD::MULHU, MVT::v2i64, Expand); - setOperationAction(ISD::MULHS, MVT::i64, Expand); - setOperationAction(ISD::MULHS, MVT::v2i64, Expand); - setOperationAction(ISD::ADD, MVT::v2i64, Expand); - setOperationAction(ISD::SREM, MVT::v2i64, Expand); - setOperationAction(ISD::Constant , MVT::i64 , Legal); - setOperationAction(ISD::SDIV, MVT::v2i64, Expand); - setOperationAction(ISD::TRUNCATE, MVT::v2i64, Expand); - setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Expand); - setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Expand); - setOperationAction(ISD::ANY_EXTEND, MVT::v2i64, Expand); - if (STM.hasHWFP64()) { - // we support loading/storing v2f64 but not operations on the type - setOperationAction(ISD::FADD, MVT::v2f64, Expand); - setOperationAction(ISD::FSUB, MVT::v2f64, Expand); - setOperationAction(ISD::FMUL, MVT::v2f64, Expand); - setOperationAction(ISD::FP_ROUND_INREG, MVT::v2f64, Expand); - setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand); - setOperationAction(ISD::ConstantFP , MVT::f64 , Legal); - // We want to expand vector conversions into their scalar - // counterparts. - setOperationAction(ISD::TRUNCATE, MVT::v2f64, Expand); - setOperationAction(ISD::SIGN_EXTEND, MVT::v2f64, Expand); - setOperationAction(ISD::ZERO_EXTEND, MVT::v2f64, Expand); - setOperationAction(ISD::ANY_EXTEND, MVT::v2f64, Expand); - setOperationAction(ISD::FABS, MVT::f64, Expand); - setOperationAction(ISD::FABS, MVT::v2f64, Expand); - } - // TODO: Fix the UDIV24 algorithm so it works for these - // types correctly. This needs vector comparisons - // for this to work correctly. - setOperationAction(ISD::UDIV, MVT::v2i8, Expand); - setOperationAction(ISD::UDIV, MVT::v4i8, Expand); - setOperationAction(ISD::UDIV, MVT::v2i16, Expand); - setOperationAction(ISD::UDIV, MVT::v4i16, Expand); - setOperationAction(ISD::SUBC, MVT::Other, Expand); - setOperationAction(ISD::ADDE, MVT::Other, Expand); - setOperationAction(ISD::ADDC, MVT::Other, Expand); - setOperationAction(ISD::BRCOND, MVT::Other, Custom); - setOperationAction(ISD::BR_JT, MVT::Other, Expand); - setOperationAction(ISD::BRIND, MVT::Other, Expand); - - - // Use the default implementation. - setOperationAction(ISD::ConstantFP , MVT::f32 , Legal); - setOperationAction(ISD::Constant , MVT::i32 , Legal); - - setSchedulingPreference(Sched::RegPressure); - setPow2DivIsCheap(false); - setSelectIsExpensive(true); - setJumpIsExpensive(true); - - MaxStoresPerMemcpy = 4096; - MaxStoresPerMemmove = 4096; - MaxStoresPerMemset = 4096; - -} - -bool -AMDGPUTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, - const CallInst &I, unsigned Intrinsic) const { - return false; -} - -// The backend supports 32 and 64 bit floating point immediates -bool -AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { - if (VT.getScalarType().getSimpleVT().SimpleTy == MVT::f32 - || VT.getScalarType().getSimpleVT().SimpleTy == MVT::f64) { - return true; - } else { - return false; - } -} - -bool -AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const { - if (VT.getScalarType().getSimpleVT().SimpleTy == MVT::f32 - || VT.getScalarType().getSimpleVT().SimpleTy == MVT::f64) { - return false; - } else { - return true; - } -} - - -// isMaskedValueZeroForTargetNode - Return true if 'Op & Mask' is known to -// be zero. Op is expected to be a target specific node. Used by DAG -// combiner. - -//===----------------------------------------------------------------------===// -// Other Lowering Hooks -//===----------------------------------------------------------------------===// - -SDValue -AMDGPUTargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const { - EVT OVT = Op.getValueType(); - SDValue DST; - if (OVT.getScalarType() == MVT::i64) { - DST = LowerSDIV64(Op, DAG); - } else if (OVT.getScalarType() == MVT::i32) { - DST = LowerSDIV32(Op, DAG); - } else if (OVT.getScalarType() == MVT::i16 - || OVT.getScalarType() == MVT::i8) { - DST = LowerSDIV24(Op, DAG); - } else { - DST = SDValue(Op.getNode(), 0); - } - return DST; -} - -SDValue -AMDGPUTargetLowering::LowerSREM(SDValue Op, SelectionDAG &DAG) const { - EVT OVT = Op.getValueType(); - SDValue DST; - if (OVT.getScalarType() == MVT::i64) { - DST = LowerSREM64(Op, DAG); - } else if (OVT.getScalarType() == MVT::i32) { - DST = LowerSREM32(Op, DAG); - } else if (OVT.getScalarType() == MVT::i16) { - DST = LowerSREM16(Op, DAG); - } else if (OVT.getScalarType() == MVT::i8) { - DST = LowerSREM8(Op, DAG); - } else { - DST = SDValue(Op.getNode(), 0); - } - return DST; -} - -EVT -AMDGPUTargetLowering::genIntType(uint32_t size, uint32_t numEle) const { - int iSize = (size * numEle); - int vEle = (iSize >> ((size == 64) ? 6 : 5)); - if (!vEle) { - vEle = 1; - } - if (size == 64) { - if (vEle == 1) { - return EVT(MVT::i64); - } else { - return EVT(MVT::getVectorVT(MVT::i64, vEle)); - } - } else { - if (vEle == 1) { - return EVT(MVT::i32); - } else { - return EVT(MVT::getVectorVT(MVT::i32, vEle)); - } - } -} - -SDValue -AMDGPUTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { - SDValue Chain = Op.getOperand(0); - SDValue Cond = Op.getOperand(1); - SDValue Jump = Op.getOperand(2); - SDValue Result; - Result = DAG.getNode( - AMDGPUISD::BRANCH_COND, - SDLoc(Op), - Op.getValueType(), - Chain, Jump, Cond); - return Result; -} - -SDValue -AMDGPUTargetLowering::LowerSDIV24(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); - EVT OVT = Op.getValueType(); - SDValue LHS = Op.getOperand(0); - SDValue RHS = Op.getOperand(1); - MVT INTTY; - MVT FLTTY; - if (!OVT.isVector()) { - INTTY = MVT::i32; - FLTTY = MVT::f32; - } else if (OVT.getVectorNumElements() == 2) { - INTTY = MVT::v2i32; - FLTTY = MVT::v2f32; - } else if (OVT.getVectorNumElements() == 4) { - INTTY = MVT::v4i32; - FLTTY = MVT::v4f32; - } - unsigned bitsize = OVT.getScalarType().getSizeInBits(); - // char|short jq = ia ^ ib; - SDValue jq = DAG.getNode(ISD::XOR, DL, OVT, LHS, RHS); - - // jq = jq >> (bitsize - 2) - jq = DAG.getNode(ISD::SRA, DL, OVT, jq, DAG.getConstant(bitsize - 2, OVT)); - - // jq = jq | 0x1 - jq = DAG.getNode(ISD::OR, DL, OVT, jq, DAG.getConstant(1, OVT)); - - // jq = (int)jq - jq = DAG.getSExtOrTrunc(jq, DL, INTTY); - - // int ia = (int)LHS; - SDValue ia = DAG.getSExtOrTrunc(LHS, DL, INTTY); - - // int ib, (int)RHS; - SDValue ib = DAG.getSExtOrTrunc(RHS, DL, INTTY); - - // float fa = (float)ia; - SDValue fa = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ia); - - // float fb = (float)ib; - SDValue fb = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ib); - - // float fq = native_divide(fa, fb); - SDValue fq = DAG.getNode(AMDGPUISD::DIV_INF, DL, FLTTY, fa, fb); - - // fq = trunc(fq); - fq = DAG.getNode(ISD::FTRUNC, DL, FLTTY, fq); - - // float fqneg = -fq; - SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FLTTY, fq); - - // float fr = mad(fqneg, fb, fa); - SDValue fr = DAG.getNode(ISD::FADD, DL, FLTTY, - DAG.getNode(ISD::MUL, DL, FLTTY, fqneg, fb), fa); - - // int iq = (int)fq; - SDValue iq = DAG.getNode(ISD::FP_TO_SINT, DL, INTTY, fq); - - // fr = fabs(fr); - fr = DAG.getNode(ISD::FABS, DL, FLTTY, fr); - - // fb = fabs(fb); - fb = DAG.getNode(ISD::FABS, DL, FLTTY, fb); - - // int cv = fr >= fb; - SDValue cv; - if (INTTY == MVT::i32) { - cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE); - } else { - cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE); - } - // jq = (cv ? jq : 0); - jq = DAG.getNode(ISD::SELECT, DL, OVT, cv, jq, - DAG.getConstant(0, OVT)); - // dst = iq + jq; - iq = DAG.getSExtOrTrunc(iq, DL, OVT); - iq = DAG.getNode(ISD::ADD, DL, OVT, iq, jq); - return iq; -} - -SDValue -AMDGPUTargetLowering::LowerSDIV32(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); - EVT OVT = Op.getValueType(); - SDValue LHS = Op.getOperand(0); - SDValue RHS = Op.getOperand(1); - // The LowerSDIV32 function generates equivalent to the following IL. - // mov r0, LHS - // mov r1, RHS - // ilt r10, r0, 0 - // ilt r11, r1, 0 - // iadd r0, r0, r10 - // iadd r1, r1, r11 - // ixor r0, r0, r10 - // ixor r1, r1, r11 - // udiv r0, r0, r1 - // ixor r10, r10, r11 - // iadd r0, r0, r10 - // ixor DST, r0, r10 - - // mov r0, LHS - SDValue r0 = LHS; - - // mov r1, RHS - SDValue r1 = RHS; - - // ilt r10, r0, 0 - SDValue r10 = DAG.getSelectCC(DL, - r0, DAG.getConstant(0, OVT), - DAG.getConstant(-1, MVT::i32), - DAG.getConstant(0, MVT::i32), - ISD::SETLT); - - // ilt r11, r1, 0 - SDValue r11 = DAG.getSelectCC(DL, - r1, DAG.getConstant(0, OVT), - DAG.getConstant(-1, MVT::i32), - DAG.getConstant(0, MVT::i32), - ISD::SETLT); - - // iadd r0, r0, r10 - r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10); - - // iadd r1, r1, r11 - r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11); - - // ixor r0, r0, r10 - r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); - - // ixor r1, r1, r11 - r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11); - - // udiv r0, r0, r1 - r0 = DAG.getNode(ISD::UDIV, DL, OVT, r0, r1); - - // ixor r10, r10, r11 - r10 = DAG.getNode(ISD::XOR, DL, OVT, r10, r11); - - // iadd r0, r0, r10 - r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10); - - // ixor DST, r0, r10 - SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); - return DST; -} - -SDValue -AMDGPUTargetLowering::LowerSDIV64(SDValue Op, SelectionDAG &DAG) const { - return SDValue(Op.getNode(), 0); -} - -SDValue -AMDGPUTargetLowering::LowerSREM8(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); - EVT OVT = Op.getValueType(); - MVT INTTY = MVT::i32; - if (OVT == MVT::v2i8) { - INTTY = MVT::v2i32; - } else if (OVT == MVT::v4i8) { - INTTY = MVT::v4i32; - } - SDValue LHS = DAG.getSExtOrTrunc(Op.getOperand(0), DL, INTTY); - SDValue RHS = DAG.getSExtOrTrunc(Op.getOperand(1), DL, INTTY); - LHS = DAG.getNode(ISD::SREM, DL, INTTY, LHS, RHS); - LHS = DAG.getSExtOrTrunc(LHS, DL, OVT); - return LHS; -} - -SDValue -AMDGPUTargetLowering::LowerSREM16(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); - EVT OVT = Op.getValueType(); - MVT INTTY = MVT::i32; - if (OVT == MVT::v2i16) { - INTTY = MVT::v2i32; - } else if (OVT == MVT::v4i16) { - INTTY = MVT::v4i32; - } - SDValue LHS = DAG.getSExtOrTrunc(Op.getOperand(0), DL, INTTY); - SDValue RHS = DAG.getSExtOrTrunc(Op.getOperand(1), DL, INTTY); - LHS = DAG.getNode(ISD::SREM, DL, INTTY, LHS, RHS); - LHS = DAG.getSExtOrTrunc(LHS, DL, OVT); - return LHS; -} - -SDValue -AMDGPUTargetLowering::LowerSREM32(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); - EVT OVT = Op.getValueType(); - SDValue LHS = Op.getOperand(0); - SDValue RHS = Op.getOperand(1); - // The LowerSREM32 function generates equivalent to the following IL. - // mov r0, LHS - // mov r1, RHS - // ilt r10, r0, 0 - // ilt r11, r1, 0 - // iadd r0, r0, r10 - // iadd r1, r1, r11 - // ixor r0, r0, r10 - // ixor r1, r1, r11 - // udiv r20, r0, r1 - // umul r20, r20, r1 - // sub r0, r0, r20 - // iadd r0, r0, r10 - // ixor DST, r0, r10 - - // mov r0, LHS - SDValue r0 = LHS; - - // mov r1, RHS - SDValue r1 = RHS; - - // ilt r10, r0, 0 - SDValue r10 = DAG.getSetCC(DL, OVT, r0, DAG.getConstant(0, OVT), ISD::SETLT); - - // ilt r11, r1, 0 - SDValue r11 = DAG.getSetCC(DL, OVT, r1, DAG.getConstant(0, OVT), ISD::SETLT); - - // iadd r0, r0, r10 - r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10); - - // iadd r1, r1, r11 - r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11); - - // ixor r0, r0, r10 - r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); - - // ixor r1, r1, r11 - r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11); - - // udiv r20, r0, r1 - SDValue r20 = DAG.getNode(ISD::UREM, DL, OVT, r0, r1); - - // umul r20, r20, r1 - r20 = DAG.getNode(AMDGPUISD::UMUL, DL, OVT, r20, r1); - - // sub r0, r0, r20 - r0 = DAG.getNode(ISD::SUB, DL, OVT, r0, r20); - - // iadd r0, r0, r10 - r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10); - - // ixor DST, r0, r10 - SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); - return DST; -} - -SDValue -AMDGPUTargetLowering::LowerSREM64(SDValue Op, SelectionDAG &DAG) const { - return SDValue(Op.getNode(), 0); -} diff --git a/lib/Target/R600/AMDILInstrInfo.td b/lib/Target/R600/AMDILInstrInfo.td deleted file mode 100644 index 0f0c88db935a..000000000000 --- a/lib/Target/R600/AMDILInstrInfo.td +++ /dev/null @@ -1,150 +0,0 @@ -//===------------ AMDILInstrInfo.td - AMDIL Target ------*-tablegen-*------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//==-----------------------------------------------------------------------===// -// -// This file describes the AMDIL instructions in TableGen format. -// -//===----------------------------------------------------------------------===// -//===--------------------------------------------------------------------===// -// Custom Operands -//===--------------------------------------------------------------------===// -def brtarget : Operand<OtherVT>; - -//===--------------------------------------------------------------------===// -// Custom Selection DAG Type Profiles -//===--------------------------------------------------------------------===// -//===----------------------------------------------------------------------===// -// Generic Profile Types -//===----------------------------------------------------------------------===// - -def SDTIL_GenBinaryOp : SDTypeProfile<1, 2, [ - SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2> - ]>; -def SDTIL_GenTernaryOp : SDTypeProfile<1, 3, [ - SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisSameAs<2, 3> - ]>; -def SDTIL_GenVecBuild : SDTypeProfile<1, 1, [ - SDTCisEltOfVec<1, 0> - ]>; - -//===----------------------------------------------------------------------===// -// Flow Control Profile Types -//===----------------------------------------------------------------------===// -// Branch instruction where second and third are basic blocks -def SDTIL_BRCond : SDTypeProfile<0, 2, [ - SDTCisVT<0, OtherVT> - ]>; - -//===--------------------------------------------------------------------===// -// Custom Selection DAG Nodes -//===--------------------------------------------------------------------===// -//===----------------------------------------------------------------------===// -// Flow Control DAG Nodes -//===----------------------------------------------------------------------===// -def IL_brcond : SDNode<"AMDGPUISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChain]>; - -//===----------------------------------------------------------------------===// -// Call/Return DAG Nodes -//===----------------------------------------------------------------------===// -def IL_retflag : SDNode<"AMDGPUISD::RET_FLAG", SDTNone, - [SDNPHasChain, SDNPOptInGlue]>; - -//===--------------------------------------------------------------------===// -// Instructions -//===--------------------------------------------------------------------===// -// Floating point math functions -def IL_div_inf : SDNode<"AMDGPUISD::DIV_INF", SDTIL_GenBinaryOp>; - -//===----------------------------------------------------------------------===// -// Integer functions -//===----------------------------------------------------------------------===// -def IL_umul : SDNode<"AMDGPUISD::UMUL" , SDTIntBinOp, - [SDNPCommutative, SDNPAssociative]>; - -//===--------------------------------------------------------------------===// -// Custom Pattern DAG Nodes -//===--------------------------------------------------------------------===// -def global_store : PatFrag<(ops node:$val, node:$ptr), - (store node:$val, node:$ptr), [{ - return isGlobalStore(dyn_cast<StoreSDNode>(N)); -}]>; - -//===----------------------------------------------------------------------===// -// Load pattern fragments -//===----------------------------------------------------------------------===// -// Global address space loads -def global_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ - return isGlobalLoad(dyn_cast<LoadSDNode>(N)); -}]>; -// Constant address space loads -def constant_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ - return isConstantLoad(dyn_cast<LoadSDNode>(N), -1); -}]>; - -//===----------------------------------------------------------------------===// -// Complex addressing mode patterns -//===----------------------------------------------------------------------===// -def ADDR : ComplexPattern<i32, 2, "SelectADDR", [], []>; -def ADDRF : ComplexPattern<i32, 2, "SelectADDR", [frameindex], []>; -def ADDR64 : ComplexPattern<i64, 2, "SelectADDR64", [], []>; -def ADDR64F : ComplexPattern<i64, 2, "SelectADDR64", [frameindex], []>; - -//===----------------------------------------------------------------------===// -// Instruction format classes -//===----------------------------------------------------------------------===// -class ILFormat<dag outs, dag ins, string asmstr, list<dag> pattern> -: Instruction { - - let Namespace = "AMDGPU"; - dag OutOperandList = outs; - dag InOperandList = ins; - let Pattern = pattern; - let AsmString = !strconcat(asmstr, "\n"); - let isPseudo = 1; - let Itinerary = NullALU; - bit hasIEEEFlag = 0; - bit hasZeroOpFlag = 0; - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; -} - -//===--------------------------------------------------------------------===// -// Multiclass Instruction formats -//===--------------------------------------------------------------------===// -// Multiclass that handles branch instructions -multiclass BranchConditional<SDNode Op, RegisterClass rci, RegisterClass rcf> { - def _i32 : ILFormat<(outs), - (ins brtarget:$target, rci:$src0), - "; i32 Pseudo branch instruction", - [(Op bb:$target, (i32 rci:$src0))]>; - def _f32 : ILFormat<(outs), - (ins brtarget:$target, rcf:$src0), - "; f32 Pseudo branch instruction", - [(Op bb:$target, (f32 rcf:$src0))]>; -} - -// Only scalar types should generate flow control -multiclass BranchInstr<string name> { - def _i32 : ILFormat<(outs), (ins GPRI32:$src), - !strconcat(name, " $src"), []>; - def _f32 : ILFormat<(outs), (ins GPRF32:$src), - !strconcat(name, " $src"), []>; -} -// Only scalar types should generate flow control -multiclass BranchInstr2<string name> { - def _i32 : ILFormat<(outs), (ins GPRI32:$src0, GPRI32:$src1), - !strconcat(name, " $src0, $src1"), []>; - def _f32 : ILFormat<(outs), (ins GPRF32:$src0, GPRF32:$src1), - !strconcat(name, " $src0, $src1"), []>; -} - -//===--------------------------------------------------------------------===// -// Intrinsics support -//===--------------------------------------------------------------------===// -include "AMDILIntrinsics.td" diff --git a/lib/Target/R600/AMDILIntrinsics.td b/lib/Target/R600/AMDILIntrinsics.td deleted file mode 100644 index 4a3e02e202bb..000000000000 --- a/lib/Target/R600/AMDILIntrinsics.td +++ /dev/null @@ -1,224 +0,0 @@ -//===- AMDILIntrinsics.td - Defines AMDIL Intrinscs -*- tablegen -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//==-----------------------------------------------------------------------===// -// -// This file defines all of the amdil-specific intrinsics -// -//===---------------------------------------------------------------===// -//===--------------------------------------------------------------------===// -// Intrinsic classes -// Generic versions of the above classes but for Target specific intrinsics -// instead of SDNode patterns. -//===--------------------------------------------------------------------===// -let TargetPrefix = "AMDIL", isTarget = 1 in { - class VoidIntLong : - Intrinsic<[llvm_i64_ty], [], []>; - class VoidIntInt : - Intrinsic<[llvm_i32_ty], [], []>; - class VoidIntBool : - Intrinsic<[llvm_i32_ty], [], []>; - class UnaryIntInt : - Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>; - class UnaryIntFloat : - Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; - class ConvertIntFTOI : - Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty], [IntrNoMem]>; - class ConvertIntITOF : - Intrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty], [IntrNoMem]>; - class UnaryIntNoRetInt : - Intrinsic<[], [llvm_anyint_ty], []>; - class UnaryIntNoRetFloat : - Intrinsic<[], [llvm_anyfloat_ty], []>; - class BinaryIntInt : - Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; - class BinaryIntFloat : - Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; - class BinaryIntNoRetInt : - Intrinsic<[], [llvm_anyint_ty, LLVMMatchType<0>], []>; - class BinaryIntNoRetFloat : - Intrinsic<[], [llvm_anyfloat_ty, LLVMMatchType<0>], []>; - class TernaryIntInt : - Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, - LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; - class TernaryIntFloat : - Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, - LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; - class QuaternaryIntInt : - Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, - LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; - class UnaryAtomicInt : - Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], [IntrReadWriteArgMem]>; - class BinaryAtomicInt : - Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>; - class TernaryAtomicInt : - Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>; - class UnaryAtomicIntNoRet : - Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty], [IntrReadWriteArgMem]>; - class BinaryAtomicIntNoRet : - Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>; - class TernaryAtomicIntNoRet : - Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>; -} - -let TargetPrefix = "AMDIL", isTarget = 1 in { - def int_AMDIL_abs : GCCBuiltin<"__amdil_abs">, UnaryIntInt; - - def int_AMDIL_bit_reverse_u32 : GCCBuiltin<"__amdil_ubit_reverse">, - UnaryIntInt; - def int_AMDIL_bit_count_i32 : GCCBuiltin<"__amdil_count_bits">, - UnaryIntInt; - def int_AMDIL_bit_find_first_lo : GCCBuiltin<"__amdil_ffb_lo">, - UnaryIntInt; - def int_AMDIL_bit_find_first_hi : GCCBuiltin<"__amdil_ffb_hi">, - UnaryIntInt; - def int_AMDIL_bit_find_first_sgn : GCCBuiltin<"__amdil_ffb_signed">, - UnaryIntInt; - def int_AMDIL_media_bitalign : GCCBuiltin<"__amdil_bitalign">, - TernaryIntInt; - def int_AMDIL_media_bytealign : GCCBuiltin<"__amdil_bytealign">, - TernaryIntInt; - def int_AMDIL_bit_insert_u32 : GCCBuiltin<"__amdil_ubit_insert">, - QuaternaryIntInt; - def int_AMDIL_bfi : GCCBuiltin<"__amdil_bfi">, - TernaryIntInt; - def int_AMDIL_bfm : GCCBuiltin<"__amdil_bfm">, - BinaryIntInt; - def int_AMDIL_mulhi_i32 : GCCBuiltin<"__amdil_imul_high">, - BinaryIntInt; - def int_AMDIL_mulhi_u32 : GCCBuiltin<"__amdil_umul_high">, - BinaryIntInt; - def int_AMDIL_mulhi24_i32 : GCCBuiltin<"__amdil_imul24_high">, - BinaryIntInt; - def int_AMDIL_mulhi24_u32 : GCCBuiltin<"__amdil_umul24_high">, - BinaryIntInt; - def int_AMDIL_carry_i32 : GCCBuiltin<"__amdil_carry">, - BinaryIntInt; - def int_AMDIL_borrow_i32 : GCCBuiltin<"__amdil_borrow">, - BinaryIntInt; - def int_AMDIL_min_i32 : GCCBuiltin<"__amdil_imin">, - BinaryIntInt; - def int_AMDIL_min_u32 : GCCBuiltin<"__amdil_umin">, - BinaryIntInt; - def int_AMDIL_min : GCCBuiltin<"__amdil_min">, - BinaryIntFloat; - def int_AMDIL_max_i32 : GCCBuiltin<"__amdil_imax">, - BinaryIntInt; - def int_AMDIL_max_u32 : GCCBuiltin<"__amdil_umax">, - BinaryIntInt; - def int_AMDIL_max : GCCBuiltin<"__amdil_max">, - BinaryIntFloat; - def int_AMDIL_media_lerp_u4 : GCCBuiltin<"__amdil_u4lerp">, - TernaryIntInt; - def int_AMDIL_media_sad : GCCBuiltin<"__amdil_sad">, - TernaryIntInt; - def int_AMDIL_media_sad_hi : GCCBuiltin<"__amdil_sadhi">, - TernaryIntInt; - def int_AMDIL_fraction : GCCBuiltin<"__amdil_fraction">, - UnaryIntFloat; - def int_AMDIL_clamp : GCCBuiltin<"__amdil_clamp">, - TernaryIntFloat; - def int_AMDIL_pireduce : GCCBuiltin<"__amdil_pireduce">, - UnaryIntFloat; - def int_AMDIL_round_nearest : GCCBuiltin<"__amdil_round_nearest">, - UnaryIntFloat; - def int_AMDIL_round_neginf : GCCBuiltin<"__amdil_round_neginf">, - UnaryIntFloat; - def int_AMDIL_round_zero : GCCBuiltin<"__amdil_round_zero">, - UnaryIntFloat; - def int_AMDIL_acos : GCCBuiltin<"__amdil_acos">, - UnaryIntFloat; - def int_AMDIL_atan : GCCBuiltin<"__amdil_atan">, - UnaryIntFloat; - def int_AMDIL_asin : GCCBuiltin<"__amdil_asin">, - UnaryIntFloat; - def int_AMDIL_cos : GCCBuiltin<"__amdil_cos">, - UnaryIntFloat; - def int_AMDIL_cos_vec : GCCBuiltin<"__amdil_cos_vec">, - UnaryIntFloat; - def int_AMDIL_tan : GCCBuiltin<"__amdil_tan">, - UnaryIntFloat; - def int_AMDIL_sin : GCCBuiltin<"__amdil_sin">, - UnaryIntFloat; - def int_AMDIL_sin_vec : GCCBuiltin<"__amdil_sin_vec">, - UnaryIntFloat; - def int_AMDIL_pow : GCCBuiltin<"__amdil_pow">, BinaryIntFloat; - def int_AMDIL_div : GCCBuiltin<"__amdil_div">, BinaryIntFloat; - def int_AMDIL_udiv : GCCBuiltin<"__amdil_udiv">, BinaryIntInt; - def int_AMDIL_sqrt: GCCBuiltin<"__amdil_sqrt">, - UnaryIntFloat; - def int_AMDIL_sqrt_vec: GCCBuiltin<"__amdil_sqrt_vec">, - UnaryIntFloat; - def int_AMDIL_exp : GCCBuiltin<"__amdil_exp">, - UnaryIntFloat; - def int_AMDIL_exp_vec : GCCBuiltin<"__amdil_exp_vec">, - UnaryIntFloat; - def int_AMDIL_exn : GCCBuiltin<"__amdil_exn">, - UnaryIntFloat; - def int_AMDIL_log_vec : GCCBuiltin<"__amdil_log_vec">, - UnaryIntFloat; - def int_AMDIL_ln : GCCBuiltin<"__amdil_ln">, - UnaryIntFloat; - def int_AMDIL_sign: GCCBuiltin<"__amdil_sign">, - UnaryIntFloat; - def int_AMDIL_fma: GCCBuiltin<"__amdil_fma">, - TernaryIntFloat; - def int_AMDIL_rsq : GCCBuiltin<"__amdil_rsq">, - UnaryIntFloat; - def int_AMDIL_rsq_vec : GCCBuiltin<"__amdil_rsq_vec">, - UnaryIntFloat; - def int_AMDIL_length : GCCBuiltin<"__amdil_length">, - UnaryIntFloat; - def int_AMDIL_lerp : GCCBuiltin<"__amdil_lerp">, - TernaryIntFloat; - def int_AMDIL_media_sad4 : GCCBuiltin<"__amdil_sad4">, - Intrinsic<[llvm_i32_ty], [llvm_v4i32_ty, - llvm_v4i32_ty, llvm_i32_ty], []>; - - def int_AMDIL_frexp_f64 : GCCBuiltin<"__amdil_frexp">, - Intrinsic<[llvm_v2i64_ty], [llvm_double_ty], []>; - def int_AMDIL_ldexp : GCCBuiltin<"__amdil_ldexp">, - Intrinsic<[llvm_anyfloat_ty], [llvm_anyfloat_ty, llvm_anyint_ty], []>; - def int_AMDIL_drcp : GCCBuiltin<"__amdil_rcp">, - Intrinsic<[llvm_double_ty], [llvm_double_ty], []>; - def int_AMDIL_convert_f16_f32 : GCCBuiltin<"__amdil_half_to_float">, - ConvertIntITOF; - def int_AMDIL_convert_f32_f16 : GCCBuiltin<"__amdil_float_to_half">, - ConvertIntFTOI; - def int_AMDIL_convert_f32_i32_rpi : GCCBuiltin<"__amdil_float_to_int_rpi">, - ConvertIntFTOI; - def int_AMDIL_convert_f32_i32_flr : GCCBuiltin<"__amdil_float_to_int_flr">, - ConvertIntFTOI; - def int_AMDIL_convert_f32_f16_near : GCCBuiltin<"__amdil_float_to_half_near">, - ConvertIntFTOI; - def int_AMDIL_convert_f32_f16_neg_inf : GCCBuiltin<"__amdil_float_to_half_neg_inf">, - ConvertIntFTOI; - def int_AMDIL_convert_f32_f16_plus_inf : GCCBuiltin<"__amdil_float_to_half_plus_inf">, - ConvertIntFTOI; - def int_AMDIL_media_convert_f2v4u8 : GCCBuiltin<"__amdil_f_2_u4">, - Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty], []>; - def int_AMDIL_media_unpack_byte_0 : GCCBuiltin<"__amdil_unpack_0">, - ConvertIntITOF; - def int_AMDIL_media_unpack_byte_1 : GCCBuiltin<"__amdil_unpack_1">, - ConvertIntITOF; - def int_AMDIL_media_unpack_byte_2 : GCCBuiltin<"__amdil_unpack_2">, - ConvertIntITOF; - def int_AMDIL_media_unpack_byte_3 : GCCBuiltin<"__amdil_unpack_3">, - ConvertIntITOF; - def int_AMDIL_dp2_add : GCCBuiltin<"__amdil_dp2_add">, - Intrinsic<[llvm_float_ty], [llvm_v2f32_ty, - llvm_v2f32_ty, llvm_float_ty], []>; - def int_AMDIL_dp2 : GCCBuiltin<"__amdil_dp2">, - Intrinsic<[llvm_float_ty], [llvm_v2f32_ty, - llvm_v2f32_ty], []>; - def int_AMDIL_dp3 : GCCBuiltin<"__amdil_dp3">, - Intrinsic<[llvm_float_ty], [llvm_v4f32_ty, - llvm_v4f32_ty], []>; - def int_AMDIL_dp4 : GCCBuiltin<"__amdil_dp4">, - Intrinsic<[llvm_float_ty], [llvm_v4f32_ty, - llvm_v4f32_ty], []>; -} diff --git a/lib/Target/R600/AMDILRegisterInfo.td b/lib/Target/R600/AMDILRegisterInfo.td deleted file mode 100644 index b9d033432e8c..000000000000 --- a/lib/Target/R600/AMDILRegisterInfo.td +++ /dev/null @@ -1,107 +0,0 @@ -//===- AMDILRegisterInfo.td - AMDIL Register defs ----------*- tablegen -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//==-----------------------------------------------------------------------===// -// -// Declarations that describe the AMDIL register file -// -//===----------------------------------------------------------------------===// - -class AMDILReg<bits<16> num, string n> : Register<n> { - field bits<16> Value; - let Value = num; - let Namespace = "AMDGPU"; -} - -// We will start with 8 registers for each class before expanding to more -// Since the swizzle is added based on the register class, we can leave it -// off here and just specify different registers for different register classes -def R1 : AMDILReg<1, "r1">, DwarfRegNum<[1]>; -def R2 : AMDILReg<2, "r2">, DwarfRegNum<[2]>; -def R3 : AMDILReg<3, "r3">, DwarfRegNum<[3]>; -def R4 : AMDILReg<4, "r4">, DwarfRegNum<[4]>; -def R5 : AMDILReg<5, "r5">, DwarfRegNum<[5]>; -def R6 : AMDILReg<6, "r6">, DwarfRegNum<[6]>; -def R7 : AMDILReg<7, "r7">, DwarfRegNum<[7]>; -def R8 : AMDILReg<8, "r8">, DwarfRegNum<[8]>; -def R9 : AMDILReg<9, "r9">, DwarfRegNum<[9]>; -def R10 : AMDILReg<10, "r10">, DwarfRegNum<[10]>; -def R11 : AMDILReg<11, "r11">, DwarfRegNum<[11]>; -def R12 : AMDILReg<12, "r12">, DwarfRegNum<[12]>; -def R13 : AMDILReg<13, "r13">, DwarfRegNum<[13]>; -def R14 : AMDILReg<14, "r14">, DwarfRegNum<[14]>; -def R15 : AMDILReg<15, "r15">, DwarfRegNum<[15]>; -def R16 : AMDILReg<16, "r16">, DwarfRegNum<[16]>; -def R17 : AMDILReg<17, "r17">, DwarfRegNum<[17]>; -def R18 : AMDILReg<18, "r18">, DwarfRegNum<[18]>; -def R19 : AMDILReg<19, "r19">, DwarfRegNum<[19]>; -def R20 : AMDILReg<20, "r20">, DwarfRegNum<[20]>; - -// All registers between 1000 and 1024 are reserved and cannot be used -// unless commented in this section -// r1021-r1025 are used to dynamically calculate the local/group/thread/region/region_local ID's -// r1020 is used to hold the frame index for local arrays -// r1019 is used to hold the dynamic stack allocation pointer -// r1018 is used as a temporary register for handwritten code -// r1017 is used as a temporary register for handwritten code -// r1016 is used as a temporary register for load/store code -// r1015 is used as a temporary register for data segment offset -// r1014 is used as a temporary register for store code -// r1013 is used as the section data pointer register -// r1012-r1010 and r1001-r1008 are used for temporary I/O registers -// r1009 is used as the frame pointer register -// r999 is used as the mem register. -// r998 is used as the return address register. -//def R1025 : AMDILReg<1025, "r1025">, DwarfRegNum<[1025]>; -//def R1024 : AMDILReg<1024, "r1024">, DwarfRegNum<[1024]>; -//def R1023 : AMDILReg<1023, "r1023">, DwarfRegNum<[1023]>; -//def R1022 : AMDILReg<1022, "r1022">, DwarfRegNum<[1022]>; -//def R1021 : AMDILReg<1021, "r1021">, DwarfRegNum<[1021]>; -//def R1020 : AMDILReg<1020, "r1020">, DwarfRegNum<[1020]>; -def SP : AMDILReg<1019, "r1019">, DwarfRegNum<[1019]>; -def T1 : AMDILReg<1018, "r1018">, DwarfRegNum<[1018]>; -def T2 : AMDILReg<1017, "r1017">, DwarfRegNum<[1017]>; -def T3 : AMDILReg<1016, "r1016">, DwarfRegNum<[1016]>; -def T4 : AMDILReg<1015, "r1015">, DwarfRegNum<[1015]>; -def T5 : AMDILReg<1014, "r1014">, DwarfRegNum<[1014]>; -def SDP : AMDILReg<1013, "r1013">, DwarfRegNum<[1013]>; -def R1012: AMDILReg<1012, "r1012">, DwarfRegNum<[1012]>; -def R1011: AMDILReg<1011, "r1011">, DwarfRegNum<[1011]>; -def R1010: AMDILReg<1010, "r1010">, DwarfRegNum<[1010]>; -def DFP : AMDILReg<1009, "r1009">, DwarfRegNum<[1009]>; -def R1008: AMDILReg<1008, "r1008">, DwarfRegNum<[1008]>; -def R1007: AMDILReg<1007, "r1007">, DwarfRegNum<[1007]>; -def R1006: AMDILReg<1006, "r1006">, DwarfRegNum<[1006]>; -def R1005: AMDILReg<1005, "r1005">, DwarfRegNum<[1005]>; -def R1004: AMDILReg<1004, "r1004">, DwarfRegNum<[1004]>; -def R1003: AMDILReg<1003, "r1003">, DwarfRegNum<[1003]>; -def R1002: AMDILReg<1002, "r1002">, DwarfRegNum<[1002]>; -def R1001: AMDILReg<1001, "r1001">, DwarfRegNum<[1001]>; -def MEM : AMDILReg<999, "mem">, DwarfRegNum<[999]>; -def RA : AMDILReg<998, "r998">, DwarfRegNum<[998]>; -def FP : AMDILReg<997, "r997">, DwarfRegNum<[997]>; -def GPRI16 : RegisterClass<"AMDGPU", [i16], 16, - (add (sequence "R%u", 1, 20), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> { - let AltOrders = [(add (sequence "R%u", 1, 20))]; - let AltOrderSelect = [{ - return 1; - }]; - } -def GPRI32 : RegisterClass<"AMDGPU", [i32], 32, - (add (sequence "R%u", 1, 20), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> { - let AltOrders = [(add (sequence "R%u", 1, 20))]; - let AltOrderSelect = [{ - return 1; - }]; - } -def GPRF32 : RegisterClass<"AMDGPU", [f32], 32, - (add (sequence "R%u", 1, 20), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> { - let AltOrders = [(add (sequence "R%u", 1, 20))]; - let AltOrderSelect = [{ - return 1; - }]; - } diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt index 3c6fa5a7ae46..4d1608260048 100644 --- a/lib/Target/R600/CMakeLists.txt +++ b/lib/Target/R600/CMakeLists.txt @@ -13,10 +13,9 @@ add_public_tablegen_target(AMDGPUCommonTableGen) add_llvm_target(R600CodeGen AMDILCFGStructurizer.cpp - AMDILIntrinsicInfo.cpp - AMDILISelLowering.cpp AMDGPUAsmPrinter.cpp AMDGPUFrameLowering.cpp + AMDGPUIntrinsicInfo.cpp AMDGPUISelDAGToDAG.cpp AMDGPUMCInstLower.cpp AMDGPUMachineFunction.cpp @@ -24,8 +23,8 @@ add_llvm_target(R600CodeGen AMDGPUTargetMachine.cpp AMDGPUTargetTransformInfo.cpp AMDGPUISelLowering.cpp - AMDGPUConvertToISA.cpp AMDGPUInstrInfo.cpp + AMDGPUPromoteAlloca.cpp AMDGPURegisterInfo.cpp R600ClauseMergePass.cpp R600ControlFlowFinalizer.cpp @@ -41,6 +40,7 @@ add_llvm_target(R600CodeGen R600TextureIntrinsicsReplacer.cpp SIAnnotateControlFlow.cpp SIFixSGPRCopies.cpp + SIFixSGPRLiveRanges.cpp SIInsertWaits.cpp SIInstrInfo.cpp SIISelLowering.cpp diff --git a/lib/Target/R600/EvergreenInstructions.td b/lib/Target/R600/EvergreenInstructions.td index 20654419a8f4..dcb7e982c7fc 100644 --- a/lib/Target/R600/EvergreenInstructions.td +++ b/lib/Target/R600/EvergreenInstructions.td @@ -295,7 +295,7 @@ def : Pat<(i32 (sext_inreg i32:$src, i8)), def : Pat<(i32 (sext_inreg i32:$src, i16)), (BFE_INT_eg i32:$src, (i32 ZERO), (MOV_IMM_I32 16))>; -defm : BFIPatterns <BFI_INT_eg>; +defm : BFIPatterns <BFI_INT_eg, MOV_IMM_I32>; def BFM_INT_eg : R600_2OP <0xA0, "BFM_INT", [(set i32:$dst, (AMDGPUbfm i32:$src0, i32:$src1))], @@ -326,6 +326,8 @@ def MUL_UINT24_eg : R600_2OP <0xB5, "MUL_UINT24", def DOT4_eg : DOT4_Common<0xBE>; defm CUBE_eg : CUBE_Common<0xC0>; +def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop, VecALU>; + let hasSideEffects = 1 in { def MOVA_INT_eg : R600_1OP <0xCC, "MOVA_INT", [], VecALU>; } @@ -346,7 +348,7 @@ def FLT_TO_UINT_eg : FLT_TO_UINT_Common<0x9A> { def UINT_TO_FLT_eg : UINT_TO_FLT_Common<0x9C>; def GROUP_BARRIER : InstR600 < - (outs), (ins), " GROUP_BARRIER", [(int_AMDGPU_barrier_local)], AnyALU>, + (outs), (ins), " GROUP_BARRIER", [(int_AMDGPU_barrier_local), (int_AMDGPU_barrier_global)], AnyALU>, R600ALU_Word0, R600ALU_Word1_OP2 <0x54> { @@ -375,6 +377,11 @@ def GROUP_BARRIER : InstR600 < let ALUInst = 1; } +def : Pat < + (int_AMDGPU_barrier_global), + (GROUP_BARRIER) +>; + //===----------------------------------------------------------------------===// // LDS Instructions //===----------------------------------------------------------------------===// diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp index 11ae09102188..0927040cb5bc 100644 --- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp +++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp @@ -99,9 +99,9 @@ void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O) { return; } - // The low 8 bits encoding value is the register index, for both VGPRs and - // SGPRs. - unsigned RegIdx = MRI.getEncodingValue(reg) & ((1 << 8) - 1); + // The low 8 bits of the encoding value is the register index, for both VGPRs + // and SGPRs. + unsigned RegIdx = MRI.getEncodingValue(reg) & ((1 << 8) - 1); if (NumRegs == 1) { O << Type << RegIdx; return; @@ -216,13 +216,8 @@ void AMDGPUInstPrinter::printClamp(const MCInst *MI, unsigned OpNo, void AMDGPUInstPrinter::printLiteral(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - union Literal { - float f; - int32_t i; - } L; - - L.i = MI->getOperand(OpNo).getImm(); - O << L.i << "(" << L.f << ")"; + int32_t Imm = MI->getOperand(OpNo).getImm(); + O << Imm << '(' << BitsToFloat(Imm) << ')'; } void AMDGPUInstPrinter::printLast(const MCInst *MI, unsigned OpNo, diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp index 5e7cefed0ace..dc1344fb8d3f 100644 --- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp +++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp @@ -172,17 +172,13 @@ uint64_t R600MCCodeEmitter::getMachineOpValue(const MCInst &MI, SmallVectorImpl<MCFixup> &Fixup, const MCSubtargetInfo &STI) const { if (MO.isReg()) { - if (HAS_NATIVE_OPERANDS(MCII.get(MI.getOpcode()).TSFlags)) { + if (HAS_NATIVE_OPERANDS(MCII.get(MI.getOpcode()).TSFlags)) return MRI.getEncodingValue(MO.getReg()); - } else { - return getHWReg(MO.getReg()); - } - } else if (MO.isImm()) { - return MO.getImm(); - } else { - assert(0); - return 0; + return getHWReg(MO.getReg()); } + + assert(MO.isImm()); + return MO.getImm(); } #include "AMDGPUGenMCCodeEmitter.inc" diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp index d255e9690524..d98a6dbb37bd 100644 --- a/lib/Target/R600/R600ControlFlowFinalizer.cpp +++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp @@ -14,6 +14,7 @@ #include "llvm/Support/Debug.h" #include "AMDGPU.h" +#include "AMDGPUSubtarget.h" #include "R600Defines.h" #include "R600InstrInfo.h" #include "R600MachineFunctionInfo.h" diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp index d6c68305da76..7f3560a4eba8 100644 --- a/lib/Target/R600/R600ISelLowering.cpp +++ b/lib/Target/R600/R600ISelLowering.cpp @@ -13,6 +13,9 @@ //===----------------------------------------------------------------------===// #include "R600ISelLowering.h" +#include "AMDGPUFrameLowering.h" +#include "AMDGPUIntrinsicInfo.h" +#include "AMDGPUSubtarget.h" #include "R600Defines.h" #include "R600InstrInfo.h" #include "R600MachineFunctionInfo.h" @@ -65,6 +68,7 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) : setOperationAction(ISD::BR_CC, MVT::i32, Expand); setOperationAction(ISD::BR_CC, MVT::f32, Expand); + setOperationAction(ISD::BRCOND, MVT::Other, Custom); setOperationAction(ISD::FSUB, MVT::f32, Expand); @@ -133,19 +137,47 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) : setOperationAction(ISD::LOAD, MVT::v4i32, Custom); setOperationAction(ISD::FrameIndex, MVT::i32, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); + + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); + setTargetDAGCombine(ISD::FP_ROUND); setTargetDAGCombine(ISD::FP_TO_SINT); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); setTargetDAGCombine(ISD::SELECT_CC); setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); + setOperationAction(ISD::SUB, MVT::i64, Expand); + // These should be replaced by UDVIREM, but it does not happen automatically // during Type Legalization setOperationAction(ISD::UDIV, MVT::i64, Custom); setOperationAction(ISD::UREM, MVT::i64, Custom); + setOperationAction(ISD::SDIV, MVT::i64, Custom); + setOperationAction(ISD::SREM, MVT::i64, Custom); + + // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32 + // to be Legal/Custom in order to avoid library calls. + setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); + setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); + setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); + const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; + for (MVT VT : ScalarIntVTs) { + setOperationAction(ISD::ADDC, VT, Expand); + setOperationAction(ISD::SUBC, VT, Expand); + setOperationAction(ISD::ADDE, VT, Expand); + setOperationAction(ISD::SUBE, VT, Expand); + } + setBooleanContents(ZeroOrNegativeOneBooleanContent); setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); setSchedulingPreference(Sched::Source); @@ -537,11 +569,24 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); switch (Op.getOpcode()) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); + case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); + case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); + case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG); + case ISD::SRA_PARTS: + case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG); case ISD::FCOS: case ISD::FSIN: return LowerTrig(Op, DAG); case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); case ISD::STORE: return LowerSTORE(Op, DAG); - case ISD::LOAD: return LowerLOAD(Op, DAG); + case ISD::LOAD: { + SDValue Result = LowerLOAD(Op, DAG); + assert((!Result.getNode() || + Result.getNode()->getNumValues() == 2) && + "Load should return a value and a chain"); + return Result; + } + + case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG); case ISD::INTRINSIC_VOID: { SDValue Chain = Op.getOperand(0); @@ -776,6 +821,9 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const case Intrinsic::r600_read_tidig_z: return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, AMDGPU::T0_Z, VT); + case Intrinsic::AMDGPU_rsq: + // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior. + return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); } // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode()) break; @@ -793,20 +841,172 @@ void R600TargetLowering::ReplaceNodeResults(SDNode *N, return; case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG)); return; - case ISD::LOAD: { - SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode(); - Results.push_back(SDValue(Node, 0)); - Results.push_back(SDValue(Node, 1)); - // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode - // function - DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1)); - return; + case ISD::UDIV: { + SDValue Op = SDValue(N, 0); + SDLoc DL(Op); + EVT VT = Op.getValueType(); + SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), + N->getOperand(0), N->getOperand(1)); + Results.push_back(UDIVREM); + break; } - case ISD::STORE: - SDNode *Node = LowerSTORE(SDValue(N, 0), DAG).getNode(); - Results.push_back(SDValue(Node, 0)); - return; + case ISD::UREM: { + SDValue Op = SDValue(N, 0); + SDLoc DL(Op); + EVT VT = Op.getValueType(); + SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), + N->getOperand(0), N->getOperand(1)); + Results.push_back(UDIVREM.getValue(1)); + break; + } + case ISD::SDIV: { + SDValue Op = SDValue(N, 0); + SDLoc DL(Op); + EVT VT = Op.getValueType(); + SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT), + N->getOperand(0), N->getOperand(1)); + Results.push_back(SDIVREM); + break; + } + case ISD::SREM: { + SDValue Op = SDValue(N, 0); + SDLoc DL(Op); + EVT VT = Op.getValueType(); + SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT), + N->getOperand(0), N->getOperand(1)); + Results.push_back(SDIVREM.getValue(1)); + break; + } + case ISD::SDIVREM: { + SDValue Op = SDValue(N, 1); + SDValue RES = LowerSDIVREM(Op, DAG); + Results.push_back(RES); + Results.push_back(RES.getValue(1)); + break; + } + case ISD::UDIVREM: { + SDValue Op = SDValue(N, 0); + SDLoc DL(Op); + EVT VT = Op.getValueType(); + EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); + + SDValue one = DAG.getConstant(1, HalfVT); + SDValue zero = DAG.getConstant(0, HalfVT); + + //HiLo split + SDValue LHS = N->getOperand(0); + SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero); + SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one); + + SDValue RHS = N->getOperand(1); + SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero); + SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one); + + // Get Speculative values + SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo); + SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo); + + SDValue REM_Hi = zero; + SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ); + + SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ); + SDValue DIV_Lo = zero; + + const unsigned halfBitWidth = HalfVT.getSizeInBits(); + + for (unsigned i = 0; i < halfBitWidth; ++i) { + SDValue POS = DAG.getConstant(halfBitWidth - i - 1, HalfVT); + // Get Value of high bit + SDValue HBit; + if (halfBitWidth == 32 && Subtarget->hasBFE()) { + HBit = DAG.getNode(AMDGPUISD::BFE_U32, DL, HalfVT, LHS_Lo, POS, one); + } else { + HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS); + HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one); + } + + SDValue Carry = DAG.getNode(ISD::SRL, DL, HalfVT, REM_Lo, + DAG.getConstant(halfBitWidth - 1, HalfVT)); + REM_Hi = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Hi, one); + REM_Hi = DAG.getNode(ISD::OR, DL, HalfVT, REM_Hi, Carry); + + REM_Lo = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Lo, one); + REM_Lo = DAG.getNode(ISD::OR, DL, HalfVT, REM_Lo, HBit); + + + SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi); + + SDValue BIT = DAG.getConstant(1 << (halfBitWidth - i - 1), HalfVT); + SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETGE); + + DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT); + + // Update REM + + SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS); + + REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETGE); + REM_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, zero); + REM_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, one); + } + + SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi); + SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi); + Results.push_back(DIV); + Results.push_back(REM); + break; } + } +} + +SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG, + SDValue Vector) const { + + SDLoc DL(Vector); + EVT VecVT = Vector.getValueType(); + EVT EltVT = VecVT.getVectorElementType(); + SmallVector<SDValue, 8> Args; + + for (unsigned i = 0, e = VecVT.getVectorNumElements(); + i != e; ++i) { + Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, + Vector, DAG.getConstant(i, getVectorIdxTy()))); + } + + return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args); +} + +SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, + SelectionDAG &DAG) const { + + SDLoc DL(Op); + SDValue Vector = Op.getOperand(0); + SDValue Index = Op.getOperand(1); + + if (isa<ConstantSDNode>(Index) || + Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR) + return Op; + + Vector = vectorToVerticalVector(DAG, Vector); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(), + Vector, Index); +} + +SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + SDValue Vector = Op.getOperand(0); + SDValue Value = Op.getOperand(1); + SDValue Index = Op.getOperand(2); + + if (isa<ConstantSDNode>(Index) || + Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR) + return Op; + + Vector = vectorToVerticalVector(DAG, Vector); + SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(), + Vector, Value, Index); + return vectorToVerticalVector(DAG, Insert); } SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { @@ -840,6 +1040,80 @@ SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { DAG.getConstantFP(3.14159265359, MVT::f32)); } +SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + + SDValue Lo = Op.getOperand(0); + SDValue Hi = Op.getOperand(1); + SDValue Shift = Op.getOperand(2); + SDValue Zero = DAG.getConstant(0, VT); + SDValue One = DAG.getConstant(1, VT); + + SDValue Width = DAG.getConstant(VT.getSizeInBits(), VT); + SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT); + SDValue BigShift = DAG.getNode(ISD::SUB, DL, VT, Shift, Width); + SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift); + + // The dance around Width1 is necessary for 0 special case. + // Without it the CompShift might be 32, producing incorrect results in + // Overflow. So we do the shift in two steps, the alternative is to + // add a conditional to filter the special case. + + SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift); + Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One); + + SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift); + HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow); + SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift); + + SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift); + SDValue LoBig = Zero; + + Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT); + Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT); + + return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi); +} + +SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + + SDValue Lo = Op.getOperand(0); + SDValue Hi = Op.getOperand(1); + SDValue Shift = Op.getOperand(2); + SDValue Zero = DAG.getConstant(0, VT); + SDValue One = DAG.getConstant(1, VT); + + const bool SRA = Op.getOpcode() == ISD::SRA_PARTS; + + SDValue Width = DAG.getConstant(VT.getSizeInBits(), VT); + SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT); + SDValue BigShift = DAG.getNode(ISD::SUB, DL, VT, Shift, Width); + SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift); + + // The dance around Width1 is necessary for 0 special case. + // Without it the CompShift might be 32, producing incorrect results in + // Overflow. So we do the shift in two steps, the alternative is to + // add a conditional to filter the special case. + + SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift); + Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One); + + SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift); + SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift); + LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow); + + SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift); + SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero; + + Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT); + Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT); + + return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi); +} + SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode( ISD::SETCC, @@ -1369,6 +1643,15 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const return DAG.getMergeValues(Ops, DL); } +SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { + SDValue Chain = Op.getOperand(0); + SDValue Cond = Op.getOperand(1); + SDValue Jump = Op.getOperand(2); + + return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(), + Chain, Jump, Cond); +} + /// XXX Only kernel functions are supported, so we can assume for now that /// every function is a kernel function, but in the future we should use /// separate calling conventions for kernel and non-kernel functions. @@ -1902,9 +2185,8 @@ SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node, SDValue FakeOp; std::vector<SDValue> Ops; - for(SDNode::op_iterator I = Node->op_begin(), E = Node->op_end(); - I != E; ++I) - Ops.push_back(*I); + for (const SDUse &I : Node->ops()) + Ops.push_back(I); if (Opcode == AMDGPU::DOT_4) { int OperandIdx[] = { diff --git a/lib/Target/R600/R600ISelLowering.h b/lib/Target/R600/R600ISelLowering.h index a8a464f338f7..d22c8c98a542 100644 --- a/lib/Target/R600/R600ISelLowering.h +++ b/lib/Target/R600/R600ISelLowering.h @@ -51,15 +51,18 @@ private: void lowerImplicitParameter(MachineInstr *MI, MachineBasicBlock &BB, MachineRegisterInfo & MRI, unsigned dword_offset) const; SDValue OptimizeSwizzle(SDValue BuildVector, SDValue Swz[], SelectionDAG &DAG) const; + SDValue vectorToVerticalVector(SelectionDAG &DAG, SDValue Vector) const; - /// \brief Lower ROTL opcode to BITALIGN - SDValue LowerROTL(SDValue Op, SelectionDAG &DAG) const; - + SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSHLParts(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSRXParts(SDValue Op, SelectionDAG &DAG) const; SDValue stackPtrToRegIndex(SDValue Ptr, unsigned StackWidth, SelectionDAG &DAG) const; diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp index b0d9ae3e70e3..3972e2f03730 100644 --- a/lib/Target/R600/R600InstrInfo.cpp +++ b/lib/Target/R600/R600InstrInfo.cpp @@ -28,10 +28,9 @@ using namespace llvm; #define GET_INSTRINFO_CTOR_DTOR #include "AMDGPUGenDFAPacketizer.inc" -R600InstrInfo::R600InstrInfo(AMDGPUTargetMachine &tm) - : AMDGPUInstrInfo(tm), - RI(tm), - ST(tm.getSubtarget<AMDGPUSubtarget>()) +R600InstrInfo::R600InstrInfo(const AMDGPUSubtarget &st) + : AMDGPUInstrInfo(st), + RI(st) { } const R600RegisterInfo &R600InstrInfo::getRegisterInfo() const { @@ -52,11 +51,15 @@ R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB, unsigned DestReg, unsigned SrcReg, bool KillSrc) const { unsigned VectorComponents = 0; - if (AMDGPU::R600_Reg128RegClass.contains(DestReg) && - AMDGPU::R600_Reg128RegClass.contains(SrcReg)) { + if ((AMDGPU::R600_Reg128RegClass.contains(DestReg) || + AMDGPU::R600_Reg128VerticalRegClass.contains(DestReg)) && + (AMDGPU::R600_Reg128RegClass.contains(SrcReg) || + AMDGPU::R600_Reg128VerticalRegClass.contains(SrcReg))) { VectorComponents = 4; - } else if(AMDGPU::R600_Reg64RegClass.contains(DestReg) && - AMDGPU::R600_Reg64RegClass.contains(SrcReg)) { + } else if((AMDGPU::R600_Reg64RegClass.contains(DestReg) || + AMDGPU::R600_Reg64VerticalRegClass.contains(DestReg)) && + (AMDGPU::R600_Reg64RegClass.contains(SrcReg) || + AMDGPU::R600_Reg64VerticalRegClass.contains(SrcReg))) { VectorComponents = 2; } @@ -768,16 +771,6 @@ R600InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, return true; } -int R600InstrInfo::getBranchInstr(const MachineOperand &op) const { - const MachineInstr *MI = op.getParent(); - - switch (MI->getDesc().OpInfo->RegClass) { - default: // FIXME: fallthrough?? - case AMDGPU::GPRI32RegClassID: return AMDGPU::BRANCH_COND_i32; - case AMDGPU::GPRF32RegClassID: return AMDGPU::BRANCH_COND_f32; - }; -} - static MachineBasicBlock::iterator FindLastAluClause(MachineBasicBlock &MBB) { for (MachineBasicBlock::reverse_iterator It = MBB.rbegin(), E = MBB.rend(); @@ -1064,10 +1057,34 @@ unsigned int R600InstrInfo::getInstrLatency(const InstrItineraryData *ItinData, return 2; } +bool R600InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { + + switch(MI->getOpcode()) { + default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); + case AMDGPU::R600_EXTRACT_ELT_V2: + case AMDGPU::R600_EXTRACT_ELT_V4: + buildIndirectRead(MI->getParent(), MI, MI->getOperand(0).getReg(), + RI.getHWRegIndex(MI->getOperand(1).getReg()), // Address + MI->getOperand(2).getReg(), + RI.getHWRegChan(MI->getOperand(1).getReg())); + break; + case AMDGPU::R600_INSERT_ELT_V2: + case AMDGPU::R600_INSERT_ELT_V4: + buildIndirectWrite(MI->getParent(), MI, MI->getOperand(2).getReg(), // Value + RI.getHWRegIndex(MI->getOperand(1).getReg()), // Address + MI->getOperand(3).getReg(), // Offset + RI.getHWRegChan(MI->getOperand(1).getReg())); // Channel + break; + } + MI->eraseFromParent(); + return true; +} + void R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved, const MachineFunction &MF) const { const AMDGPUFrameLowering *TFL = - static_cast<const AMDGPUFrameLowering*>(TM.getFrameLowering()); + static_cast<const AMDGPUFrameLowering*>( + MF.getTarget().getFrameLowering()); unsigned StackWidth = TFL->getStackWidth(MF); int End = getIndirectIndexEnd(MF); @@ -1100,7 +1117,22 @@ MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, unsigned ValueReg, unsigned Address, unsigned OffsetReg) const { - unsigned AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); + return buildIndirectWrite(MBB, I, ValueReg, Address, OffsetReg, 0); +} + +MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg, + unsigned AddrChan) const { + unsigned AddrReg; + switch (AddrChan) { + default: llvm_unreachable("Invalid Channel"); + case 0: AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); break; + case 1: AddrReg = AMDGPU::R600_Addr_YRegClass.getRegister(Address); break; + case 2: AddrReg = AMDGPU::R600_Addr_ZRegClass.getRegister(Address); break; + case 3: AddrReg = AMDGPU::R600_Addr_WRegClass.getRegister(Address); break; + } MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg, AMDGPU::AR_X, OffsetReg); setImmOperand(MOVA, AMDGPU::OpName::write, 0); @@ -1117,7 +1149,22 @@ MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, unsigned ValueReg, unsigned Address, unsigned OffsetReg) const { - unsigned AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); + return buildIndirectRead(MBB, I, ValueReg, Address, OffsetReg, 0); +} + +MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg, + unsigned AddrChan) const { + unsigned AddrReg; + switch (AddrChan) { + default: llvm_unreachable("Invalid Channel"); + case 0: AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); break; + case 1: AddrReg = AMDGPU::R600_Addr_YRegClass.getRegister(Address); break; + case 2: AddrReg = AMDGPU::R600_Addr_ZRegClass.getRegister(Address); break; + case 3: AddrReg = AMDGPU::R600_Addr_WRegClass.getRegister(Address); break; + } MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg, AMDGPU::AR_X, OffsetReg); @@ -1220,7 +1267,6 @@ MachineInstr *R600InstrInfo::buildSlotOfVectorInstruction( const { assert (MI->getOpcode() == AMDGPU::DOT_4 && "Not Implemented"); unsigned Opcode; - const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(); if (ST.getGeneration() <= AMDGPUSubtarget::R700) Opcode = AMDGPU::DOT4_r600; else diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h index b5304a0edfd5..45a57d367b8b 100644 --- a/lib/Target/R600/R600InstrInfo.h +++ b/lib/Target/R600/R600InstrInfo.h @@ -32,12 +32,22 @@ namespace llvm { class R600InstrInfo : public AMDGPUInstrInfo { private: const R600RegisterInfo RI; - const AMDGPUSubtarget &ST; - int getBranchInstr(const MachineOperand &op) const; std::vector<std::pair<int, unsigned> > ExtractSrcs(MachineInstr *MI, const DenseMap<unsigned, unsigned> &PV, unsigned &ConstCount) const; + + MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg, + unsigned AddrChan) const; + + MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg, + unsigned AddrChan) const; public: enum BankSwizzle { ALU_VEC_012_SCL_210 = 0, @@ -48,7 +58,7 @@ namespace llvm { ALU_VEC_210 }; - explicit R600InstrInfo(AMDGPUTargetMachine &tm); + explicit R600InstrInfo(const AMDGPUSubtarget &st); const R600RegisterInfo &getRegisterInfo() const override; void copyPhysReg(MachineBasicBlock &MBB, @@ -197,6 +207,8 @@ namespace llvm { int getInstrLatency(const InstrItineraryData *ItinData, SDNode *Node) const override { return 1;} + virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const; + /// \brief Reserve the registers that may be accesed using indirect addressing. void reserveIndirectRegisters(BitVector &Reserved, const MachineFunction &MF) const; diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td index 590fde20293e..73fa345ac0f2 100644 --- a/lib/Target/R600/R600Instructions.td +++ b/lib/Target/R600/R600Instructions.td @@ -125,7 +125,7 @@ class R600_1OP <bits<11> inst, string opName, list<dag> pattern, class R600_1OP_Helper <bits<11> inst, string opName, SDPatternOperator node, InstrItinClass itin = AnyALU> : R600_1OP <inst, opName, - [(set R600_Reg32:$dst, (node R600_Reg32:$src0))] + [(set R600_Reg32:$dst, (node R600_Reg32:$src0))], itin >; // If you add or change the operands for R600_2OP instructions, you must @@ -161,10 +161,10 @@ class R600_2OP <bits<11> inst, string opName, list<dag> pattern, } class R600_2OP_Helper <bits<11> inst, string opName, SDPatternOperator node, - InstrItinClass itim = AnyALU> : + InstrItinClass itin = AnyALU> : R600_2OP <inst, opName, [(set R600_Reg32:$dst, (node R600_Reg32:$src0, - R600_Reg32:$src1))] + R600_Reg32:$src1))], itin >; // If you add our change the operands for R600_3OP instructions, you must @@ -721,14 +721,11 @@ def SETNE_DX10 : R600_2OP < >; def FRACT : R600_1OP_Helper <0x10, "FRACT", AMDGPUfract>; -def TRUNC : R600_1OP_Helper <0x11, "TRUNC", int_AMDGPU_trunc>; +def TRUNC : R600_1OP_Helper <0x11, "TRUNC", ftrunc>; def CEIL : R600_1OP_Helper <0x12, "CEIL", fceil>; def RNDNE : R600_1OP_Helper <0x13, "RNDNE", frint>; def FLOOR : R600_1OP_Helper <0x14, "FLOOR", ffloor>; -// Add also ftrunc intrinsic pattern -def : Pat<(ftrunc f32:$src0), (TRUNC $src0)>; - def MOV : R600_1OP <0x19, "MOV", []>; let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1 in { @@ -1082,18 +1079,21 @@ class RECIP_UINT_Common <bits<11> inst> : R600_1OP_Helper < let Itinerary = TransALU; } +// Clamped to maximum. class RECIPSQRT_CLAMPED_Common <bits<11> inst> : R600_1OP_Helper < - inst, "RECIPSQRT_CLAMPED", int_AMDGPU_rsq + inst, "RECIPSQRT_CLAMPED", AMDGPUrsq_clamped > { let Itinerary = TransALU; } -class RECIPSQRT_IEEE_Common <bits<11> inst> : R600_1OP < - inst, "RECIPSQRT_IEEE", [] +class RECIPSQRT_IEEE_Common <bits<11> inst> : R600_1OP_Helper < + inst, "RECIPSQRT_IEEE", AMDGPUrsq_legacy > { let Itinerary = TransALU; } +// TODO: There is also RECIPSQRT_FF which clamps to zero. + class SIN_Common <bits<11> inst> : R600_1OP < inst, "SIN", [(set f32:$dst, (SIN_HW f32:$src0))]>{ let Trig = 1; @@ -1266,13 +1266,6 @@ defm R600_ : RegisterLoadStore <R600_Reg32, FRAMEri, ADDRIndirect>; //===----------------------------------------------------------------------===// -// Branch Instructions -//===----------------------------------------------------------------------===// - -def IF_PREDICATE_SET : ILFormat<(outs), (ins GPRI32:$src), - "IF_PREDICATE_SET $src", []>; - -//===----------------------------------------------------------------------===// // Pseudo instructions //===----------------------------------------------------------------------===// @@ -1345,15 +1338,6 @@ def TXD_SHADOW: InstR600 < } // End isPseudo = 1 } // End usesCustomInserter = 1 -//===---------------------------------------------------------------------===// -// Return instruction -//===---------------------------------------------------------------------===// -let isTerminator = 1, isReturn = 1, hasCtrlDep = 1, - usesCustomInserter = 1 in { - def RETURN : ILFormat<(outs), (ins variable_ops), - "RETURN", [(IL_retflag)]>; -} - //===----------------------------------------------------------------------===// // Constant Buffer Addressing Support @@ -1480,11 +1464,52 @@ let Inst{63-32} = Word1; let VTXInst = 1; } +//===---------------------------------------------------------------------===// +// Flow and Program control Instructions +//===---------------------------------------------------------------------===// +class ILFormat<dag outs, dag ins, string asmstr, list<dag> pattern> +: Instruction { + + let Namespace = "AMDGPU"; + dag OutOperandList = outs; + dag InOperandList = ins; + let Pattern = pattern; + let AsmString = !strconcat(asmstr, "\n"); + let isPseudo = 1; + let Itinerary = NullALU; + bit hasIEEEFlag = 0; + bit hasZeroOpFlag = 0; + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; +} + +multiclass BranchConditional<SDNode Op, RegisterClass rci, RegisterClass rcf> { + def _i32 : ILFormat<(outs), + (ins brtarget:$target, rci:$src0), + "; i32 Pseudo branch instruction", + [(Op bb:$target, (i32 rci:$src0))]>; + def _f32 : ILFormat<(outs), + (ins brtarget:$target, rcf:$src0), + "; f32 Pseudo branch instruction", + [(Op bb:$target, (f32 rcf:$src0))]>; +} + +// Only scalar types should generate flow control +multiclass BranchInstr<string name> { + def _i32 : ILFormat<(outs), (ins R600_Reg32:$src), + !strconcat(name, " $src"), []>; + def _f32 : ILFormat<(outs), (ins R600_Reg32:$src), + !strconcat(name, " $src"), []>; +} +// Only scalar types should generate flow control +multiclass BranchInstr2<string name> { + def _i32 : ILFormat<(outs), (ins R600_Reg32:$src0, R600_Reg32:$src1), + !strconcat(name, " $src0, $src1"), []>; + def _f32 : ILFormat<(outs), (ins R600_Reg32:$src0, R600_Reg32:$src1), + !strconcat(name, " $src0, $src1"), []>; +} - -//===--------------------------------------------------------------------===// -// Instructions support -//===--------------------------------------------------------------------===// //===---------------------------------------------------------------------===// // Custom Inserter for Branches and returns, this eventually will be a // separate pass @@ -1497,13 +1522,22 @@ let isTerminator = 1, usesCustomInserter = 1, isBranch = 1, isBarrier = 1 in { } //===---------------------------------------------------------------------===// -// Flow and Program control Instructions +// Return instruction //===---------------------------------------------------------------------===// +let isTerminator = 1, isReturn = 1, hasCtrlDep = 1, + usesCustomInserter = 1 in { + def RETURN : ILFormat<(outs), (ins variable_ops), + "RETURN", [(IL_retflag)]>; +} + +//===----------------------------------------------------------------------===// +// Branch Instructions +//===----------------------------------------------------------------------===// + +def IF_PREDICATE_SET : ILFormat<(outs), (ins R600_Reg32:$src), + "IF_PREDICATE_SET $src", []>; + let isTerminator=1 in { - def SWITCH : ILFormat< (outs), (ins GPRI32:$src), - !strconcat("SWITCH", " $src"), []>; - def CASE : ILFormat< (outs), (ins GPRI32:$src), - !strconcat("CASE", " $src"), []>; def BREAK : ILFormat< (outs), (ins), "BREAK", []>; def CONTINUE : ILFormat< (outs), (ins), @@ -1548,6 +1582,60 @@ let isTerminator=1 in { } //===----------------------------------------------------------------------===// +// Indirect addressing pseudo instructions +//===----------------------------------------------------------------------===// + +let isPseudo = 1 in { + +class ExtractVertical <RegisterClass vec_rc> : InstR600 < + (outs R600_Reg32:$dst), + (ins vec_rc:$vec, R600_Reg32:$index), "", + [], + AnyALU +>; + +let Constraints = "$dst = $vec" in { + +class InsertVertical <RegisterClass vec_rc> : InstR600 < + (outs vec_rc:$dst), + (ins vec_rc:$vec, R600_Reg32:$value, R600_Reg32:$index), "", + [], + AnyALU +>; + +} // End Constraints = "$dst = $vec" + +} // End isPseudo = 1 + +def R600_EXTRACT_ELT_V2 : ExtractVertical <R600_Reg64Vertical>; +def R600_EXTRACT_ELT_V4 : ExtractVertical <R600_Reg128Vertical>; + +def R600_INSERT_ELT_V2 : InsertVertical <R600_Reg64Vertical>; +def R600_INSERT_ELT_V4 : InsertVertical <R600_Reg128Vertical>; + +class ExtractVerticalPat <Instruction inst, ValueType vec_ty, + ValueType scalar_ty> : Pat < + (scalar_ty (extractelt vec_ty:$vec, i32:$index)), + (inst $vec, $index) +>; + +def : ExtractVerticalPat <R600_EXTRACT_ELT_V2, v2i32, i32>; +def : ExtractVerticalPat <R600_EXTRACT_ELT_V2, v2f32, f32>; +def : ExtractVerticalPat <R600_EXTRACT_ELT_V4, v4i32, i32>; +def : ExtractVerticalPat <R600_EXTRACT_ELT_V4, v4f32, f32>; + +class InsertVerticalPat <Instruction inst, ValueType vec_ty, + ValueType scalar_ty> : Pat < + (vec_ty (insertelt vec_ty:$vec, scalar_ty:$value, i32:$index)), + (inst $vec, $value, $index) +>; + +def : InsertVerticalPat <R600_INSERT_ELT_V2, v2i32, i32>; +def : InsertVerticalPat <R600_INSERT_ELT_V2, v2f32, f32>; +def : InsertVerticalPat <R600_INSERT_ELT_V4, v4i32, i32>; +def : InsertVerticalPat <R600_INSERT_ELT_V4, v4f32, f32>; + +//===----------------------------------------------------------------------===// // ISel Patterns //===----------------------------------------------------------------------===// diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp index d1655d1ddc3c..7ea654cb14cd 100644 --- a/lib/Target/R600/R600MachineScheduler.cpp +++ b/lib/Target/R600/R600MachineScheduler.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "R600MachineScheduler.h" +#include "AMDGPUSubtarget.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/Pass.h" diff --git a/lib/Target/R600/R600Packetizer.cpp b/lib/Target/R600/R600Packetizer.cpp index c2f6c03320d1..74cf30974d58 100644 --- a/lib/Target/R600/R600Packetizer.cpp +++ b/lib/Target/R600/R600Packetizer.cpp @@ -16,6 +16,7 @@ #include "llvm/Support/Debug.h" #include "AMDGPU.h" +#include "AMDGPUSubtarget.h" #include "R600InstrInfo.h" #include "llvm/CodeGen/DFAPacketizer.h" #include "llvm/CodeGen/MachineDominators.h" diff --git a/lib/Target/R600/R600RegisterInfo.cpp b/lib/Target/R600/R600RegisterInfo.cpp index f3bb88b3eefc..dc9567505082 100644 --- a/lib/Target/R600/R600RegisterInfo.cpp +++ b/lib/Target/R600/R600RegisterInfo.cpp @@ -20,15 +20,14 @@ using namespace llvm; -R600RegisterInfo::R600RegisterInfo(AMDGPUTargetMachine &tm) -: AMDGPURegisterInfo(tm), - TM(tm) +R600RegisterInfo::R600RegisterInfo(const AMDGPUSubtarget &st) +: AMDGPURegisterInfo(st) { RCW.RegWeight = 0; RCW.WeightLimit = 0;} BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); - const R600InstrInfo *TII = static_cast<const R600InstrInfo*>(TM.getInstrInfo()); + const R600InstrInfo *TII = static_cast<const R600InstrInfo*>(ST.getInstrInfo()); Reserved.set(AMDGPU::ZERO); Reserved.set(AMDGPU::HALF); @@ -55,16 +54,6 @@ BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const { return Reserved; } -const TargetRegisterClass * -R600RegisterInfo::getISARegClass(const TargetRegisterClass * rc) const { - switch (rc->getID()) { - case AMDGPU::GPRF32RegClassID: - case AMDGPU::GPRI32RegClassID: - return &AMDGPU::R600_Reg32RegClass; - default: return rc; - } -} - unsigned R600RegisterInfo::getHWRegChan(unsigned reg) const { return this->getEncodingValue(reg) >> HW_CHAN_SHIFT; } diff --git a/lib/Target/R600/R600RegisterInfo.h b/lib/Target/R600/R600RegisterInfo.h index 52e1a4bed948..247808b6e7b6 100644 --- a/lib/Target/R600/R600RegisterInfo.h +++ b/lib/Target/R600/R600RegisterInfo.h @@ -16,26 +16,18 @@ #define R600REGISTERINFO_H_ #include "AMDGPURegisterInfo.h" -#include "AMDGPUTargetMachine.h" namespace llvm { -class R600TargetMachine; +class AMDGPUSubtarget; struct R600RegisterInfo : public AMDGPURegisterInfo { - AMDGPUTargetMachine &TM; RegClassWeight RCW; - R600RegisterInfo(AMDGPUTargetMachine &tm); + R600RegisterInfo(const AMDGPUSubtarget &st); BitVector getReservedRegs(const MachineFunction &MF) const override; - /// \param RC is an AMDIL reg class. - /// - /// \returns the R600 reg class that is equivalent to \p RC. - const TargetRegisterClass *getISARegClass( - const TargetRegisterClass *RC) const override; - /// \brief get the HW encoding for a register's channel. unsigned getHWRegChan(unsigned reg) const; diff --git a/lib/Target/R600/R600RegisterInfo.td b/lib/Target/R600/R600RegisterInfo.td index 68bcd207b42c..cc667d985a82 100644 --- a/lib/Target/R600/R600RegisterInfo.td +++ b/lib/Target/R600/R600RegisterInfo.td @@ -18,18 +18,28 @@ class R600RegWithChan <string name, bits<9> sel, string chan> : class R600Reg_128<string n, list<Register> subregs, bits<16> encoding> : RegisterWithSubRegs<n, subregs> { + field bits<2> chan_encoding = 0; let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1, sub2, sub3]; - let HWEncoding = encoding; + let HWEncoding{8-0} = encoding{8-0}; + let HWEncoding{10-9} = chan_encoding; } class R600Reg_64<string n, list<Register> subregs, bits<16> encoding> : RegisterWithSubRegs<n, subregs> { + field bits<2> chan_encoding = 0; let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; let HWEncoding = encoding; + let HWEncoding{8-0} = encoding{8-0}; + let HWEncoding{10-9} = chan_encoding; } +class R600Reg_64Vertical<int lo, int hi, string chan> : R600Reg_64 < + "V"#lo#hi#"_"#chan, + [!cast<Register>("T"#lo#"_"#chan), !cast<Register>("T"#hi#"_"#chan)], + lo +>; foreach Index = 0-127 in { foreach Chan = [ "X", "Y", "Z", "W" ] in { @@ -54,6 +64,24 @@ foreach Index = 0-127 in { Index>; } +foreach Chan = [ "X", "Y", "Z", "W"] in { + + let chan_encoding = !if(!eq(Chan, "X"), 0, + !if(!eq(Chan, "Y"), 1, + !if(!eq(Chan, "Z"), 2, + !if(!eq(Chan, "W"), 3, 0)))) in { + def V0123_#Chan : R600Reg_128 <"V0123_"#Chan, + [!cast<Register>("T0_"#Chan), + !cast<Register>("T1_"#Chan), + !cast<Register>("T2_"#Chan), + !cast<Register>("T3_"#Chan)], + 0>; + def V01_#Chan : R600Reg_64Vertical<0, 1, Chan>; + def V23_#Chan : R600Reg_64Vertical<2, 3, Chan>; + } +} + + // KCACHE_BANK0 foreach Index = 159-128 in { foreach Chan = [ "X", "Y", "Z", "W" ] in { @@ -130,8 +158,14 @@ def ALU_PARAM : R600Reg<"Param", 0>; let isAllocatable = 0 in { -// XXX: Only use the X channel, until we support wider stack widths -def R600_Addr : RegisterClass <"AMDGPU", [i32], 127, (add (sequence "Addr%u_X", 0, 127))>; +def R600_Addr : RegisterClass <"AMDGPU", [i32], 32, (add (sequence "Addr%u_X", 0, 127))>; + +// We only use Addr_[YZW] for vertical vectors. +// FIXME if we add more vertical vector registers we will need to ad more +// registers to these classes. +def R600_Addr_Y : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_Y)>; +def R600_Addr_Z : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_Z)>; +def R600_Addr_W : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_W)>; def R600_LDS_SRC_REG : RegisterClass<"AMDGPU", [i32], 32, (add OQA, OQB, OQAP, OQBP, LDS_DIRECT_A, LDS_DIRECT_B)>; @@ -206,5 +240,13 @@ def R600_Reg128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128, let CopyCost = -1; } +def R600_Reg128Vertical : RegisterClass<"AMDGPU", [v4f32, v4i32], 128, + (add V0123_W, V0123_Z, V0123_Y, V0123_X) +>; + def R600_Reg64 : RegisterClass<"AMDGPU", [v2f32, v2i32], 64, (add (sequence "T%u_XY", 0, 63))>; + +def R600_Reg64Vertical : RegisterClass<"AMDGPU", [v2f32, v2i32], 64, + (add V01_X, V01_Y, V01_Z, V01_W, + V23_X, V23_Y, V23_Z, V23_W)>; diff --git a/lib/Target/R600/SIAnnotateControlFlow.cpp b/lib/Target/R600/SIAnnotateControlFlow.cpp index d6e445136faf..91eb60beb6bb 100644 --- a/lib/Target/R600/SIAnnotateControlFlow.cpp +++ b/lib/Target/R600/SIAnnotateControlFlow.cpp @@ -65,7 +65,6 @@ class SIAnnotateControlFlow : public FunctionPass { DominatorTree *DT; StackVector Stack; - SSAUpdater PhiInserter; bool isTopOfStack(BasicBlock *BB); @@ -81,7 +80,7 @@ class SIAnnotateControlFlow : public FunctionPass { void insertElse(BranchInst *Term); - void handleLoopCondition(Value *Cond); + Value *handleLoopCondition(Value *Cond, PHINode *Broken); void handleLoop(BranchInst *Term); @@ -177,7 +176,7 @@ bool SIAnnotateControlFlow::isElse(PHINode *Phi) { } else { if (Phi->getIncomingValue(i) != BoolFalse) return false; - + } } return true; @@ -204,20 +203,26 @@ void SIAnnotateControlFlow::insertElse(BranchInst *Term) { } /// \brief Recursively handle the condition leading to a loop -void SIAnnotateControlFlow::handleLoopCondition(Value *Cond) { +Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken) { if (PHINode *Phi = dyn_cast<PHINode>(Cond)) { + BasicBlock *Parent = Phi->getParent(); + PHINode *NewPhi = PHINode::Create(Int64, 0, "", &Parent->front()); + Value *Ret = NewPhi; // Handle all non-constant incoming values first for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) { Value *Incoming = Phi->getIncomingValue(i); - if (isa<ConstantInt>(Incoming)) + BasicBlock *From = Phi->getIncomingBlock(i); + if (isa<ConstantInt>(Incoming)) { + NewPhi->addIncoming(Broken, From); continue; + } Phi->setIncomingValue(i, BoolFalse); - handleLoopCondition(Incoming); + Value *PhiArg = handleLoopCondition(Incoming, Broken); + NewPhi->addIncoming(PhiArg, From); } - BasicBlock *Parent = Phi->getParent(); BasicBlock *IDom = DT->getNode(Parent)->getIDom()->getBlock(); for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) { @@ -230,33 +235,28 @@ void SIAnnotateControlFlow::handleLoopCondition(Value *Cond) { if (From == IDom) { CallInst *OldEnd = dyn_cast<CallInst>(Parent->getFirstInsertionPt()); if (OldEnd && OldEnd->getCalledFunction() == EndCf) { - Value *Args[] = { - OldEnd->getArgOperand(0), - PhiInserter.GetValueAtEndOfBlock(Parent) - }; - Value *Ret = CallInst::Create(ElseBreak, Args, "", OldEnd); - PhiInserter.AddAvailableValue(Parent, Ret); + Value *Args[] = { OldEnd->getArgOperand(0), NewPhi }; + Ret = CallInst::Create(ElseBreak, Args, "", OldEnd); continue; } } - TerminatorInst *Insert = From->getTerminator(); - Value *Arg = PhiInserter.GetValueAtEndOfBlock(From); - Value *Ret = CallInst::Create(Break, Arg, "", Insert); - PhiInserter.AddAvailableValue(From, Ret); + Value *PhiArg = CallInst::Create(Break, Broken, "", Insert); + NewPhi->setIncomingValue(i, PhiArg); } eraseIfUnused(Phi); + return Ret; } else if (Instruction *Inst = dyn_cast<Instruction>(Cond)) { BasicBlock *Parent = Inst->getParent(); TerminatorInst *Insert = Parent->getTerminator(); - Value *Args[] = { Cond, PhiInserter.GetValueAtEndOfBlock(Parent) }; - Value *Ret = CallInst::Create(IfBreak, Args, "", Insert); - PhiInserter.AddAvailableValue(Parent, Ret); + Value *Args[] = { Cond, Broken }; + return CallInst::Create(IfBreak, Args, "", Insert); } else { llvm_unreachable("Unhandled loop condition!"); } + return 0; } /// \brief Handle a back edge (loop) @@ -264,15 +264,11 @@ void SIAnnotateControlFlow::handleLoop(BranchInst *Term) { BasicBlock *Target = Term->getSuccessor(1); PHINode *Broken = PHINode::Create(Int64, 0, "", &Target->front()); - PhiInserter.Initialize(Int64, ""); - PhiInserter.AddAvailableValue(Target, Broken); - Value *Cond = Term->getCondition(); Term->setCondition(BoolTrue); - handleLoopCondition(Cond); + Value *Arg = handleLoopCondition(Cond, Broken); BasicBlock *BB = Term->getParent(); - Value *Arg = PhiInserter.GetValueAtEndOfBlock(BB); for (pred_iterator PI = pred_begin(Target), PE = pred_end(Target); PI != PE; ++PI) { diff --git a/lib/Target/R600/SIDefines.h b/lib/Target/R600/SIDefines.h index 2cbce282cbe7..4d31a1185933 100644 --- a/lib/Target/R600/SIDefines.h +++ b/lib/Target/R600/SIDefines.h @@ -35,4 +35,54 @@ enum { #define S_00B84C_LDS_SIZE(x) (((x) & 0x1FF) << 15) #define R_0286CC_SPI_PS_INPUT_ENA 0x0286CC + +#define R_00B848_COMPUTE_PGM_RSRC1 0x00B848 +#define S_00B848_VGPRS(x) (((x) & 0x3F) << 0) +#define G_00B848_VGPRS(x) (((x) >> 0) & 0x3F) +#define C_00B848_VGPRS 0xFFFFFFC0 +#define S_00B848_SGPRS(x) (((x) & 0x0F) << 6) +#define G_00B848_SGPRS(x) (((x) >> 6) & 0x0F) +#define C_00B848_SGPRS 0xFFFFFC3F +#define S_00B848_PRIORITY(x) (((x) & 0x03) << 10) +#define G_00B848_PRIORITY(x) (((x) >> 10) & 0x03) +#define C_00B848_PRIORITY 0xFFFFF3FF +#define S_00B848_FLOAT_MODE(x) (((x) & 0xFF) << 12) +#define G_00B848_FLOAT_MODE(x) (((x) >> 12) & 0xFF) +#define C_00B848_FLOAT_MODE 0xFFF00FFF +#define S_00B848_PRIV(x) (((x) & 0x1) << 20) +#define G_00B848_PRIV(x) (((x) >> 20) & 0x1) +#define C_00B848_PRIV 0xFFEFFFFF +#define S_00B848_DX10_CLAMP(x) (((x) & 0x1) << 21) +#define G_00B848_DX10_CLAMP(x) (((x) >> 21) & 0x1) +#define C_00B848_DX10_CLAMP 0xFFDFFFFF +#define S_00B848_DEBUG_MODE(x) (((x) & 0x1) << 22) +#define G_00B848_DEBUG_MODE(x) (((x) >> 22) & 0x1) +#define C_00B848_DEBUG_MODE 0xFFBFFFFF +#define S_00B848_IEEE_MODE(x) (((x) & 0x1) << 23) +#define G_00B848_IEEE_MODE(x) (((x) >> 23) & 0x1) +#define C_00B848_IEEE_MODE 0xFF7FFFFF + + +// Helpers for setting FLOAT_MODE +#define FP_ROUND_ROUND_TO_NEAREST 0 +#define FP_ROUND_ROUND_TO_INF 1 +#define FP_ROUND_ROUND_TO_NEGINF 2 +#define FP_ROUND_ROUND_TO_ZERO 3 + +// Bits 3:0 control rounding mode. 1:0 control single precision, 3:2 double +// precision. +#define FP_ROUND_MODE_SP(x) ((x) & 0x3) +#define FP_ROUND_MODE_DP(x) (((x) & 0x3) << 2) + +#define FP_DENORM_FLUSH_IN_FLUSH_OUT 0 +#define FP_DENORM_FLUSH_OUT 1 +#define FP_DENORM_FLUSH_IN 2 +#define FP_DENORM_FLUSH_NONE 3 + + +// Bits 7:4 control denormal handling. 5:4 control single precision, 6:7 double +// precision. +#define FP_DENORM_MODE_SP(x) (((x) & 0x3) << 4) +#define FP_DENORM_MODE_DP(x) (((x) & 0x3) << 6) + #endif // SIDEFINES_H_ diff --git a/lib/Target/R600/SIFixSGPRLiveRanges.cpp b/lib/Target/R600/SIFixSGPRLiveRanges.cpp new file mode 100644 index 000000000000..7d116eef396c --- /dev/null +++ b/lib/Target/R600/SIFixSGPRLiveRanges.cpp @@ -0,0 +1,110 @@ +//===-- SIFixSGPRLiveRanges.cpp - Fix SGPR live ranges ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// SALU instructions ignore control flow, so we need to modify the live ranges +/// of the registers they define. +/// +/// The strategy is to view the entire program as if it were a single basic +/// block and calculate the intervals accordingly. We implement this +/// by walking this list of segments for each LiveRange and setting the +/// end of each segment equal to the start of the segment that immediately +/// follows it. + +#include "AMDGPU.h" +#include "SIRegisterInfo.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-fix-sgpr-live-ranges" + +namespace { + +class SIFixSGPRLiveRanges : public MachineFunctionPass { +public: + static char ID; + +public: + SIFixSGPRLiveRanges() : MachineFunctionPass(ID) { + initializeSIFixSGPRLiveRangesPass(*PassRegistry::getPassRegistry()); + } + + virtual bool runOnMachineFunction(MachineFunction &MF) override; + + virtual const char *getPassName() const override { + return "SI Fix SGPR live ranges"; + } + + virtual void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<LiveIntervals>(); + AU.addPreserved<LiveIntervals>(); + AU.addPreserved<SlotIndexes>(); + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(SIFixSGPRLiveRanges, DEBUG_TYPE, + "SI Fix SGPR Live Ranges", false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_END(SIFixSGPRLiveRanges, DEBUG_TYPE, + "SI Fix SGPR Live Ranges", false, false) + +char SIFixSGPRLiveRanges::ID = 0; + +char &llvm::SIFixSGPRLiveRangesID = SIFixSGPRLiveRanges::ID; + +FunctionPass *llvm::createSIFixSGPRLiveRangesPass() { + return new SIFixSGPRLiveRanges(); +} + +bool SIFixSGPRLiveRanges::runOnMachineFunction(MachineFunction &MF) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>( + MF.getTarget().getRegisterInfo()); + LiveIntervals *LIS = &getAnalysis<LiveIntervals>(); + + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); + BI != BE; ++BI) { + + MachineBasicBlock &MBB = *BI; + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); + I != E; ++I) { + MachineInstr &MI = *I; + MachineOperand *ExecUse = MI.findRegisterUseOperand(AMDGPU::EXEC); + if (ExecUse) + continue; + + for (const MachineOperand &Def : MI.operands()) { + if (!Def.isReg() || !Def.isDef() ||!TargetRegisterInfo::isVirtualRegister(Def.getReg())) + continue; + + const TargetRegisterClass *RC = MRI.getRegClass(Def.getReg()); + + if (!TRI->isSGPRClass(RC)) + continue; + LiveInterval &LI = LIS->getInterval(Def.getReg()); + for (unsigned i = 0, e = LI.size() - 1; i != e; ++i) { + LiveRange::Segment &Seg = LI.segments[i]; + LiveRange::Segment &Next = LI.segments[i + 1]; + Seg.end = Next.start; + } + } + } + } + + return false; +} diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp index c9e247c16656..b13c3b8dee2e 100644 --- a/lib/Target/R600/SIISelLowering.cpp +++ b/lib/Target/R600/SIISelLowering.cpp @@ -14,8 +14,8 @@ #include "SIISelLowering.h" #include "AMDGPU.h" +#include "AMDGPUIntrinsicInfo.h" #include "AMDGPUSubtarget.h" -#include "AMDILIntrinsicInfo.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" @@ -24,6 +24,7 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/IR/Function.h" +#include "llvm/ADT/SmallString.h" using namespace llvm; @@ -76,6 +77,8 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::ADD, MVT::i32, Legal); setOperationAction(ISD::ADDC, MVT::i32, Legal); setOperationAction(ISD::ADDE, MVT::i32, Legal); + setOperationAction(ISD::SUBC, MVT::i32, Legal); + setOperationAction(ISD::SUBE, MVT::i32, Legal); // We need to custom lower vector stores from local memory setOperationAction(ISD::LOAD, MVT::v2i32, Custom); @@ -88,14 +91,12 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : // We need to custom lower loads/stores from private memory setOperationAction(ISD::LOAD, MVT::i32, Custom); - setOperationAction(ISD::LOAD, MVT::i64, Custom); setOperationAction(ISD::LOAD, MVT::v2i32, Custom); setOperationAction(ISD::LOAD, MVT::v4i32, Custom); setOperationAction(ISD::LOAD, MVT::v8i32, Custom); setOperationAction(ISD::STORE, MVT::i1, Custom); setOperationAction(ISD::STORE, MVT::i32, Custom); - setOperationAction(ISD::STORE, MVT::i64, Custom); setOperationAction(ISD::STORE, MVT::v2i32, Custom); setOperationAction(ISD::STORE, MVT::v4i32, Custom); @@ -105,18 +106,14 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::SELECT, MVT::f64, Promote); AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64); - setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); - setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); - - setOperationAction(ISD::SELECT_CC, MVT::Other, Expand); + setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i32, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); + setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); setOperationAction(ISD::SETCC, MVT::v2i1, Expand); setOperationAction(ISD::SETCC, MVT::v4i1, Expand); - setOperationAction(ISD::ANY_EXTEND, MVT::i64, Custom); - setOperationAction(ISD::SIGN_EXTEND, MVT::i64, Custom); - setOperationAction(ISD::ZERO_EXTEND, MVT::i64, Custom); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom); @@ -139,6 +136,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); + setOperationAction(ISD::BRCOND, MVT::Other, Custom); setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom); @@ -215,9 +213,16 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::FRINT, MVT::f64, Legal); } + // FIXME: These should be removed and handled the same was as f32 fneg. Source + // modifiers also work for the double instructions. + setOperationAction(ISD::FNEG, MVT::f64, Expand); + setOperationAction(ISD::FABS, MVT::f64, Expand); + setTargetDAGCombine(ISD::SELECT_CC); setTargetDAGCombine(ISD::SETCC); + setTargetDAGCombine(ISD::UINT_TO_FP); + setSchedulingPreference(Sched::RegPressure); } @@ -265,8 +270,12 @@ bool SITargetLowering::allowsUnalignedMemoryAccesses(EVT VT, return VT.bitsGT(MVT::i32); } -bool SITargetLowering::shouldSplitVectorType(EVT VT) const { - return VT.getScalarType().bitsLE(MVT::i16); +TargetLoweringBase::LegalizeTypeAction +SITargetLowering::getPreferredVectorAction(EVT VT) const { + if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16)) + return TypeSplitVector; + + return TargetLoweringBase::getPreferredVectorAction(VT); } bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, @@ -482,19 +491,20 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( MI->eraseFromParent(); break; } - case AMDGPU::V_SUB_F64: - BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64), - MI->getOperand(0).getReg()) - .addReg(MI->getOperand(1).getReg()) - .addReg(MI->getOperand(2).getReg()) - .addImm(0) /* src2 */ - .addImm(0) /* ABS */ - .addImm(0) /* CLAMP */ - .addImm(0) /* OMOD */ - .addImm(2); /* NEG */ + case AMDGPU::V_SUB_F64: { + unsigned DestReg = MI->getOperand(0).getReg(); + BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64), DestReg) + .addImm(0) // SRC0 modifiers + .addReg(MI->getOperand(1).getReg()) + .addImm(1) // SRC1 modifiers + .addReg(MI->getOperand(2).getReg()) + .addImm(0) // SRC2 modifiers + .addImm(0) // src2 + .addImm(0) // CLAMP + .addImm(0); // OMOD MI->eraseFromParent(); break; - + } case AMDGPU::SI_RegisterStorePseudo: { MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); @@ -595,27 +605,31 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::LOAD: { LoadSDNode *Load = dyn_cast<LoadSDNode>(Op); + EVT VT = Op.getValueType(); + + // These loads are legal. + if (Load->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && + VT.isVector() && VT.getVectorNumElements() == 2 && + VT.getVectorElementType() == MVT::i32) + return SDValue(); + if (Op.getValueType().isVector() && (Load->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || Load->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS || (Load->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && Op.getValueType().getVectorNumElements() > 4))) { - SDValue MergedValues[2] = { - SplitVectorLoad(Op, DAG), - Load->getChain() - }; - return DAG.getMergeValues(MergedValues, SDLoc(Op)); + return SplitVectorLoad(Op, DAG); } else { - return LowerLOAD(Op, DAG); + SDValue Result = LowerLOAD(Op, DAG); + assert((!Result.getNode() || + Result.getNode()->getNumValues() == 2) && + "Load should return a value and a chain"); + return Result; } } case ISD::SELECT: return LowerSELECT(Op, DAG); - case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); - case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, DAG); case ISD::STORE: return LowerSTORE(Op, DAG); - case ISD::ANY_EXTEND: // Fall-through - case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, DAG); case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG); case ISD::INTRINSIC_WO_CHAIN: { unsigned IntrinsicID = @@ -827,13 +841,9 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); LoadSDNode *Load = cast<LoadSDNode>(Op); - SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG); - SDValue MergedValues[2]; - MergedValues[1] = Load->getChain(); - if (Ret.getNode()) { - MergedValues[0] = Ret; - return DAG.getMergeValues(MergedValues, DL); - } + SDValue Lowered = AMDGPUTargetLowering::LowerLOAD(Op, DAG); + if (Lowered.getNode()) + return Lowered; if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { return SDValue(); @@ -846,25 +856,38 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(), DAG.getConstant(2, MVT::i32)); - Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, - Load->getChain(), Ptr, - DAG.getTargetConstant(0, MVT::i32), - Op.getOperand(2)); + + // FIXME: REGISTER_LOAD should probably have a chain result. + SDValue Chain = Load->getChain(); + SDValue LoLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, + Chain, Ptr, + DAG.getTargetConstant(0, MVT::i32), + Op.getOperand(2)); + + SDValue Ret = LoLoad.getValue(0); if (MemVT.getSizeInBits() == 64) { + // TODO: This needs a test to make sure the right thing is happening with + // the chain. That is hard without general function support. + SDValue IncPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, DAG.getConstant(1, MVT::i32)); - SDValue LoadUpper = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, - Load->getChain(), IncPtr, - DAG.getTargetConstant(0, MVT::i32), - Op.getOperand(2)); + SDValue HiLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, + Chain, IncPtr, + DAG.getTargetConstant(0, MVT::i32), + Op.getOperand(2)); - Ret = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ret, LoadUpper); + Ret = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, LoLoad, HiLoad); + // Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, + // LoLoad.getValue(1), HiLoad.getValue(1)); } - MergedValues[0] = Ret; - return DAG.getMergeValues(MergedValues, DL); + SDValue Ops[] = { + Ret, + Chain + }; + return DAG.getMergeValues(Ops, DL); } SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode, @@ -903,39 +926,17 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res); } -SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { - SDValue LHS = Op.getOperand(0); - SDValue RHS = Op.getOperand(1); - SDValue True = Op.getOperand(2); - SDValue False = Op.getOperand(3); - SDValue CC = Op.getOperand(4); - EVT VT = Op.getValueType(); - SDLoc DL(Op); - - SDValue Cond = DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS, CC); - return DAG.getNode(ISD::SELECT, DL, VT, Cond, True, False); -} - -SDValue SITargetLowering::LowerSIGN_EXTEND(SDValue Op, - SelectionDAG &DAG) const { - EVT VT = Op.getValueType(); - SDLoc DL(Op); - - if (VT != MVT::i64) { - return SDValue(); - } - - SDValue Hi = DAG.getNode(ISD::SRA, DL, MVT::i32, Op.getOperand(0), - DAG.getConstant(31, MVT::i32)); - - return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Op.getOperand(0), Hi); -} - SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); StoreSDNode *Store = cast<StoreSDNode>(Op); EVT VT = Store->getMemoryVT(); + // These stores are legal. + if (Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && + VT.isVector() && VT.getVectorNumElements() == 2 && + VT.getVectorElementType() == MVT::i32) + return SDValue(); + SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG); if (Ret.getNode()) return Ret; @@ -1011,27 +1012,99 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { return Chain; } +//===----------------------------------------------------------------------===// +// Custom DAG optimizations +//===----------------------------------------------------------------------===// + +SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, + DAGCombinerInfo &DCI) { + EVT VT = N->getValueType(0); + EVT ScalarVT = VT.getScalarType(); + if (ScalarVT != MVT::f32) + return SDValue(); -SDValue SITargetLowering::LowerZERO_EXTEND(SDValue Op, - SelectionDAG &DAG) const { - EVT VT = Op.getValueType(); - SDLoc DL(Op); + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); - if (VT != MVT::i64) { + SDValue Src = N->getOperand(0); + EVT SrcVT = Src.getValueType(); + + // TODO: We could try to match extracting the higher bytes, which would be + // easier if i8 vectors weren't promoted to i32 vectors, particularly after + // types are legalized. v4i8 -> v4f32 is probably the only case to worry + // about in practice. + if (DCI.isAfterLegalizeVectorOps() && SrcVT == MVT::i32) { + if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) { + SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src); + DCI.AddToWorklist(Cvt.getNode()); + return Cvt; + } + } + + // We are primarily trying to catch operations on illegal vector types + // before they are expanded. + // For scalars, we can use the more flexible method of checking masked bits + // after legalization. + if (!DCI.isBeforeLegalize() || + !SrcVT.isVector() || + SrcVT.getVectorElementType() != MVT::i8) { return SDValue(); } - SDValue Src = Op.getOperand(0); - if (Src.getValueType() != MVT::i32) - Src = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src); + assert(DCI.isBeforeLegalize() && "Unexpected legal type"); - SDValue Zero = DAG.getConstant(0, MVT::i32); - return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Src, Zero); -} + // Weird sized vectors are a pain to handle, but we know 3 is really the same + // size as 4. + unsigned NElts = SrcVT.getVectorNumElements(); + if (!SrcVT.isSimple() && NElts != 3) + return SDValue(); -//===----------------------------------------------------------------------===// -// Custom DAG optimizations -//===----------------------------------------------------------------------===// + // Handle v4i8 -> v4f32 extload. Replace the v4i8 with a legal i32 load to + // prevent a mess from expanding to v4i32 and repacking. + if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) { + EVT LoadVT = getEquivalentMemType(*DAG.getContext(), SrcVT); + EVT RegVT = getEquivalentLoadRegType(*DAG.getContext(), SrcVT); + EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, NElts); + + LoadSDNode *Load = cast<LoadSDNode>(Src); + SDValue NewLoad = DAG.getExtLoad(ISD::ZEXTLOAD, DL, RegVT, + Load->getChain(), + Load->getBasePtr(), + LoadVT, + Load->getMemOperand()); + + // Make sure successors of the original load stay after it by updating + // them to use the new Chain. + DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), NewLoad.getValue(1)); + + SmallVector<SDValue, 4> Elts; + if (RegVT.isVector()) + DAG.ExtractVectorElements(NewLoad, Elts); + else + Elts.push_back(NewLoad); + + SmallVector<SDValue, 4> Ops; + + unsigned EltIdx = 0; + for (SDValue Elt : Elts) { + unsigned ComponentsInElt = std::min(4u, NElts - 4 * EltIdx); + for (unsigned I = 0; I < ComponentsInElt; ++I) { + unsigned Opc = AMDGPUISD::CVT_F32_UBYTE0 + I; + SDValue Cvt = DAG.getNode(Opc, DL, MVT::f32, Elt); + DCI.AddToWorklist(Cvt.getNode()); + Ops.push_back(Cvt); + } + + ++EltIdx; + } + + assert(Ops.size() == NElts); + + return DAG.getNode(ISD::BUILD_VECTOR, DL, FloatVT, Ops); + } + + return SDValue(); +} SDValue SITargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { @@ -1074,6 +1147,31 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, } break; } + + case AMDGPUISD::CVT_F32_UBYTE0: + case AMDGPUISD::CVT_F32_UBYTE1: + case AMDGPUISD::CVT_F32_UBYTE2: + case AMDGPUISD::CVT_F32_UBYTE3: { + unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0; + + SDValue Src = N->getOperand(0); + APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8); + + APInt KnownZero, KnownOne; + TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), + !DCI.isBeforeLegalizeOps()); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLO.ShrinkDemandedConstant(Src, Demanded) || + TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) { + DCI.CommitTargetLoweringOpt(TLO); + } + + break; + } + + case ISD::UINT_TO_FP: { + return performUCharToFloatCombine(N, DCI); + } } return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); @@ -1297,7 +1395,7 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node, int32_t Immediate = Desc->getSize() == 4 ? 0 : -1; bool HaveVSrc = false, HaveSSrc = false; - // First figure out what we alread have in this instruction + // First figure out what we already have in this instruction. for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs; i != e && Op < NumOps; ++i, ++Op) { @@ -1316,7 +1414,7 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node, } } - // If we neither have VSrc nor SSrc it makes no sense to continue + // If we neither have VSrc nor SSrc, it makes no sense to continue. if (!HaveVSrc && !HaveSSrc) return Node; @@ -1332,17 +1430,17 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node, const SDValue &Operand = Node->getOperand(i); Ops.push_back(Operand); - // Already folded immediate ? + // Already folded immediate? if (isa<ConstantSDNode>(Operand.getNode()) || isa<ConstantFPSDNode>(Operand.getNode())) continue; - // Is this a VSrc or SSrc operand ? + // Is this a VSrc or SSrc operand? unsigned RegClass = Desc->OpInfo[Op].RegClass; if (isVSrc(RegClass) || isSSrc(RegClass)) { // Try to fold the immediates if (!foldImm(Ops[i], Immediate, ScalarSlotUsed)) { - // Folding didn't worked, make sure we don't hit the SReg limit + // Folding didn't work, make sure we don't hit the SReg limit. ensureSRegLimit(DAG, Ops[i], RegClass, ScalarSlotUsed); } continue; @@ -1371,7 +1469,6 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node, continue; if (DescE64) { - // Test if it makes sense to switch to e64 encoding unsigned OtherRegClass = DescE64->OpInfo[Op].RegClass; if (!isVSrc(OtherRegClass) && !isSSrc(OtherRegClass)) @@ -1402,7 +1499,7 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node, if (!DescE64) continue; Desc = DescE64; - DescE64 = 0; + DescE64 = nullptr; } else if (Operand.getMachineOpcode() == AMDGPU::FABS_SI) { Ops.pop_back(); @@ -1412,7 +1509,7 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node, if (!DescE64) continue; Desc = DescE64; - DescE64 = 0; + DescE64 = nullptr; } } @@ -1535,7 +1632,7 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node, } } -/// \brief Fold the instructions after slecting them +/// \brief Fold the instructions after selecting them. SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, SelectionDAG &DAG) const { const SIInstrInfo *TII = diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h index c6eaa812e4d6..e25323ae87d5 100644 --- a/lib/Target/R600/SIISelLowering.h +++ b/lib/Target/R600/SIISelLowering.h @@ -27,10 +27,7 @@ class SITargetLowering : public AMDGPUTargetLowering { SelectionDAG &DAG) const; SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; bool foldImm(SDValue &Operand, int32_t &Immediate, @@ -46,11 +43,16 @@ class SITargetLowering : public AMDGPUTargetLowering { void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const; MachineSDNode *AdjustRegClass(MachineSDNode *N, SelectionDAG &DAG) const; + static SDValue performUCharToFloatCombine(SDNode *N, + DAGCombinerInfo &DCI); + public: SITargetLowering(TargetMachine &tm); bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AS, bool *IsFast) const override; - bool shouldSplitVectorType(EVT VT) const override; + + TargetLoweringBase::LegalizeTypeAction + getPreferredVectorAction(EVT VT) const override; bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override; diff --git a/lib/Target/R600/SIInsertWaits.cpp b/lib/Target/R600/SIInsertWaits.cpp index a17fed7e7ea7..173332674ff1 100644 --- a/lib/Target/R600/SIInsertWaits.cpp +++ b/lib/Target/R600/SIInsertWaits.cpp @@ -341,6 +341,8 @@ Counters SIInsertWaits::handleOperands(MachineInstr &MI) { return Result; } +// FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States" +// around other non-memory instructions. bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { bool Changes = false; diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td index 168eff25bb22..7cae9fc0d0eb 100644 --- a/lib/Target/R600/SIInstrFormats.td +++ b/lib/Target/R600/SIInstrFormats.td @@ -51,6 +51,16 @@ class Enc64 <dag outs, dag ins, string asm, list<dag> pattern> : let Size = 8; } +class VOP3Common <dag outs, dag ins, string asm, list<dag> pattern> : + Enc64 <outs, ins, asm, pattern> { + + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let UseNamedOperandTable = 1; + let VOP3 = 1; +} + //===----------------------------------------------------------------------===// // Scalar operations //===----------------------------------------------------------------------===// @@ -207,7 +217,7 @@ class VOP2 <bits<6> op, dag outs, dag ins, string asm, list<dag> pattern> : } class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> : - Enc64 <outs, ins, asm, pattern> { + VOP3Common <outs, ins, asm, pattern> { bits<8> dst; bits<2> src0_modifiers; @@ -233,16 +243,11 @@ class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> : let Inst{61} = src0_modifiers{0}; let Inst{62} = src1_modifiers{0}; let Inst{63} = src2_modifiers{0}; - - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let UseNamedOperandTable = 1; - let VOP3 = 1; + } class VOP3b <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> : - Enc64 <outs, ins, asm, pattern> { + VOP3Common <outs, ins, asm, pattern> { bits<8> dst; bits<2> src0_modifiers; @@ -266,11 +271,6 @@ class VOP3b <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> : let Inst{62} = src1_modifiers{0}; let Inst{63} = src2_modifiers{0}; - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let UseNamedOperandTable = 1; - let VOP3 = 1; } class VOPC <bits<8> op, dag ins, string asm, list<dag> pattern> : diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp index 4a9e3467c5a6..455c8904122f 100644 --- a/lib/Target/R600/SIInstrInfo.cpp +++ b/lib/Target/R600/SIInstrInfo.cpp @@ -19,13 +19,14 @@ #include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Function.h" #include "llvm/MC/MCInstrDesc.h" using namespace llvm; -SIInstrInfo::SIInstrInfo(AMDGPUTargetMachine &tm) - : AMDGPUInstrInfo(tm), - RI(tm) { } +SIInstrInfo::SIInstrInfo(const AMDGPUSubtarget &st) + : AMDGPUInstrInfo(st), + RI(st) { } //===----------------------------------------------------------------------===// // TargetInstrInfo callbacks @@ -187,18 +188,25 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { - SIMachineFunctionInfo *MFI = MBB.getParent()->getInfo<SIMachineFunctionInfo>(); + MachineFunction *MF = MBB.getParent(); + SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); + MachineRegisterInfo &MRI = MF->getRegInfo(); DebugLoc DL = MBB.findDebugLoc(MI); unsigned KillFlag = isKill ? RegState::Kill : 0; - MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - if (TRI->getCommonSubClass(RC, &AMDGPU::SGPR_32RegClass)) { - unsigned Lane = MFI->SpillTracker.reserveLanes(MRI, MBB.getParent()); + if (RI.hasVGPRs(RC)) { + LLVMContext &Ctx = MF->getFunction()->getContext(); + Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Can't spill VGPR!"); + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), AMDGPU::VGPR0) + .addReg(SrcReg); + } else if (TRI->getCommonSubClass(RC, &AMDGPU::SGPR_32RegClass)) { + unsigned Lane = MFI->SpillTracker.reserveLanes(MRI, MF); + unsigned TgtReg = MFI->SpillTracker.LaneVGPR; - BuildMI(MBB, MI, DL, get(AMDGPU::V_WRITELANE_B32), MFI->SpillTracker.LaneVGPR) + BuildMI(MBB, MI, DL, get(AMDGPU::V_WRITELANE_B32), TgtReg) .addReg(SrcReg, KillFlag) .addImm(Lane); - MFI->SpillTracker.addSpilledReg(FrameIndex, MFI->SpillTracker.LaneVGPR, Lane); + MFI->SpillTracker.addSpilledReg(FrameIndex, TgtReg, Lane); } else if (RI.isSGPRClass(RC)) { // We are only allowed to create one new instruction when spilling // registers, so we need to use pseudo instruction for vector @@ -207,8 +215,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, // Reserve a spot in the spill tracker for each sub-register of // the vector register. unsigned NumSubRegs = RC->getSize() / 4; - unsigned FirstLane = MFI->SpillTracker.reserveLanes(MRI, MBB.getParent(), - NumSubRegs); + unsigned FirstLane = MFI->SpillTracker.reserveLanes(MRI, MF, NumSubRegs); MFI->SpillTracker.addSpilledReg(FrameIndex, MFI->SpillTracker.LaneVGPR, FirstLane); @@ -234,19 +241,19 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, unsigned DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { - SIMachineFunctionInfo *MFI = MBB.getParent()->getInfo<SIMachineFunctionInfo>(); + MachineFunction *MF = MBB.getParent(); + SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); DebugLoc DL = MBB.findDebugLoc(MI); - if (TRI->getCommonSubClass(RC, &AMDGPU::SReg_32RegClass)) { - SIMachineFunctionInfo::SpilledReg Spill = - MFI->SpillTracker.getSpilledReg(FrameIndex); - assert(Spill.VGPR); - BuildMI(MBB, MI, DL, get(AMDGPU::V_READLANE_B32), DestReg) - .addReg(Spill.VGPR) - .addImm(Spill.Lane); - insertNOPs(MI, 3); + + if (RI.hasVGPRs(RC)) { + LLVMContext &Ctx = MF->getFunction()->getContext(); + Ctx.emitError("SIInstrInfo::loadRegToStackSlot - Can't retrieve spilled VGPR!"); + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) + .addImm(0); } else if (RI.isSGPRClass(RC)){ unsigned Opcode; switch(RC->getSize() * 8) { + case 32: Opcode = AMDGPU::SI_SPILL_S32_RESTORE; break; case 64: Opcode = AMDGPU::SI_SPILL_S64_RESTORE; break; case 128: Opcode = AMDGPU::SI_SPILL_S128_RESTORE; break; case 256: Opcode = AMDGPU::SI_SPILL_S256_RESTORE; break; @@ -260,7 +267,6 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, BuildMI(MBB, MI, DL, get(Opcode), DestReg) .addReg(Spill.VGPR) .addImm(FrameIndex); - insertNOPs(MI, 3); } else { llvm_unreachable("VGPR spilling not supported"); } @@ -281,6 +287,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) { case AMDGPU::SI_SPILL_S64_SAVE: case AMDGPU::SI_SPILL_S64_RESTORE: return 2; + case AMDGPU::SI_SPILL_S32_RESTORE: + return 1; default: llvm_unreachable("Invalid spill opcode"); } } @@ -334,7 +342,8 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { case AMDGPU::SI_SPILL_S512_RESTORE: case AMDGPU::SI_SPILL_S256_RESTORE: case AMDGPU::SI_SPILL_S128_RESTORE: - case AMDGPU::SI_SPILL_S64_RESTORE: { + case AMDGPU::SI_SPILL_S64_RESTORE: + case AMDGPU::SI_SPILL_S32_RESTORE: { unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { @@ -348,6 +357,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { .addReg(MI->getOperand(1).getReg()) .addImm(Spill.Lane + i); } + insertNOPs(MI, 3); MI->eraseFromParent(); break; } @@ -514,6 +524,23 @@ bool SIInstrInfo::isLiteralConstant(const MachineOperand &MO) const { return (MO.isImm() || MO.isFPImm()) && !isInlineConstant(MO); } +static bool compareMachineOp(const MachineOperand &Op0, + const MachineOperand &Op1) { + if (Op0.getType() != Op1.getType()) + return false; + + switch (Op0.getType()) { + case MachineOperand::MO_Register: + return Op0.getReg() == Op1.getReg(); + case MachineOperand::MO_Immediate: + return Op0.getImm() == Op1.getImm(); + case MachineOperand::MO_FPImmediate: + return Op0.getFPImm() == Op1.getFPImm(); + default: + llvm_unreachable("Didn't expect to be comparing these operand types"); + } +} + bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, StringRef &ErrInfo) const { uint16_t Opcode = MI->getOpcode(); @@ -532,7 +559,14 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, // Make sure the register classes are correct for (unsigned i = 0, e = Desc.getNumOperands(); i != e; ++i) { switch (Desc.OpInfo[i].OperandType) { - case MCOI::OPERAND_REGISTER: + case MCOI::OPERAND_REGISTER: { + int RegClass = Desc.OpInfo[i].RegClass; + if (!RI.regClassCanUseImmediate(RegClass) && + (MI->getOperand(i).isImm() || MI->getOperand(i).isFPImm())) { + ErrInfo = "Expected register, but got immediate"; + return false; + } + } break; case MCOI::OPERAND_IMMEDIATE: if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFPImm()) { @@ -620,6 +654,24 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, return false; } } + + // Verify misc. restrictions on specific instructions. + if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 || + Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) { + MI->dump(); + + const MachineOperand &Src0 = MI->getOperand(2); + const MachineOperand &Src1 = MI->getOperand(3); + const MachineOperand &Src2 = MI->getOperand(4); + if (Src0.isReg() && Src1.isReg() && Src2.isReg()) { + if (!compareMachineOp(Src0, Src1) && + !compareMachineOp(Src0, Src2)) { + ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2"; + return false; + } + } + } + return true; } @@ -654,7 +706,9 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32; case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32; case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32; + case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32; case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32; + case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32; case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32; case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32; case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32; @@ -667,6 +721,9 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { case AMDGPU::S_LOAD_DWORDX2_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64; case AMDGPU::S_LOAD_DWORDX4_IMM: case AMDGPU::S_LOAD_DWORDX4_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64; + case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e32; + case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; + case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; } } @@ -731,8 +788,8 @@ unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, unsigned SubReg = MRI.createVirtualRegister(SubRC); // Just in case the super register is itself a sub-register, copy it to a new - // value so we don't need to wory about merging its subreg index with the - // SubIdx passed to this function. The register coalescer should be able to + // value so we don't need to worry about merging its subreg index with the + // SubIdx passed to this function. The register coalescer should be able to // eliminate this extra copy. BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(TargetOpcode::COPY), NewSuperReg) @@ -1157,22 +1214,27 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { continue; } case AMDGPU::S_AND_B64: - splitScalar64BitOp(Worklist, Inst, AMDGPU::S_AND_B32); + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32); Inst->eraseFromParent(); continue; case AMDGPU::S_OR_B64: - splitScalar64BitOp(Worklist, Inst, AMDGPU::S_OR_B32); + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32); Inst->eraseFromParent(); continue; case AMDGPU::S_XOR_B64: - splitScalar64BitOp(Worklist, Inst, AMDGPU::S_XOR_B32); + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32); Inst->eraseFromParent(); continue; case AMDGPU::S_NOT_B64: - splitScalar64BitOp(Worklist, Inst, AMDGPU::S_NOT_B32); + splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32); + Inst->eraseFromParent(); + continue; + + case AMDGPU::S_BCNT1_I32_B64: + splitScalar64BitBCNT(Worklist, Inst); Inst->eraseFromParent(); continue; @@ -1217,6 +1279,10 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { // 3 to not hit an assertion later in MCInstLower. Inst->addOperand(MachineOperand::CreateImm(0)); Inst->addOperand(MachineOperand::CreateImm(0)); + } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { + // The VALU version adds the second operand to the result, so insert an + // extra 0 operand. + Inst->addOperand(MachineOperand::CreateImm(0)); } addDescImplicitUseDef(NewDesc, Inst); @@ -1297,9 +1363,62 @@ const TargetRegisterClass *SIInstrInfo::getIndirectAddrRegClass() const { return &AMDGPU::VReg_32RegClass; } -void SIInstrInfo::splitScalar64BitOp(SmallVectorImpl<MachineInstr *> &Worklist, - MachineInstr *Inst, - unsigned Opcode) const { +void SIInstrInfo::splitScalar64BitUnaryOp( + SmallVectorImpl<MachineInstr *> &Worklist, + MachineInstr *Inst, + unsigned Opcode) const { + MachineBasicBlock &MBB = *Inst->getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + + MachineOperand &Dest = Inst->getOperand(0); + MachineOperand &Src0 = Inst->getOperand(1); + DebugLoc DL = Inst->getDebugLoc(); + + MachineBasicBlock::iterator MII = Inst; + + const MCInstrDesc &InstDesc = get(Opcode); + const TargetRegisterClass *Src0RC = Src0.isReg() ? + MRI.getRegClass(Src0.getReg()) : + &AMDGPU::SGPR_32RegClass; + + const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); + + MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, + AMDGPU::sub0, Src0SubRC); + + const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); + const TargetRegisterClass *DestSubRC = RI.getSubRegClass(DestRC, AMDGPU::sub0); + + unsigned DestSub0 = MRI.createVirtualRegister(DestRC); + MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0) + .addOperand(SrcReg0Sub0); + + MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, + AMDGPU::sub1, Src0SubRC); + + unsigned DestSub1 = MRI.createVirtualRegister(DestSubRC); + MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1) + .addOperand(SrcReg0Sub1); + + unsigned FullDestReg = MRI.createVirtualRegister(DestRC); + BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) + .addReg(DestSub0) + .addImm(AMDGPU::sub0) + .addReg(DestSub1) + .addImm(AMDGPU::sub1); + + MRI.replaceRegWith(Dest.getReg(), FullDestReg); + + // Try to legalize the operands in case we need to swap the order to keep it + // valid. + Worklist.push_back(LoHalf); + Worklist.push_back(HiHalf); +} + +void SIInstrInfo::splitScalar64BitBinaryOp( + SmallVectorImpl<MachineInstr *> &Worklist, + MachineInstr *Inst, + unsigned Opcode) const { MachineBasicBlock &MBB = *Inst->getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -1360,6 +1479,46 @@ void SIInstrInfo::splitScalar64BitOp(SmallVectorImpl<MachineInstr *> &Worklist, Worklist.push_back(HiHalf); } +void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist, + MachineInstr *Inst) const { + MachineBasicBlock &MBB = *Inst->getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + + MachineBasicBlock::iterator MII = Inst; + DebugLoc DL = Inst->getDebugLoc(); + + MachineOperand &Dest = Inst->getOperand(0); + MachineOperand &Src = Inst->getOperand(1); + + const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e32); + const TargetRegisterClass *SrcRC = Src.isReg() ? + MRI.getRegClass(Src.getReg()) : + &AMDGPU::SGPR_32RegClass; + + unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0); + + MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, + AMDGPU::sub0, SrcSubRC); + MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, + AMDGPU::sub1, SrcSubRC); + + MachineInstr *First = BuildMI(MBB, MII, DL, InstDesc, MidReg) + .addOperand(SrcRegSub0) + .addImm(0); + + MachineInstr *Second = BuildMI(MBB, MII, DL, InstDesc, ResultReg) + .addOperand(SrcRegSub1) + .addReg(MidReg); + + MRI.replaceRegWith(Dest.getReg(), ResultReg); + + Worklist.push_back(First); + Worklist.push_back(Second); +} + void SIInstrInfo::addDescImplicitUseDef(const MCInstrDesc &NewDesc, MachineInstr *Inst) const { // Add the implict and explicit register definitions. diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h index 7b31a81f0a87..4c204d877809 100644 --- a/lib/Target/R600/SIInstrInfo.h +++ b/lib/Target/R600/SIInstrInfo.h @@ -44,13 +44,19 @@ private: const TargetRegisterClass *RC, const MachineOperand &Op) const; - void splitScalar64BitOp(SmallVectorImpl<MachineInstr *> & Worklist, - MachineInstr *Inst, unsigned Opcode) const; + void splitScalar64BitUnaryOp(SmallVectorImpl<MachineInstr *> &Worklist, + MachineInstr *Inst, unsigned Opcode) const; + + void splitScalar64BitBinaryOp(SmallVectorImpl<MachineInstr *> &Worklist, + MachineInstr *Inst, unsigned Opcode) const; + + void splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist, + MachineInstr *Inst) const; void addDescImplicitUseDef(const MCInstrDesc &Desc, MachineInstr *MI) const; public: - explicit SIInstrInfo(AMDGPUTargetMachine &tm); + explicit SIInstrInfo(const AMDGPUSubtarget &st); const SIRegisterInfo &getRegisterInfo() const override { return RI; diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td index 2242e6d8648b..774c9d13b17d 100644 --- a/lib/Target/R600/SIInstrInfo.td +++ b/lib/Target/R600/SIInstrInfo.td @@ -147,6 +147,12 @@ def FRAMEri32 : Operand<iPTR> { } //===----------------------------------------------------------------------===// +// Complex patterns +//===----------------------------------------------------------------------===// + +def MUBUFAddr64 : ComplexPattern<i64, 3, "SelectMUBUFAddr64">; + +//===----------------------------------------------------------------------===// // SI assembler operands //===----------------------------------------------------------------------===// @@ -187,6 +193,12 @@ class SOP1_64 <bits<8> op, string opName, list<dag> pattern> : SOP1 < opName#" $dst, $src0", pattern >; +// 64-bit input, 32-bit output. +class SOP1_32_64 <bits<8> op, string opName, list<dag> pattern> : SOP1 < + op, (outs SReg_32:$dst), (ins SSrc_64:$src0), + opName#" $dst, $src0", pattern +>; + class SOP2_32 <bits<7> op, string opName, list<dag> pattern> : SOP2 < op, (outs SReg_32:$dst), (ins SSrc_32:$src0, SSrc_32:$src1), opName#" $dst, $src0, $src1", pattern @@ -260,7 +272,7 @@ class SIMCInstr <string pseudo, int subtarget> { multiclass VOP3_m <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern, string opName> { - def "" : InstSI <outs, ins, "", pattern>, VOP <opName>, + def "" : VOP3Common <outs, ins, "", pattern>, VOP <opName>, SIMCInstr<OpName, SISubtarget.NONE> { let isPseudo = 1; } @@ -357,12 +369,13 @@ multiclass VOP2b_32 <bits<6> op, string opName, list<dag> pattern, } multiclass VOPC_Helper <bits<8> op, RegisterClass vrc, RegisterClass arc, - string opName, ValueType vt, PatLeaf cond> { - + string opName, ValueType vt, PatLeaf cond, bit defExec = 0> { def _e32 : VOPC < op, (ins arc:$src0, vrc:$src1), opName#"_e32 $dst, $src0, $src1", [] - >, VOP <opName>; + >, VOP <opName> { + let Defs = !if(defExec, [VCC, EXEC], [VCC]); + } def _e64 : VOP3 < {0, op{7}, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}}, @@ -375,6 +388,7 @@ multiclass VOPC_Helper <bits<8> op, RegisterClass vrc, RegisterClass arc, [(set SReg_64:$dst, (i1 (setcc (vt arc:$src0), arc:$src1, cond)))] ) >, VOP <opName> { + let Defs = !if(defExec, [EXEC], []); let src2 = SIOperand.ZERO; let src2_modifiers = 0; } @@ -388,6 +402,14 @@ multiclass VOPC_64 <bits<8> op, string opName, ValueType vt = untyped, PatLeaf cond = COND_NULL> : VOPC_Helper <op, VReg_64, VSrc_64, opName, vt, cond>; +multiclass VOPCX_32 <bits<8> op, string opName, + ValueType vt = untyped, PatLeaf cond = COND_NULL> + : VOPC_Helper <op, VReg_32, VSrc_32, opName, vt, cond, 1>; + +multiclass VOPCX_64 <bits<8> op, string opName, + ValueType vt = untyped, PatLeaf cond = COND_NULL> + : VOPC_Helper <op, VReg_64, VSrc_64, opName, vt, cond, 1>; + multiclass VOP3_32 <bits<9> op, string opName, list<dag> pattern> : VOP3_m < op, (outs VReg_32:$dst), (ins InputMods: $src0_modifiers, VSrc_32:$src0, InputMods:$src1_modifiers, @@ -396,7 +418,7 @@ multiclass VOP3_32 <bits<9> op, string opName, list<dag> pattern> : VOP3_m < opName#" $dst, $src0_modifiers, $src1, $src2, $clamp, $omod", pattern, opName >; -class VOP3_64_Shift <bits <9> op, string opName, list<dag> pattern> : VOP3 < +class VOP3_64_32 <bits <9> op, string opName, list<dag> pattern> : VOP3 < op, (outs VReg_64:$dst), (ins VSrc_64:$src0, VSrc_32:$src1), opName#" $dst, $src0, $src1", pattern @@ -410,11 +432,29 @@ class VOP3_64_Shift <bits <9> op, string opName, list<dag> pattern> : VOP3 < class VOP3_64 <bits<9> op, string opName, list<dag> pattern> : VOP3 < op, (outs VReg_64:$dst), - (ins VSrc_64:$src0, VSrc_64:$src1, VSrc_64:$src2, + (ins InputMods:$src0_modifiers, VSrc_64:$src0, + InputMods:$src1_modifiers, VSrc_64:$src1, + InputMods:$src2_modifiers, VSrc_64:$src2, + InstFlag:$clamp, InstFlag:$omod), + opName#" $dst, $src0_modifiers, $src1_modifiers, $src2_modifiers, $clamp, $omod", pattern +>, VOP <opName>; + + +class VOP3b_Helper <bits<9> op, RegisterClass vrc, RegisterClass arc, + string opName, list<dag> pattern> : VOP3 < + op, (outs vrc:$dst0, SReg_64:$dst1), + (ins arc:$src0, arc:$src1, arc:$src2, InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg), - opName#" $dst, $src0, $src1, $src2, $abs, $clamp, $omod, $neg", pattern + opName#" $dst0, $dst1, $src0, $src1, $src2, $abs, $clamp, $omod, $neg", pattern >, VOP <opName>; + +class VOP3b_64 <bits<9> op, string opName, list<dag> pattern> : + VOP3b_Helper <op, VReg_64, VSrc_64, opName, pattern>; + +class VOP3b_32 <bits<9> op, string opName, list<dag> pattern> : + VOP3b_Helper <op, VReg_32, VSrc_32, opName, pattern>; + //===----------------------------------------------------------------------===// // Vector I/O classes //===----------------------------------------------------------------------===// @@ -475,10 +515,11 @@ class DS_Store2_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A let vdst = 0; } +// 1 address, 1 data. class DS_1A1D_RET <bits<8> op, string asm, RegisterClass rc> : DS_1A < op, (outs rc:$vdst), - (ins i1imm:$gds, VReg_32:$addr, VReg_32:$data0, u16imm:$offset), + (ins i1imm:$gds, VReg_32:$addr, rc:$data0, u16imm:$offset), asm#" $vdst, $addr, $data0, $offset, [M0]", []> { @@ -487,6 +528,41 @@ class DS_1A1D_RET <bits<8> op, string asm, RegisterClass rc> : DS_1A < let mayLoad = 1; } +// 1 address, 2 data. +class DS_1A2D_RET <bits<8> op, string asm, RegisterClass rc> : DS_1A < + op, + (outs rc:$vdst), + (ins i1imm:$gds, VReg_32:$addr, rc:$data0, rc:$data1, u16imm:$offset), + asm#" $vdst, $addr, $data0, $data1, $offset, [M0]", + []> { + let mayStore = 1; + let mayLoad = 1; +} + +// 1 address, 2 data. +class DS_1A2D_NORET <bits<8> op, string asm, RegisterClass rc> : DS_1A < + op, + (outs), + (ins i1imm:$gds, VReg_32:$addr, rc:$data0, rc:$data1, u16imm:$offset), + asm#" $addr, $data0, $data1, $offset, [M0]", + []> { + let mayStore = 1; + let mayLoad = 1; +} + +// 1 address, 1 data. +class DS_1A1D_NORET <bits<8> op, string asm, RegisterClass rc> : DS_1A < + op, + (outs), + (ins i1imm:$gds, VReg_32:$addr, rc:$data0, u16imm:$offset), + asm#" $addr, $data0, $offset, [M0]", + []> { + + let data1 = 0; + let mayStore = 1; + let mayLoad = 1; +} + class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF < op, (outs), @@ -500,7 +576,9 @@ class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBU let mayLoad = 0; } -multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> { +multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass, + ValueType load_vt = i32, + SDPatternOperator ld = null_frag> { let lds = 0, mayLoad = 1 in { @@ -542,16 +620,19 @@ multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> { let offen = 0, idxen = 0, addr64 = 1, glc = 0, slc = 0, tfe = 0, soffset = 128 /* ZERO */ in { def _ADDR64 : MUBUF <op, (outs regClass:$vdata), (ins SReg_128:$srsrc, VReg_64:$vaddr, u16imm:$offset), - asm#" $vdata, $srsrc + $vaddr + $offset", []>; + asm#" $vdata, $srsrc + $vaddr + $offset", + [(set load_vt:$vdata, (ld (MUBUFAddr64 v4i32:$srsrc, + i64:$vaddr, u16imm:$offset)))]>; } } } -class MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass> : +class MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass, + ValueType store_vt, SDPatternOperator st> : MUBUF <op, (outs), (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_64:$vaddr, u16imm:$offset), name#" $vdata, $srsrc + $vaddr + $offset", - []> { + [(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, u16imm:$offset))]> { let mayLoad = 0; let mayStore = 1; @@ -658,6 +739,53 @@ multiclass MIMG_Sampler <bits<7> op, string asm> { defm _V4 : MIMG_Sampler_Src_Helper<op, asm, VReg_128, 4>; } +class MIMG_Gather_Helper <bits<7> op, string asm, + RegisterClass dst_rc, + RegisterClass src_rc> : MIMG < + op, + (outs dst_rc:$vdata), + (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128, + i1imm:$tfe, i1imm:$lwe, i1imm:$slc, src_rc:$vaddr, + SReg_256:$srsrc, SReg_128:$ssamp), + asm#" $vdata, $dmask, $unorm, $glc, $da, $r128," + #" $tfe, $lwe, $slc, $vaddr, $srsrc, $ssamp", + []> { + let mayLoad = 1; + let mayStore = 0; + + // DMASK was repurposed for GATHER4. 4 components are always + // returned and DMASK works like a swizzle - it selects + // the component to fetch. The only useful DMASK values are + // 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns + // (red,red,red,red) etc.) The ISA document doesn't mention + // this. + // Therefore, disable all code which updates DMASK by setting these two: + let MIMG = 0; + let hasPostISelHook = 0; +} + +multiclass MIMG_Gather_Src_Helper <bits<7> op, string asm, + RegisterClass dst_rc, + int channels> { + def _V1 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_32>, + MIMG_Mask<asm#"_V1", channels>; + def _V2 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_64>, + MIMG_Mask<asm#"_V2", channels>; + def _V4 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_128>, + MIMG_Mask<asm#"_V4", channels>; + def _V8 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_256>, + MIMG_Mask<asm#"_V8", channels>; + def _V16 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_512>, + MIMG_Mask<asm#"_V16", channels>; +} + +multiclass MIMG_Gather <bits<7> op, string asm> { + defm _V1 : MIMG_Gather_Src_Helper<op, asm, VReg_32, 1>; + defm _V2 : MIMG_Gather_Src_Helper<op, asm, VReg_64, 2>; + defm _V3 : MIMG_Gather_Src_Helper<op, asm, VReg_96, 3>; + defm _V4 : MIMG_Gather_Src_Helper<op, asm, VReg_128, 4>; +} + //===----------------------------------------------------------------------===// // Vector instruction mappings //===----------------------------------------------------------------------===// diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td index 500fa7894c29..b3b44e2b6801 100644 --- a/lib/Target/R600/SIInstructions.td +++ b/lib/Target/R600/SIInstructions.td @@ -96,22 +96,35 @@ def S_NOT_B32 : SOP1_32 <0x00000007, "S_NOT_B32", [(set i32:$dst, (not i32:$src0))] >; -def S_NOT_B64 : SOP1_64 <0x00000008, "S_NOT_B64", []>; +def S_NOT_B64 : SOP1_64 <0x00000008, "S_NOT_B64", + [(set i64:$dst, (not i64:$src0))] +>; def S_WQM_B32 : SOP1_32 <0x00000009, "S_WQM_B32", []>; def S_WQM_B64 : SOP1_64 <0x0000000a, "S_WQM_B64", []>; -def S_BREV_B32 : SOP1_32 <0x0000000b, "S_BREV_B32", []>; +def S_BREV_B32 : SOP1_32 <0x0000000b, "S_BREV_B32", + [(set i32:$dst, (AMDGPUbrev i32:$src0))] +>; def S_BREV_B64 : SOP1_64 <0x0000000c, "S_BREV_B64", []>; } // End neverHasSideEffects = 1 ////def S_BCNT0_I32_B32 : SOP1_BCNT0 <0x0000000d, "S_BCNT0_I32_B32", []>; ////def S_BCNT0_I32_B64 : SOP1_BCNT0 <0x0000000e, "S_BCNT0_I32_B64", []>; -////def S_BCNT1_I32_B32 : SOP1_BCNT1 <0x0000000f, "S_BCNT1_I32_B32", []>; -////def S_BCNT1_I32_B64 : SOP1_BCNT1 <0x00000010, "S_BCNT1_I32_B64", []>; -////def S_FF0_I32_B32 : SOP1_FF0 <0x00000011, "S_FF0_I32_B32", []>; +def S_BCNT1_I32_B32 : SOP1_32 <0x0000000f, "S_BCNT1_I32_B32", + [(set i32:$dst, (ctpop i32:$src0))] +>; +def S_BCNT1_I32_B64 : SOP1_32_64 <0x00000010, "S_BCNT1_I32_B64", []>; + +////def S_FF0_I32_B32 : SOP1_32 <0x00000011, "S_FF0_I32_B32", []>; ////def S_FF0_I32_B64 : SOP1_FF0 <0x00000012, "S_FF0_I32_B64", []>; -////def S_FF1_I32_B32 : SOP1_FF1 <0x00000013, "S_FF1_I32_B32", []>; +def S_FF1_I32_B32 : SOP1_32 <0x00000013, "S_FF1_I32_B32", + [(set i32:$dst, (cttz_zero_undef i32:$src0))] +>; ////def S_FF1_I32_B64 : SOP1_FF1 <0x00000014, "S_FF1_I32_B64", []>; -//def S_FLBIT_I32_B32 : SOP1_32 <0x00000015, "S_FLBIT_I32_B32", []>; + +def S_FLBIT_I32_B32 : SOP1_32 <0x00000015, "S_FLBIT_I32_B32", + [(set i32:$dst, (ctlz_zero_undef i32:$src0))] +>; + //def S_FLBIT_I32_B64 : SOP1_32 <0x00000016, "S_FLBIT_I32_B64", []>; def S_FLBIT_I32 : SOP1_32 <0x00000017, "S_FLBIT_I32", []>; //def S_FLBIT_I32_I64 : SOP1_32 <0x00000018, "S_FLBIT_I32_I64", []>; @@ -320,7 +333,7 @@ def S_CMPK_EQ_I32 : SOPK < >; */ -let isCompare = 1 in { +let isCompare = 1, Defs = [SCC] in { def S_CMPK_LG_I32 : SOPK_32 <0x00000004, "S_CMPK_LG_I32", []>; def S_CMPK_GT_I32 : SOPK_32 <0x00000005, "S_CMPK_GT_I32", []>; def S_CMPK_GE_I32 : SOPK_32 <0x00000006, "S_CMPK_GE_I32", []>; @@ -332,7 +345,7 @@ def S_CMPK_GT_U32 : SOPK_32 <0x0000000b, "S_CMPK_GT_U32", []>; def S_CMPK_GE_U32 : SOPK_32 <0x0000000c, "S_CMPK_GE_U32", []>; def S_CMPK_LT_U32 : SOPK_32 <0x0000000d, "S_CMPK_LT_U32", []>; def S_CMPK_LE_U32 : SOPK_32 <0x0000000e, "S_CMPK_LE_U32", []>; -} // End isCompare = 1 +} // End isCompare = 1, Defs = [SCC] let Defs = [SCC], isCommutable = 1 in { def S_ADDK_I32 : SOPK_32 <0x0000000f, "S_ADDK_I32", []>; @@ -467,26 +480,26 @@ defm V_CMP_NEQ_F32 : VOPC_32 <0x0000000d, "V_CMP_NEQ_F32", f32, COND_UNE>; defm V_CMP_NLT_F32 : VOPC_32 <0x0000000e, "V_CMP_NLT_F32">; defm V_CMP_TRU_F32 : VOPC_32 <0x0000000f, "V_CMP_TRU_F32">; -let hasSideEffects = 1, Defs = [EXEC] in { +let hasSideEffects = 1 in { -defm V_CMPX_F_F32 : VOPC_32 <0x00000010, "V_CMPX_F_F32">; -defm V_CMPX_LT_F32 : VOPC_32 <0x00000011, "V_CMPX_LT_F32">; -defm V_CMPX_EQ_F32 : VOPC_32 <0x00000012, "V_CMPX_EQ_F32">; -defm V_CMPX_LE_F32 : VOPC_32 <0x00000013, "V_CMPX_LE_F32">; -defm V_CMPX_GT_F32 : VOPC_32 <0x00000014, "V_CMPX_GT_F32">; -defm V_CMPX_LG_F32 : VOPC_32 <0x00000015, "V_CMPX_LG_F32">; -defm V_CMPX_GE_F32 : VOPC_32 <0x00000016, "V_CMPX_GE_F32">; -defm V_CMPX_O_F32 : VOPC_32 <0x00000017, "V_CMPX_O_F32">; -defm V_CMPX_U_F32 : VOPC_32 <0x00000018, "V_CMPX_U_F32">; -defm V_CMPX_NGE_F32 : VOPC_32 <0x00000019, "V_CMPX_NGE_F32">; -defm V_CMPX_NLG_F32 : VOPC_32 <0x0000001a, "V_CMPX_NLG_F32">; -defm V_CMPX_NGT_F32 : VOPC_32 <0x0000001b, "V_CMPX_NGT_F32">; -defm V_CMPX_NLE_F32 : VOPC_32 <0x0000001c, "V_CMPX_NLE_F32">; -defm V_CMPX_NEQ_F32 : VOPC_32 <0x0000001d, "V_CMPX_NEQ_F32">; -defm V_CMPX_NLT_F32 : VOPC_32 <0x0000001e, "V_CMPX_NLT_F32">; -defm V_CMPX_TRU_F32 : VOPC_32 <0x0000001f, "V_CMPX_TRU_F32">; +defm V_CMPX_F_F32 : VOPCX_32 <0x00000010, "V_CMPX_F_F32">; +defm V_CMPX_LT_F32 : VOPCX_32 <0x00000011, "V_CMPX_LT_F32">; +defm V_CMPX_EQ_F32 : VOPCX_32 <0x00000012, "V_CMPX_EQ_F32">; +defm V_CMPX_LE_F32 : VOPCX_32 <0x00000013, "V_CMPX_LE_F32">; +defm V_CMPX_GT_F32 : VOPCX_32 <0x00000014, "V_CMPX_GT_F32">; +defm V_CMPX_LG_F32 : VOPCX_32 <0x00000015, "V_CMPX_LG_F32">; +defm V_CMPX_GE_F32 : VOPCX_32 <0x00000016, "V_CMPX_GE_F32">; +defm V_CMPX_O_F32 : VOPCX_32 <0x00000017, "V_CMPX_O_F32">; +defm V_CMPX_U_F32 : VOPCX_32 <0x00000018, "V_CMPX_U_F32">; +defm V_CMPX_NGE_F32 : VOPCX_32 <0x00000019, "V_CMPX_NGE_F32">; +defm V_CMPX_NLG_F32 : VOPCX_32 <0x0000001a, "V_CMPX_NLG_F32">; +defm V_CMPX_NGT_F32 : VOPCX_32 <0x0000001b, "V_CMPX_NGT_F32">; +defm V_CMPX_NLE_F32 : VOPCX_32 <0x0000001c, "V_CMPX_NLE_F32">; +defm V_CMPX_NEQ_F32 : VOPCX_32 <0x0000001d, "V_CMPX_NEQ_F32">; +defm V_CMPX_NLT_F32 : VOPCX_32 <0x0000001e, "V_CMPX_NLT_F32">; +defm V_CMPX_TRU_F32 : VOPCX_32 <0x0000001f, "V_CMPX_TRU_F32">; -} // End hasSideEffects = 1, Defs = [EXEC] +} // End hasSideEffects = 1 defm V_CMP_F_F64 : VOPC_64 <0x00000020, "V_CMP_F_F64">; defm V_CMP_LT_F64 : VOPC_64 <0x00000021, "V_CMP_LT_F64", f64, COND_OLT>; @@ -505,26 +518,26 @@ defm V_CMP_NEQ_F64 : VOPC_64 <0x0000002d, "V_CMP_NEQ_F64", f64, COND_UNE>; defm V_CMP_NLT_F64 : VOPC_64 <0x0000002e, "V_CMP_NLT_F64">; defm V_CMP_TRU_F64 : VOPC_64 <0x0000002f, "V_CMP_TRU_F64">; -let hasSideEffects = 1, Defs = [EXEC] in { +let hasSideEffects = 1 in { -defm V_CMPX_F_F64 : VOPC_64 <0x00000030, "V_CMPX_F_F64">; -defm V_CMPX_LT_F64 : VOPC_64 <0x00000031, "V_CMPX_LT_F64">; -defm V_CMPX_EQ_F64 : VOPC_64 <0x00000032, "V_CMPX_EQ_F64">; -defm V_CMPX_LE_F64 : VOPC_64 <0x00000033, "V_CMPX_LE_F64">; -defm V_CMPX_GT_F64 : VOPC_64 <0x00000034, "V_CMPX_GT_F64">; -defm V_CMPX_LG_F64 : VOPC_64 <0x00000035, "V_CMPX_LG_F64">; -defm V_CMPX_GE_F64 : VOPC_64 <0x00000036, "V_CMPX_GE_F64">; -defm V_CMPX_O_F64 : VOPC_64 <0x00000037, "V_CMPX_O_F64">; -defm V_CMPX_U_F64 : VOPC_64 <0x00000038, "V_CMPX_U_F64">; -defm V_CMPX_NGE_F64 : VOPC_64 <0x00000039, "V_CMPX_NGE_F64">; -defm V_CMPX_NLG_F64 : VOPC_64 <0x0000003a, "V_CMPX_NLG_F64">; -defm V_CMPX_NGT_F64 : VOPC_64 <0x0000003b, "V_CMPX_NGT_F64">; -defm V_CMPX_NLE_F64 : VOPC_64 <0x0000003c, "V_CMPX_NLE_F64">; -defm V_CMPX_NEQ_F64 : VOPC_64 <0x0000003d, "V_CMPX_NEQ_F64">; -defm V_CMPX_NLT_F64 : VOPC_64 <0x0000003e, "V_CMPX_NLT_F64">; -defm V_CMPX_TRU_F64 : VOPC_64 <0x0000003f, "V_CMPX_TRU_F64">; +defm V_CMPX_F_F64 : VOPCX_64 <0x00000030, "V_CMPX_F_F64">; +defm V_CMPX_LT_F64 : VOPCX_64 <0x00000031, "V_CMPX_LT_F64">; +defm V_CMPX_EQ_F64 : VOPCX_64 <0x00000032, "V_CMPX_EQ_F64">; +defm V_CMPX_LE_F64 : VOPCX_64 <0x00000033, "V_CMPX_LE_F64">; +defm V_CMPX_GT_F64 : VOPCX_64 <0x00000034, "V_CMPX_GT_F64">; +defm V_CMPX_LG_F64 : VOPCX_64 <0x00000035, "V_CMPX_LG_F64">; +defm V_CMPX_GE_F64 : VOPCX_64 <0x00000036, "V_CMPX_GE_F64">; +defm V_CMPX_O_F64 : VOPCX_64 <0x00000037, "V_CMPX_O_F64">; +defm V_CMPX_U_F64 : VOPCX_64 <0x00000038, "V_CMPX_U_F64">; +defm V_CMPX_NGE_F64 : VOPCX_64 <0x00000039, "V_CMPX_NGE_F64">; +defm V_CMPX_NLG_F64 : VOPCX_64 <0x0000003a, "V_CMPX_NLG_F64">; +defm V_CMPX_NGT_F64 : VOPCX_64 <0x0000003b, "V_CMPX_NGT_F64">; +defm V_CMPX_NLE_F64 : VOPCX_64 <0x0000003c, "V_CMPX_NLE_F64">; +defm V_CMPX_NEQ_F64 : VOPCX_64 <0x0000003d, "V_CMPX_NEQ_F64">; +defm V_CMPX_NLT_F64 : VOPCX_64 <0x0000003e, "V_CMPX_NLT_F64">; +defm V_CMPX_TRU_F64 : VOPCX_64 <0x0000003f, "V_CMPX_TRU_F64">; -} // End hasSideEffects = 1, Defs = [EXEC] +} // End hasSideEffects = 1 defm V_CMPS_F_F32 : VOPC_32 <0x00000040, "V_CMPS_F_F32">; defm V_CMPS_LT_F32 : VOPC_32 <0x00000041, "V_CMPS_LT_F32">; @@ -543,26 +556,26 @@ defm V_CMPS_NEQ_F32 : VOPC_32 <0x0000004d, "V_CMPS_NEQ_F32">; defm V_CMPS_NLT_F32 : VOPC_32 <0x0000004e, "V_CMPS_NLT_F32">; defm V_CMPS_TRU_F32 : VOPC_32 <0x0000004f, "V_CMPS_TRU_F32">; -let hasSideEffects = 1, Defs = [EXEC] in { +let hasSideEffects = 1 in { -defm V_CMPSX_F_F32 : VOPC_32 <0x00000050, "V_CMPSX_F_F32">; -defm V_CMPSX_LT_F32 : VOPC_32 <0x00000051, "V_CMPSX_LT_F32">; -defm V_CMPSX_EQ_F32 : VOPC_32 <0x00000052, "V_CMPSX_EQ_F32">; -defm V_CMPSX_LE_F32 : VOPC_32 <0x00000053, "V_CMPSX_LE_F32">; -defm V_CMPSX_GT_F32 : VOPC_32 <0x00000054, "V_CMPSX_GT_F32">; -defm V_CMPSX_LG_F32 : VOPC_32 <0x00000055, "V_CMPSX_LG_F32">; -defm V_CMPSX_GE_F32 : VOPC_32 <0x00000056, "V_CMPSX_GE_F32">; -defm V_CMPSX_O_F32 : VOPC_32 <0x00000057, "V_CMPSX_O_F32">; -defm V_CMPSX_U_F32 : VOPC_32 <0x00000058, "V_CMPSX_U_F32">; -defm V_CMPSX_NGE_F32 : VOPC_32 <0x00000059, "V_CMPSX_NGE_F32">; -defm V_CMPSX_NLG_F32 : VOPC_32 <0x0000005a, "V_CMPSX_NLG_F32">; -defm V_CMPSX_NGT_F32 : VOPC_32 <0x0000005b, "V_CMPSX_NGT_F32">; -defm V_CMPSX_NLE_F32 : VOPC_32 <0x0000005c, "V_CMPSX_NLE_F32">; -defm V_CMPSX_NEQ_F32 : VOPC_32 <0x0000005d, "V_CMPSX_NEQ_F32">; -defm V_CMPSX_NLT_F32 : VOPC_32 <0x0000005e, "V_CMPSX_NLT_F32">; -defm V_CMPSX_TRU_F32 : VOPC_32 <0x0000005f, "V_CMPSX_TRU_F32">; +defm V_CMPSX_F_F32 : VOPCX_32 <0x00000050, "V_CMPSX_F_F32">; +defm V_CMPSX_LT_F32 : VOPCX_32 <0x00000051, "V_CMPSX_LT_F32">; +defm V_CMPSX_EQ_F32 : VOPCX_32 <0x00000052, "V_CMPSX_EQ_F32">; +defm V_CMPSX_LE_F32 : VOPCX_32 <0x00000053, "V_CMPSX_LE_F32">; +defm V_CMPSX_GT_F32 : VOPCX_32 <0x00000054, "V_CMPSX_GT_F32">; +defm V_CMPSX_LG_F32 : VOPCX_32 <0x00000055, "V_CMPSX_LG_F32">; +defm V_CMPSX_GE_F32 : VOPCX_32 <0x00000056, "V_CMPSX_GE_F32">; +defm V_CMPSX_O_F32 : VOPCX_32 <0x00000057, "V_CMPSX_O_F32">; +defm V_CMPSX_U_F32 : VOPCX_32 <0x00000058, "V_CMPSX_U_F32">; +defm V_CMPSX_NGE_F32 : VOPCX_32 <0x00000059, "V_CMPSX_NGE_F32">; +defm V_CMPSX_NLG_F32 : VOPCX_32 <0x0000005a, "V_CMPSX_NLG_F32">; +defm V_CMPSX_NGT_F32 : VOPCX_32 <0x0000005b, "V_CMPSX_NGT_F32">; +defm V_CMPSX_NLE_F32 : VOPCX_32 <0x0000005c, "V_CMPSX_NLE_F32">; +defm V_CMPSX_NEQ_F32 : VOPCX_32 <0x0000005d, "V_CMPSX_NEQ_F32">; +defm V_CMPSX_NLT_F32 : VOPCX_32 <0x0000005e, "V_CMPSX_NLT_F32">; +defm V_CMPSX_TRU_F32 : VOPCX_32 <0x0000005f, "V_CMPSX_TRU_F32">; -} // End hasSideEffects = 1, Defs = [EXEC] +} // End hasSideEffects = 1 defm V_CMPS_F_F64 : VOPC_64 <0x00000060, "V_CMPS_F_F64">; defm V_CMPS_LT_F64 : VOPC_64 <0x00000061, "V_CMPS_LT_F64">; @@ -611,18 +624,18 @@ defm V_CMP_NE_I32 : VOPC_32 <0x00000085, "V_CMP_NE_I32", i32, COND_NE>; defm V_CMP_GE_I32 : VOPC_32 <0x00000086, "V_CMP_GE_I32", i32, COND_SGE>; defm V_CMP_T_I32 : VOPC_32 <0x00000087, "V_CMP_T_I32">; -let hasSideEffects = 1, Defs = [EXEC] in { +let hasSideEffects = 1 in { -defm V_CMPX_F_I32 : VOPC_32 <0x00000090, "V_CMPX_F_I32">; -defm V_CMPX_LT_I32 : VOPC_32 <0x00000091, "V_CMPX_LT_I32">; -defm V_CMPX_EQ_I32 : VOPC_32 <0x00000092, "V_CMPX_EQ_I32">; -defm V_CMPX_LE_I32 : VOPC_32 <0x00000093, "V_CMPX_LE_I32">; -defm V_CMPX_GT_I32 : VOPC_32 <0x00000094, "V_CMPX_GT_I32">; -defm V_CMPX_NE_I32 : VOPC_32 <0x00000095, "V_CMPX_NE_I32">; -defm V_CMPX_GE_I32 : VOPC_32 <0x00000096, "V_CMPX_GE_I32">; -defm V_CMPX_T_I32 : VOPC_32 <0x00000097, "V_CMPX_T_I32">; +defm V_CMPX_F_I32 : VOPCX_32 <0x00000090, "V_CMPX_F_I32">; +defm V_CMPX_LT_I32 : VOPCX_32 <0x00000091, "V_CMPX_LT_I32">; +defm V_CMPX_EQ_I32 : VOPCX_32 <0x00000092, "V_CMPX_EQ_I32">; +defm V_CMPX_LE_I32 : VOPCX_32 <0x00000093, "V_CMPX_LE_I32">; +defm V_CMPX_GT_I32 : VOPCX_32 <0x00000094, "V_CMPX_GT_I32">; +defm V_CMPX_NE_I32 : VOPCX_32 <0x00000095, "V_CMPX_NE_I32">; +defm V_CMPX_GE_I32 : VOPCX_32 <0x00000096, "V_CMPX_GE_I32">; +defm V_CMPX_T_I32 : VOPCX_32 <0x00000097, "V_CMPX_T_I32">; -} // End hasSideEffects = 1, Defs = [EXEC] +} // End hasSideEffects = 1 defm V_CMP_F_I64 : VOPC_64 <0x000000a0, "V_CMP_F_I64">; defm V_CMP_LT_I64 : VOPC_64 <0x000000a1, "V_CMP_LT_I64", i64, COND_SLT>; @@ -633,18 +646,18 @@ defm V_CMP_NE_I64 : VOPC_64 <0x000000a5, "V_CMP_NE_I64", i64, COND_NE>; defm V_CMP_GE_I64 : VOPC_64 <0x000000a6, "V_CMP_GE_I64", i64, COND_SGE>; defm V_CMP_T_I64 : VOPC_64 <0x000000a7, "V_CMP_T_I64">; -let hasSideEffects = 1, Defs = [EXEC] in { +let hasSideEffects = 1 in { -defm V_CMPX_F_I64 : VOPC_64 <0x000000b0, "V_CMPX_F_I64">; -defm V_CMPX_LT_I64 : VOPC_64 <0x000000b1, "V_CMPX_LT_I64">; -defm V_CMPX_EQ_I64 : VOPC_64 <0x000000b2, "V_CMPX_EQ_I64">; -defm V_CMPX_LE_I64 : VOPC_64 <0x000000b3, "V_CMPX_LE_I64">; -defm V_CMPX_GT_I64 : VOPC_64 <0x000000b4, "V_CMPX_GT_I64">; -defm V_CMPX_NE_I64 : VOPC_64 <0x000000b5, "V_CMPX_NE_I64">; -defm V_CMPX_GE_I64 : VOPC_64 <0x000000b6, "V_CMPX_GE_I64">; -defm V_CMPX_T_I64 : VOPC_64 <0x000000b7, "V_CMPX_T_I64">; +defm V_CMPX_F_I64 : VOPCX_64 <0x000000b0, "V_CMPX_F_I64">; +defm V_CMPX_LT_I64 : VOPCX_64 <0x000000b1, "V_CMPX_LT_I64">; +defm V_CMPX_EQ_I64 : VOPCX_64 <0x000000b2, "V_CMPX_EQ_I64">; +defm V_CMPX_LE_I64 : VOPCX_64 <0x000000b3, "V_CMPX_LE_I64">; +defm V_CMPX_GT_I64 : VOPCX_64 <0x000000b4, "V_CMPX_GT_I64">; +defm V_CMPX_NE_I64 : VOPCX_64 <0x000000b5, "V_CMPX_NE_I64">; +defm V_CMPX_GE_I64 : VOPCX_64 <0x000000b6, "V_CMPX_GE_I64">; +defm V_CMPX_T_I64 : VOPCX_64 <0x000000b7, "V_CMPX_T_I64">; -} // End hasSideEffects = 1, Defs = [EXEC] +} // End hasSideEffects = 1 defm V_CMP_F_U32 : VOPC_32 <0x000000c0, "V_CMP_F_U32">; defm V_CMP_LT_U32 : VOPC_32 <0x000000c1, "V_CMP_LT_U32", i32, COND_ULT>; @@ -655,18 +668,18 @@ defm V_CMP_NE_U32 : VOPC_32 <0x000000c5, "V_CMP_NE_U32", i32, COND_NE>; defm V_CMP_GE_U32 : VOPC_32 <0x000000c6, "V_CMP_GE_U32", i32, COND_UGE>; defm V_CMP_T_U32 : VOPC_32 <0x000000c7, "V_CMP_T_U32">; -let hasSideEffects = 1, Defs = [EXEC] in { +let hasSideEffects = 1 in { -defm V_CMPX_F_U32 : VOPC_32 <0x000000d0, "V_CMPX_F_U32">; -defm V_CMPX_LT_U32 : VOPC_32 <0x000000d1, "V_CMPX_LT_U32">; -defm V_CMPX_EQ_U32 : VOPC_32 <0x000000d2, "V_CMPX_EQ_U32">; -defm V_CMPX_LE_U32 : VOPC_32 <0x000000d3, "V_CMPX_LE_U32">; -defm V_CMPX_GT_U32 : VOPC_32 <0x000000d4, "V_CMPX_GT_U32">; -defm V_CMPX_NE_U32 : VOPC_32 <0x000000d5, "V_CMPX_NE_U32">; -defm V_CMPX_GE_U32 : VOPC_32 <0x000000d6, "V_CMPX_GE_U32">; -defm V_CMPX_T_U32 : VOPC_32 <0x000000d7, "V_CMPX_T_U32">; +defm V_CMPX_F_U32 : VOPCX_32 <0x000000d0, "V_CMPX_F_U32">; +defm V_CMPX_LT_U32 : VOPCX_32 <0x000000d1, "V_CMPX_LT_U32">; +defm V_CMPX_EQ_U32 : VOPCX_32 <0x000000d2, "V_CMPX_EQ_U32">; +defm V_CMPX_LE_U32 : VOPCX_32 <0x000000d3, "V_CMPX_LE_U32">; +defm V_CMPX_GT_U32 : VOPCX_32 <0x000000d4, "V_CMPX_GT_U32">; +defm V_CMPX_NE_U32 : VOPCX_32 <0x000000d5, "V_CMPX_NE_U32">; +defm V_CMPX_GE_U32 : VOPCX_32 <0x000000d6, "V_CMPX_GE_U32">; +defm V_CMPX_T_U32 : VOPCX_32 <0x000000d7, "V_CMPX_T_U32">; -} // End hasSideEffects = 1, Defs = [EXEC] +} // End hasSideEffects = 1 defm V_CMP_F_U64 : VOPC_64 <0x000000e0, "V_CMP_F_U64">; defm V_CMP_LT_U64 : VOPC_64 <0x000000e1, "V_CMP_LT_U64", i64, COND_ULT>; @@ -677,30 +690,30 @@ defm V_CMP_NE_U64 : VOPC_64 <0x000000e5, "V_CMP_NE_U64", i64, COND_NE>; defm V_CMP_GE_U64 : VOPC_64 <0x000000e6, "V_CMP_GE_U64", i64, COND_UGE>; defm V_CMP_T_U64 : VOPC_64 <0x000000e7, "V_CMP_T_U64">; -let hasSideEffects = 1, Defs = [EXEC] in { +let hasSideEffects = 1 in { -defm V_CMPX_F_U64 : VOPC_64 <0x000000f0, "V_CMPX_F_U64">; -defm V_CMPX_LT_U64 : VOPC_64 <0x000000f1, "V_CMPX_LT_U64">; -defm V_CMPX_EQ_U64 : VOPC_64 <0x000000f2, "V_CMPX_EQ_U64">; -defm V_CMPX_LE_U64 : VOPC_64 <0x000000f3, "V_CMPX_LE_U64">; -defm V_CMPX_GT_U64 : VOPC_64 <0x000000f4, "V_CMPX_GT_U64">; -defm V_CMPX_NE_U64 : VOPC_64 <0x000000f5, "V_CMPX_NE_U64">; -defm V_CMPX_GE_U64 : VOPC_64 <0x000000f6, "V_CMPX_GE_U64">; -defm V_CMPX_T_U64 : VOPC_64 <0x000000f7, "V_CMPX_T_U64">; +defm V_CMPX_F_U64 : VOPCX_64 <0x000000f0, "V_CMPX_F_U64">; +defm V_CMPX_LT_U64 : VOPCX_64 <0x000000f1, "V_CMPX_LT_U64">; +defm V_CMPX_EQ_U64 : VOPCX_64 <0x000000f2, "V_CMPX_EQ_U64">; +defm V_CMPX_LE_U64 : VOPCX_64 <0x000000f3, "V_CMPX_LE_U64">; +defm V_CMPX_GT_U64 : VOPCX_64 <0x000000f4, "V_CMPX_GT_U64">; +defm V_CMPX_NE_U64 : VOPCX_64 <0x000000f5, "V_CMPX_NE_U64">; +defm V_CMPX_GE_U64 : VOPCX_64 <0x000000f6, "V_CMPX_GE_U64">; +defm V_CMPX_T_U64 : VOPCX_64 <0x000000f7, "V_CMPX_T_U64">; -} // End hasSideEffects = 1, Defs = [EXEC] +} // End hasSideEffects = 1 defm V_CMP_CLASS_F32 : VOPC_32 <0x00000088, "V_CMP_CLASS_F32">; -let hasSideEffects = 1, Defs = [EXEC] in { -defm V_CMPX_CLASS_F32 : VOPC_32 <0x00000098, "V_CMPX_CLASS_F32">; -} // End hasSideEffects = 1, Defs = [EXEC] +let hasSideEffects = 1 in { +defm V_CMPX_CLASS_F32 : VOPCX_32 <0x00000098, "V_CMPX_CLASS_F32">; +} // End hasSideEffects = 1 defm V_CMP_CLASS_F64 : VOPC_64 <0x000000a8, "V_CMP_CLASS_F64">; -let hasSideEffects = 1, Defs = [EXEC] in { -defm V_CMPX_CLASS_F64 : VOPC_64 <0x000000b8, "V_CMPX_CLASS_F64">; -} // End hasSideEffects = 1, Defs = [EXEC] +let hasSideEffects = 1 in { +defm V_CMPX_CLASS_F64 : VOPCX_64 <0x000000b8, "V_CMPX_CLASS_F64">; +} // End hasSideEffects = 1 } // End isCompare = 1 @@ -708,8 +721,97 @@ defm V_CMPX_CLASS_F64 : VOPC_64 <0x000000b8, "V_CMPX_CLASS_F64">; // DS Instructions //===----------------------------------------------------------------------===// -def DS_ADD_U32_RTN : DS_1A1D_RET <0x20, "DS_ADD_U32_RTN", VReg_32>; -def DS_SUB_U32_RTN : DS_1A1D_RET <0x21, "DS_SUB_U32_RTN", VReg_32>; + +def DS_ADD_U32 : DS_1A1D_NORET <0x0, "DS_ADD_U32", VReg_32>; +def DS_SUB_U32 : DS_1A1D_NORET <0x1, "DS_SUB_U32", VReg_32>; +def DS_RSUB_U32 : DS_1A1D_NORET <0x2, "DS_RSUB_U32", VReg_32>; +def DS_INC_U32 : DS_1A1D_NORET <0x3, "DS_INC_U32", VReg_32>; +def DS_DEC_U32 : DS_1A1D_NORET <0x4, "DS_DEC_U32", VReg_32>; +def DS_MIN_I32 : DS_1A1D_NORET <0x5, "DS_MIN_I32", VReg_32>; +def DS_MAX_I32 : DS_1A1D_NORET <0x6, "DS_MAX_I32", VReg_32>; +def DS_MIN_U32 : DS_1A1D_NORET <0x7, "DS_MIN_U32", VReg_32>; +def DS_MAX_U32 : DS_1A1D_NORET <0x8, "DS_MAX_U32", VReg_32>; +def DS_AND_B32 : DS_1A1D_NORET <0x9, "DS_AND_B32", VReg_32>; +def DS_OR_B32 : DS_1A1D_NORET <0xa, "DS_OR_B32", VReg_32>; +def DS_XOR_B32 : DS_1A1D_NORET <0xb, "DS_XOR_B32", VReg_32>; +def DS_MSKOR_B32 : DS_1A1D_NORET <0xc, "DS_MSKOR_B32", VReg_32>; +def DS_CMPST_B32 : DS_1A2D_NORET <0x10, "DS_CMPST_B32", VReg_32>; +def DS_CMPST_F32 : DS_1A2D_NORET <0x11, "DS_CMPST_F32", VReg_32>; +def DS_MIN_F32 : DS_1A1D_NORET <0x12, "DS_MIN_F32", VReg_32>; +def DS_MAX_F32 : DS_1A1D_NORET <0x13, "DS_MAX_F32", VReg_32>; + +def DS_ADD_RTN_U32 : DS_1A1D_RET <0x20, "DS_ADD_RTN_U32", VReg_32>; +def DS_SUB_RTN_U32 : DS_1A1D_RET <0x21, "DS_SUB_RTN_U32", VReg_32>; +def DS_RSUB_RTN_U32 : DS_1A1D_RET <0x22, "DS_RSUB_RTN_U32", VReg_32>; +def DS_INC_RTN_U32 : DS_1A1D_RET <0x23, "DS_INC_RTN_U32", VReg_32>; +def DS_DEC_RTN_U32 : DS_1A1D_RET <0x24, "DS_DEC_RTN_U32", VReg_32>; +def DS_MIN_RTN_I32 : DS_1A1D_RET <0x25, "DS_MIN_RTN_I32", VReg_32>; +def DS_MAX_RTN_I32 : DS_1A1D_RET <0x26, "DS_MAX_RTN_I32", VReg_32>; +def DS_MIN_RTN_U32 : DS_1A1D_RET <0x27, "DS_MIN_RTN_U32", VReg_32>; +def DS_MAX_RTN_U32 : DS_1A1D_RET <0x28, "DS_MAX_RTN_U32", VReg_32>; +def DS_AND_RTN_B32 : DS_1A1D_RET <0x29, "DS_AND_RTN_B32", VReg_32>; +def DS_OR_RTN_B32 : DS_1A1D_RET <0x2a, "DS_OR_RTN_B32", VReg_32>; +def DS_XOR_RTN_B32 : DS_1A1D_RET <0x2b, "DS_XOR_RTN_B32", VReg_32>; +def DS_MSKOR_RTN_B32 : DS_1A1D_RET <0x2c, "DS_MSKOR_RTN_B32", VReg_32>; +def DS_WRXCHG_RTN_B32 : DS_1A1D_RET <0x2d, "DS_WRXCHG_RTN_B32", VReg_32>; +//def DS_WRXCHG2_RTN_B32 : DS_2A0D_RET <0x2e, "DS_WRXCHG2_RTN_B32", VReg_32>; +//def DS_WRXCHG2ST64_RTN_B32 : DS_2A0D_RET <0x2f, "DS_WRXCHG2_RTN_B32", VReg_32>; +def DS_CMPST_RTN_B32 : DS_1A2D_RET <0x30, "DS_CMPST_RTN_B32", VReg_32>; +def DS_CMPST_RTN_F32 : DS_1A2D_RET <0x31, "DS_CMPST_RTN_F32", VReg_32>; +def DS_MIN_RTN_F32 : DS_1A1D_RET <0x32, "DS_MIN_RTN_F32", VReg_32>; +def DS_MAX_RTN_F32 : DS_1A1D_RET <0x33, "DS_MAX_RTN_F32", VReg_32>; + +let SubtargetPredicate = isCI in { +def DS_WRAP_RTN_F32 : DS_1A1D_RET <0x34, "DS_WRAP_RTN_F32", VReg_32>; +} // End isCI + + +def DS_ADD_U64 : DS_1A1D_NORET <0x40, "DS_ADD_U64", VReg_32>; +def DS_SUB_U64 : DS_1A1D_NORET <0x41, "DS_SUB_U64", VReg_32>; +def DS_RSUB_U64 : DS_1A1D_NORET <0x42, "DS_RSUB_U64", VReg_32>; +def DS_INC_U64 : DS_1A1D_NORET <0x43, "DS_INC_U64", VReg_32>; +def DS_DEC_U64 : DS_1A1D_NORET <0x44, "DS_DEC_U64", VReg_32>; +def DS_MIN_I64 : DS_1A1D_NORET <0x45, "DS_MIN_I64", VReg_64>; +def DS_MAX_I64 : DS_1A1D_NORET <0x46, "DS_MAX_I64", VReg_64>; +def DS_MIN_U64 : DS_1A1D_NORET <0x47, "DS_MIN_U64", VReg_64>; +def DS_MAX_U64 : DS_1A1D_NORET <0x48, "DS_MAX_U64", VReg_64>; +def DS_AND_B64 : DS_1A1D_NORET <0x49, "DS_AND_B64", VReg_64>; +def DS_OR_B64 : DS_1A1D_NORET <0x4a, "DS_OR_B64", VReg_64>; +def DS_XOR_B64 : DS_1A1D_NORET <0x4b, "DS_XOR_B64", VReg_64>; +def DS_MSKOR_B64 : DS_1A1D_NORET <0x4c, "DS_MSKOR_B64", VReg_64>; +def DS_CMPST_B64 : DS_1A2D_NORET <0x50, "DS_CMPST_B64", VReg_64>; +def DS_CMPST_F64 : DS_1A2D_NORET <0x51, "DS_CMPST_F64", VReg_64>; +def DS_MIN_F64 : DS_1A1D_NORET <0x52, "DS_MIN_F64", VReg_64>; +def DS_MAX_F64 : DS_1A1D_NORET <0x53, "DS_MAX_F64", VReg_64>; + +def DS_ADD_RTN_U64 : DS_1A1D_RET <0x60, "DS_ADD_RTN_U64", VReg_64>; +def DS_SUB_RTN_U64 : DS_1A1D_RET <0x61, "DS_SUB_RTN_U64", VReg_64>; +def DS_RSUB_RTN_U64 : DS_1A1D_RET <0x62, "DS_RSUB_RTN_U64", VReg_64>; +def DS_INC_RTN_U64 : DS_1A1D_RET <0x63, "DS_INC_RTN_U64", VReg_64>; +def DS_DEC_RTN_U64 : DS_1A1D_RET <0x64, "DS_DEC_RTN_U64", VReg_64>; +def DS_MIN_RTN_I64 : DS_1A1D_RET <0x65, "DS_MIN_RTN_I64", VReg_64>; +def DS_MAX_RTN_I64 : DS_1A1D_RET <0x66, "DS_MAX_RTN_I64", VReg_64>; +def DS_MIN_RTN_U64 : DS_1A1D_RET <0x67, "DS_MIN_RTN_U64", VReg_64>; +def DS_MAX_RTN_U64 : DS_1A1D_RET <0x68, "DS_MAX_RTN_U64", VReg_64>; +def DS_AND_RTN_B64 : DS_1A1D_RET <0x69, "DS_AND_RTN_B64", VReg_64>; +def DS_OR_RTN_B64 : DS_1A1D_RET <0x6a, "DS_OR_RTN_B64", VReg_64>; +def DS_XOR_RTN_B64 : DS_1A1D_RET <0x6b, "DS_XOR_RTN_B64", VReg_64>; +def DS_MSKOR_RTN_B64 : DS_1A1D_RET <0x6c, "DS_MSKOR_RTN_B64", VReg_64>; +def DS_WRXCHG_RTN_B64 : DS_1A1D_RET <0x6d, "DS_WRXCHG_RTN_B64", VReg_64>; +//def DS_WRXCHG2_RTN_B64 : DS_2A0D_RET <0x6e, "DS_WRXCHG2_RTN_B64", VReg_64>; +//def DS_WRXCHG2ST64_RTN_B64 : DS_2A0D_RET <0x6f, "DS_WRXCHG2_RTN_B64", VReg_64>; +def DS_CMPST_RTN_B64 : DS_1A2D_RET <0x70, "DS_CMPST_RTN_B64", VReg_64>; +def DS_CMPST_RTN_F64 : DS_1A2D_RET <0x71, "DS_CMPST_RTN_F64", VReg_64>; +def DS_MIN_RTN_F64 : DS_1A1D_RET <0x72, "DS_MIN_F64", VReg_64>; +def DS_MAX_RTN_F64 : DS_1A1D_RET <0x73, "DS_MAX_F64", VReg_64>; + +//let SubtargetPredicate = isCI in { +// DS_CONDXCHG32_RTN_B64 +// DS_CONDXCHG32_RTN_B128 +//} // End isCI + +// TODO: _SRC2_* forms + def DS_WRITE_B32 : DS_Store_Helper <0x0000000d, "DS_WRITE_B32", VReg_32>; def DS_WRITE_B8 : DS_Store_Helper <0x00000001e, "DS_WRITE_B8", VReg_32>; def DS_WRITE_B16 : DS_Store_Helper <0x00000001f, "DS_WRITE_B16", VReg_32>; @@ -744,32 +846,46 @@ defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <0x00000003, "BUFFER_LOAD_FORMA //def BUFFER_STORE_FORMAT_XY : MUBUF_ <0x00000005, "BUFFER_STORE_FORMAT_XY", []>; //def BUFFER_STORE_FORMAT_XYZ : MUBUF_ <0x00000006, "BUFFER_STORE_FORMAT_XYZ", []>; //def BUFFER_STORE_FORMAT_XYZW : MUBUF_ <0x00000007, "BUFFER_STORE_FORMAT_XYZW", []>; -defm BUFFER_LOAD_UBYTE : MUBUF_Load_Helper <0x00000008, "BUFFER_LOAD_UBYTE", VReg_32>; -defm BUFFER_LOAD_SBYTE : MUBUF_Load_Helper <0x00000009, "BUFFER_LOAD_SBYTE", VReg_32>; -defm BUFFER_LOAD_USHORT : MUBUF_Load_Helper <0x0000000a, "BUFFER_LOAD_USHORT", VReg_32>; -defm BUFFER_LOAD_SSHORT : MUBUF_Load_Helper <0x0000000b, "BUFFER_LOAD_SSHORT", VReg_32>; -defm BUFFER_LOAD_DWORD : MUBUF_Load_Helper <0x0000000c, "BUFFER_LOAD_DWORD", VReg_32>; -defm BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper <0x0000000d, "BUFFER_LOAD_DWORDX2", VReg_64>; -defm BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper <0x0000000e, "BUFFER_LOAD_DWORDX4", VReg_128>; +defm BUFFER_LOAD_UBYTE : MUBUF_Load_Helper < + 0x00000008, "BUFFER_LOAD_UBYTE", VReg_32, i32, az_extloadi8_global +>; +defm BUFFER_LOAD_SBYTE : MUBUF_Load_Helper < + 0x00000009, "BUFFER_LOAD_SBYTE", VReg_32, i32, sextloadi8_global +>; +defm BUFFER_LOAD_USHORT : MUBUF_Load_Helper < + 0x0000000a, "BUFFER_LOAD_USHORT", VReg_32, i32, az_extloadi16_global +>; +defm BUFFER_LOAD_SSHORT : MUBUF_Load_Helper < + 0x0000000b, "BUFFER_LOAD_SSHORT", VReg_32, i32, sextloadi16_global +>; +defm BUFFER_LOAD_DWORD : MUBUF_Load_Helper < + 0x0000000c, "BUFFER_LOAD_DWORD", VReg_32, i32, global_load +>; +defm BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper < + 0x0000000d, "BUFFER_LOAD_DWORDX2", VReg_64, v2i32, global_load +>; +defm BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper < + 0x0000000e, "BUFFER_LOAD_DWORDX4", VReg_128, v4i32, global_load +>; def BUFFER_STORE_BYTE : MUBUF_Store_Helper < - 0x00000018, "BUFFER_STORE_BYTE", VReg_32 + 0x00000018, "BUFFER_STORE_BYTE", VReg_32, i32, truncstorei8_global >; def BUFFER_STORE_SHORT : MUBUF_Store_Helper < - 0x0000001a, "BUFFER_STORE_SHORT", VReg_32 + 0x0000001a, "BUFFER_STORE_SHORT", VReg_32, i32, truncstorei16_global >; def BUFFER_STORE_DWORD : MUBUF_Store_Helper < - 0x0000001c, "BUFFER_STORE_DWORD", VReg_32 + 0x0000001c, "BUFFER_STORE_DWORD", VReg_32, i32, global_store >; def BUFFER_STORE_DWORDX2 : MUBUF_Store_Helper < - 0x0000001d, "BUFFER_STORE_DWORDX2", VReg_64 + 0x0000001d, "BUFFER_STORE_DWORDX2", VReg_64, v2i32, global_store >; def BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper < - 0x0000001e, "BUFFER_STORE_DWORDX4", VReg_128 + 0x0000001e, "BUFFER_STORE_DWORDX4", VReg_128, v4i32, global_store >; //def BUFFER_ATOMIC_SWAP : MUBUF_ <0x00000030, "BUFFER_ATOMIC_SWAP", []>; //def BUFFER_ATOMIC_CMPSWAP : MUBUF_ <0x00000031, "BUFFER_ATOMIC_CMPSWAP", []>; @@ -885,31 +1001,31 @@ defm IMAGE_SAMPLE_C_B : MIMG_Sampler <0x0000002d, "IMAGE_SAMPLE_C_B">; //def IMAGE_SAMPLE_C_B_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_O", 0x0000003d>; //def IMAGE_SAMPLE_C_B_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_CL_O", 0x0000003e>; //def IMAGE_SAMPLE_C_LZ_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_LZ_O", 0x0000003f>; -//def IMAGE_GATHER4 : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4", 0x00000040>; -//def IMAGE_GATHER4_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_CL", 0x00000041>; -//def IMAGE_GATHER4_L : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_L", 0x00000044>; -//def IMAGE_GATHER4_B : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B", 0x00000045>; -//def IMAGE_GATHER4_B_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B_CL", 0x00000046>; -//def IMAGE_GATHER4_LZ : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_LZ", 0x00000047>; -//def IMAGE_GATHER4_C : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C", 0x00000048>; -//def IMAGE_GATHER4_C_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_CL", 0x00000049>; -//def IMAGE_GATHER4_C_L : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_L", 0x0000004c>; -//def IMAGE_GATHER4_C_B : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B", 0x0000004d>; -//def IMAGE_GATHER4_C_B_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B_CL", 0x0000004e>; -//def IMAGE_GATHER4_C_LZ : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_LZ", 0x0000004f>; -//def IMAGE_GATHER4_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_O", 0x00000050>; -//def IMAGE_GATHER4_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_CL_O", 0x00000051>; -//def IMAGE_GATHER4_L_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_L_O", 0x00000054>; -//def IMAGE_GATHER4_B_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B_O", 0x00000055>; -//def IMAGE_GATHER4_B_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B_CL_O", 0x00000056>; -//def IMAGE_GATHER4_LZ_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_LZ_O", 0x00000057>; -//def IMAGE_GATHER4_C_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_O", 0x00000058>; -//def IMAGE_GATHER4_C_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_CL_O", 0x00000059>; -//def IMAGE_GATHER4_C_L_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_L_O", 0x0000005c>; -//def IMAGE_GATHER4_C_B_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B_O", 0x0000005d>; -//def IMAGE_GATHER4_C_B_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B_CL_O", 0x0000005e>; -//def IMAGE_GATHER4_C_LZ_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_LZ_O", 0x0000005f>; -//def IMAGE_GET_LOD : MIMG_NoPattern_ <"IMAGE_GET_LOD", 0x00000060>; +defm IMAGE_GATHER4 : MIMG_Gather <0x00000040, "IMAGE_GATHER4">; +defm IMAGE_GATHER4_CL : MIMG_Gather <0x00000041, "IMAGE_GATHER4_CL">; +defm IMAGE_GATHER4_L : MIMG_Gather <0x00000044, "IMAGE_GATHER4_L">; +defm IMAGE_GATHER4_B : MIMG_Gather <0x00000045, "IMAGE_GATHER4_B">; +defm IMAGE_GATHER4_B_CL : MIMG_Gather <0x00000046, "IMAGE_GATHER4_B_CL">; +defm IMAGE_GATHER4_LZ : MIMG_Gather <0x00000047, "IMAGE_GATHER4_LZ">; +defm IMAGE_GATHER4_C : MIMG_Gather <0x00000048, "IMAGE_GATHER4_C">; +defm IMAGE_GATHER4_C_CL : MIMG_Gather <0x00000049, "IMAGE_GATHER4_C_CL">; +defm IMAGE_GATHER4_C_L : MIMG_Gather <0x0000004c, "IMAGE_GATHER4_C_L">; +defm IMAGE_GATHER4_C_B : MIMG_Gather <0x0000004d, "IMAGE_GATHER4_C_B">; +defm IMAGE_GATHER4_C_B_CL : MIMG_Gather <0x0000004e, "IMAGE_GATHER4_C_B_CL">; +defm IMAGE_GATHER4_C_LZ : MIMG_Gather <0x0000004f, "IMAGE_GATHER4_C_LZ">; +defm IMAGE_GATHER4_O : MIMG_Gather <0x00000050, "IMAGE_GATHER4_O">; +defm IMAGE_GATHER4_CL_O : MIMG_Gather <0x00000051, "IMAGE_GATHER4_CL_O">; +defm IMAGE_GATHER4_L_O : MIMG_Gather <0x00000054, "IMAGE_GATHER4_L_O">; +defm IMAGE_GATHER4_B_O : MIMG_Gather <0x00000055, "IMAGE_GATHER4_B_O">; +defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <0x00000056, "IMAGE_GATHER4_B_CL_O">; +defm IMAGE_GATHER4_LZ_O : MIMG_Gather <0x00000057, "IMAGE_GATHER4_LZ_O">; +defm IMAGE_GATHER4_C_O : MIMG_Gather <0x00000058, "IMAGE_GATHER4_C_O">; +defm IMAGE_GATHER4_C_CL_O : MIMG_Gather <0x00000059, "IMAGE_GATHER4_C_CL_O">; +defm IMAGE_GATHER4_C_L_O : MIMG_Gather <0x0000005c, "IMAGE_GATHER4_C_L_O">; +defm IMAGE_GATHER4_C_B_O : MIMG_Gather <0x0000005d, "IMAGE_GATHER4_C_B_O">; +defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather <0x0000005e, "IMAGE_GATHER4_C_B_CL_O">; +defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <0x0000005f, "IMAGE_GATHER4_C_LZ_O">; +defm IMAGE_GET_LOD : MIMG_Sampler <0x00000060, "IMAGE_GET_LOD">; //def IMAGE_SAMPLE_CD : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD", 0x00000068>; //def IMAGE_SAMPLE_CD_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD_CL", 0x00000069>; //def IMAGE_SAMPLE_C_CD : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD", 0x0000006a>; @@ -962,8 +1078,12 @@ defm V_CVT_I32_F32 : VOP1_32 <0x00000008, "V_CVT_I32_F32", [(set i32:$dst, (fp_to_sint f32:$src0))] >; defm V_MOV_FED_B32 : VOP1_32 <0x00000009, "V_MOV_FED_B32", []>; -////def V_CVT_F16_F32 : VOP1_F16 <0x0000000a, "V_CVT_F16_F32", []>; -//defm V_CVT_F32_F16 : VOP1_32 <0x0000000b, "V_CVT_F32_F16", []>; +defm V_CVT_F16_F32 : VOP1_32 <0x0000000a, "V_CVT_F16_F32", + [(set i32:$dst, (f32_to_f16 f32:$src0))] +>; +defm V_CVT_F32_F16 : VOP1_32 <0x0000000b, "V_CVT_F32_F16", + [(set f32:$dst, (f16_to_f32 i32:$src0))] +>; //defm V_CVT_RPI_I32_F32 : VOP1_32 <0x0000000c, "V_CVT_RPI_I32_F32", []>; //defm V_CVT_FLR_I32_F32 : VOP1_32 <0x0000000d, "V_CVT_FLR_I32_F32", []>; //defm V_CVT_OFF_F32_I4 : VOP1_32 <0x0000000e, "V_CVT_OFF_F32_I4", []>; @@ -973,10 +1093,18 @@ defm V_CVT_F32_F64 : VOP1_32_64 <0x0000000f, "V_CVT_F32_F64", defm V_CVT_F64_F32 : VOP1_64_32 <0x00000010, "V_CVT_F64_F32", [(set f64:$dst, (fextend f32:$src0))] >; -//defm V_CVT_F32_UBYTE0 : VOP1_32 <0x00000011, "V_CVT_F32_UBYTE0", []>; -//defm V_CVT_F32_UBYTE1 : VOP1_32 <0x00000012, "V_CVT_F32_UBYTE1", []>; -//defm V_CVT_F32_UBYTE2 : VOP1_32 <0x00000013, "V_CVT_F32_UBYTE2", []>; -//defm V_CVT_F32_UBYTE3 : VOP1_32 <0x00000014, "V_CVT_F32_UBYTE3", []>; +defm V_CVT_F32_UBYTE0 : VOP1_32 <0x00000011, "V_CVT_F32_UBYTE0", + [(set f32:$dst, (AMDGPUcvt_f32_ubyte0 i32:$src0))] +>; +defm V_CVT_F32_UBYTE1 : VOP1_32 <0x00000012, "V_CVT_F32_UBYTE1", + [(set f32:$dst, (AMDGPUcvt_f32_ubyte1 i32:$src0))] +>; +defm V_CVT_F32_UBYTE2 : VOP1_32 <0x00000013, "V_CVT_F32_UBYTE2", + [(set f32:$dst, (AMDGPUcvt_f32_ubyte2 i32:$src0))] +>; +defm V_CVT_F32_UBYTE3 : VOP1_32 <0x00000014, "V_CVT_F32_UBYTE3", + [(set f32:$dst, (AMDGPUcvt_f32_ubyte3 i32:$src0))] +>; defm V_CVT_U32_F64 : VOP1_32_64 <0x00000015, "V_CVT_U32_F64", [(set i32:$dst, (fp_to_uint f64:$src0))] >; @@ -988,7 +1116,7 @@ defm V_FRACT_F32 : VOP1_32 <0x00000020, "V_FRACT_F32", [(set f32:$dst, (AMDGPUfract f32:$src0))] >; defm V_TRUNC_F32 : VOP1_32 <0x00000021, "V_TRUNC_F32", - [(set f32:$dst, (int_AMDGPU_trunc f32:$src0))] + [(set f32:$dst, (ftrunc f32:$src0))] >; defm V_CEIL_F32 : VOP1_32 <0x00000022, "V_CEIL_F32", [(set f32:$dst, (fceil f32:$src0))] @@ -1006,24 +1134,33 @@ defm V_LOG_CLAMP_F32 : VOP1_32 <0x00000026, "V_LOG_CLAMP_F32", []>; defm V_LOG_F32 : VOP1_32 <0x00000027, "V_LOG_F32", [(set f32:$dst, (flog2 f32:$src0))] >; + defm V_RCP_CLAMP_F32 : VOP1_32 <0x00000028, "V_RCP_CLAMP_F32", []>; defm V_RCP_LEGACY_F32 : VOP1_32 <0x00000029, "V_RCP_LEGACY_F32", []>; defm V_RCP_F32 : VOP1_32 <0x0000002a, "V_RCP_F32", - [(set f32:$dst, (fdiv FP_ONE, f32:$src0))] + [(set f32:$dst, (AMDGPUrcp f32:$src0))] >; defm V_RCP_IFLAG_F32 : VOP1_32 <0x0000002b, "V_RCP_IFLAG_F32", []>; -defm V_RSQ_CLAMP_F32 : VOP1_32 <0x0000002c, "V_RSQ_CLAMP_F32", []>; +defm V_RSQ_CLAMP_F32 : VOP1_32 <0x0000002c, "V_RSQ_CLAMP_F32", + [(set f32:$dst, (AMDGPUrsq_clamped f32:$src0))] +>; defm V_RSQ_LEGACY_F32 : VOP1_32 < 0x0000002d, "V_RSQ_LEGACY_F32", - [(set f32:$dst, (int_AMDGPU_rsq f32:$src0))] + [(set f32:$dst, (AMDGPUrsq_legacy f32:$src0))] +>; +defm V_RSQ_F32 : VOP1_32 <0x0000002e, "V_RSQ_F32", + [(set f32:$dst, (AMDGPUrsq f32:$src0))] >; -defm V_RSQ_F32 : VOP1_32 <0x0000002e, "V_RSQ_F32", []>; defm V_RCP_F64 : VOP1_64 <0x0000002f, "V_RCP_F64", - [(set f64:$dst, (fdiv FP_ONE, f64:$src0))] + [(set f64:$dst, (AMDGPUrcp f64:$src0))] >; defm V_RCP_CLAMP_F64 : VOP1_64 <0x00000030, "V_RCP_CLAMP_F64", []>; -defm V_RSQ_F64 : VOP1_64 <0x00000031, "V_RSQ_F64", []>; -defm V_RSQ_CLAMP_F64 : VOP1_64 <0x00000032, "V_RSQ_CLAMP_F64", []>; +defm V_RSQ_F64 : VOP1_64 <0x00000031, "V_RSQ_F64", + [(set f64:$dst, (AMDGPUrsq f64:$src0))] +>; +defm V_RSQ_CLAMP_F64 : VOP1_64 <0x00000032, "V_RSQ_CLAMP_F64", + [(set f64:$dst, (AMDGPUrsq_clamped f64:$src0))] +>; defm V_SQRT_F32 : VOP1_32 <0x00000033, "V_SQRT_F32", [(set f32:$dst, (fsqrt f32:$src0))] >; @@ -1211,7 +1348,7 @@ defm V_BFM_B32 : VOP2_32 <0x0000001e, "V_BFM_B32", defm V_MAC_F32 : VOP2_32 <0x0000001f, "V_MAC_F32", []>; defm V_MADMK_F32 : VOP2_32 <0x00000020, "V_MADMK_F32", []>; defm V_MADAK_F32 : VOP2_32 <0x00000021, "V_MADAK_F32", []>; -//defm V_BCNT_U32_B32 : VOP2_32 <0x00000022, "V_BCNT_U32_B32", []>; +defm V_BCNT_U32_B32 : VOP2_32 <0x00000022, "V_BCNT_U32_B32", []>; defm V_MBCNT_LO_U32_B32 : VOP2_32 <0x00000023, "V_MBCNT_LO_U32_B32", []>; defm V_MBCNT_HI_U32_B32 : VOP2_32 <0x00000024, "V_MBCNT_HI_U32_B32", []>; @@ -1303,16 +1440,20 @@ defm V_MULLIT_F32 : VOP3_32 <0x00000150, "V_MULLIT_F32", []>; //def V_SAD_U16 : VOP3_U16 <0x0000015c, "V_SAD_U16", []>; defm V_SAD_U32 : VOP3_32 <0x0000015d, "V_SAD_U32", []>; ////def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "V_CVT_PK_U8_F32", []>; -defm V_DIV_FIXUP_F32 : VOP3_32 <0x0000015f, "V_DIV_FIXUP_F32", []>; -def V_DIV_FIXUP_F64 : VOP3_64 <0x00000160, "V_DIV_FIXUP_F64", []>; +defm V_DIV_FIXUP_F32 : VOP3_32 <0x0000015f, "V_DIV_FIXUP_F32", + [(set f32:$dst, (AMDGPUdiv_fixup f32:$src0, f32:$src1, f32:$src2))] +>; +def V_DIV_FIXUP_F64 : VOP3_64 <0x00000160, "V_DIV_FIXUP_F64", + [(set f64:$dst, (AMDGPUdiv_fixup f64:$src0, f64:$src1, f64:$src2))] +>; -def V_LSHL_B64 : VOP3_64_Shift <0x00000161, "V_LSHL_B64", +def V_LSHL_B64 : VOP3_64_32 <0x00000161, "V_LSHL_B64", [(set i64:$dst, (shl i64:$src0, i32:$src1))] >; -def V_LSHR_B64 : VOP3_64_Shift <0x00000162, "V_LSHR_B64", +def V_LSHR_B64 : VOP3_64_32 <0x00000162, "V_LSHR_B64", [(set i64:$dst, (srl i64:$src0, i32:$src1))] >; -def V_ASHR_I64 : VOP3_64_Shift <0x00000163, "V_ASHR_I64", +def V_ASHR_I64 : VOP3_64_32 <0x00000163, "V_ASHR_I64", [(set i64:$dst, (sra i64:$src0, i32:$src1))] >; @@ -1336,14 +1477,23 @@ defm V_MUL_HI_I32 : VOP3_32 <0x0000016c, "V_MUL_HI_I32", []>; } // isCommutable = 1 -defm V_DIV_SCALE_F32 : VOP3_32 <0x0000016d, "V_DIV_SCALE_F32", []>; -def V_DIV_SCALE_F64 : VOP3_64 <0x0000016e, "V_DIV_SCALE_F64", []>; -defm V_DIV_FMAS_F32 : VOP3_32 <0x0000016f, "V_DIV_FMAS_F32", []>; -def V_DIV_FMAS_F64 : VOP3_64 <0x00000170, "V_DIV_FMAS_F64", []>; +def V_DIV_SCALE_F32 : VOP3b_32 <0x0000016d, "V_DIV_SCALE_F32", []>; + +// Double precision division pre-scale. +def V_DIV_SCALE_F64 : VOP3b_64 <0x0000016e, "V_DIV_SCALE_F64", []>; + +defm V_DIV_FMAS_F32 : VOP3_32 <0x0000016f, "V_DIV_FMAS_F32", + [(set f32:$dst, (AMDGPUdiv_fmas f32:$src0, f32:$src1, f32:$src2))] +>; +def V_DIV_FMAS_F64 : VOP3_64 <0x00000170, "V_DIV_FMAS_F64", + [(set f64:$dst, (AMDGPUdiv_fmas f64:$src0, f64:$src1, f64:$src2))] +>; //def V_MSAD_U8 : VOP3_U8 <0x00000171, "V_MSAD_U8", []>; //def V_QSAD_U8 : VOP3_U8 <0x00000172, "V_QSAD_U8", []>; //def V_MQSAD_U8 : VOP3_U8 <0x00000173, "V_MQSAD_U8", []>; -def V_TRIG_PREOP_F64 : VOP3_64 <0x00000174, "V_TRIG_PREOP_F64", []>; +def V_TRIG_PREOP_F64 : VOP3_64_32 <0x00000174, "V_TRIG_PREOP_F64", + [(set f64:$dst, (AMDGPUtrig_preop f64:$src0, i32:$src1))] +>; //===----------------------------------------------------------------------===// // Pseudo Instructions @@ -1500,7 +1650,7 @@ let usesCustomInserter = 1 in { // constant that can be used with the ADDR64 MUBUF instructions. def SI_ADDR64_RSRC : InstSI < (outs SReg_128:$srsrc), - (ins SReg_64:$ptr), + (ins SSrc_64:$ptr), "", [] >; @@ -1508,7 +1658,7 @@ def V_SUB_F64 : InstSI < (outs VReg_64:$dst), (ins VReg_64:$src0, VReg_64:$src1), "V_SUB_F64 $dst, $src0, $src1", - [] + [(set f64:$dst, (fsub f64:$src0, f64:$src1))] >; } // end usesCustomInserter @@ -1529,6 +1679,7 @@ multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> { } +defm SI_SPILL_S32 : SI_SPILL_SGPR <SReg_32>; defm SI_SPILL_S64 : SI_SPILL_SGPR <SReg_64>; defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>; defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>; @@ -1552,7 +1703,7 @@ def : Pat < /* int_SI_vs_load_input */ def : Pat< - (SIload_input v4i32:$tlst, IMM12bit:$attr_offset, i32:$buf_idx_vgpr), + (SIload_input v4i32:$tlst, imm:$attr_offset, i32:$buf_idx_vgpr), (BUFFER_LOAD_FORMAT_XYZW_IDXEN $tlst, $buf_idx_vgpr, imm:$attr_offset, 0, 0, 0, 0) >; @@ -1564,11 +1715,6 @@ def : Pat < $src0, $src1, $src2, $src3) >; -def : Pat < - (f64 (fsub f64:$src0, f64:$src1)), - (V_SUB_F64 $src0, $src1) ->; - //===----------------------------------------------------------------------===// // SMRD Patterns //===----------------------------------------------------------------------===// @@ -1596,7 +1742,6 @@ multiclass SMRD_Pattern <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> { defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>; defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>; -defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, i64>; defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, v2i32>; defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>; defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>; @@ -1615,6 +1760,24 @@ def : Pat < (S_BUFFER_LOAD_DWORD_SGPR $sbase, (S_MOV_B32 imm:$offset)) >; +} // Predicates = [isSI] in { + +//===----------------------------------------------------------------------===// +// SOP1 Patterns +//===----------------------------------------------------------------------===// + +let Predicates = [isSI, isCFDepth0] in { + +def : Pat < + (i64 (ctpop i64:$src)), + (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)), + (S_BCNT1_I32_B64 $src), sub0), + (S_MOV_B32 0), sub1) +>; + +} // Predicates = [isSI, isCFDepth0] + +let Predicates = [isSI] in { //===----------------------------------------------------------------------===// // SOP2 Patterns //===----------------------------------------------------------------------===// @@ -1625,18 +1788,39 @@ def : Pat < >; //===----------------------------------------------------------------------===// -// VOP2 Patterns +// SOPP Patterns //===----------------------------------------------------------------------===// def : Pat < - (or i64:$src0, i64:$src1), + (int_AMDGPU_barrier_global), + (S_BARRIER) +>; + +//===----------------------------------------------------------------------===// +// VOP1 Patterns +//===----------------------------------------------------------------------===// + +def : RcpPat<V_RCP_F32_e32, f32>; +def : RcpPat<V_RCP_F64_e32, f64>; +defm : RsqPat<V_RSQ_F32_e32, f32>; +defm : RsqPat<V_RSQ_F64_e32, f64>; + +//===----------------------------------------------------------------------===// +// VOP2 Patterns +//===----------------------------------------------------------------------===// + +class BinOp64Pat <SDNode node, Instruction inst> : Pat < + (node i64:$src0, i64:$src1), (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)), - (V_OR_B32_e32 (EXTRACT_SUBREG i64:$src0, sub0), + (inst (EXTRACT_SUBREG i64:$src0, sub0), (EXTRACT_SUBREG i64:$src1, sub0)), sub0), - (V_OR_B32_e32 (EXTRACT_SUBREG i64:$src0, sub1), + (inst (EXTRACT_SUBREG i64:$src0, sub1), (EXTRACT_SUBREG i64:$src1, sub1)), sub1) >; +def : BinOp64Pat <or, V_OR_B32_e32>; +def : BinOp64Pat <xor, V_XOR_B32_e32>; + class SextInReg <ValueType vt, int ShiftAmt> : Pat < (sext_inreg i32:$src0, vt), (V_ASHRREV_I32_e32 ShiftAmt, (V_LSHLREV_B32_e32 ShiftAmt, $src0)) @@ -1645,10 +1829,82 @@ class SextInReg <ValueType vt, int ShiftAmt> : Pat < def : SextInReg <i8, 24>; def : SextInReg <i16, 16>; +def : Pat < + (i32 (add (i32 (ctpop i32:$popcnt)), i32:$val)), + (V_BCNT_U32_B32_e32 $popcnt, $val) +>; + +def : Pat < + (i32 (ctpop i32:$popcnt)), + (V_BCNT_U32_B32_e64 $popcnt, 0, 0, 0) +>; + +def : Pat < + (i64 (ctpop i64:$src)), + (INSERT_SUBREG + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), + (V_BCNT_U32_B32_e32 (EXTRACT_SUBREG $src, sub1), + (V_BCNT_U32_B32_e64 (EXTRACT_SUBREG $src, sub0), 0, 0, 0)), + sub0), + (V_MOV_B32_e32 0), sub1) +>; + /********** ======================= **********/ /********** Image sampling patterns **********/ /********** ======================= **********/ +class SampleRawPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat < + (name vt:$addr, v32i8:$rsrc, v16i8:$sampler, i32:$dmask, i32:$unorm, + i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe), + (opcode (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $da), + (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $slc), + $addr, $rsrc, $sampler) +>; + +// Only the variants which make sense are defined. +def : SampleRawPattern<int_SI_gather4, IMAGE_GATHER4_V4_V2, v2i32>; +def : SampleRawPattern<int_SI_gather4, IMAGE_GATHER4_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_cl, IMAGE_GATHER4_CL_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_l, IMAGE_GATHER4_L_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_b, IMAGE_GATHER4_B_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_b_cl, IMAGE_GATHER4_B_CL_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_b_cl, IMAGE_GATHER4_B_CL_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_lz, IMAGE_GATHER4_LZ_V4_V2, v2i32>; +def : SampleRawPattern<int_SI_gather4_lz, IMAGE_GATHER4_LZ_V4_V4, v4i32>; + +def : SampleRawPattern<int_SI_gather4_c, IMAGE_GATHER4_C_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_c_cl, IMAGE_GATHER4_C_CL_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_c_cl, IMAGE_GATHER4_C_CL_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_c_l, IMAGE_GATHER4_C_L_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_c_l, IMAGE_GATHER4_C_L_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_c_b, IMAGE_GATHER4_C_B_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_c_b, IMAGE_GATHER4_C_B_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_c_b_cl, IMAGE_GATHER4_C_B_CL_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_c_lz, IMAGE_GATHER4_C_LZ_V4_V4, v4i32>; + +def : SampleRawPattern<int_SI_gather4_o, IMAGE_GATHER4_O_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_cl_o, IMAGE_GATHER4_CL_O_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_cl_o, IMAGE_GATHER4_CL_O_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_l_o, IMAGE_GATHER4_L_O_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_l_o, IMAGE_GATHER4_L_O_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_b_o, IMAGE_GATHER4_B_O_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_b_o, IMAGE_GATHER4_B_O_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_b_cl_o, IMAGE_GATHER4_B_CL_O_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_lz_o, IMAGE_GATHER4_LZ_O_V4_V4, v4i32>; + +def : SampleRawPattern<int_SI_gather4_c_o, IMAGE_GATHER4_C_O_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_c_o, IMAGE_GATHER4_C_O_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_c_cl_o, IMAGE_GATHER4_C_CL_O_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_c_l_o, IMAGE_GATHER4_C_L_O_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_c_b_o, IMAGE_GATHER4_C_B_O_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_c_b_cl_o, IMAGE_GATHER4_C_B_CL_O_V4_V8, v8i32>; +def : SampleRawPattern<int_SI_gather4_c_lz_o, IMAGE_GATHER4_C_LZ_O_V4_V4, v4i32>; +def : SampleRawPattern<int_SI_gather4_c_lz_o, IMAGE_GATHER4_C_LZ_O_V4_V8, v8i32>; + +def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V1, i32>; +def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V2, v2i32>; +def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V4, v4i32>; + /* SIsample for simple 1D texture lookup */ def : Pat < (SIsample i32:$addr, v32i8:$rsrc, v4i32:$sampler, imm), @@ -1864,7 +2120,10 @@ def : BitConvert <v2f32, v2i32, VReg_64>; def : BitConvert <v2i32, v2f32, VReg_64>; def : BitConvert <v2i32, i64, VReg_64>; def : BitConvert <i64, v2i32, VReg_64>; - +def : BitConvert <v2f32, i64, VReg_64>; +def : BitConvert <i64, v2f32, VReg_64>; +def : BitConvert <v2i32, f64, VReg_64>; +def : BitConvert <f64, v2i32, VReg_64>; def : BitConvert <v4f32, v4i32, VReg_128>; def : BitConvert <v4i32, v4f32, VReg_128>; @@ -1894,7 +2153,7 @@ def FCLAMP_SI : AMDGPUShaderInst < } def : Pat < - (int_AMDIL_clamp f32:$src, (f32 FP_ZERO), (f32 FP_ONE)), + (AMDGPUclamp f32:$src, (f32 FP_ZERO), (f32 FP_ONE)), (FCLAMP_SI f32:$src) >; @@ -2106,7 +2365,7 @@ def : Pat < (V_MUL_HI_I32 $src0, $src1, (i32 0)) >; -defm : BFIPatterns <V_BFI_B32>; +defm : BFIPatterns <V_BFI_B32, S_MOV_B32>; def : ROTRPattern <V_ALIGNBIT_B32>; /********** ======================= **********/ @@ -2130,7 +2389,7 @@ defm : DSReadPat <DS_READ_U8, i32, az_extloadi8_local>; defm : DSReadPat <DS_READ_I16, i32, sextloadi16_local>; defm : DSReadPat <DS_READ_U16, i32, az_extloadi16_local>; defm : DSReadPat <DS_READ_B32, i32, local_load>; -defm : DSReadPat <DS_READ_B64, i64, local_load>; +defm : DSReadPat <DS_READ_B64, v2i32, local_load>; multiclass DSWritePat <DS inst, ValueType vt, PatFrag frag> { def : Pat < @@ -2139,48 +2398,109 @@ multiclass DSWritePat <DS inst, ValueType vt, PatFrag frag> { >; def : Pat < - (frag vt:$src1, i32:$src0), - (inst 0, $src0, $src1, 0) + (frag vt:$val, i32:$ptr), + (inst 0, $ptr, $val, 0) >; } defm : DSWritePat <DS_WRITE_B8, i32, truncstorei8_local>; defm : DSWritePat <DS_WRITE_B16, i32, truncstorei16_local>; defm : DSWritePat <DS_WRITE_B32, i32, local_store>; -defm : DSWritePat <DS_WRITE_B64, i64, local_store>; +defm : DSWritePat <DS_WRITE_B64, v2i32, local_store>; -def : Pat <(atomic_load_add_local i32:$ptr, i32:$val), - (DS_ADD_U32_RTN 0, $ptr, $val, 0)>; - -def : Pat <(atomic_load_sub_local i32:$ptr, i32:$val), - (DS_SUB_U32_RTN 0, $ptr, $val, 0)>; +multiclass DSAtomicRetPat<DS inst, ValueType vt, PatFrag frag> { + def : Pat < + (frag (add i32:$ptr, (i32 IMM16bit:$offset)), vt:$value), + (inst (i1 0), $ptr, $value, (as_i16imm $offset)) + >; -//===----------------------------------------------------------------------===// -// MUBUF Patterns -//===----------------------------------------------------------------------===// + def : Pat < + (frag i32:$ptr, vt:$val), + (inst 0, $ptr, $val, 0) + >; +} -multiclass MUBUFLoad_Pattern <MUBUF Instr_ADDR64, ValueType vt, - PatFrag global_ld, PatFrag constant_ld> { +// Special case of DSAtomicRetPat for add / sub 1 -> inc / dec +// +// We need to use something for the data0, so we set a register to +// -1. For the non-rtn variants, the manual says it does +// DS[A] = (DS[A] >= D0) ? 0 : DS[A] + 1, and setting D0 to uint_max +// will always do the increment so I'm assuming it's the same. +// +// We also load this -1 with s_mov_b32 / s_mov_b64 even though this +// needs to be a VGPR. The SGPR copy pass will fix this, and it's +// easier since there is no v_mov_b64. +multiclass DSAtomicIncRetPat<DS inst, ValueType vt, + Instruction LoadImm, PatFrag frag> { def : Pat < - (vt (global_ld (mubuf_vaddr_offset i64:$ptr, i64:$offset, IMM12bit:$imm_offset))), - (Instr_ADDR64 (SI_ADDR64_RSRC $ptr), $offset, (as_i16imm $imm_offset)) + (frag (add i32:$ptr, (i32 IMM16bit:$offset)), (vt 1)), + (inst (i1 0), $ptr, (LoadImm (vt -1)), (as_i16imm $offset)) >; def : Pat < - (vt (global_ld (add i64:$ptr, (i64 IMM12bit:$offset)))), - (Instr_ADDR64 (SI_ADDR64_RSRC (i64 0)), $ptr, (as_i16imm $offset)) + (frag i32:$ptr, (vt 1)), + (inst 0, $ptr, (LoadImm (vt -1)), 0) >; +} +multiclass DSAtomicCmpXChg <DS inst, ValueType vt, PatFrag frag> { def : Pat < - (vt (global_ld i64:$ptr)), - (Instr_ADDR64 (SI_ADDR64_RSRC (i64 0)), $ptr, 0) + (frag (add i32:$ptr, (i32 IMM16bit:$offset)), vt:$cmp, vt:$swap), + (inst (i1 0), $ptr, $cmp, $swap, (as_i16imm $offset)) >; def : Pat < - (vt (global_ld (add i64:$ptr, i64:$offset))), - (Instr_ADDR64 (SI_ADDR64_RSRC $ptr), $offset, 0) + (frag i32:$ptr, vt:$cmp, vt:$swap), + (inst 0, $ptr, $cmp, $swap, 0) >; +} + + +// 32-bit atomics. +defm : DSAtomicIncRetPat<DS_INC_RTN_U32, i32, + S_MOV_B32, atomic_load_add_local>; +defm : DSAtomicIncRetPat<DS_DEC_RTN_U32, i32, + S_MOV_B32, atomic_load_sub_local>; + +defm : DSAtomicRetPat<DS_WRXCHG_RTN_B32, i32, atomic_swap_local>; +defm : DSAtomicRetPat<DS_ADD_RTN_U32, i32, atomic_load_add_local>; +defm : DSAtomicRetPat<DS_SUB_RTN_U32, i32, atomic_load_sub_local>; +defm : DSAtomicRetPat<DS_AND_RTN_B32, i32, atomic_load_and_local>; +defm : DSAtomicRetPat<DS_OR_RTN_B32, i32, atomic_load_or_local>; +defm : DSAtomicRetPat<DS_XOR_RTN_B32, i32, atomic_load_xor_local>; +defm : DSAtomicRetPat<DS_MIN_RTN_I32, i32, atomic_load_min_local>; +defm : DSAtomicRetPat<DS_MAX_RTN_I32, i32, atomic_load_max_local>; +defm : DSAtomicRetPat<DS_MIN_RTN_U32, i32, atomic_load_umin_local>; +defm : DSAtomicRetPat<DS_MAX_RTN_U32, i32, atomic_load_umax_local>; + +defm : DSAtomicCmpXChg<DS_CMPST_RTN_B32, i32, atomic_cmp_swap_32_local>; + +// 64-bit atomics. +defm : DSAtomicIncRetPat<DS_INC_RTN_U64, i64, + S_MOV_B64, atomic_load_add_local>; +defm : DSAtomicIncRetPat<DS_DEC_RTN_U64, i64, + S_MOV_B64, atomic_load_sub_local>; + +defm : DSAtomicRetPat<DS_WRXCHG_RTN_B64, i64, atomic_swap_local>; +defm : DSAtomicRetPat<DS_ADD_RTN_U64, i64, atomic_load_add_local>; +defm : DSAtomicRetPat<DS_SUB_RTN_U64, i64, atomic_load_sub_local>; +defm : DSAtomicRetPat<DS_AND_RTN_B64, i64, atomic_load_and_local>; +defm : DSAtomicRetPat<DS_OR_RTN_B64, i64, atomic_load_or_local>; +defm : DSAtomicRetPat<DS_XOR_RTN_B64, i64, atomic_load_xor_local>; +defm : DSAtomicRetPat<DS_MIN_RTN_I64, i64, atomic_load_min_local>; +defm : DSAtomicRetPat<DS_MAX_RTN_I64, i64, atomic_load_max_local>; +defm : DSAtomicRetPat<DS_MIN_RTN_U64, i64, atomic_load_umin_local>; +defm : DSAtomicRetPat<DS_MAX_RTN_U64, i64, atomic_load_umax_local>; + +defm : DSAtomicCmpXChg<DS_CMPST_RTN_B64, i64, atomic_cmp_swap_64_local>; + +//===----------------------------------------------------------------------===// +// MUBUF Patterns +//===----------------------------------------------------------------------===// + +multiclass MUBUFLoad_Pattern <MUBUF Instr_ADDR64, ValueType vt, + PatFrag constant_ld> { def : Pat < (vt (constant_ld (add i64:$ptr, i64:$offset))), (Instr_ADDR64 (SI_ADDR64_RSRC $ptr), $offset, 0) @@ -2188,53 +2508,19 @@ multiclass MUBUFLoad_Pattern <MUBUF Instr_ADDR64, ValueType vt, } defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_ADDR64, i32, - sextloadi8_global, sextloadi8_constant>; + sextloadi8_constant>; defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_ADDR64, i32, - az_extloadi8_global, az_extloadi8_constant>; + az_extloadi8_constant>; defm : MUBUFLoad_Pattern <BUFFER_LOAD_SSHORT_ADDR64, i32, - sextloadi16_global, sextloadi16_constant>; + sextloadi16_constant>; defm : MUBUFLoad_Pattern <BUFFER_LOAD_USHORT_ADDR64, i32, - az_extloadi16_global, az_extloadi16_constant>; + az_extloadi16_constant>; defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORD_ADDR64, i32, - global_load, constant_load>; -defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, i64, - global_load, constant_load>; -defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, i64, - az_extloadi32_global, az_extloadi32_constant>; + constant_load>; defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, v2i32, - global_load, constant_load>; + constant_load>; defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX4_ADDR64, v4i32, - global_load, constant_load>; - -multiclass MUBUFStore_Pattern <MUBUF Instr, ValueType vt, PatFrag st> { - - def : Pat < - (st vt:$value, (mubuf_vaddr_offset i64:$ptr, i64:$offset, IMM12bit:$imm_offset)), - (Instr $value, (SI_ADDR64_RSRC $ptr), $offset, (as_i16imm $imm_offset)) - >; - - def : Pat < - (st vt:$value, (add i64:$ptr, IMM12bit:$offset)), - (Instr $value, (SI_ADDR64_RSRC (i64 0)), $ptr, (as_i16imm $offset)) - >; - - def : Pat < - (st vt:$value, i64:$ptr), - (Instr $value, (SI_ADDR64_RSRC (i64 0)), $ptr, 0) - >; - - def : Pat < - (st vt:$value, (add i64:$ptr, i64:$offset)), - (Instr $value, (SI_ADDR64_RSRC $ptr), $offset, 0) - >; -} - -defm : MUBUFStore_Pattern <BUFFER_STORE_BYTE, i32, truncstorei8_global>; -defm : MUBUFStore_Pattern <BUFFER_STORE_SHORT, i32, truncstorei16_global>; -defm : MUBUFStore_Pattern <BUFFER_STORE_DWORD, i32, global_store>; -defm : MUBUFStore_Pattern <BUFFER_STORE_DWORDX2, i64, global_store>; -defm : MUBUFStore_Pattern <BUFFER_STORE_DWORDX2, v2i32, global_store>; -defm : MUBUFStore_Pattern <BUFFER_STORE_DWORDX4, v4i32, global_store>; + constant_load>; // BUFFER_LOAD_DWORD*, addr64=0 multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxen, @@ -2301,7 +2587,7 @@ def : MTBUF_StoreResource <v2i32, 2, TBUFFER_STORE_FORMAT_XY>; def : MTBUF_StoreResource <v4i32, 3, TBUFFER_STORE_FORMAT_XYZ>; def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>; -let Predicates = [isCI] in { +let SubtargetPredicate = isCI in { // Sea island new arithmetic instructinos let neverHasSideEffects = 1 in { @@ -2348,7 +2634,7 @@ def V_MAD_I64_I32 : VOP3_64 <0x00000177, "V_MAD_I64_I32", []>; // BUFFER_LOAD_DWORDX3 // BUFFER_STORE_DWORDX3 -} // End Predicates = [isCI] +} // End iSCI /********** ====================== **********/ @@ -2360,13 +2646,13 @@ multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, SI_INDIRECT_DST I // 1. Extract with offset def : Pat< (vector_extract vt:$vec, (add i32:$idx, imm:$off)), - (f32 (SI_INDIRECT_SRC (IMPLICIT_DEF), $vec, $idx, imm:$off)) + (eltvt (SI_INDIRECT_SRC (IMPLICIT_DEF), $vec, $idx, imm:$off)) >; // 2. Extract without offset def : Pat< (vector_extract vt:$vec, i32:$idx), - (f32 (SI_INDIRECT_SRC (IMPLICIT_DEF), $vec, $idx, 0)) + (eltvt (SI_INDIRECT_SRC (IMPLICIT_DEF), $vec, $idx, 0)) >; // 3. Insert with offset @@ -2392,20 +2678,6 @@ defm : SI_INDIRECT_Pattern <v4i32, i32, SI_INDIRECT_DST_V4>; defm : SI_INDIRECT_Pattern <v8i32, i32, SI_INDIRECT_DST_V8>; defm : SI_INDIRECT_Pattern <v16i32, i32, SI_INDIRECT_DST_V16>; -/********** =============== **********/ -/********** Conditions **********/ -/********** =============== **********/ - -def : Pat< - (i1 (setcc f32:$src0, f32:$src1, SETO)), - (V_CMP_O_F32_e64 $src0, $src1) ->; - -def : Pat< - (i1 (setcc f32:$src0, f32:$src1, SETUO)), - (V_CMP_U_F32_e64 $src0, $src1) ->; - //===----------------------------------------------------------------------===// // Conversion Patterns //===----------------------------------------------------------------------===// @@ -2439,6 +2711,62 @@ def : Pat < (S_MOV_B32 -1), sub1) >; +class ZExt_i64_i32_Pat <SDNode ext> : Pat < + (i64 (ext i32:$src)), + (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $src, sub0), + (S_MOV_B32 0), sub1) +>; + +class ZExt_i64_i1_Pat <SDNode ext> : Pat < + (i64 (ext i1:$src)), + (INSERT_SUBREG + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), + (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src), sub0), + (S_MOV_B32 0), sub1) +>; + + +def : ZExt_i64_i32_Pat<zext>; +def : ZExt_i64_i32_Pat<anyext>; +def : ZExt_i64_i1_Pat<zext>; +def : ZExt_i64_i1_Pat<anyext>; + +def : Pat < + (i64 (sext i32:$src)), + (INSERT_SUBREG + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $src, sub0), + (S_ASHR_I32 $src, 31), sub1) +>; + +def : Pat < + (i64 (sext i1:$src)), + (INSERT_SUBREG + (INSERT_SUBREG + (i64 (IMPLICIT_DEF)), + (V_CNDMASK_B32_e64 0, -1, $src), sub0), + (V_CNDMASK_B32_e64 0, -1, $src), sub1) +>; + +def : Pat < + (f32 (sint_to_fp i1:$src)), + (V_CNDMASK_B32_e64 (i32 0), CONST.FP32_NEG_ONE, $src) +>; + +def : Pat < + (f32 (uint_to_fp i1:$src)), + (V_CNDMASK_B32_e64 (i32 0), CONST.FP32_ONE, $src) +>; + +def : Pat < + (f64 (sint_to_fp i1:$src)), + (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src)) +>; + +def : Pat < + (f64 (uint_to_fp i1:$src)), + (V_CVT_F64_U32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src)) +>; + //===----------------------------------------------------------------------===// // Miscellaneous Patterns //===----------------------------------------------------------------------===// diff --git a/lib/Target/R600/SIIntrinsics.td b/lib/Target/R600/SIIntrinsics.td index 00e32c03a99e..df690a4f4b87 100644 --- a/lib/Target/R600/SIIntrinsics.td +++ b/lib/Target/R600/SIIntrinsics.td @@ -56,11 +56,61 @@ let TargetPrefix = "SI", isTarget = 1 in { class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>; + // Fully-flexible SAMPLE instruction. + class SampleRaw : Intrinsic < + [llvm_v4f32_ty], // vdata(VGPR) + [llvm_anyint_ty, // vaddr(VGPR) + llvm_v32i8_ty, // rsrc(SGPR) + llvm_v16i8_ty, // sampler(SGPR) + llvm_i32_ty, // dmask(imm) + llvm_i32_ty, // unorm(imm) + llvm_i32_ty, // r128(imm) + llvm_i32_ty, // da(imm) + llvm_i32_ty, // glc(imm) + llvm_i32_ty, // slc(imm) + llvm_i32_ty, // tfe(imm) + llvm_i32_ty], // lwe(imm) + [IntrNoMem]>; + def int_SI_sample : Sample; def int_SI_sampleb : Sample; def int_SI_sampled : Sample; def int_SI_samplel : Sample; + // Basic gather4 + def int_SI_gather4 : SampleRaw; + def int_SI_gather4_cl : SampleRaw; + def int_SI_gather4_l : SampleRaw; + def int_SI_gather4_b : SampleRaw; + def int_SI_gather4_b_cl : SampleRaw; + def int_SI_gather4_lz : SampleRaw; + + // Gather4 with comparison + def int_SI_gather4_c : SampleRaw; + def int_SI_gather4_c_cl : SampleRaw; + def int_SI_gather4_c_l : SampleRaw; + def int_SI_gather4_c_b : SampleRaw; + def int_SI_gather4_c_b_cl : SampleRaw; + def int_SI_gather4_c_lz : SampleRaw; + + // Gather4 with offsets + def int_SI_gather4_o : SampleRaw; + def int_SI_gather4_cl_o : SampleRaw; + def int_SI_gather4_l_o : SampleRaw; + def int_SI_gather4_b_o : SampleRaw; + def int_SI_gather4_b_cl_o : SampleRaw; + def int_SI_gather4_lz_o : SampleRaw; + + // Gather4 with comparison and offsets + def int_SI_gather4_c_o : SampleRaw; + def int_SI_gather4_c_cl_o : SampleRaw; + def int_SI_gather4_c_l_o : SampleRaw; + def int_SI_gather4_c_b_o : SampleRaw; + def int_SI_gather4_c_b_cl_o : SampleRaw; + def int_SI_gather4_c_lz_o : SampleRaw; + + def int_SI_getlod : SampleRaw; + def int_SI_imageload : Intrinsic <[llvm_v4i32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; def int_SI_resinfo : Intrinsic <[llvm_v4i32_ty], [llvm_i32_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; diff --git a/lib/Target/R600/SILowerControlFlow.cpp b/lib/Target/R600/SILowerControlFlow.cpp index 6601f2a98065..9f5ff29ad93a 100644 --- a/lib/Target/R600/SILowerControlFlow.cpp +++ b/lib/Target/R600/SILowerControlFlow.cpp @@ -86,6 +86,7 @@ private: void Kill(MachineInstr &MI); void Branch(MachineInstr &MI); + void InitM0ForLDS(MachineBasicBlock::iterator MI); void LoadM0(MachineInstr &MI, MachineInstr *MovRel); void IndirectSrc(MachineInstr &MI); void IndirectDst(MachineInstr &MI); @@ -320,6 +321,14 @@ void SILowerControlFlowPass::Kill(MachineInstr &MI) { MI.eraseFromParent(); } +/// The m0 register stores the maximum allowable address for LDS reads and +/// writes. Its value must be at least the size in bytes of LDS allocated by +/// the shader. For simplicity, we set it to the maximum possible value. +void SILowerControlFlowPass::InitM0ForLDS(MachineBasicBlock::iterator MI) { + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), + AMDGPU::M0).addImm(0xffffffff); +} + void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) { MachineBasicBlock &MBB = *MI.getParent(); @@ -333,52 +342,57 @@ void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) { BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) .addReg(Idx); MBB.insert(I, MovRel); - MI.eraseFromParent(); - return; - } + } else { - assert(AMDGPU::SReg_64RegClass.contains(Save)); - assert(AMDGPU::VReg_32RegClass.contains(Idx)); + assert(AMDGPU::SReg_64RegClass.contains(Save)); + assert(AMDGPU::VReg_32RegClass.contains(Idx)); - // Save the EXEC mask - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save) - .addReg(AMDGPU::EXEC); + // Save the EXEC mask + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save) + .addReg(AMDGPU::EXEC); - // Read the next variant into VCC (lower 32 bits) <- also loop target - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), - AMDGPU::VCC_LO) - .addReg(Idx); + // Read the next variant into VCC (lower 32 bits) <- also loop target + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), + AMDGPU::VCC_LO) + .addReg(Idx); - // Move index from VCC into M0 - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) - .addReg(AMDGPU::VCC_LO); + // Move index from VCC into M0 + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) + .addReg(AMDGPU::VCC_LO); - // Compare the just read M0 value to all possible Idx values - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32), AMDGPU::VCC) - .addReg(AMDGPU::M0) - .addReg(Idx); + // Compare the just read M0 value to all possible Idx values + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32), AMDGPU::VCC) + .addReg(AMDGPU::M0) + .addReg(Idx); - // Update EXEC, save the original EXEC value to VCC - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC) - .addReg(AMDGPU::VCC); + // Update EXEC, save the original EXEC value to VCC + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC) + .addReg(AMDGPU::VCC); - // Do the actual move - MBB.insert(I, MovRel); + // Do the actual move + MBB.insert(I, MovRel); - // Update EXEC, switch all done bits to 0 and all todo bits to 1 - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) - .addReg(AMDGPU::VCC); + // Update EXEC, switch all done bits to 0 and all todo bits to 1 + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addReg(AMDGPU::VCC); - // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) - .addImm(-7) - .addReg(AMDGPU::EXEC); + // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) + .addImm(-7) + .addReg(AMDGPU::EXEC); - // Restore EXEC - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) - .addReg(Save); + // Restore EXEC + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) + .addReg(Save); + } + // FIXME: Are there any values other than the LDS address clamp that need to + // be stored in the m0 register and may be live for more than a few + // instructions? If so, we should save the m0 register at the beginning + // of this function and restore it here. + // FIXME: Add support for LDS direct loads. + InitM0ForLDS(&MI); MI.eraseFromParent(); } @@ -523,8 +537,7 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { MachineBasicBlock &MBB = MF.front(); // Initialize M0 to a value that won't cause LDS access to be discarded // due to offset clamping - BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_MOV_B32), - AMDGPU::M0).addImm(0xffffffff); + InitM0ForLDS(MBB.getFirstNonPHI()); } if (NeedWQM && MFI->ShaderType == ShaderType::PIXEL) { diff --git a/lib/Target/R600/SIMachineFunctionInfo.cpp b/lib/Target/R600/SIMachineFunctionInfo.cpp index af609958129c..e2df950fd271 100644 --- a/lib/Target/R600/SIMachineFunctionInfo.cpp +++ b/lib/Target/R600/SIMachineFunctionInfo.cpp @@ -62,8 +62,10 @@ static unsigned createLaneVGPR(MachineRegisterInfo &MRI, MachineFunction *MF) { return VGPR; } } - MF->getFunction()->getContext().emitError( - "Could not found S_ENGPGM instrtuction."); + + LLVMContext &Ctx = MF->getFunction()->getContext(); + Ctx.emitError("Could not find S_ENDPGM instruction."); + return VGPR; } diff --git a/lib/Target/R600/SIRegisterInfo.cpp b/lib/Target/R600/SIRegisterInfo.cpp index c72d549a3dd1..d0b677a41f27 100644 --- a/lib/Target/R600/SIRegisterInfo.cpp +++ b/lib/Target/R600/SIRegisterInfo.cpp @@ -14,21 +14,20 @@ #include "SIRegisterInfo.h" -#include "AMDGPUTargetMachine.h" +#include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" using namespace llvm; -SIRegisterInfo::SIRegisterInfo(AMDGPUTargetMachine &tm) -: AMDGPURegisterInfo(tm), - TM(tm) +SIRegisterInfo::SIRegisterInfo(const AMDGPUSubtarget &st) +: AMDGPURegisterInfo(st) { } BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); Reserved.set(AMDGPU::EXEC); Reserved.set(AMDGPU::INDIRECT_BASE_ADDR); - const SIInstrInfo *TII = static_cast<const SIInstrInfo*>(TM.getInstrInfo()); + const SIInstrInfo *TII = static_cast<const SIInstrInfo*>(ST.getInstrInfo()); TII->reserveIndirectRegisters(Reserved, MF); return Reserved; } @@ -38,15 +37,6 @@ unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, return RC->getNumRegs(); } -const TargetRegisterClass * -SIRegisterInfo::getISARegClass(const TargetRegisterClass * rc) const { - switch (rc->getID()) { - case AMDGPU::GPRF32RegClassID: - return &AMDGPU::VReg_32RegClass; - default: return rc; - } -} - const TargetRegisterClass * SIRegisterInfo::getCFGStructurizerRegClass( MVT VT) const { switch(VT.SimpleTy) { @@ -135,3 +125,19 @@ unsigned SIRegisterInfo::getPhysRegSubReg(unsigned Reg, unsigned Index = getHWRegIndex(Reg); return SubRC->getRegister(Index + Channel); } + +bool SIRegisterInfo::regClassCanUseImmediate(int RCID) const { + switch (RCID) { + default: return false; + case AMDGPU::SSrc_32RegClassID: + case AMDGPU::SSrc_64RegClassID: + case AMDGPU::VSrc_32RegClassID: + case AMDGPU::VSrc_64RegClassID: + return true; + } +} + +bool SIRegisterInfo::regClassCanUseImmediate( + const TargetRegisterClass *RC) const { + return regClassCanUseImmediate(RC->getID()); +} diff --git a/lib/Target/R600/SIRegisterInfo.h b/lib/Target/R600/SIRegisterInfo.h index 36b4fcd32a89..c9305fbad40c 100644 --- a/lib/Target/R600/SIRegisterInfo.h +++ b/lib/Target/R600/SIRegisterInfo.h @@ -20,24 +20,15 @@ namespace llvm { -class AMDGPUTargetMachine; - struct SIRegisterInfo : public AMDGPURegisterInfo { - AMDGPUTargetMachine &TM; - SIRegisterInfo(AMDGPUTargetMachine &tm); + SIRegisterInfo(const AMDGPUSubtarget &st); BitVector getReservedRegs(const MachineFunction &MF) const override; unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override; - /// \param RC is an AMDIL reg class. - /// - /// \returns the SI register class that is equivalent to \p RC. - const TargetRegisterClass * - getISARegClass(const TargetRegisterClass *RC) const override; - /// \brief get the register class of the specified type to use in the /// CFGStructurizer const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const override; @@ -69,6 +60,14 @@ struct SIRegisterInfo : public AMDGPURegisterInfo { /// \returns The sub-register of Reg that is in Channel. unsigned getPhysRegSubReg(unsigned Reg, const TargetRegisterClass *SubRC, unsigned Channel) const; + + /// \returns True if operands defined with this register class can accept + /// inline immediates. + bool regClassCanUseImmediate(int RCID) const; + + /// \returns True if operands defined with this register class can accept + /// inline immediates. + bool regClassCanUseImmediate(const TargetRegisterClass *RC) const; }; } // End namespace llvm diff --git a/lib/Target/R600/SIRegisterInfo.td b/lib/Target/R600/SIRegisterInfo.td index f1f01deaf361..8974b6300625 100644 --- a/lib/Target/R600/SIRegisterInfo.td +++ b/lib/Target/R600/SIRegisterInfo.td @@ -168,7 +168,7 @@ def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, i1], 64, (add SGPR_64Regs, VCCReg, EXECReg) >; -def SReg_128 : RegisterClass<"AMDGPU", [v4i32], 128, (add SGPR_128)>; +def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8], 128, (add SGPR_128)>; def SReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add SGPR_256)>; diff --git a/lib/Target/R600/SITypeRewriter.cpp b/lib/Target/R600/SITypeRewriter.cpp index a0b6907dd7d4..367963aebb00 100644 --- a/lib/Target/R600/SITypeRewriter.cpp +++ b/lib/Target/R600/SITypeRewriter.cpp @@ -119,8 +119,7 @@ void SITypeRewriter::visitCallInst(CallInst &I) { Type::getInt32Ty(I.getContext())){ Type *ElementTy = Arg->getType()->getVectorElementType(); std::string TypeName = "i32"; - InsertElementInst *Def = dyn_cast<InsertElementInst>(Arg); - assert(Def); + InsertElementInst *Def = cast<InsertElementInst>(Arg); Args.push_back(Def->getOperand(1)); Types.push_back(ElementTy); std::string VecTypeName = "v1" + TypeName; diff --git a/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp index da88820f52d2..9df005401897 100644 --- a/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp +++ b/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp @@ -47,31 +47,27 @@ class SparcAsmParser : public MCTargetAsmParser { // public interface of the MCTargetAsmParser. bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, - SmallVectorImpl<MCParsedAsmOperand*> &Operands, - MCStreamer &Out, unsigned &ErrorInfo, + OperandVector &Operands, MCStreamer &Out, + unsigned &ErrorInfo, bool MatchingInlineAsm) override; bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override; bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, - SMLoc NameLoc, - SmallVectorImpl<MCParsedAsmOperand*> &Operands) override; + SMLoc NameLoc, OperandVector &Operands) override; bool ParseDirective(AsmToken DirectiveID) override; - unsigned validateTargetOperandClass(MCParsedAsmOperand *Op, + unsigned validateTargetOperandClass(MCParsedAsmOperand &Op, unsigned Kind) override; // Custom parse functions for Sparc specific operands. - OperandMatchResultTy - parseMEMOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands); + OperandMatchResultTy parseMEMOperand(OperandVector &Operands); - OperandMatchResultTy - parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands, - StringRef Name); + OperandMatchResultTy parseOperand(OperandVector &Operands, StringRef Name); OperandMatchResultTy - parseSparcAsmOperand(SparcOperand *&Operand, bool isCall = false); + parseSparcAsmOperand(std::unique_ptr<SparcOperand> &Operand, + bool isCall = false); - OperandMatchResultTy - parseBranchModifiers(SmallVectorImpl<MCParsedAsmOperand*> &Operands); + OperandMatchResultTy parseBranchModifiers(OperandVector &Operands); // returns true if Tok is matched to a register and returns register in RegNo. bool matchRegisterName(const AsmToken &Tok, unsigned &RegNo, @@ -153,8 +149,6 @@ private: SMLoc StartLoc, EndLoc; - SparcOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {} - struct Token { const char *Data; unsigned Length; @@ -182,6 +176,8 @@ private: struct MemOp Mem; }; public: + SparcOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {} + bool isToken() const override { return Kind == k_Token; } bool isReg() const override { return Kind == k_Register; } bool isImm() const override { return Kind == k_Immediate; } @@ -291,8 +287,8 @@ public: addExpr(Inst, Expr); } - static SparcOperand *CreateToken(StringRef Str, SMLoc S) { - SparcOperand *Op = new SparcOperand(k_Token); + static std::unique_ptr<SparcOperand> CreateToken(StringRef Str, SMLoc S) { + auto Op = make_unique<SparcOperand>(k_Token); Op->Tok.Data = Str.data(); Op->Tok.Length = Str.size(); Op->StartLoc = S; @@ -300,10 +296,9 @@ public: return Op; } - static SparcOperand *CreateReg(unsigned RegNum, - unsigned Kind, - SMLoc S, SMLoc E) { - SparcOperand *Op = new SparcOperand(k_Register); + static std::unique_ptr<SparcOperand> CreateReg(unsigned RegNum, unsigned Kind, + SMLoc S, SMLoc E) { + auto Op = make_unique<SparcOperand>(k_Register); Op->Reg.RegNum = RegNum; Op->Reg.Kind = (SparcOperand::RegisterKind)Kind; Op->StartLoc = S; @@ -311,49 +306,51 @@ public: return Op; } - static SparcOperand *CreateImm(const MCExpr *Val, SMLoc S, SMLoc E) { - SparcOperand *Op = new SparcOperand(k_Immediate); + static std::unique_ptr<SparcOperand> CreateImm(const MCExpr *Val, SMLoc S, + SMLoc E) { + auto Op = make_unique<SparcOperand>(k_Immediate); Op->Imm.Val = Val; Op->StartLoc = S; Op->EndLoc = E; return Op; } - static SparcOperand *MorphToDoubleReg(SparcOperand *Op) { - unsigned Reg = Op->getReg(); - assert(Op->Reg.Kind == rk_FloatReg); + static bool MorphToDoubleReg(SparcOperand &Op) { + unsigned Reg = Op.getReg(); + assert(Op.Reg.Kind == rk_FloatReg); unsigned regIdx = Reg - Sparc::F0; if (regIdx % 2 || regIdx > 31) - return nullptr; - Op->Reg.RegNum = DoubleRegs[regIdx / 2]; - Op->Reg.Kind = rk_DoubleReg; - return Op; + return false; + Op.Reg.RegNum = DoubleRegs[regIdx / 2]; + Op.Reg.Kind = rk_DoubleReg; + return true; } - static SparcOperand *MorphToQuadReg(SparcOperand *Op) { - unsigned Reg = Op->getReg(); + static bool MorphToQuadReg(SparcOperand &Op) { + unsigned Reg = Op.getReg(); unsigned regIdx = 0; - switch (Op->Reg.Kind) { - default: assert(0 && "Unexpected register kind!"); + switch (Op.Reg.Kind) { + default: llvm_unreachable("Unexpected register kind!"); case rk_FloatReg: regIdx = Reg - Sparc::F0; if (regIdx % 4 || regIdx > 31) - return nullptr; + return false; Reg = QuadFPRegs[regIdx / 4]; break; case rk_DoubleReg: regIdx = Reg - Sparc::D0; if (regIdx % 2 || regIdx > 31) - return nullptr; + return false; Reg = QuadFPRegs[regIdx / 2]; break; } - Op->Reg.RegNum = Reg; - Op->Reg.Kind = rk_QuadReg; - return Op; + Op.Reg.RegNum = Reg; + Op.Reg.Kind = rk_QuadReg; + return true; } - static SparcOperand *MorphToMEMrr(unsigned Base, SparcOperand *Op) { + static std::unique_ptr<SparcOperand> + MorphToMEMrr(unsigned Base, std::unique_ptr<SparcOperand> Op) { unsigned offsetReg = Op->getReg(); Op->Kind = k_MemoryReg; Op->Mem.Base = Base; @@ -362,10 +359,9 @@ public: return Op; } - static SparcOperand *CreateMEMri(unsigned Base, - const MCExpr *Off, - SMLoc S, SMLoc E) { - SparcOperand *Op = new SparcOperand(k_MemoryImm); + static std::unique_ptr<SparcOperand> + CreateMEMri(unsigned Base, const MCExpr *Off, SMLoc S, SMLoc E) { + auto Op = make_unique<SparcOperand>(k_MemoryImm); Op->Mem.Base = Base; Op->Mem.OffsetReg = 0; Op->Mem.Off = Off; @@ -374,7 +370,8 @@ public: return Op; } - static SparcOperand *MorphToMEMri(unsigned Base, SparcOperand *Op) { + static std::unique_ptr<SparcOperand> + MorphToMEMri(unsigned Base, std::unique_ptr<SparcOperand> Op) { const MCExpr *Imm = Op->getImm(); Op->Kind = k_MemoryImm; Op->Mem.Base = Base; @@ -386,11 +383,11 @@ public: } // end namespace -bool SparcAsmParser:: -MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, - SmallVectorImpl<MCParsedAsmOperand*> &Operands, - MCStreamer &Out, unsigned &ErrorInfo, - bool MatchingInlineAsm) { +bool SparcAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + OperandVector &Operands, + MCStreamer &Out, + unsigned &ErrorInfo, + bool MatchingInlineAsm) { MCInst Inst; SmallVector<MCInst, 8> Instructions; unsigned MatchResult = MatchInstructionImpl(Operands, Inst, ErrorInfo, @@ -415,7 +412,7 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, if (ErrorInfo >= Operands.size()) return Error(IDLoc, "too few operands for instruction"); - ErrorLoc = ((SparcOperand*) Operands[ErrorInfo])->getStartLoc(); + ErrorLoc = ((SparcOperand &)*Operands[ErrorInfo]).getStartLoc(); if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc; } @@ -450,11 +447,9 @@ ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) static void applyMnemonicAliases(StringRef &Mnemonic, unsigned Features, unsigned VariantID); -bool SparcAsmParser:: -ParseInstruction(ParseInstructionInfo &Info, StringRef Name, - SMLoc NameLoc, - SmallVectorImpl<MCParsedAsmOperand*> &Operands) -{ +bool SparcAsmParser::ParseInstruction(ParseInstructionInfo &Info, + StringRef Name, SMLoc NameLoc, + OperandVector &Operands) { // First operand in MCInst is instruction mnemonic. Operands.push_back(SparcOperand::CreateToken(Name, NameLoc)); @@ -548,9 +543,8 @@ bool SparcAsmParser:: parseDirectiveWord(unsigned Size, SMLoc L) { return false; } -SparcAsmParser::OperandMatchResultTy SparcAsmParser:: -parseMEMOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) -{ +SparcAsmParser::OperandMatchResultTy +SparcAsmParser::parseMEMOperand(OperandVector &Operands) { SMLoc S, E; unsigned BaseReg = 0; @@ -575,23 +569,20 @@ parseMEMOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) break; } - SparcOperand *Offset = nullptr; + std::unique_ptr<SparcOperand> Offset; OperandMatchResultTy ResTy = parseSparcAsmOperand(Offset); if (ResTy != MatchOperand_Success || !Offset) return MatchOperand_NoMatch; - Offset = (Offset->isImm() - ? SparcOperand::MorphToMEMri(BaseReg, Offset) - : SparcOperand::MorphToMEMrr(BaseReg, Offset)); + Operands.push_back( + Offset->isImm() ? SparcOperand::MorphToMEMri(BaseReg, std::move(Offset)) + : SparcOperand::MorphToMEMrr(BaseReg, std::move(Offset))); - Operands.push_back(Offset); return MatchOperand_Success; } -SparcAsmParser::OperandMatchResultTy SparcAsmParser:: -parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands, - StringRef Mnemonic) -{ +SparcAsmParser::OperandMatchResultTy +SparcAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) { OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic); @@ -637,21 +628,21 @@ parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands, return MatchOperand_Success; } - SparcOperand *Op = nullptr; + std::unique_ptr<SparcOperand> Op; ResTy = parseSparcAsmOperand(Op, (Mnemonic == "call")); if (ResTy != MatchOperand_Success || !Op) return MatchOperand_ParseFail; // Push the parsed operand into the list of operands - Operands.push_back(Op); + Operands.push_back(std::move(Op)); return MatchOperand_Success; } SparcAsmParser::OperandMatchResultTy -SparcAsmParser::parseSparcAsmOperand(SparcOperand *&Op, bool isCall) -{ +SparcAsmParser::parseSparcAsmOperand(std::unique_ptr<SparcOperand> &Op, + bool isCall) { SMLoc S = Parser.getTok().getLoc(); SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1); @@ -718,8 +709,8 @@ SparcAsmParser::parseSparcAsmOperand(SparcOperand *&Op, bool isCall) return (Op) ? MatchOperand_Success : MatchOperand_ParseFail; } -SparcAsmParser::OperandMatchResultTy SparcAsmParser:: -parseBranchModifiers(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { +SparcAsmParser::OperandMatchResultTy +SparcAsmParser::parseBranchModifiers(OperandVector &Operands) { // parse (,a|,pn|,pt)+ @@ -928,18 +919,14 @@ extern "C" void LLVMInitializeSparcAsmParser() { #define GET_MATCHER_IMPLEMENTATION #include "SparcGenAsmMatcher.inc" - - -unsigned SparcAsmParser:: -validateTargetOperandClass(MCParsedAsmOperand *GOp, - unsigned Kind) -{ - SparcOperand *Op = (SparcOperand*)GOp; - if (Op->isFloatOrDoubleReg()) { +unsigned SparcAsmParser::validateTargetOperandClass(MCParsedAsmOperand &GOp, + unsigned Kind) { + SparcOperand &Op = (SparcOperand &)GOp; + if (Op.isFloatOrDoubleReg()) { switch (Kind) { default: break; case MCK_DFPRegs: - if (!Op->isFloatReg() || SparcOperand::MorphToDoubleReg(Op)) + if (!Op.isFloatReg() || SparcOperand::MorphToDoubleReg(Op)) return MCTargetAsmParser::Match_Success; break; case MCK_QFPRegs: diff --git a/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp b/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp index 261fb3838d37..5975a517994a 100644 --- a/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp +++ b/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp @@ -173,6 +173,6 @@ void SparcInstPrinter::printCCOperand(const MCInst *MI, int opNum, bool SparcInstPrinter::printGetPCX(const MCInst *MI, unsigned opNum, raw_ostream &O) { - assert(0 && "FIXME: Implement SparcInstPrinter::printGetPCX."); + llvm_unreachable("FIXME: Implement SparcInstPrinter::printGetPCX."); return true; } diff --git a/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp b/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp index 7d517b699b22..dcd81e3d6249 100644 --- a/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp +++ b/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp @@ -196,12 +196,12 @@ namespace { const MCRelaxableFragment *DF, const MCAsmLayout &Layout) const override { // FIXME. - assert(0 && "fixupNeedsRelaxation() unimplemented"); + llvm_unreachable("fixupNeedsRelaxation() unimplemented"); return false; } void relaxInstruction(const MCInst &Inst, MCInst &Res) const override { // FIXME. - assert(0 && "relaxInstruction() unimplemented"); + llvm_unreachable("relaxInstruction() unimplemented"); } bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override { diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp index b19ad7b45ca6..eea9626c17b0 100644 --- a/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp +++ b/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp @@ -133,7 +133,7 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO, if (Expr->EvaluateAsAbsolute(Res)) return Res; - assert(0 && "Unhandled expression!"); + llvm_unreachable("Unhandled expression!"); return 0; } diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp index ae57fdc4eebd..7f01ab06879f 100644 --- a/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp +++ b/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp @@ -16,6 +16,7 @@ #include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCELF.h" +#include "llvm/MC/MCObjectStreamer.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Object/ELF.h" @@ -124,7 +125,7 @@ SparcMCExpr::VariantKind SparcMCExpr::parseVariantKind(StringRef name) Sparc::Fixups SparcMCExpr::getFixupKind(SparcMCExpr::VariantKind Kind) { switch (Kind) { - default: assert(0 && "Unhandled SparcMCExpr::VariantKind"); + default: llvm_unreachable("Unhandled SparcMCExpr::VariantKind"); case VK_Sparc_LO: return Sparc::fixup_sparc_lo10; case VK_Sparc_HI: return Sparc::fixup_sparc_hi22; case VK_Sparc_H44: return Sparc::fixup_sparc_h44; @@ -219,35 +220,6 @@ void SparcMCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const { fixELFSymbolsInTLSFixupsImpl(getSubExpr(), Asm); } -// FIXME: This basically copies MCObjectStreamer::AddValueSymbols. Perhaps -// that method should be made public? -// FIXME: really do above: now that at least three other backends are using it. -static void AddValueSymbolsImpl(const MCExpr *Value, MCAssembler *Asm) { - switch (Value->getKind()) { - case MCExpr::Target: - llvm_unreachable("Can't handle nested target expr!"); - break; - - case MCExpr::Constant: - break; - - case MCExpr::Binary: { - const MCBinaryExpr *BE = cast<MCBinaryExpr>(Value); - AddValueSymbolsImpl(BE->getLHS(), Asm); - AddValueSymbolsImpl(BE->getRHS(), Asm); - break; - } - - case MCExpr::SymbolRef: - Asm->getOrCreateSymbolData(cast<MCSymbolRefExpr>(Value)->getSymbol()); - break; - - case MCExpr::Unary: - AddValueSymbolsImpl(cast<MCUnaryExpr>(Value)->getSubExpr(), Asm); - break; - } -} - -void SparcMCExpr::AddValueSymbols(MCAssembler *Asm) const { - AddValueSymbolsImpl(getSubExpr(), Asm); +void SparcMCExpr::visitUsedExpr(MCStreamer &Streamer) const { + Streamer.visitUsedExpr(*getSubExpr()); } diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h b/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h index 78dd945e2277..f0d0ef363ad8 100644 --- a/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h +++ b/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h @@ -88,7 +88,7 @@ public: void PrintImpl(raw_ostream &OS) const override; bool EvaluateAsRelocatableImpl(MCValue &Res, const MCAsmLayout *Layout) const override; - void AddValueSymbols(MCAssembler *) const override; + void visitUsedExpr(MCStreamer &Streamer) const override; const MCSection *FindAssociatedSection() const override { return getSubExpr()->FindAssociatedSection(); } diff --git a/lib/Target/Sparc/SparcFrameLowering.cpp b/lib/Target/Sparc/SparcFrameLowering.cpp index a37da94df1a9..3cdfda3e059a 100644 --- a/lib/Target/Sparc/SparcFrameLowering.cpp +++ b/lib/Target/Sparc/SparcFrameLowering.cpp @@ -14,6 +14,7 @@ #include "SparcFrameLowering.h" #include "SparcInstrInfo.h" #include "SparcMachineFunctionInfo.h" +#include "SparcSubtarget.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -32,6 +33,9 @@ DisableLeafProc("disable-sparc-leaf-proc", cl::desc("Disable Sparc leaf procedure optimization."), cl::Hidden); +SparcFrameLowering::SparcFrameLowering(const SparcSubtarget &ST) + : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, + ST.is64Bit() ? 16 : 8, 0, ST.is64Bit() ? 16 : 8) {} void SparcFrameLowering::emitSPAdjustment(MachineFunction &MF, MachineBasicBlock &MBB, @@ -99,7 +103,9 @@ void SparcFrameLowering::emitPrologue(MachineFunction &MF) const { SAVEri = SP::ADDri; SAVErr = SP::ADDrr; } - NumBytes = - SubTarget.getAdjustedFrameSize(NumBytes); + NumBytes = + -MF.getTarget().getSubtarget<SparcSubtarget>().getAdjustedFrameSize( + NumBytes); emitSPAdjustment(MF, MBB, MBBI, NumBytes, SAVErr, SAVEri); MachineModuleInfo &MMI = MF.getMMI(); @@ -162,7 +168,8 @@ void SparcFrameLowering::emitEpilogue(MachineFunction &MF, if (NumBytes == 0) return; - NumBytes = SubTarget.getAdjustedFrameSize(NumBytes); + NumBytes = MF.getTarget().getSubtarget<SparcSubtarget>().getAdjustedFrameSize( + NumBytes); emitSPAdjustment(MF, MBB, MBBI, NumBytes, SP::ADDrr, SP::ADDri); } diff --git a/lib/Target/Sparc/SparcFrameLowering.h b/lib/Target/Sparc/SparcFrameLowering.h index bda7b7cd185e..a7d1b8902dcd 100644 --- a/lib/Target/Sparc/SparcFrameLowering.h +++ b/lib/Target/Sparc/SparcFrameLowering.h @@ -15,19 +15,14 @@ #define SPARC_FRAMEINFO_H #include "Sparc.h" -#include "SparcSubtarget.h" #include "llvm/Target/TargetFrameLowering.h" namespace llvm { - class SparcSubtarget; +class SparcSubtarget; class SparcFrameLowering : public TargetFrameLowering { - const SparcSubtarget &SubTarget; public: - explicit SparcFrameLowering(const SparcSubtarget &ST) - : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, - ST.is64Bit() ? 16 : 8, 0, ST.is64Bit() ? 16 : 8), - SubTarget(ST) {} + explicit SparcFrameLowering(const SparcSubtarget &ST); /// emitProlog/emitEpilog - These methods insert prolog and epilog code into /// the function. diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp index ef614667ee6d..990f52a97275 100644 --- a/lib/Target/Sparc/SparcISelLowering.cpp +++ b/lib/Target/Sparc/SparcISelLowering.cpp @@ -2030,7 +2030,7 @@ SparcTargetLowering::LowerF128Op(SDValue Op, SelectionDAG &DAG, } TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(SDLoc(Op)).setChain(Chain) - .setCallee(CallingConv::C, RetTyABI, Callee, &Args, 0); + .setCallee(CallingConv::C, RetTyABI, Callee, std::move(Args), 0); std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI); @@ -2086,7 +2086,7 @@ SparcTargetLowering::LowerF128Compare(SDValue LHS, SDValue RHS, TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(DL).setChain(Chain) - .setCallee(CallingConv::C, RetTy, Callee, &Args, 0); + .setCallee(CallingConv::C, RetTy, Callee, std::move(Args), 0); std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI); diff --git a/lib/Target/Sparc/SparcJITInfo.cpp b/lib/Target/Sparc/SparcJITInfo.cpp index c775e9e83ee6..d0eec98b5e91 100644 --- a/lib/Target/Sparc/SparcJITInfo.cpp +++ b/lib/Target/Sparc/SparcJITInfo.cpp @@ -213,7 +213,8 @@ extern "C" void *SparcCompilationCallbackC(intptr_t StubAddr) { void SparcJITInfo::replaceMachineCodeForFunction(void *Old, void *New) { - assert(0 && "FIXME: Implement SparcJITInfo::replaceMachineCodeForFunction"); + llvm_unreachable("FIXME: Implement SparcJITInfo::" + "replaceMachineCodeForFunction"); } diff --git a/lib/Target/Sparc/SparcSelectionDAGInfo.cpp b/lib/Target/Sparc/SparcSelectionDAGInfo.cpp index eb36d2940b76..a308fc5e739e 100644 --- a/lib/Target/Sparc/SparcSelectionDAGInfo.cpp +++ b/lib/Target/Sparc/SparcSelectionDAGInfo.cpp @@ -11,13 +11,13 @@ // //===----------------------------------------------------------------------===// -#include "SparcTargetMachine.h" +#include "SparcSelectionDAGInfo.h" using namespace llvm; #define DEBUG_TYPE "sparc-selectiondag-info" -SparcSelectionDAGInfo::SparcSelectionDAGInfo(const SparcTargetMachine &TM) - : TargetSelectionDAGInfo(TM) { +SparcSelectionDAGInfo::SparcSelectionDAGInfo(const DataLayout &DL) + : TargetSelectionDAGInfo(&DL) { } SparcSelectionDAGInfo::~SparcSelectionDAGInfo() { diff --git a/lib/Target/Sparc/SparcSelectionDAGInfo.h b/lib/Target/Sparc/SparcSelectionDAGInfo.h index dcd42037253d..2346f4109dcb 100644 --- a/lib/Target/Sparc/SparcSelectionDAGInfo.h +++ b/lib/Target/Sparc/SparcSelectionDAGInfo.h @@ -22,7 +22,7 @@ class SparcTargetMachine; class SparcSelectionDAGInfo : public TargetSelectionDAGInfo { public: - explicit SparcSelectionDAGInfo(const SparcTargetMachine &TM); + explicit SparcSelectionDAGInfo(const DataLayout &DL); ~SparcSelectionDAGInfo(); }; diff --git a/lib/Target/Sparc/SparcSubtarget.cpp b/lib/Target/Sparc/SparcSubtarget.cpp index e38fb02c9a89..eea0c8c33c6a 100644 --- a/lib/Target/Sparc/SparcSubtarget.cpp +++ b/lib/Target/Sparc/SparcSubtarget.cpp @@ -26,20 +26,44 @@ using namespace llvm; void SparcSubtarget::anchor() { } -SparcSubtarget::SparcSubtarget(const std::string &TT, const std::string &CPU, - const std::string &FS, bool is64Bit) : - SparcGenSubtargetInfo(TT, CPU, FS), - IsV9(false), - V8DeprecatedInsts(false), - IsVIS(false), - Is64Bit(is64Bit), - HasHardQuad(false), - UsePopc(false) { +static std::string computeDataLayout(const SparcSubtarget &ST) { + // Sparc is big endian. + std::string Ret = "E-m:e"; + + // Some ABIs have 32bit pointers. + if (!ST.is64Bit()) + Ret += "-p:32:32"; + + // Alignments for 64 bit integers. + Ret += "-i64:64"; + + // On SparcV9 128 floats are aligned to 128 bits, on others only to 64. + // On SparcV9 registers can hold 64 or 32 bits, on others only 32. + if (ST.is64Bit()) + Ret += "-n32:64"; + else + Ret += "-f128:64-n32"; + + if (ST.is64Bit()) + Ret += "-S128"; + else + Ret += "-S64"; + + return Ret; +} + +SparcSubtarget &SparcSubtarget::initializeSubtargetDependencies(StringRef CPU, + StringRef FS) { + IsV9 = false; + V8DeprecatedInsts = false; + IsVIS = false; + HasHardQuad = false; + UsePopc = false; // Determine default and user specified characteristics std::string CPUName = CPU; if (CPUName.empty()) - CPUName = (is64Bit) ? "v9" : "v8"; + CPUName = (Is64Bit) ? "v9" : "v8"; // Parse features string. ParseSubtargetFeatures(CPUName, FS); @@ -47,8 +71,16 @@ SparcSubtarget::SparcSubtarget(const std::string &TT, const std::string &CPU, // Popc is a v9-only instruction. if (!IsV9) UsePopc = false; + + return *this; } +SparcSubtarget::SparcSubtarget(const std::string &TT, const std::string &CPU, + const std::string &FS, TargetMachine &TM, + bool is64Bit) + : SparcGenSubtargetInfo(TT, CPU, FS), Is64Bit(is64Bit), + DL(computeDataLayout(initializeSubtargetDependencies(CPU, FS))), + InstrInfo(*this), TLInfo(TM), TSInfo(DL), FrameLowering(*this) {} int SparcSubtarget::getAdjustedFrameSize(int frameSize) const { diff --git a/lib/Target/Sparc/SparcSubtarget.h b/lib/Target/Sparc/SparcSubtarget.h index 4025622be8e4..a3357786cded 100644 --- a/lib/Target/Sparc/SparcSubtarget.h +++ b/lib/Target/Sparc/SparcSubtarget.h @@ -14,6 +14,13 @@ #ifndef SPARC_SUBTARGET_H #define SPARC_SUBTARGET_H +#include "SparcFrameLowering.h" +#include "SparcInstrInfo.h" +#include "SparcISelLowering.h" +#include "SparcJITInfo.h" +#include "SparcSelectionDAGInfo.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetSubtargetInfo.h" #include <string> @@ -31,10 +38,26 @@ class SparcSubtarget : public SparcGenSubtargetInfo { bool Is64Bit; bool HasHardQuad; bool UsePopc; + const DataLayout DL; // Calculates type size & alignment + SparcInstrInfo InstrInfo; + SparcTargetLowering TLInfo; + SparcSelectionDAGInfo TSInfo; + SparcFrameLowering FrameLowering; + SparcJITInfo JITInfo; public: SparcSubtarget(const std::string &TT, const std::string &CPU, - const std::string &FS, bool is64bit); + const std::string &FS, TargetMachine &TM, bool is64bit); + + const SparcInstrInfo *getInstrInfo() const { return &InstrInfo; } + const TargetFrameLowering *getFrameLowering() const { return &FrameLowering; } + const SparcRegisterInfo *getRegisterInfo() const { + return &InstrInfo.getRegisterInfo(); + } + const SparcTargetLowering *getTargetLowering() const { return &TLInfo; } + const SparcSelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; } + SparcJITInfo *getJITInfo() { return &JITInfo; } + const DataLayout *getDataLayout() const { return &DL; } bool isV9() const { return IsV9; } bool isVIS() const { return IsVIS; } @@ -47,6 +70,7 @@ public: /// ParseSubtargetFeatures - Parses features string setting specified /// subtarget options. Definition of function is auto generated by tblgen. void ParseSubtargetFeatures(StringRef CPU, StringRef FS); + SparcSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS); bool is64Bit() const { return Is64Bit; } diff --git a/lib/Target/Sparc/SparcTargetMachine.cpp b/lib/Target/Sparc/SparcTargetMachine.cpp index 2469d9312c16..0130face3ff6 100644 --- a/lib/Target/Sparc/SparcTargetMachine.cpp +++ b/lib/Target/Sparc/SparcTargetMachine.cpp @@ -23,32 +23,6 @@ extern "C" void LLVMInitializeSparcTarget() { RegisterTargetMachine<SparcV9TargetMachine> Y(TheSparcV9Target); } -static std::string computeDataLayout(const SparcSubtarget &ST) { - // Sparc is big endian. - std::string Ret = "E-m:e"; - - // Some ABIs have 32bit pointers. - if (!ST.is64Bit()) - Ret += "-p:32:32"; - - // Alignments for 64 bit integers. - Ret += "-i64:64"; - - // On SparcV9 128 floats are aligned to 128 bits, on others only to 64. - // On SparcV9 registers can hold 64 or 32 bits, on others only 32. - if (ST.is64Bit()) - Ret += "-n32:64"; - else - Ret += "-f128:64-n32"; - - if (ST.is64Bit()) - Ret += "-S128"; - else - Ret += "-S64"; - - return Ret; -} - /// SparcTargetMachine ctor - Create an ILP32 architecture model /// SparcTargetMachine::SparcTargetMachine(const Target &T, StringRef TT, @@ -58,11 +32,7 @@ SparcTargetMachine::SparcTargetMachine(const Target &T, StringRef TT, CodeGenOpt::Level OL, bool is64bit) : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), - Subtarget(TT, CPU, FS, is64bit), - DL(computeDataLayout(Subtarget)), - InstrInfo(Subtarget), - TLInfo(*this), TSInfo(*this), - FrameLowering(Subtarget) { + Subtarget(TT, CPU, FS, *this, is64bit) { initAsmInfo(); } diff --git a/lib/Target/Sparc/SparcTargetMachine.h b/lib/Target/Sparc/SparcTargetMachine.h index 7d043388e8cf..03b513746dfe 100644 --- a/lib/Target/Sparc/SparcTargetMachine.h +++ b/lib/Target/Sparc/SparcTargetMachine.h @@ -14,50 +14,40 @@ #ifndef SPARCTARGETMACHINE_H #define SPARCTARGETMACHINE_H -#include "SparcFrameLowering.h" -#include "SparcISelLowering.h" #include "SparcInstrInfo.h" -#include "SparcJITInfo.h" -#include "SparcSelectionDAGInfo.h" #include "SparcSubtarget.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetMachine.h" namespace llvm { class SparcTargetMachine : public LLVMTargetMachine { SparcSubtarget Subtarget; - const DataLayout DL; // Calculates type size & alignment - SparcInstrInfo InstrInfo; - SparcTargetLowering TLInfo; - SparcSelectionDAGInfo TSInfo; - SparcFrameLowering FrameLowering; - SparcJITInfo JITInfo; public: SparcTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL, bool is64bit); - const SparcInstrInfo *getInstrInfo() const override { return &InstrInfo; } - const TargetFrameLowering *getFrameLowering() const override { - return &FrameLowering; + const SparcInstrInfo *getInstrInfo() const override { + return getSubtargetImpl()->getInstrInfo(); + } + const TargetFrameLowering *getFrameLowering() const override { + return getSubtargetImpl()->getFrameLowering(); } const SparcSubtarget *getSubtargetImpl() const override { return &Subtarget; } const SparcRegisterInfo *getRegisterInfo() const override { - return &InstrInfo.getRegisterInfo(); + return getSubtargetImpl()->getRegisterInfo(); } - const SparcTargetLowering* getTargetLowering() const override { - return &TLInfo; + const SparcTargetLowering *getTargetLowering() const override { + return getSubtargetImpl()->getTargetLowering(); } - const SparcSelectionDAGInfo* getSelectionDAGInfo() const override { - return &TSInfo; + const SparcSelectionDAGInfo *getSelectionDAGInfo() const override { + return getSubtargetImpl()->getSelectionDAGInfo(); } - SparcJITInfo *getJITInfo() override { - return &JITInfo; + SparcJITInfo *getJITInfo() override { return Subtarget.getJITInfo(); } + const DataLayout *getDataLayout() const override { + return getSubtargetImpl()->getDataLayout(); } - const DataLayout *getDataLayout() const override { return &DL; } // Pass Pipeline Configuration TargetPassConfig *createPassConfig(PassManagerBase &PM) override; diff --git a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp index 71de64fff87e..758be41ce263 100644 --- a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp +++ b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp @@ -104,10 +104,6 @@ private: MemOp Mem; }; - SystemZOperand(OperandKind kind, SMLoc startLoc, SMLoc endLoc) - : Kind(kind), StartLoc(startLoc), EndLoc(endLoc) - {} - void addExpr(MCInst &Inst, const MCExpr *Expr) const { // Add as immediates when possible. Null MCExpr = 0. if (!Expr) @@ -119,40 +115,44 @@ private: } public: + SystemZOperand(OperandKind kind, SMLoc startLoc, SMLoc endLoc) + : Kind(kind), StartLoc(startLoc), EndLoc(endLoc) {} + // Create particular kinds of operand. - static SystemZOperand *createInvalid(SMLoc StartLoc, SMLoc EndLoc) { - return new SystemZOperand(KindInvalid, StartLoc, EndLoc); + static std::unique_ptr<SystemZOperand> createInvalid(SMLoc StartLoc, + SMLoc EndLoc) { + return make_unique<SystemZOperand>(KindInvalid, StartLoc, EndLoc); } - static SystemZOperand *createToken(StringRef Str, SMLoc Loc) { - SystemZOperand *Op = new SystemZOperand(KindToken, Loc, Loc); + static std::unique_ptr<SystemZOperand> createToken(StringRef Str, SMLoc Loc) { + auto Op = make_unique<SystemZOperand>(KindToken, Loc, Loc); Op->Token.Data = Str.data(); Op->Token.Length = Str.size(); return Op; } - static SystemZOperand *createReg(RegisterKind Kind, unsigned Num, - SMLoc StartLoc, SMLoc EndLoc) { - SystemZOperand *Op = new SystemZOperand(KindReg, StartLoc, EndLoc); + static std::unique_ptr<SystemZOperand> + createReg(RegisterKind Kind, unsigned Num, SMLoc StartLoc, SMLoc EndLoc) { + auto Op = make_unique<SystemZOperand>(KindReg, StartLoc, EndLoc); Op->Reg.Kind = Kind; Op->Reg.Num = Num; return Op; } - static SystemZOperand *createAccessReg(unsigned Num, SMLoc StartLoc, - SMLoc EndLoc) { - SystemZOperand *Op = new SystemZOperand(KindAccessReg, StartLoc, EndLoc); + static std::unique_ptr<SystemZOperand> + createAccessReg(unsigned Num, SMLoc StartLoc, SMLoc EndLoc) { + auto Op = make_unique<SystemZOperand>(KindAccessReg, StartLoc, EndLoc); Op->AccessReg = Num; return Op; } - static SystemZOperand *createImm(const MCExpr *Expr, SMLoc StartLoc, - SMLoc EndLoc) { - SystemZOperand *Op = new SystemZOperand(KindImm, StartLoc, EndLoc); + static std::unique_ptr<SystemZOperand> + createImm(const MCExpr *Expr, SMLoc StartLoc, SMLoc EndLoc) { + auto Op = make_unique<SystemZOperand>(KindImm, StartLoc, EndLoc); Op->Imm = Expr; return Op; } - static SystemZOperand *createMem(RegisterKind RegKind, unsigned Base, - const MCExpr *Disp, unsigned Index, - const MCExpr *Length, SMLoc StartLoc, - SMLoc EndLoc) { - SystemZOperand *Op = new SystemZOperand(KindMem, StartLoc, EndLoc); + static std::unique_ptr<SystemZOperand> + createMem(RegisterKind RegKind, unsigned Base, const MCExpr *Disp, + unsigned Index, const MCExpr *Length, SMLoc StartLoc, + SMLoc EndLoc) { + auto Op = make_unique<SystemZOperand>(KindMem, StartLoc, EndLoc); Op->Mem.RegKind = RegKind; Op->Mem.Base = Base; Op->Mem.Index = Index; @@ -313,21 +313,19 @@ private: bool parseRegister(Register &Reg, RegisterGroup Group, const unsigned *Regs, bool IsAddress = false); - OperandMatchResultTy - parseRegister(SmallVectorImpl<MCParsedAsmOperand*> &Operands, - RegisterGroup Group, const unsigned *Regs, RegisterKind Kind); + OperandMatchResultTy parseRegister(OperandVector &Operands, + RegisterGroup Group, const unsigned *Regs, + RegisterKind Kind); bool parseAddress(unsigned &Base, const MCExpr *&Disp, unsigned &Index, const MCExpr *&Length, const unsigned *Regs, RegisterKind RegKind); - OperandMatchResultTy - parseAddress(SmallVectorImpl<MCParsedAsmOperand*> &Operands, - const unsigned *Regs, RegisterKind RegKind, - MemoryKind MemKind); + OperandMatchResultTy parseAddress(OperandVector &Operands, + const unsigned *Regs, RegisterKind RegKind, + MemoryKind MemKind); - bool parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands, - StringRef Mnemonic); + bool parseOperand(OperandVector &Operands, StringRef Mnemonic); public: SystemZAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser, @@ -343,87 +341,66 @@ public: // Override MCTargetAsmParser. bool ParseDirective(AsmToken DirectiveID) override; bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override; - bool ParseInstruction(ParseInstructionInfo &Info, - StringRef Name, SMLoc NameLoc, - SmallVectorImpl<MCParsedAsmOperand*> &Operands) - override; + bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + SMLoc NameLoc, OperandVector &Operands) override; bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, - SmallVectorImpl<MCParsedAsmOperand*> &Operands, - MCStreamer &Out, unsigned &ErrorInfo, + OperandVector &Operands, MCStreamer &Out, + unsigned &ErrorInfo, bool MatchingInlineAsm) override; // Used by the TableGen code to parse particular operand types. - OperandMatchResultTy - parseGR32(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + OperandMatchResultTy parseGR32(OperandVector &Operands) { return parseRegister(Operands, RegGR, SystemZMC::GR32Regs, GR32Reg); } - OperandMatchResultTy - parseGRH32(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + OperandMatchResultTy parseGRH32(OperandVector &Operands) { return parseRegister(Operands, RegGR, SystemZMC::GRH32Regs, GRH32Reg); } - OperandMatchResultTy - parseGRX32(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + OperandMatchResultTy parseGRX32(OperandVector &Operands) { llvm_unreachable("GRX32 should only be used for pseudo instructions"); } - OperandMatchResultTy - parseGR64(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + OperandMatchResultTy parseGR64(OperandVector &Operands) { return parseRegister(Operands, RegGR, SystemZMC::GR64Regs, GR64Reg); } - OperandMatchResultTy - parseGR128(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + OperandMatchResultTy parseGR128(OperandVector &Operands) { return parseRegister(Operands, RegGR, SystemZMC::GR128Regs, GR128Reg); } - OperandMatchResultTy - parseADDR32(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + OperandMatchResultTy parseADDR32(OperandVector &Operands) { return parseRegister(Operands, RegGR, SystemZMC::GR32Regs, ADDR32Reg); } - OperandMatchResultTy - parseADDR64(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + OperandMatchResultTy parseADDR64(OperandVector &Operands) { return parseRegister(Operands, RegGR, SystemZMC::GR64Regs, ADDR64Reg); } - OperandMatchResultTy - parseADDR128(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + OperandMatchResultTy parseADDR128(OperandVector &Operands) { llvm_unreachable("Shouldn't be used as an operand"); } - OperandMatchResultTy - parseFP32(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + OperandMatchResultTy parseFP32(OperandVector &Operands) { return parseRegister(Operands, RegFP, SystemZMC::FP32Regs, FP32Reg); } - OperandMatchResultTy - parseFP64(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + OperandMatchResultTy parseFP64(OperandVector &Operands) { return parseRegister(Operands, RegFP, SystemZMC::FP64Regs, FP64Reg); } - OperandMatchResultTy - parseFP128(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + OperandMatchResultTy parseFP128(OperandVector &Operands) { return parseRegister(Operands, RegFP, SystemZMC::FP128Regs, FP128Reg); } - OperandMatchResultTy - parseBDAddr32(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + OperandMatchResultTy parseBDAddr32(OperandVector &Operands) { return parseAddress(Operands, SystemZMC::GR32Regs, ADDR32Reg, BDMem); } - OperandMatchResultTy - parseBDAddr64(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + OperandMatchResultTy parseBDAddr64(OperandVector &Operands) { return parseAddress(Operands, SystemZMC::GR64Regs, ADDR64Reg, BDMem); } - OperandMatchResultTy - parseBDXAddr64(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + OperandMatchResultTy parseBDXAddr64(OperandVector &Operands) { return parseAddress(Operands, SystemZMC::GR64Regs, ADDR64Reg, BDXMem); } - OperandMatchResultTy - parseBDLAddr64(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + OperandMatchResultTy parseBDLAddr64(OperandVector &Operands) { return parseAddress(Operands, SystemZMC::GR64Regs, ADDR64Reg, BDLMem); } - OperandMatchResultTy - parseAccessReg(SmallVectorImpl<MCParsedAsmOperand*> &Operands); - OperandMatchResultTy - parsePCRel(SmallVectorImpl<MCParsedAsmOperand*> &Operands, - int64_t MinVal, int64_t MaxVal); - OperandMatchResultTy - parsePCRel16(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + OperandMatchResultTy parseAccessReg(OperandVector &Operands); + OperandMatchResultTy parsePCRel(OperandVector &Operands, int64_t MinVal, + int64_t MaxVal); + OperandMatchResultTy parsePCRel16(OperandVector &Operands) { return parsePCRel(Operands, -(1LL << 16), (1LL << 16) - 1); } - OperandMatchResultTy - parsePCRel32(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + OperandMatchResultTy parsePCRel32(OperandVector &Operands) { return parsePCRel(Operands, -(1LL << 32), (1LL << 32) - 1); } }; @@ -497,9 +474,8 @@ bool SystemZAsmParser::parseRegister(Register &Reg, RegisterGroup Group, // Parse a register and add it to Operands. The other arguments are as above. SystemZAsmParser::OperandMatchResultTy -SystemZAsmParser::parseRegister(SmallVectorImpl<MCParsedAsmOperand*> &Operands, - RegisterGroup Group, const unsigned *Regs, - RegisterKind Kind) { +SystemZAsmParser::parseRegister(OperandVector &Operands, RegisterGroup Group, + const unsigned *Regs, RegisterKind Kind) { if (Parser.getTok().isNot(AsmToken::Percent)) return MatchOperand_NoMatch; @@ -566,9 +542,8 @@ bool SystemZAsmParser::parseAddress(unsigned &Base, const MCExpr *&Disp, // Parse a memory operand and add it to Operands. The other arguments // are as above. SystemZAsmParser::OperandMatchResultTy -SystemZAsmParser::parseAddress(SmallVectorImpl<MCParsedAsmOperand*> &Operands, - const unsigned *Regs, RegisterKind RegKind, - MemoryKind MemKind) { +SystemZAsmParser::parseAddress(OperandVector &Operands, const unsigned *Regs, + RegisterKind RegKind, MemoryKind MemKind) { SMLoc StartLoc = Parser.getTok().getLoc(); unsigned Base, Index; const MCExpr *Disp; @@ -622,9 +597,9 @@ bool SystemZAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, return false; } -bool SystemZAsmParser:: -ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, - SmallVectorImpl<MCParsedAsmOperand*> &Operands) { +bool SystemZAsmParser::ParseInstruction(ParseInstructionInfo &Info, + StringRef Name, SMLoc NameLoc, + OperandVector &Operands) { Operands.push_back(SystemZOperand::createToken(Name, NameLoc)); // Read the remaining operands. @@ -655,9 +630,8 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, return false; } -bool SystemZAsmParser:: -parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands, - StringRef Mnemonic) { +bool SystemZAsmParser::parseOperand(OperandVector &Operands, + StringRef Mnemonic) { // Check if the current operand has a custom associated parser, if so, try to // custom parse the operand, or fallback to the general approach. OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic); @@ -700,11 +674,11 @@ parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands, return false; } -bool SystemZAsmParser:: -MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, - SmallVectorImpl<MCParsedAsmOperand*> &Operands, - MCStreamer &Out, unsigned &ErrorInfo, - bool MatchingInlineAsm) { +bool SystemZAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + OperandVector &Operands, + MCStreamer &Out, + unsigned &ErrorInfo, + bool MatchingInlineAsm) { MCInst Inst; unsigned MatchResult; @@ -739,7 +713,7 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, if (ErrorInfo >= Operands.size()) return Error(IDLoc, "too few operands for instruction"); - ErrorLoc = ((SystemZOperand*)Operands[ErrorInfo])->getStartLoc(); + ErrorLoc = ((SystemZOperand &)*Operands[ErrorInfo]).getStartLoc(); if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc; } @@ -753,8 +727,8 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, llvm_unreachable("Unexpected match type"); } -SystemZAsmParser::OperandMatchResultTy SystemZAsmParser:: -parseAccessReg(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { +SystemZAsmParser::OperandMatchResultTy +SystemZAsmParser::parseAccessReg(OperandVector &Operands) { if (Parser.getTok().isNot(AsmToken::Percent)) return MatchOperand_NoMatch; @@ -768,9 +742,9 @@ parseAccessReg(SmallVectorImpl<MCParsedAsmOperand*> &Operands) { return MatchOperand_Success; } -SystemZAsmParser::OperandMatchResultTy SystemZAsmParser:: -parsePCRel(SmallVectorImpl<MCParsedAsmOperand*> &Operands, - int64_t MinVal, int64_t MaxVal) { +SystemZAsmParser::OperandMatchResultTy +SystemZAsmParser::parsePCRel(OperandVector &Operands, int64_t MinVal, + int64_t MaxVal) { MCContext &Ctx = getContext(); MCStreamer &Out = getStreamer(); const MCExpr *Expr; diff --git a/lib/Target/SystemZ/SystemZCallingConv.td b/lib/Target/SystemZ/SystemZCallingConv.td index c4f641e7bdec..fb0d1d8a3fe7 100644 --- a/lib/Target/SystemZ/SystemZCallingConv.td +++ b/lib/Target/SystemZ/SystemZCallingConv.td @@ -13,7 +13,7 @@ class CCIfExtend<CCAction A> : CCIf<"ArgFlags.isSExt() || ArgFlags.isZExt()", A>; //===----------------------------------------------------------------------===// -// SVR4 return value calling convention +// z/Linux return value calling convention //===----------------------------------------------------------------------===// def RetCC_SystemZ : CallingConv<[ // Promote i32 to i64 if it has an explicit extension type. @@ -39,7 +39,7 @@ def RetCC_SystemZ : CallingConv<[ ]>; //===----------------------------------------------------------------------===// -// SVR4 argument calling conventions +// z/Linux argument calling conventions //===----------------------------------------------------------------------===// def CC_SystemZ : CallingConv<[ // Promote i32 to i64 if it has an explicit extension type. @@ -63,3 +63,9 @@ def CC_SystemZ : CallingConv<[ // Other arguments are passed in 8-byte-aligned 8-byte stack slots. CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>> ]>; + +//===----------------------------------------------------------------------===// +// z/Linux callee-saved registers +//===----------------------------------------------------------------------===// +def CSR_SystemZ : CalleeSavedRegs<(add (sequence "R%dD", 6, 15), + (sequence "F%dD", 8, 15))>; diff --git a/lib/Target/SystemZ/SystemZFrameLowering.cpp b/lib/Target/SystemZ/SystemZFrameLowering.cpp index 65f3caf64e4c..055dbe914995 100644 --- a/lib/Target/SystemZ/SystemZFrameLowering.cpp +++ b/lib/Target/SystemZ/SystemZFrameLowering.cpp @@ -10,8 +10,9 @@ #include "SystemZFrameLowering.h" #include "SystemZCallingConv.h" #include "SystemZInstrBuilder.h" +#include "SystemZInstrInfo.h" #include "SystemZMachineFunctionInfo.h" -#include "SystemZTargetMachine.h" +#include "SystemZRegisterInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RegisterScavenging.h" @@ -44,11 +45,9 @@ static const TargetFrameLowering::SpillSlot SpillOffsetTable[] = { }; } // end anonymous namespace -SystemZFrameLowering::SystemZFrameLowering(const SystemZTargetMachine &tm, - const SystemZSubtarget &sti) - : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 8, - -SystemZMC::CallFrameSize, 8), - TM(tm), STI(sti) { +SystemZFrameLowering::SystemZFrameLowering() + : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 8, + -SystemZMC::CallFrameSize, 8) { // Create a mapping from register number to save slot offset. RegSpillOffsets.grow(SystemZ::NUM_TARGET_REGS); for (unsigned I = 0, E = array_lengthof(SpillOffsetTable); I != E; ++I) @@ -108,9 +107,8 @@ processFunctionBeforeCalleeSavedScan(MachineFunction &MF, // instruction, or an implicit one that comes between the explicit start // and end registers. static void addSavedGPR(MachineBasicBlock &MBB, MachineInstrBuilder &MIB, - const SystemZTargetMachine &TM, unsigned GPR64, bool IsImplicit) { - const SystemZRegisterInfo *RI = TM.getRegisterInfo(); + const TargetRegisterInfo *RI = MBB.getParent()->getTarget().getRegisterInfo(); unsigned GPR32 = RI->getSubReg(GPR64, SystemZ::subreg_l32); bool IsLive = MBB.isLiveIn(GPR64) || MBB.isLiveIn(GPR32); if (!IsLive || !IsImplicit) { @@ -176,8 +174,8 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB, MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(SystemZ::STMG)); // Add the explicit register operands. - addSavedGPR(MBB, MIB, TM, LowGPR, false); - addSavedGPR(MBB, MIB, TM, HighGPR, false); + addSavedGPR(MBB, MIB, LowGPR, false); + addSavedGPR(MBB, MIB, HighGPR, false); // Add the address. MIB.addReg(SystemZ::R15D).addImm(StartOffset); @@ -187,13 +185,13 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB, for (unsigned I = 0, E = CSI.size(); I != E; ++I) { unsigned Reg = CSI[I].getReg(); if (SystemZ::GR64BitRegClass.contains(Reg)) - addSavedGPR(MBB, MIB, TM, Reg, true); + addSavedGPR(MBB, MIB, Reg, true); } // ...likewise GPR varargs. if (IsVarArg) for (unsigned I = ZFI->getVarArgsFirstGPR(); I < SystemZ::NumArgGPRs; ++I) - addSavedGPR(MBB, MIB, TM, SystemZ::ArgGPRs[I], true); + addSavedGPR(MBB, MIB, SystemZ::ArgGPRs[I], true); } // Save FPRs in the normal TargetInstrInfo way. diff --git a/lib/Target/SystemZ/SystemZFrameLowering.h b/lib/Target/SystemZ/SystemZFrameLowering.h index 70e25fb243b2..4d5fe6dce62d 100644 --- a/lib/Target/SystemZ/SystemZFrameLowering.h +++ b/lib/Target/SystemZ/SystemZFrameLowering.h @@ -10,7 +10,6 @@ #ifndef SYSTEMZFRAMELOWERING_H #define SYSTEMZFRAMELOWERING_H -#include "SystemZSubtarget.h" #include "llvm/ADT/IndexedMap.h" #include "llvm/Target/TargetFrameLowering.h" @@ -21,13 +20,8 @@ class SystemZSubtarget; class SystemZFrameLowering : public TargetFrameLowering { IndexedMap<unsigned> RegSpillOffsets; -protected: - const SystemZTargetMachine &TM; - const SystemZSubtarget &STI; - public: - SystemZFrameLowering(const SystemZTargetMachine &tm, - const SystemZSubtarget &sti); + SystemZFrameLowering(); // Override TargetFrameLowering. bool isFPCloseToIncomingSP() const override { return false; } diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp index 6fe1fb9f7d3d..00c65f5bba6b 100644 --- a/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -80,9 +80,9 @@ static MachineOperand earlyUseOperand(MachineOperand Op) { return Op; } -SystemZTargetLowering::SystemZTargetLowering(SystemZTargetMachine &tm) - : TargetLowering(tm, new TargetLoweringObjectFileELF()), - Subtarget(*tm.getSubtargetImpl()), TM(tm) { +SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm) + : TargetLowering(tm, new TargetLoweringObjectFileELF()), + Subtarget(tm.getSubtarget<SystemZSubtarget>()) { MVT PtrVT = getPointerTy(); // Set up the register classes. @@ -673,11 +673,13 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, MachineRegisterInfo &MRI = MF.getRegInfo(); SystemZMachineFunctionInfo *FuncInfo = MF.getInfo<SystemZMachineFunctionInfo>(); - auto *TFL = static_cast<const SystemZFrameLowering *>(TM.getFrameLowering()); + auto *TFL = static_cast<const SystemZFrameLowering *>( + DAG.getTarget().getFrameLowering()); // Assign locations to all of the incoming arguments. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CallConv, IsVarArg, MF, TM, ArgLocs, *DAG.getContext()); + CCState CCInfo(CallConv, IsVarArg, MF, DAG.getTarget(), ArgLocs, + *DAG.getContext()); CCInfo.AnalyzeFormalArguments(Ins, CC_SystemZ); unsigned NumFixedGPRs = 0; @@ -815,7 +817,8 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI, // Analyze the operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ArgLocs; - CCState ArgCCInfo(CallConv, IsVarArg, MF, TM, ArgLocs, *DAG.getContext()); + CCState ArgCCInfo(CallConv, IsVarArg, MF, DAG.getTarget(), ArgLocs, + *DAG.getContext()); ArgCCInfo.AnalyzeCallOperands(Outs, CC_SystemZ); // We don't support GuaranteedTailCallOpt, only automatically-detected @@ -911,6 +914,12 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI, Ops.push_back(DAG.getRegister(RegsToPass[I].first, RegsToPass[I].second.getValueType())); + // Add a register mask operand representing the call-preserved registers. + const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); + const uint32_t *Mask = TRI->getCallPreservedMask(CallConv); + assert(Mask && "Missing call preserved mask for calling convention"); + Ops.push_back(DAG.getRegisterMask(Mask)); + // Glue the call to the argument copies, if any. if (Glue.getNode()) Ops.push_back(Glue); @@ -931,7 +940,8 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI, // Assign locations to each value returned by this call. SmallVector<CCValAssign, 16> RetLocs; - CCState RetCCInfo(CallConv, IsVarArg, MF, TM, RetLocs, *DAG.getContext()); + CCState RetCCInfo(CallConv, IsVarArg, MF, DAG.getTarget(), RetLocs, + *DAG.getContext()); RetCCInfo.AnalyzeCallResult(Ins, RetCC_SystemZ); // Copy all of the result registers out of their specified physreg. @@ -962,7 +972,8 @@ SystemZTargetLowering::LowerReturn(SDValue Chain, // Assign locations to each returned value. SmallVector<CCValAssign, 16> RetLocs; - CCState RetCCInfo(CallConv, IsVarArg, MF, TM, RetLocs, *DAG.getContext()); + CCState RetCCInfo(CallConv, IsVarArg, MF, DAG.getTarget(), RetLocs, + *DAG.getContext()); RetCCInfo.AnalyzeReturn(Outs, RetCC_SystemZ); // Quick exit for void returns @@ -1786,8 +1797,8 @@ SDValue SystemZTargetLowering::lowerGlobalAddress(GlobalAddressSDNode *Node, const GlobalValue *GV = Node->getGlobal(); int64_t Offset = Node->getOffset(); EVT PtrVT = getPointerTy(); - Reloc::Model RM = TM.getRelocationModel(); - CodeModel::Model CM = TM.getCodeModel(); + Reloc::Model RM = DAG.getTarget().getRelocationModel(); + CodeModel::Model CM = DAG.getTarget().getCodeModel(); SDValue Result; if (Subtarget.isPC32DBLSymbol(GV, RM, CM)) { @@ -1824,7 +1835,7 @@ SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node, SDLoc DL(Node); const GlobalValue *GV = Node->getGlobal(); EVT PtrVT = getPointerTy(); - TLSModel::Model model = TM.getTLSModel(GV); + TLSModel::Model model = DAG.getTarget().getTLSModel(GV); if (model != TLSModel::LocalExec) llvm_unreachable("only local-exec TLS mode supported"); @@ -2287,9 +2298,9 @@ SDValue SystemZTargetLowering::lowerATOMIC_LOAD_SUB(SDValue Op, // Use an addition if the operand is constant and either LAA(G) is // available or the negative value is in the range of A(G)FHI. int64_t Value = (-Op2->getAPIntValue()).getSExtValue(); - if (isInt<32>(Value) || TM.getSubtargetImpl()->hasInterlockedAccess1()) + if (isInt<32>(Value) || Subtarget.hasInterlockedAccess1()) NegSrc2 = DAG.getConstant(Value, MemVT); - } else if (TM.getSubtargetImpl()->hasInterlockedAccess1()) + } else if (Subtarget.hasInterlockedAccess1()) // Use LAA(G) if available. NegSrc2 = DAG.getNode(ISD::SUB, DL, MemVT, DAG.getConstant(0, MemVT), Src2); @@ -2602,7 +2613,8 @@ static unsigned forceReg(MachineInstr *MI, MachineOperand &Base, MachineBasicBlock * SystemZTargetLowering::emitSelect(MachineInstr *MI, MachineBasicBlock *MBB) const { - const SystemZInstrInfo *TII = TM.getInstrInfo(); + const SystemZInstrInfo *TII = static_cast<const SystemZInstrInfo *>( + MBB->getParent()->getTarget().getInstrInfo()); unsigned DestReg = MI->getOperand(0).getReg(); unsigned TrueReg = MI->getOperand(1).getReg(); @@ -2650,7 +2662,8 @@ SystemZTargetLowering::emitCondStore(MachineInstr *MI, MachineBasicBlock *MBB, unsigned StoreOpcode, unsigned STOCOpcode, bool Invert) const { - const SystemZInstrInfo *TII = TM.getInstrInfo(); + const SystemZInstrInfo *TII = static_cast<const SystemZInstrInfo *>( + MBB->getParent()->getTarget().getInstrInfo()); unsigned SrcReg = MI->getOperand(0).getReg(); MachineOperand Base = MI->getOperand(1); @@ -2665,7 +2678,7 @@ SystemZTargetLowering::emitCondStore(MachineInstr *MI, // Use STOCOpcode if possible. We could use different store patterns in // order to avoid matching the index register, but the performance trade-offs // might be more complicated in that case. - if (STOCOpcode && !IndexReg && TM.getSubtargetImpl()->hasLoadStoreOnCond()) { + if (STOCOpcode && !IndexReg && Subtarget.hasLoadStoreOnCond()) { if (Invert) CCMask ^= CCValid; BuildMI(*MBB, MI, DL, TII->get(STOCOpcode)) @@ -2717,8 +2730,9 @@ SystemZTargetLowering::emitAtomicLoadBinary(MachineInstr *MI, unsigned BinOpcode, unsigned BitSize, bool Invert) const { - const SystemZInstrInfo *TII = TM.getInstrInfo(); MachineFunction &MF = *MBB->getParent(); + const SystemZInstrInfo *TII = + static_cast<const SystemZInstrInfo *>(MF.getTarget().getInstrInfo()); MachineRegisterInfo &MRI = MF.getRegInfo(); bool IsSubWord = (BitSize < 32); @@ -2840,8 +2854,9 @@ SystemZTargetLowering::emitAtomicLoadMinMax(MachineInstr *MI, unsigned CompareOpcode, unsigned KeepOldMask, unsigned BitSize) const { - const SystemZInstrInfo *TII = TM.getInstrInfo(); MachineFunction &MF = *MBB->getParent(); + const SystemZInstrInfo *TII = + static_cast<const SystemZInstrInfo *>(MF.getTarget().getInstrInfo()); MachineRegisterInfo &MRI = MF.getRegInfo(); bool IsSubWord = (BitSize < 32); @@ -2951,8 +2966,9 @@ SystemZTargetLowering::emitAtomicLoadMinMax(MachineInstr *MI, MachineBasicBlock * SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr *MI, MachineBasicBlock *MBB) const { - const SystemZInstrInfo *TII = TM.getInstrInfo(); MachineFunction &MF = *MBB->getParent(); + const SystemZInstrInfo *TII = + static_cast<const SystemZInstrInfo *>(MF.getTarget().getInstrInfo()); MachineRegisterInfo &MRI = MF.getRegInfo(); // Extract the operands. Base can be a register or a frame index. @@ -3067,8 +3083,9 @@ MachineBasicBlock * SystemZTargetLowering::emitExt128(MachineInstr *MI, MachineBasicBlock *MBB, bool ClearEven, unsigned SubReg) const { - const SystemZInstrInfo *TII = TM.getInstrInfo(); MachineFunction &MF = *MBB->getParent(); + const SystemZInstrInfo *TII = + static_cast<const SystemZInstrInfo *>(MF.getTarget().getInstrInfo()); MachineRegisterInfo &MRI = MF.getRegInfo(); DebugLoc DL = MI->getDebugLoc(); @@ -3098,8 +3115,9 @@ MachineBasicBlock * SystemZTargetLowering::emitMemMemWrapper(MachineInstr *MI, MachineBasicBlock *MBB, unsigned Opcode) const { - const SystemZInstrInfo *TII = TM.getInstrInfo(); MachineFunction &MF = *MBB->getParent(); + const SystemZInstrInfo *TII = + static_cast<const SystemZInstrInfo *>(MF.getTarget().getInstrInfo()); MachineRegisterInfo &MRI = MF.getRegInfo(); DebugLoc DL = MI->getDebugLoc(); @@ -3267,8 +3285,9 @@ MachineBasicBlock * SystemZTargetLowering::emitStringWrapper(MachineInstr *MI, MachineBasicBlock *MBB, unsigned Opcode) const { - const SystemZInstrInfo *TII = TM.getInstrInfo(); MachineFunction &MF = *MBB->getParent(); + const SystemZInstrInfo *TII = + static_cast<const SystemZInstrInfo *>(MF.getTarget().getInstrInfo()); MachineRegisterInfo &MRI = MF.getRegInfo(); DebugLoc DL = MI->getDebugLoc(); diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h index bceb25e036e7..e21b0501933f 100644 --- a/lib/Target/SystemZ/SystemZISelLowering.h +++ b/lib/Target/SystemZ/SystemZISelLowering.h @@ -198,7 +198,7 @@ class SystemZTargetMachine; class SystemZTargetLowering : public TargetLowering { public: - explicit SystemZTargetLowering(SystemZTargetMachine &TM); + explicit SystemZTargetLowering(const TargetMachine &TM); // Override TargetLowering. MVT getScalarShiftAmountTy(EVT LHSTy) const override { @@ -249,7 +249,6 @@ public: private: const SystemZSubtarget &Subtarget; - const SystemZTargetMachine &TM; // Implement LowerOperation for individual opcodes. SDValue lowerSETCC(SDValue Op, SelectionDAG &DAG) const; diff --git a/lib/Target/SystemZ/SystemZInstrFP.td b/lib/Target/SystemZ/SystemZInstrFP.td index a1e782cdfd77..e8841e131324 100644 --- a/lib/Target/SystemZ/SystemZInstrFP.td +++ b/lib/Target/SystemZ/SystemZInstrFP.td @@ -133,6 +133,13 @@ def LEDBR : UnaryRRE<"ledb", 0xB344, fround, FP32, FP64>; def LEXBR : UnaryRRE<"lexb", 0xB346, null_frag, FP128, FP128>; def LDXBR : UnaryRRE<"ldxb", 0xB345, null_frag, FP128, FP128>; +def LEDBRA : UnaryRRF4<"ledbra", 0xB344, FP32, FP64>, + Requires<[FeatureFPExtension]>; +def LEXBRA : UnaryRRF4<"lexbra", 0xB346, FP128, FP128>, + Requires<[FeatureFPExtension]>; +def LDXBRA : UnaryRRF4<"ldxbra", 0xB345, FP128, FP128>, + Requires<[FeatureFPExtension]>; + def : Pat<(f32 (fround FP128:$src)), (EXTRACT_SUBREG (LEXBR FP128:$src), subreg_hh32)>; def : Pat<(f64 (fround FP128:$src)), diff --git a/lib/Target/SystemZ/SystemZInstrFormats.td b/lib/Target/SystemZ/SystemZInstrFormats.td index add675a22ccc..9f59a1c8e7e3 100644 --- a/lib/Target/SystemZ/SystemZInstrFormats.td +++ b/lib/Target/SystemZ/SystemZInstrFormats.td @@ -511,34 +511,24 @@ class InstSS<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern> // to store. Other stored registers are added as implicit uses. // // Unary: -// One register output operand and one input operand. The input -// operand may be a register, immediate or memory. +// One register output operand and one input operand. // // Binary: -// One register output operand and two input operands. The first -// input operand is always a register and the second may be a register, -// immediate or memory. -// -// Shift: -// One register output operand and two input operands. The first -// input operand is a register and the second has the same form as -// an address (although it isn't actually used to address memory). +// One register output operand and two input operands. // // Compare: -// Two input operands. The first operand is always a register, -// the second may be a register, immediate or memory. +// Two input operands and an implicit CC output operand. // // Ternary: -// One register output operand and three register input operands. +// One register output operand and three input operands. // // LoadAndOp: -// One output operand and two input operands. The first input operand -// is a register and the second is an address. +// One output operand and two input operands, one of which is an address. +// The instruction both reads from and writes to the address. // // CmpSwap: -// One output operand and three input operands. The first two -// operands are registers and the third is an address. The instruction -// both reads from and writes to the address. +// One output operand and three input operands, one of which is an address. +// The instruction both reads from and writes to the address. // // RotateSelect: // One output operand and five input operands. The first two operands @@ -691,7 +681,7 @@ class CondStoreRSY<string mnemonic, bits<16> opcode, class AsmCondStoreRSY<string mnemonic, bits<16> opcode, RegisterOperand cls, bits<5> bytes, AddressingMode mode = bdaddr20only> - : InstRSY<opcode, (outs), (ins cls:$R1, mode:$BD2, uimm8zx4:$R3), + : InstRSY<opcode, (outs), (ins cls:$R1, mode:$BD2, imm32zx4:$R3), mnemonic#"\t$R1, $BD2, $R3", []>, Requires<[FeatureLoadStoreOnCond]> { let mayStore = 1; @@ -730,7 +720,7 @@ class UnaryRRE<string mnemonic, bits<16> opcode, SDPatternOperator operator, class UnaryRRF<string mnemonic, bits<16> opcode, RegisterOperand cls1, RegisterOperand cls2> - : InstRRF<opcode, (outs cls1:$R1), (ins uimm8zx4:$R3, cls2:$R2), + : InstRRF<opcode, (outs cls1:$R1), (ins imm32zx4:$R3, cls2:$R2), mnemonic#"r\t$R1, $R3, $R2", []> { let OpKey = mnemonic ## cls1; let OpType = "reg"; @@ -739,7 +729,7 @@ class UnaryRRF<string mnemonic, bits<16> opcode, RegisterOperand cls1, class UnaryRRF4<string mnemonic, bits<16> opcode, RegisterOperand cls1, RegisterOperand cls2> - : InstRRF<opcode, (outs cls1:$R1), (ins uimm8zx4:$R3, cls2:$R2, uimm8zx4:$R4), + : InstRRF<opcode, (outs cls1:$R1), (ins imm32zx4:$R3, cls2:$R2, imm32zx4:$R4), mnemonic#"\t$R1, $R3, $R2, $R4", []>; // These instructions are generated by if conversion. The old value of R1 @@ -757,7 +747,7 @@ class CondUnaryRRF<string mnemonic, bits<16> opcode, RegisterOperand cls1, // mask is the third operand rather than being part of the mnemonic. class AsmCondUnaryRRF<string mnemonic, bits<16> opcode, RegisterOperand cls1, RegisterOperand cls2> - : InstRRF<opcode, (outs cls1:$R1), (ins cls1:$R1src, cls2:$R2, uimm8zx4:$R3), + : InstRRF<opcode, (outs cls1:$R1), (ins cls1:$R1src, cls2:$R2, imm32zx4:$R3), mnemonic#"r\t$R1, $R2, $R3", []>, Requires<[FeatureLoadStoreOnCond]> { let Constraints = "$R1 = $R1src"; @@ -823,7 +813,7 @@ class CondUnaryRSY<string mnemonic, bits<16> opcode, class AsmCondUnaryRSY<string mnemonic, bits<16> opcode, RegisterOperand cls, bits<5> bytes, AddressingMode mode = bdaddr20only> - : InstRSY<opcode, (outs cls:$R1), (ins cls:$R1src, mode:$BD2, uimm8zx4:$R3), + : InstRSY<opcode, (outs cls:$R1), (ins cls:$R1src, mode:$BD2, imm32zx4:$R3), mnemonic#"\t$R1, $BD2, $R3", []>, Requires<[FeatureLoadStoreOnCond]> { let mayLoad = 1; @@ -993,6 +983,33 @@ class BinaryRIL<string mnemonic, bits<12> opcode, SDPatternOperator operator, let DisableEncoding = "$R1src"; } +class BinaryRS<string mnemonic, bits<8> opcode, SDPatternOperator operator, + RegisterOperand cls> + : InstRS<opcode, (outs cls:$R1), (ins cls:$R1src, shift12only:$BD2), + mnemonic#"\t$R1, $BD2", + [(set cls:$R1, (operator cls:$R1src, shift12only:$BD2))]> { + let R3 = 0; + let Constraints = "$R1 = $R1src"; + let DisableEncoding = "$R1src"; +} + +class BinaryRSY<string mnemonic, bits<16> opcode, SDPatternOperator operator, + RegisterOperand cls> + : InstRSY<opcode, (outs cls:$R1), (ins cls:$R3, shift20only:$BD2), + mnemonic#"\t$R1, $R3, $BD2", + [(set cls:$R1, (operator cls:$R3, shift20only:$BD2))]>; + +multiclass BinaryRSAndK<string mnemonic, bits<8> opcode1, bits<16> opcode2, + SDPatternOperator operator, RegisterOperand cls> { + let NumOpsKey = mnemonic in { + let NumOpsValue = "3" in + def K : BinaryRSY<mnemonic##"k", opcode2, null_frag, cls>, + Requires<[FeatureDistinctOps]>; + let NumOpsValue = "2", isConvertibleToThreeAddress = 1 in + def "" : BinaryRS<mnemonic, opcode1, operator, cls>; + } +} + class BinaryRX<string mnemonic, bits<8> opcode, SDPatternOperator operator, RegisterOperand cls, SDPatternOperator load, bits<5> bytes, AddressingMode mode = bdxaddr12only> @@ -1077,33 +1094,6 @@ multiclass BinarySIPair<string mnemonic, bits<8> siOpcode, } } -class ShiftRS<string mnemonic, bits<8> opcode, SDPatternOperator operator, - RegisterOperand cls> - : InstRS<opcode, (outs cls:$R1), (ins cls:$R1src, shift12only:$BD2), - mnemonic#"\t$R1, $BD2", - [(set cls:$R1, (operator cls:$R1src, shift12only:$BD2))]> { - let R3 = 0; - let Constraints = "$R1 = $R1src"; - let DisableEncoding = "$R1src"; -} - -class ShiftRSY<string mnemonic, bits<16> opcode, SDPatternOperator operator, - RegisterOperand cls> - : InstRSY<opcode, (outs cls:$R1), (ins cls:$R3, shift20only:$BD2), - mnemonic#"\t$R1, $R3, $BD2", - [(set cls:$R1, (operator cls:$R3, shift20only:$BD2))]>; - -multiclass ShiftRSAndK<string mnemonic, bits<8> opcode1, bits<16> opcode2, - SDPatternOperator operator, RegisterOperand cls> { - let NumOpsKey = mnemonic in { - let NumOpsValue = "3" in - def K : ShiftRSY<mnemonic##"k", opcode2, null_frag, cls>, - Requires<[FeatureDistinctOps]>; - let NumOpsValue = "2", isConvertibleToThreeAddress = 1 in - def "" : ShiftRS<mnemonic, opcode1, operator, cls>; - } -} - class CompareRR<string mnemonic, bits<8> opcode, SDPatternOperator operator, RegisterOperand cls1, RegisterOperand cls2> : InstRR<opcode, (outs), (ins cls1:$R1, cls2:$R2), @@ -1315,22 +1305,23 @@ multiclass CmpSwapRSPair<string mnemonic, bits<8> rsOpcode, bits<16> rsyOpcode, class RotateSelectRIEf<string mnemonic, bits<16> opcode, RegisterOperand cls1, RegisterOperand cls2> : InstRIEf<opcode, (outs cls1:$R1), - (ins cls1:$R1src, cls2:$R2, uimm8:$I3, uimm8:$I4, uimm8zx6:$I5), + (ins cls1:$R1src, cls2:$R2, imm32zx8:$I3, imm32zx8:$I4, + imm32zx6:$I5), mnemonic#"\t$R1, $R2, $I3, $I4, $I5", []> { let Constraints = "$R1 = $R1src"; let DisableEncoding = "$R1src"; } class PrefetchRXY<string mnemonic, bits<16> opcode, SDPatternOperator operator> - : InstRXY<opcode, (outs), (ins uimm8zx4:$R1, bdxaddr20only:$XBD2), + : InstRXY<opcode, (outs), (ins imm32zx4:$R1, bdxaddr20only:$XBD2), mnemonic##"\t$R1, $XBD2", - [(operator uimm8zx4:$R1, bdxaddr20only:$XBD2)]>; + [(operator imm32zx4:$R1, bdxaddr20only:$XBD2)]>; class PrefetchRILPC<string mnemonic, bits<12> opcode, SDPatternOperator operator> - : InstRIL<opcode, (outs), (ins uimm8zx4:$R1, pcrel32:$I2), + : InstRIL<opcode, (outs), (ins imm32zx4:$R1, pcrel32:$I2), mnemonic##"\t$R1, $I2", - [(operator uimm8zx4:$R1, pcrel32:$I2)]> { + [(operator imm32zx4:$R1, pcrel32:$I2)]> { // We want PC-relative addresses to be tried ahead of BD and BDX addresses. // However, BDXs have two extra operands and are therefore 6 units more // complex. @@ -1450,7 +1441,8 @@ class StoreRXYPseudo<SDPatternOperator operator, RegisterOperand cls, // of registers. class RotateSelectRIEfPseudo<RegisterOperand cls1, RegisterOperand cls2> : Pseudo<(outs cls1:$R1), - (ins cls1:$R1src, cls2:$R2, uimm8:$I3, uimm8:$I4, uimm8zx6:$I5), + (ins cls1:$R1src, cls2:$R2, imm32zx8:$I3, imm32zx8:$I4, + imm32zx6:$I5), []> { let Constraints = "$R1 = $R1src"; let DisableEncoding = "$R1src"; @@ -1460,9 +1452,9 @@ class RotateSelectRIEfPseudo<RegisterOperand cls1, RegisterOperand cls2> // the value of the PSW's 2-bit condition code field. class SelectWrapper<RegisterOperand cls> : Pseudo<(outs cls:$dst), - (ins cls:$src1, cls:$src2, uimm8zx4:$valid, uimm8zx4:$cc), + (ins cls:$src1, cls:$src2, imm32zx4:$valid, imm32zx4:$cc), [(set cls:$dst, (z_select_ccmask cls:$src1, cls:$src2, - uimm8zx4:$valid, uimm8zx4:$cc))]> { + imm32zx4:$valid, imm32zx4:$cc))]> { let usesCustomInserter = 1; // Although the instructions used by these nodes do not in themselves // change CC, the insertion requires new blocks, and CC cannot be live @@ -1476,14 +1468,14 @@ multiclass CondStores<RegisterOperand cls, SDPatternOperator store, SDPatternOperator load, AddressingMode mode> { let Defs = [CC], Uses = [CC], usesCustomInserter = 1 in { def "" : Pseudo<(outs), - (ins cls:$new, mode:$addr, uimm8zx4:$valid, uimm8zx4:$cc), + (ins cls:$new, mode:$addr, imm32zx4:$valid, imm32zx4:$cc), [(store (z_select_ccmask cls:$new, (load mode:$addr), - uimm8zx4:$valid, uimm8zx4:$cc), + imm32zx4:$valid, imm32zx4:$cc), mode:$addr)]>; def Inv : Pseudo<(outs), - (ins cls:$new, mode:$addr, uimm8zx4:$valid, uimm8zx4:$cc), + (ins cls:$new, mode:$addr, imm32zx4:$valid, imm32zx4:$cc), [(store (z_select_ccmask (load mode:$addr), cls:$new, - uimm8zx4:$valid, uimm8zx4:$cc), + imm32zx4:$valid, imm32zx4:$cc), mode:$addr)]>; } } @@ -1611,6 +1603,7 @@ class CompareAliasRI<SDPatternOperator operator, RegisterOperand cls, // An alias of a RotateSelectRIEf, but with different register sizes. class RotateSelectAliasRIEf<RegisterOperand cls1, RegisterOperand cls2> : Alias<6, (outs cls1:$R1), - (ins cls1:$R1src, cls2:$R2, uimm8:$I3, uimm8:$I4, uimm8zx6:$I5), []> { + (ins cls1:$R1src, cls2:$R2, imm32zx8:$I3, imm32zx8:$I4, + imm32zx6:$I5), []> { let Constraints = "$R1 = $R1src"; } diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp index 6a18b2dea9ea..f58ab474fbbc 100644 --- a/lib/Target/SystemZ/SystemZInstrInfo.cpp +++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp @@ -40,9 +40,9 @@ static bool isHighReg(unsigned int Reg) { // Pin the vtable to this file. void SystemZInstrInfo::anchor() {} -SystemZInstrInfo::SystemZInstrInfo(SystemZTargetMachine &tm) +SystemZInstrInfo::SystemZInstrInfo(SystemZSubtarget &sti) : SystemZGenInstrInfo(SystemZ::ADJCALLSTACKDOWN, SystemZ::ADJCALLSTACKUP), - RI(tm), TM(tm) { + RI(), STI(sti) { } // MI is a 128-bit load or store. Split it into two 64-bit loads or stores, @@ -488,7 +488,7 @@ SystemZInstrInfo::optimizeCompareInstr(MachineInstr *Compare, bool IsLogical = (Compare->getDesc().TSFlags & SystemZII::IsLogical) != 0; if (Value == 0 && !IsLogical && - removeIPMBasedCompare(Compare, SrcReg, MRI, TM.getRegisterInfo())) + removeIPMBasedCompare(Compare, SrcReg, MRI, &RI)) return true; return false; } @@ -505,7 +505,7 @@ static unsigned getConditionalMove(unsigned Opcode) { bool SystemZInstrInfo::isPredicable(MachineInstr *MI) const { unsigned Opcode = MI->getOpcode(); - if (TM.getSubtargetImpl()->hasLoadStoreOnCond() && + if (STI.hasLoadStoreOnCond() && getConditionalMove(Opcode)) return true; return false; @@ -537,7 +537,7 @@ PredicateInstruction(MachineInstr *MI, unsigned CCMask = Pred[1].getImm(); assert(CCMask > 0 && CCMask < 15 && "Invalid predicate"); unsigned Opcode = MI->getOpcode(); - if (TM.getSubtargetImpl()->hasLoadStoreOnCond()) { + if (STI.hasLoadStoreOnCond()) { if (unsigned CondOpcode = getConditionalMove(Opcode)) { MI->setDesc(get(CondOpcode)); MachineInstrBuilder(*MI->getParent()->getParent(), MI) @@ -685,7 +685,7 @@ SystemZInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, // We prefer to keep the two-operand form where possible both // because it tends to be shorter and because some instructions // have memory forms that can be used during spilling. - if (TM.getSubtargetImpl()->hasDistinctOps()) { + if (STI.hasDistinctOps()) { MachineOperand &Dest = MI->getOperand(0); MachineOperand &Src = MI->getOperand(1); unsigned DestReg = Dest.getReg(); diff --git a/lib/Target/SystemZ/SystemZInstrInfo.h b/lib/Target/SystemZ/SystemZInstrInfo.h index 09aee5d20293..83009cb8d426 100644 --- a/lib/Target/SystemZ/SystemZInstrInfo.h +++ b/lib/Target/SystemZ/SystemZInstrInfo.h @@ -110,9 +110,10 @@ struct Branch { }; } // end namespace SystemZII +class SystemZSubtarget; class SystemZInstrInfo : public SystemZGenInstrInfo { const SystemZRegisterInfo RI; - SystemZTargetMachine &TM; + SystemZSubtarget &STI; void splitMove(MachineBasicBlock::iterator MI, unsigned NewOpcode) const; void splitAdjDynAlloc(MachineBasicBlock::iterator MI) const; @@ -130,7 +131,7 @@ class SystemZInstrInfo : public SystemZGenInstrInfo { virtual void anchor(); public: - explicit SystemZInstrInfo(SystemZTargetMachine &TM); + explicit SystemZInstrInfo(SystemZSubtarget &STI); // Override TargetInstrInfo. unsigned isLoadFromStackSlot(const MachineInstr *MI, diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td index e70df92ffe81..f4951ad8e0ac 100644 --- a/lib/Target/SystemZ/SystemZInstrInfo.td +++ b/lib/Target/SystemZ/SystemZInstrInfo.td @@ -63,11 +63,11 @@ let isBranch = 1, isTerminator = 1, Uses = [CC] in { def BRCL : InstRIL<0xC04, (outs), (ins cond4:$valid, cond4:$R1, brtarget32:$I2), "jg$R1\t$I2", []>; } - def AsmBRC : InstRI<0xA74, (outs), (ins uimm8zx4:$R1, brtarget16:$I2), + def AsmBRC : InstRI<0xA74, (outs), (ins imm32zx4:$R1, brtarget16:$I2), "brc\t$R1, $I2", []>; - def AsmBRCL : InstRIL<0xC04, (outs), (ins uimm8zx4:$R1, brtarget32:$I2), + def AsmBRCL : InstRIL<0xC04, (outs), (ins imm32zx4:$R1, brtarget32:$I2), "brcl\t$R1, $I2", []>; - def AsmBCR : InstRR<0x07, (outs), (ins uimm8zx4:$R1, GR64:$R2), + def AsmBCR : InstRR<0x07, (outs), (ins imm32zx4:$R1, GR64:$R2), "bcr\t$R1, $R2", []>; } @@ -109,7 +109,7 @@ multiclass CompareBranches<Operand ccmask, string pos1, string pos2> { } let isCodeGenOnly = 1 in defm C : CompareBranches<cond4, "$M3", "">; -defm AsmC : CompareBranches<uimm8zx4, "", "$M3, ">; +defm AsmC : CompareBranches<imm32zx4, "", "$M3, ">; // Define AsmParser mnemonics for each general condition-code mask // (integer or floating-point) @@ -233,9 +233,7 @@ defm CondStore64 : CondStores<GR64, nonvolatile_store, // Call instructions //===----------------------------------------------------------------------===// -// The definitions here are for the call-clobbered registers. -let isCall = 1, Defs = [R0D, R1D, R2D, R3D, R4D, R5D, R14D, - F0D, F1D, F2D, F3D, F4D, F5D, F6D, F7D, CC] in { +let isCall = 1, Defs = [R14D, CC] in { def CallBRASL : Alias<6, (outs), (ins pcrel32:$I2, variable_ops), [(z_call pcrel32:$I2)]>; def CallBASR : Alias<2, (outs), (ins ADDR64:$R2, variable_ops), @@ -855,7 +853,7 @@ let Defs = [CC] in { } // AND to memory - defm NI : BinarySIPair<"ni", 0x94, 0xEB54, null_frag, uimm8>; + defm NI : BinarySIPair<"ni", 0x94, 0xEB54, null_frag, imm32zx8>; // Block AND. let mayLoad = 1, mayStore = 1 in @@ -912,7 +910,7 @@ let Defs = [CC] in { } // OR to memory - defm OI : BinarySIPair<"oi", 0x96, 0xEB56, null_frag, uimm8>; + defm OI : BinarySIPair<"oi", 0x96, 0xEB56, null_frag, imm32zx8>; // Block OR. let mayLoad = 1, mayStore = 1 in @@ -952,7 +950,7 @@ let Defs = [CC] in { } // XOR to memory - defm XI : BinarySIPair<"xi", 0x97, 0xEB57, null_frag, uimm8>; + defm XI : BinarySIPair<"xi", 0x97, 0xEB57, null_frag, imm32zx8>; // Block XOR. let mayLoad = 1, mayStore = 1 in @@ -1015,26 +1013,26 @@ def DLG : BinaryRXY<"dlg", 0xE387, z_udivrem64, GR128, load, 8>; // Shift left. let neverHasSideEffects = 1 in { - defm SLL : ShiftRSAndK<"sll", 0x89, 0xEBDF, shl, GR32>; - def SLLG : ShiftRSY<"sllg", 0xEB0D, shl, GR64>; + defm SLL : BinaryRSAndK<"sll", 0x89, 0xEBDF, shl, GR32>; + def SLLG : BinaryRSY<"sllg", 0xEB0D, shl, GR64>; } // Logical shift right. let neverHasSideEffects = 1 in { - defm SRL : ShiftRSAndK<"srl", 0x88, 0xEBDE, srl, GR32>; - def SRLG : ShiftRSY<"srlg", 0xEB0C, srl, GR64>; + defm SRL : BinaryRSAndK<"srl", 0x88, 0xEBDE, srl, GR32>; + def SRLG : BinaryRSY<"srlg", 0xEB0C, srl, GR64>; } // Arithmetic shift right. let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in { - defm SRA : ShiftRSAndK<"sra", 0x8A, 0xEBDC, sra, GR32>; - def SRAG : ShiftRSY<"srag", 0xEB0A, sra, GR64>; + defm SRA : BinaryRSAndK<"sra", 0x8A, 0xEBDC, sra, GR32>; + def SRAG : BinaryRSY<"srag", 0xEB0A, sra, GR64>; } // Rotate left. let neverHasSideEffects = 1 in { - def RLL : ShiftRSY<"rll", 0xEB1D, rotl, GR32>; - def RLLG : ShiftRSY<"rllg", 0xEB1C, rotl, GR64>; + def RLL : BinaryRSY<"rll", 0xEB1D, rotl, GR32>; + def RLLG : BinaryRSY<"rllg", 0xEB1C, rotl, GR64>; } // Rotate second operand left and inserted selected bits into first operand. @@ -1403,15 +1401,15 @@ def : Pat<(sub GR64:$src1, (azextloadi32 bdxaddr20only:$addr)), // Optimize sign-extended 1/0 selects to -1/0 selects. This is important // for vector legalization. -def : Pat<(sra (shl (i32 (z_select_ccmask 1, 0, uimm8zx4:$valid, uimm8zx4:$cc)), +def : Pat<(sra (shl (i32 (z_select_ccmask 1, 0, imm32zx4:$valid, imm32zx4:$cc)), (i32 31)), (i32 31)), - (Select32 (LHI -1), (LHI 0), uimm8zx4:$valid, uimm8zx4:$cc)>; -def : Pat<(sra (shl (i64 (anyext (i32 (z_select_ccmask 1, 0, uimm8zx4:$valid, - uimm8zx4:$cc)))), + (Select32 (LHI -1), (LHI 0), imm32zx4:$valid, imm32zx4:$cc)>; +def : Pat<(sra (shl (i64 (anyext (i32 (z_select_ccmask 1, 0, imm32zx4:$valid, + imm32zx4:$cc)))), (i32 63)), (i32 63)), - (Select64 (LGHI -1), (LGHI 0), uimm8zx4:$valid, uimm8zx4:$cc)>; + (Select64 (LGHI -1), (LGHI 0), imm32zx4:$valid, imm32zx4:$cc)>; // Peepholes for turning scalar operations into block operations. defm : BlockLoadStore<anyextloadi8, i32, MVCSequence, NCSequence, OCSequence, diff --git a/lib/Target/SystemZ/SystemZOperands.td b/lib/Target/SystemZ/SystemZOperands.td index 3ad146c57d92..7be81dca727b 100644 --- a/lib/Target/SystemZ/SystemZOperands.td +++ b/lib/Target/SystemZ/SystemZOperands.td @@ -202,21 +202,6 @@ def S32Imm : ImmediateAsmOperand<"S32Imm">; def U32Imm : ImmediateAsmOperand<"U32Imm">; //===----------------------------------------------------------------------===// -// 8-bit immediates -//===----------------------------------------------------------------------===// - -def uimm8zx4 : Immediate<i8, [{ - return isUInt<4>(N->getZExtValue()); -}], NOOP_SDNodeXForm, "U4Imm">; - -def uimm8zx6 : Immediate<i8, [{ - return isUInt<6>(N->getZExtValue()); -}], NOOP_SDNodeXForm, "U6Imm">; - -def simm8 : Immediate<i8, [{}], SIMM8, "S8Imm">; -def uimm8 : Immediate<i8, [{}], UIMM8, "U8Imm">; - -//===----------------------------------------------------------------------===// // i32 immediates //===----------------------------------------------------------------------===// @@ -241,6 +226,14 @@ def imm32lh16c : Immediate<i32, [{ }], LH16, "U16Imm">; // Short immediates +def imm32zx4 : Immediate<i32, [{ + return isUInt<4>(N->getZExtValue()); +}], NOOP_SDNodeXForm, "U4Imm">; + +def imm32zx6 : Immediate<i32, [{ + return isUInt<6>(N->getZExtValue()); +}], NOOP_SDNodeXForm, "U6Imm">; + def imm32sx8 : Immediate<i32, [{ return isInt<8>(N->getSExtValue()); }], SIMM8, "S8Imm">; @@ -470,13 +463,13 @@ def AccessReg : AsmOperandClass { let Name = "AccessReg"; let ParserMethod = "parseAccessReg"; } -def access_reg : Immediate<i8, [{ return N->getZExtValue() < 16; }], +def access_reg : Immediate<i32, [{ return N->getZExtValue() < 16; }], NOOP_SDNodeXForm, "AccessReg"> { let ParserMatchClass = AccessReg; } // A 4-bit condition-code mask. -def cond4 : PatLeaf<(i8 imm), [{ return (N->getZExtValue() < 16); }]>, - Operand<i8> { +def cond4 : PatLeaf<(i32 imm), [{ return (N->getZExtValue() < 16); }]>, + Operand<i32> { let PrintMethod = "printCond4Operand"; } diff --git a/lib/Target/SystemZ/SystemZOperators.td b/lib/Target/SystemZ/SystemZOperators.td index a3919618f8f2..c70e662db427 100644 --- a/lib/Target/SystemZ/SystemZOperators.td +++ b/lib/Target/SystemZ/SystemZOperators.td @@ -19,14 +19,14 @@ def SDT_ZICmp : SDTypeProfile<0, 3, [SDTCisSameAs<0, 1>, SDTCisVT<2, i32>]>; def SDT_ZBRCCMask : SDTypeProfile<0, 3, - [SDTCisVT<0, i8>, - SDTCisVT<1, i8>, + [SDTCisVT<0, i32>, + SDTCisVT<1, i32>, SDTCisVT<2, OtherVT>]>; def SDT_ZSelectCCMask : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, - SDTCisVT<3, i8>, - SDTCisVT<4, i8>]>; + SDTCisVT<3, i32>, + SDTCisVT<4, i32>]>; def SDT_ZWrapPtr : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>; @@ -37,7 +37,7 @@ def SDT_ZWrapOffset : SDTypeProfile<1, 2, def SDT_ZAdjDynAlloc : SDTypeProfile<1, 0, [SDTCisVT<0, i64>]>; def SDT_ZExtractAccess : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, - SDTCisVT<1, i8>]>; + SDTCisVT<1, i32>]>; def SDT_ZGR128Binary32 : SDTypeProfile<1, 2, [SDTCisVT<0, untyped>, SDTCisVT<1, untyped>, @@ -77,7 +77,7 @@ def SDT_ZString : SDTypeProfile<1, 3, SDTCisVT<3, i32>]>; def SDT_ZI32Intrinsic : SDTypeProfile<1, 0, [SDTCisVT<0, i32>]>; def SDT_ZPrefetch : SDTypeProfile<0, 2, - [SDTCisVT<0, i8>, + [SDTCisVT<0, i32>, SDTCisPtrTy<1>]>; //===----------------------------------------------------------------------===// diff --git a/lib/Target/SystemZ/SystemZPatterns.td b/lib/Target/SystemZ/SystemZPatterns.td index c0f94ecbe2c9..e307f8a888ee 100644 --- a/lib/Target/SystemZ/SystemZPatterns.td +++ b/lib/Target/SystemZ/SystemZPatterns.td @@ -101,15 +101,15 @@ multiclass CondStores64<Instruction insn, Instruction insninv, SDPatternOperator store, SDPatternOperator load, AddressingMode mode> { def : Pat<(store (z_select_ccmask GR64:$new, (load mode:$addr), - uimm8zx4:$valid, uimm8zx4:$cc), + imm32zx4:$valid, imm32zx4:$cc), mode:$addr), (insn (EXTRACT_SUBREG GR64:$new, subreg_l32), mode:$addr, - uimm8zx4:$valid, uimm8zx4:$cc)>; + imm32zx4:$valid, imm32zx4:$cc)>; def : Pat<(store (z_select_ccmask (load mode:$addr), GR64:$new, - uimm8zx4:$valid, uimm8zx4:$cc), + imm32zx4:$valid, imm32zx4:$cc), mode:$addr), (insninv (EXTRACT_SUBREG GR64:$new, subreg_l32), mode:$addr, - uimm8zx4:$valid, uimm8zx4:$cc)>; + imm32zx4:$valid, imm32zx4:$cc)>; } // Try to use MVC instruction INSN for a load of type LOAD followed by a store diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/lib/Target/SystemZ/SystemZRegisterInfo.cpp index a04d703d09fa..f03bcc412d51 100644 --- a/lib/Target/SystemZ/SystemZRegisterInfo.cpp +++ b/lib/Target/SystemZ/SystemZRegisterInfo.cpp @@ -7,31 +7,29 @@ // //===----------------------------------------------------------------------===// +#include "SystemZInstrInfo.h" #include "SystemZRegisterInfo.h" -#include "SystemZTargetMachine.h" +#include "SystemZSubtarget.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Target/TargetFrameLowering.h" using namespace llvm; #define GET_REGINFO_TARGET_DESC #include "SystemZGenRegisterInfo.inc" -SystemZRegisterInfo::SystemZRegisterInfo(SystemZTargetMachine &tm) - : SystemZGenRegisterInfo(SystemZ::R14D), TM(tm) {} +SystemZRegisterInfo::SystemZRegisterInfo() + : SystemZGenRegisterInfo(SystemZ::R14D) {} -const MCPhysReg* +const MCPhysReg * SystemZRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { - static const MCPhysReg CalleeSavedRegs[] = { - SystemZ::R6D, SystemZ::R7D, SystemZ::R8D, SystemZ::R9D, - SystemZ::R10D, SystemZ::R11D, SystemZ::R12D, SystemZ::R13D, - SystemZ::R14D, SystemZ::R15D, - SystemZ::F8D, SystemZ::F9D, SystemZ::F10D, SystemZ::F11D, - SystemZ::F12D, SystemZ::F13D, SystemZ::F14D, SystemZ::F15D, - 0 - }; - - return CalleeSavedRegs; + return CSR_SystemZ_SaveList; +} + +const uint32_t * +SystemZRegisterInfo::getCallPreservedMask(CallingConv::ID CC) const { + return CSR_SystemZ_RegMask; } BitVector @@ -63,7 +61,8 @@ SystemZRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, MachineBasicBlock &MBB = *MI->getParent(); MachineFunction &MF = *MBB.getParent(); - auto *TII = static_cast<const SystemZInstrInfo*>(TM.getInstrInfo()); + auto *TII = + static_cast<const SystemZInstrInfo *>(MF.getTarget().getInstrInfo()); const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); DebugLoc DL = MI->getDebugLoc(); diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.h b/lib/Target/SystemZ/SystemZRegisterInfo.h index e236f712e7d8..9bffa467a15d 100644 --- a/lib/Target/SystemZ/SystemZRegisterInfo.h +++ b/lib/Target/SystemZ/SystemZRegisterInfo.h @@ -29,15 +29,9 @@ inline unsigned odd128(bool Is32bit) { } } // end namespace SystemZ -class SystemZSubtarget; -class SystemZInstrInfo; - struct SystemZRegisterInfo : public SystemZGenRegisterInfo { -private: - SystemZTargetMachine &TM; - public: - SystemZRegisterInfo(SystemZTargetMachine &tm); + SystemZRegisterInfo(); // Override TargetRegisterInfo.h. bool requiresRegisterScavenging(const MachineFunction &MF) const override { @@ -51,6 +45,7 @@ public: } const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF = nullptr) const override; + const uint32_t *getCallPreservedMask(CallingConv::ID CC) const override; BitVector getReservedRegs(const MachineFunction &MF) const override; void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, unsigned FIOperandNum, diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.td b/lib/Target/SystemZ/SystemZRegisterInfo.td index 93d7c8375b3d..47ac20dae78a 100644 --- a/lib/Target/SystemZ/SystemZRegisterInfo.td +++ b/lib/Target/SystemZ/SystemZRegisterInfo.td @@ -119,6 +119,29 @@ defm ADDR128 : SystemZRegClass<"ADDR128", untyped, 128, (sub GR128Bit, R0Q)>; // Floating-point registers //===----------------------------------------------------------------------===// +// Maps FPR register numbers to their DWARF encoding. +class DwarfMapping<int id> { int Id = id; } + +def F0Dwarf : DwarfMapping<16>; +def F2Dwarf : DwarfMapping<17>; +def F4Dwarf : DwarfMapping<18>; +def F6Dwarf : DwarfMapping<19>; + +def F1Dwarf : DwarfMapping<20>; +def F3Dwarf : DwarfMapping<21>; +def F5Dwarf : DwarfMapping<22>; +def F7Dwarf : DwarfMapping<23>; + +def F8Dwarf : DwarfMapping<24>; +def F10Dwarf : DwarfMapping<25>; +def F12Dwarf : DwarfMapping<26>; +def F14Dwarf : DwarfMapping<27>; + +def F9Dwarf : DwarfMapping<28>; +def F11Dwarf : DwarfMapping<29>; +def F13Dwarf : DwarfMapping<30>; +def F15Dwarf : DwarfMapping<31>; + // Lower 32 bits of one of the 16 64-bit floating-point registers class FPR32<bits<16> num, string n> : SystemZReg<n> { let HWEncoding = num; @@ -142,7 +165,7 @@ class FPR128<bits<16> num, string n, FPR64 low, FPR64 high> foreach I = 0-15 in { def F#I#S : FPR32<I, "f"#I>; def F#I#D : FPR64<I, "f"#I, !cast<FPR32>("F"#I#"S")>, - DwarfRegNum<[!add(I, 16)]>; + DwarfRegNum<[!cast<DwarfMapping>("F"#I#"Dwarf").Id]>; } foreach I = [0, 1, 4, 5, 8, 9, 12, 13] in { diff --git a/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp index 97abee303b09..a3cba64b9ed2 100644 --- a/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp +++ b/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp @@ -18,10 +18,8 @@ using namespace llvm; #define DEBUG_TYPE "systemz-selectiondag-info" -SystemZSelectionDAGInfo:: -SystemZSelectionDAGInfo(const SystemZTargetMachine &TM) - : TargetSelectionDAGInfo(TM) { -} +SystemZSelectionDAGInfo::SystemZSelectionDAGInfo(const DataLayout &DL) + : TargetSelectionDAGInfo(&DL) {} SystemZSelectionDAGInfo::~SystemZSelectionDAGInfo() { } diff --git a/lib/Target/SystemZ/SystemZSelectionDAGInfo.h b/lib/Target/SystemZ/SystemZSelectionDAGInfo.h index 79e7fab20c18..e9de146af1d6 100644 --- a/lib/Target/SystemZ/SystemZSelectionDAGInfo.h +++ b/lib/Target/SystemZ/SystemZSelectionDAGInfo.h @@ -22,7 +22,7 @@ class SystemZTargetMachine; class SystemZSelectionDAGInfo : public TargetSelectionDAGInfo { public: - explicit SystemZSelectionDAGInfo(const SystemZTargetMachine &TM); + explicit SystemZSelectionDAGInfo(const DataLayout &DL); ~SystemZSelectionDAGInfo(); SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc DL, SDValue Chain, diff --git a/lib/Target/SystemZ/SystemZSubtarget.cpp b/lib/Target/SystemZ/SystemZSubtarget.cpp index a011157dcdfc..e160bc86f225 100644 --- a/lib/Target/SystemZ/SystemZSubtarget.cpp +++ b/lib/Target/SystemZ/SystemZSubtarget.cpp @@ -20,16 +20,11 @@ using namespace llvm; #define GET_SUBTARGETINFO_CTOR #include "SystemZGenSubtargetInfo.inc" -// Pin the vtabel to this file. +// Pin the vtable to this file. void SystemZSubtarget::anchor() {} -SystemZSubtarget::SystemZSubtarget(const std::string &TT, - const std::string &CPU, - const std::string &FS) - : SystemZGenSubtargetInfo(TT, CPU, FS), HasDistinctOps(false), - HasLoadStoreOnCond(false), HasHighWord(false), HasFPExtension(false), - HasFastSerialization(false), HasInterlockedAccess1(false), - TargetTriple(TT) { +SystemZSubtarget & +SystemZSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) { std::string CPUName = CPU; if (CPUName.empty()) CPUName = "generic"; @@ -37,11 +32,26 @@ SystemZSubtarget::SystemZSubtarget(const std::string &TT, if (CPUName == "generic") CPUName = sys::getHostCPUName(); #endif - // Parse features string. ParseSubtargetFeatures(CPUName, FS); + return *this; } +SystemZSubtarget::SystemZSubtarget(const std::string &TT, + const std::string &CPU, + const std::string &FS, + const TargetMachine &TM) + : SystemZGenSubtargetInfo(TT, CPU, FS), HasDistinctOps(false), + HasLoadStoreOnCond(false), HasHighWord(false), HasFPExtension(false), + HasFastSerialization(false), HasInterlockedAccess1(false), + TargetTriple(TT), + // Make sure that global data has at least 16 bits of alignment by + // default, so that we can refer to it using LARL. We don't have any + // special requirements for stack variables though. + DL("E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-a:8:16-n32:64"), + InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM), + TSInfo(DL), FrameLowering() {} + // Return true if GV binds locally under reloc model RM. static bool bindsLocally(const GlobalValue *GV, Reloc::Model RM) { // For non-PIC, all symbols bind locally. diff --git a/lib/Target/SystemZ/SystemZSubtarget.h b/lib/Target/SystemZ/SystemZSubtarget.h index ffca2d8113a8..4e8c710bdefd 100644 --- a/lib/Target/SystemZ/SystemZSubtarget.h +++ b/lib/Target/SystemZ/SystemZSubtarget.h @@ -14,6 +14,12 @@ #ifndef SYSTEMZSUBTARGET_H #define SYSTEMZSUBTARGET_H +#include "SystemZFrameLowering.h" +#include "SystemZISelLowering.h" +#include "SystemZInstrInfo.h" +#include "SystemZRegisterInfo.h" +#include "SystemZSelectionDAGInfo.h" +#include "llvm/IR/DataLayout.h" #include "llvm/ADT/Triple.h" #include "llvm/Target/TargetSubtargetInfo.h" #include <string> @@ -37,10 +43,26 @@ protected: private: Triple TargetTriple; - + const DataLayout DL; + SystemZInstrInfo InstrInfo; + SystemZTargetLowering TLInfo; + SystemZSelectionDAGInfo TSInfo; + SystemZFrameLowering FrameLowering; + + SystemZSubtarget &initializeSubtargetDependencies(StringRef CPU, + StringRef FS); public: SystemZSubtarget(const std::string &TT, const std::string &CPU, - const std::string &FS); + const std::string &FS, const TargetMachine &TM); + + const TargetFrameLowering *getFrameLowering() const { return &FrameLowering; } + const SystemZInstrInfo *getInstrInfo() const { return &InstrInfo; } + const DataLayout *getDataLayout() const { return &DL; } + const SystemZRegisterInfo *getRegisterInfo() const { + return &InstrInfo.getRegisterInfo(); + } + const SystemZTargetLowering *getTargetLowering() const { return &TLInfo; } + const TargetSelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; } // This is important for reducing register pressure in vector code. bool useAA() const override { return true; } diff --git a/lib/Target/SystemZ/SystemZTargetMachine.cpp b/lib/Target/SystemZ/SystemZTargetMachine.cpp index 4c9ce29c7e2a..0122e99f8a77 100644 --- a/lib/Target/SystemZ/SystemZTargetMachine.cpp +++ b/lib/Target/SystemZ/SystemZTargetMachine.cpp @@ -22,17 +22,10 @@ extern "C" void LLVMInitializeSystemZTarget() { SystemZTargetMachine::SystemZTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, const TargetOptions &Options, - Reloc::Model RM, - CodeModel::Model CM, + Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) - : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), - Subtarget(TT, CPU, FS), - // Make sure that global data has at least 16 bits of alignment by default, - // so that we can refer to it using LARL. We don't have any special - // requirements for stack variables though. - DL("E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-a:8:16-n32:64"), - InstrInfo(*this), TLInfo(*this), TSInfo(*this), - FrameLowering(*this, Subtarget) { + : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), + Subtarget(TT, CPU, FS, *this) { initAsmInfo(); } @@ -65,7 +58,8 @@ bool SystemZPassConfig::addInstSelector() { } bool SystemZPassConfig::addPreSched2() { - if (getSystemZTargetMachine().getSubtargetImpl()->hasLoadStoreOnCond()) + if (getOptLevel() != CodeGenOpt::None && + getSystemZTargetMachine().getSubtargetImpl()->hasLoadStoreOnCond()) addPass(&IfConverterID); return true; } diff --git a/lib/Target/SystemZ/SystemZTargetMachine.h b/lib/Target/SystemZ/SystemZTargetMachine.h index 1db717b7126d..ded07e912443 100644 --- a/lib/Target/SystemZ/SystemZTargetMachine.h +++ b/lib/Target/SystemZ/SystemZTargetMachine.h @@ -15,25 +15,15 @@ #ifndef SYSTEMZTARGETMACHINE_H #define SYSTEMZTARGETMACHINE_H -#include "SystemZFrameLowering.h" -#include "SystemZISelLowering.h" -#include "SystemZInstrInfo.h" -#include "SystemZRegisterInfo.h" -#include "SystemZSelectionDAGInfo.h" #include "SystemZSubtarget.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetMachine.h" namespace llvm { +class TargetFrameLowering; + class SystemZTargetMachine : public LLVMTargetMachine { SystemZSubtarget Subtarget; - const DataLayout DL; - SystemZInstrInfo InstrInfo; - SystemZTargetLowering TLInfo; - SystemZSelectionDAGInfo TSInfo; - SystemZFrameLowering FrameLowering; public: SystemZTargetMachine(const Target &T, StringRef TT, StringRef CPU, @@ -43,25 +33,25 @@ public: // Override TargetMachine. const TargetFrameLowering *getFrameLowering() const override { - return &FrameLowering; + return getSubtargetImpl()->getFrameLowering(); } const SystemZInstrInfo *getInstrInfo() const override { - return &InstrInfo; + return getSubtargetImpl()->getInstrInfo(); } const SystemZSubtarget *getSubtargetImpl() const override { return &Subtarget; } const DataLayout *getDataLayout() const override { - return &DL; + return getSubtargetImpl()->getDataLayout(); } const SystemZRegisterInfo *getRegisterInfo() const override { - return &InstrInfo.getRegisterInfo(); + return getSubtargetImpl()->getRegisterInfo(); } const SystemZTargetLowering *getTargetLowering() const override { - return &TLInfo; + return getSubtargetImpl()->getTargetLowering(); } const TargetSelectionDAGInfo *getSelectionDAGInfo() const override { - return &TSInfo; + return getSubtargetImpl()->getSelectionDAGInfo(); } // Override LLVMTargetMachine diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp index 8365f64dc54a..95c8cb66f402 100644 --- a/lib/Target/TargetMachine.cpp +++ b/lib/Target/TargetMachine.cpp @@ -88,8 +88,8 @@ CodeModel::Model TargetMachine::getCodeModel() const { } /// Get the IR-specified TLS model for Var. -static TLSModel::Model getSelectedTLSModel(const GlobalVariable *Var) { - switch (Var->getThreadLocalMode()) { +static TLSModel::Model getSelectedTLSModel(const GlobalValue *GV) { + switch (GV->getThreadLocalMode()) { case GlobalVariable::NotThreadLocal: llvm_unreachable("getSelectedTLSModel for non-TLS variable"); break; @@ -127,13 +127,10 @@ TLSModel::Model TargetMachine::getTLSModel(const GlobalValue *GV) const { Model = TLSModel::InitialExec; } - const GlobalVariable *Var = dyn_cast<GlobalVariable>(GV); - if (Var) { - // If the user specified a more specific model, use that. - TLSModel::Model SelectedModel = getSelectedTLSModel(Var); - if (SelectedModel > Model) - return SelectedModel; - } + // If the user specified a more specific model, use that. + TLSModel::Model SelectedModel = getSelectedTLSModel(GV); + if (SelectedModel > Model) + return SelectedModel; return Model; } diff --git a/lib/Target/TargetSubtargetInfo.cpp b/lib/Target/TargetSubtargetInfo.cpp index 3ca13dac035d..87b6b661d318 100644 --- a/lib/Target/TargetSubtargetInfo.cpp +++ b/lib/Target/TargetSubtargetInfo.cpp @@ -39,10 +39,23 @@ bool TargetSubtargetInfo::useMachineScheduler() const { return enableMachineScheduler(); } +bool TargetSubtargetInfo::enableAtomicExpandLoadLinked() const { + return true; +} + bool TargetSubtargetInfo::enableMachineScheduler() const { return false; } +bool TargetSubtargetInfo::enableRALocalReassignment( + CodeGenOpt::Level OptLevel) const { + return true; +} + +bool TargetSubtargetInfo::enablePostMachineScheduler() const { + return false; +} + bool TargetSubtargetInfo::enablePostRAScheduler( CodeGenOpt::Level OptLevel, AntiDepBreakMode& Mode, diff --git a/lib/Target/X86/Android.mk b/lib/Target/X86/Android.mk index 0d0a9ca2a32e..e2c4be7c3c6b 100644 --- a/lib/Target/X86/Android.mk +++ b/lib/Target/X86/Android.mk @@ -12,6 +12,7 @@ x86_codegen_TBLGEN_TABLES := \ x86_codegen_SRC_FILES := \ X86AsmPrinter.cpp \ + X86AtomicExpandPass.cpp \ X86CodeEmitter.cpp \ X86FastISel.cpp \ X86FixupLEAs.cpp \ diff --git a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp index f3e6b3f4193b..a365f62190d2 100644 --- a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp +++ b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp @@ -20,6 +20,7 @@ #include "llvm/MC/MCParser/MCParsedAsmOperand.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCTargetAsmParser.h" #include "llvm/MC/MCTargetOptions.h" #include "llvm/Support/CommandLine.h" @@ -36,8 +37,8 @@ bool IsStackReg(unsigned Reg) { } std::string FuncName(unsigned AccessSize, bool IsWrite) { - return std::string("__sanitizer_sanitize_") + (IsWrite ? "store" : "load") + - (utostr(AccessSize)); + return std::string("__asan_report_") + (IsWrite ? "store" : "load") + + utostr(AccessSize); } class X86AddressSanitizer : public X86AsmInstrumentation { @@ -47,47 +48,55 @@ public: // X86AsmInstrumentation implementation: virtual void InstrumentInstruction( - const MCInst &Inst, SmallVectorImpl<MCParsedAsmOperand *> &Operands, - MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out) override { + const MCInst &Inst, OperandVector &Operands, MCContext &Ctx, + const MCInstrInfo &MII, MCStreamer &Out) override { InstrumentMOV(Inst, Operands, Ctx, MII, Out); } // Should be implemented differently in x86_32 and x86_64 subclasses. - virtual void InstrumentMemOperandImpl(X86Operand *Op, unsigned AccessSize, - bool IsWrite, MCContext &Ctx, - MCStreamer &Out) = 0; + virtual void InstrumentMemOperandSmallImpl( + X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, + MCStreamer &Out) = 0; + virtual void InstrumentMemOperandLargeImpl( + X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, + MCStreamer &Out) = 0; - void InstrumentMemOperand(MCParsedAsmOperand *Op, unsigned AccessSize, + void InstrumentMemOperand(MCParsedAsmOperand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, MCStreamer &Out); - void InstrumentMOV(const MCInst &Inst, - SmallVectorImpl<MCParsedAsmOperand *> &Operands, + void InstrumentMOV(const MCInst &Inst, OperandVector &Operands, MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out); void EmitInstruction(MCStreamer &Out, const MCInst &Inst) { Out.EmitInstruction(Inst, STI); } + void EmitLabel(MCStreamer &Out, MCSymbol *Label) { Out.EmitLabel(Label); } + protected: const MCSubtargetInfo &STI; }; void X86AddressSanitizer::InstrumentMemOperand( - MCParsedAsmOperand *Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, + MCParsedAsmOperand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, MCStreamer &Out) { - assert(Op && Op->isMem() && "Op should be a memory operand."); + assert(Op.isMem() && "Op should be a memory operand."); assert((AccessSize & (AccessSize - 1)) == 0 && AccessSize <= 16 && "AccessSize should be a power of two, less or equal than 16."); - X86Operand *MemOp = static_cast<X86Operand *>(Op); + X86Operand &MemOp = static_cast<X86Operand &>(Op); // FIXME: get rid of this limitation. - if (IsStackReg(MemOp->getMemBaseReg()) || IsStackReg(MemOp->getMemIndexReg())) + if (IsStackReg(MemOp.getMemBaseReg()) || IsStackReg(MemOp.getMemIndexReg())) return; - InstrumentMemOperandImpl(MemOp, AccessSize, IsWrite, Ctx, Out); + // FIXME: take into account load/store alignment. + if (AccessSize < 8) + InstrumentMemOperandSmallImpl(MemOp, AccessSize, IsWrite, Ctx, Out); + else + InstrumentMemOperandLargeImpl(MemOp, AccessSize, IsWrite, Ctx, Out); } void X86AddressSanitizer::InstrumentMOV( - const MCInst &Inst, SmallVectorImpl<MCParsedAsmOperand *> &Operands, - MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out) { + const MCInst &Inst, OperandVector &Operands, MCContext &Ctx, + const MCInstrInfo &MII, MCStreamer &Out) { // Access size in bytes. unsigned AccessSize = 0; @@ -124,107 +133,351 @@ void X86AddressSanitizer::InstrumentMOV( const bool IsWrite = MII.get(Inst.getOpcode()).mayStore(); for (unsigned Ix = 0; Ix < Operands.size(); ++Ix) { - MCParsedAsmOperand *Op = Operands[Ix]; - if (Op && Op->isMem()) + assert(Operands[Ix]); + MCParsedAsmOperand &Op = *Operands[Ix]; + if (Op.isMem()) InstrumentMemOperand(Op, AccessSize, IsWrite, Ctx, Out); } } class X86AddressSanitizer32 : public X86AddressSanitizer { public: + static const long kShadowOffset = 0x20000000; + X86AddressSanitizer32(const MCSubtargetInfo &STI) : X86AddressSanitizer(STI) {} virtual ~X86AddressSanitizer32() {} - virtual void InstrumentMemOperandImpl(X86Operand *Op, unsigned AccessSize, - bool IsWrite, MCContext &Ctx, - MCStreamer &Out) override; + virtual void InstrumentMemOperandSmallImpl( + X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, + MCStreamer &Out) override; + virtual void InstrumentMemOperandLargeImpl( + X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, + MCStreamer &Out) override; + + private: + void EmitCallAsanReport(MCContext &Ctx, MCStreamer &Out, unsigned AccessSize, + bool IsWrite, unsigned AddressReg) { + EmitInstruction(Out, MCInstBuilder(X86::CLD)); + EmitInstruction(Out, MCInstBuilder(X86::MMX_EMMS)); + + EmitInstruction(Out, MCInstBuilder(X86::AND64ri8).addReg(X86::ESP) + .addReg(X86::ESP).addImm(-16)); + EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(AddressReg)); + + + const std::string& Fn = FuncName(AccessSize, IsWrite); + MCSymbol *FnSym = Ctx.GetOrCreateSymbol(StringRef(Fn)); + const MCSymbolRefExpr *FnExpr = + MCSymbolRefExpr::Create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx); + EmitInstruction(Out, MCInstBuilder(X86::CALLpcrel32).addExpr(FnExpr)); + } }; -void X86AddressSanitizer32::InstrumentMemOperandImpl( - X86Operand *Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, +void X86AddressSanitizer32::InstrumentMemOperandSmallImpl( + X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, MCStreamer &Out) { - // FIXME: emit .cfi directives for correct stack unwinding. EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(X86::EAX)); + EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(X86::ECX)); + EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(X86::EDX)); + EmitInstruction(Out, MCInstBuilder(X86::PUSHF32)); + { MCInst Inst; Inst.setOpcode(X86::LEA32r); Inst.addOperand(MCOperand::CreateReg(X86::EAX)); + Op.addMemOperands(Inst, 5); + EmitInstruction(Out, Inst); + } + + EmitInstruction( + Out, MCInstBuilder(X86::MOV32rr).addReg(X86::ECX).addReg(X86::EAX)); + EmitInstruction(Out, MCInstBuilder(X86::SHR32ri).addReg(X86::ECX) + .addReg(X86::ECX).addImm(3)); + + { + MCInst Inst; + Inst.setOpcode(X86::MOV8rm); + Inst.addOperand(MCOperand::CreateReg(X86::CL)); + const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx); + std::unique_ptr<X86Operand> Op( + X86Operand::CreateMem(0, Disp, X86::ECX, 0, 1, SMLoc(), SMLoc())); + Op->addMemOperands(Inst, 5); + EmitInstruction(Out, Inst); + } + + EmitInstruction(Out, + MCInstBuilder(X86::TEST8rr).addReg(X86::CL).addReg(X86::CL)); + MCSymbol *DoneSym = Ctx.CreateTempSymbol(); + const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx); + EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr)); + + EmitInstruction( + Out, MCInstBuilder(X86::MOV32rr).addReg(X86::EDX).addReg(X86::EAX)); + EmitInstruction(Out, MCInstBuilder(X86::AND32ri).addReg(X86::EDX) + .addReg(X86::EDX).addImm(7)); + + switch (AccessSize) { + case 1: + break; + case 2: { + MCInst Inst; + Inst.setOpcode(X86::LEA32r); + Inst.addOperand(MCOperand::CreateReg(X86::EDX)); + + const MCExpr *Disp = MCConstantExpr::Create(1, Ctx); + std::unique_ptr<X86Operand> Op( + X86Operand::CreateMem(0, Disp, X86::EDX, 0, 1, SMLoc(), SMLoc())); Op->addMemOperands(Inst, 5); EmitInstruction(Out, Inst); + break; } + case 4: + EmitInstruction(Out, MCInstBuilder(X86::ADD32ri8).addReg(X86::EDX) + .addReg(X86::EDX).addImm(3)); + break; + default: + assert(false && "Incorrect access size"); + break; + } + + EmitInstruction( + Out, MCInstBuilder(X86::MOVSX32rr8).addReg(X86::ECX).addReg(X86::CL)); + EmitInstruction( + Out, MCInstBuilder(X86::CMP32rr).addReg(X86::EDX).addReg(X86::ECX)); + EmitInstruction(Out, MCInstBuilder(X86::JL_4).addExpr(DoneExpr)); + + EmitCallAsanReport(Ctx, Out, AccessSize, IsWrite, X86::EAX); + EmitLabel(Out, DoneSym); + + EmitInstruction(Out, MCInstBuilder(X86::POPF32)); + EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::EDX)); + EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::ECX)); + EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::EAX)); +} + +void X86AddressSanitizer32::InstrumentMemOperandLargeImpl( + X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, + MCStreamer &Out) { EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(X86::EAX)); + EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(X86::ECX)); + EmitInstruction(Out, MCInstBuilder(X86::PUSHF32)); + { - const std::string Func = FuncName(AccessSize, IsWrite); - const MCSymbol *FuncSym = Ctx.GetOrCreateSymbol(StringRef(Func)); - const MCSymbolRefExpr *FuncExpr = - MCSymbolRefExpr::Create(FuncSym, MCSymbolRefExpr::VK_PLT, Ctx); - EmitInstruction(Out, MCInstBuilder(X86::CALLpcrel32).addExpr(FuncExpr)); + MCInst Inst; + Inst.setOpcode(X86::LEA32r); + Inst.addOperand(MCOperand::CreateReg(X86::EAX)); + Op.addMemOperands(Inst, 5); + EmitInstruction(Out, Inst); } - EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::EAX)); + EmitInstruction( + Out, MCInstBuilder(X86::MOV32rr).addReg(X86::ECX).addReg(X86::EAX)); + EmitInstruction(Out, MCInstBuilder(X86::SHR32ri).addReg(X86::ECX) + .addReg(X86::ECX).addImm(3)); + { + MCInst Inst; + switch (AccessSize) { + case 8: + Inst.setOpcode(X86::CMP8mi); + break; + case 16: + Inst.setOpcode(X86::CMP16mi); + break; + default: + assert(false && "Incorrect access size"); + break; + } + const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx); + std::unique_ptr<X86Operand> Op( + X86Operand::CreateMem(0, Disp, X86::ECX, 0, 1, SMLoc(), SMLoc())); + Op->addMemOperands(Inst, 5); + Inst.addOperand(MCOperand::CreateImm(0)); + EmitInstruction(Out, Inst); + } + MCSymbol *DoneSym = Ctx.CreateTempSymbol(); + const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx); + EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr)); + + EmitCallAsanReport(Ctx, Out, AccessSize, IsWrite, X86::EAX); + EmitLabel(Out, DoneSym); + + EmitInstruction(Out, MCInstBuilder(X86::POPF32)); + EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::ECX)); EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::EAX)); } class X86AddressSanitizer64 : public X86AddressSanitizer { public: + static const long kShadowOffset = 0x7fff8000; + X86AddressSanitizer64(const MCSubtargetInfo &STI) : X86AddressSanitizer(STI) {} virtual ~X86AddressSanitizer64() {} - virtual void InstrumentMemOperandImpl(X86Operand *Op, unsigned AccessSize, - bool IsWrite, MCContext &Ctx, - MCStreamer &Out) override; -}; + virtual void InstrumentMemOperandSmallImpl( + X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, + MCStreamer &Out) override; + virtual void InstrumentMemOperandLargeImpl( + X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, + MCStreamer &Out) override; -void X86AddressSanitizer64::InstrumentMemOperandImpl(X86Operand *Op, - unsigned AccessSize, - bool IsWrite, - MCContext &Ctx, - MCStreamer &Out) { - // FIXME: emit .cfi directives for correct stack unwinding. - - // Set %rsp below current red zone (128 bytes wide) using LEA instruction to - // preserve flags. - { +private: + void EmitAdjustRSP(MCContext &Ctx, MCStreamer &Out, long Offset) { MCInst Inst; Inst.setOpcode(X86::LEA64r); Inst.addOperand(MCOperand::CreateReg(X86::RSP)); - const MCExpr *Disp = MCConstantExpr::Create(-128, Ctx); + const MCExpr *Disp = MCConstantExpr::Create(Offset, Ctx); std::unique_ptr<X86Operand> Op( X86Operand::CreateMem(0, Disp, X86::RSP, 0, 1, SMLoc(), SMLoc())); Op->addMemOperands(Inst, 5); EmitInstruction(Out, Inst); } + + void EmitCallAsanReport(MCContext &Ctx, MCStreamer &Out, unsigned AccessSize, + bool IsWrite) { + EmitInstruction(Out, MCInstBuilder(X86::CLD)); + EmitInstruction(Out, MCInstBuilder(X86::MMX_EMMS)); + + EmitInstruction(Out, MCInstBuilder(X86::AND64ri8).addReg(X86::RSP) + .addReg(X86::RSP).addImm(-16)); + + const std::string& Fn = FuncName(AccessSize, IsWrite); + MCSymbol *FnSym = Ctx.GetOrCreateSymbol(StringRef(Fn)); + const MCSymbolRefExpr *FnExpr = + MCSymbolRefExpr::Create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx); + EmitInstruction(Out, MCInstBuilder(X86::CALL64pcrel32).addExpr(FnExpr)); + } +}; + +void X86AddressSanitizer64::InstrumentMemOperandSmallImpl( + X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, + MCStreamer &Out) { + EmitAdjustRSP(Ctx, Out, -128); + EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(X86::RAX)); + EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(X86::RCX)); EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(X86::RDI)); + EmitInstruction(Out, MCInstBuilder(X86::PUSHF64)); { MCInst Inst; Inst.setOpcode(X86::LEA64r); Inst.addOperand(MCOperand::CreateReg(X86::RDI)); - Op->addMemOperands(Inst, 5); + Op.addMemOperands(Inst, 5); EmitInstruction(Out, Inst); } + EmitInstruction( + Out, MCInstBuilder(X86::MOV64rr).addReg(X86::RAX).addReg(X86::RDI)); + EmitInstruction(Out, MCInstBuilder(X86::SHR64ri).addReg(X86::RAX) + .addReg(X86::RAX).addImm(3)); { - const std::string Func = FuncName(AccessSize, IsWrite); - const MCSymbol *FuncSym = Ctx.GetOrCreateSymbol(StringRef(Func)); - const MCSymbolRefExpr *FuncExpr = - MCSymbolRefExpr::Create(FuncSym, MCSymbolRefExpr::VK_PLT, Ctx); - EmitInstruction(Out, MCInstBuilder(X86::CALL64pcrel32).addExpr(FuncExpr)); + MCInst Inst; + Inst.setOpcode(X86::MOV8rm); + Inst.addOperand(MCOperand::CreateReg(X86::AL)); + const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx); + std::unique_ptr<X86Operand> Op( + X86Operand::CreateMem(0, Disp, X86::RAX, 0, 1, SMLoc(), SMLoc())); + Op->addMemOperands(Inst, 5); + EmitInstruction(Out, Inst); + } + + EmitInstruction(Out, + MCInstBuilder(X86::TEST8rr).addReg(X86::AL).addReg(X86::AL)); + MCSymbol *DoneSym = Ctx.CreateTempSymbol(); + const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx); + EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr)); + + EmitInstruction( + Out, MCInstBuilder(X86::MOV32rr).addReg(X86::ECX).addReg(X86::EDI)); + EmitInstruction(Out, MCInstBuilder(X86::AND32ri).addReg(X86::ECX) + .addReg(X86::ECX).addImm(7)); + + switch (AccessSize) { + case 1: + break; + case 2: { + MCInst Inst; + Inst.setOpcode(X86::LEA32r); + Inst.addOperand(MCOperand::CreateReg(X86::ECX)); + + const MCExpr *Disp = MCConstantExpr::Create(1, Ctx); + std::unique_ptr<X86Operand> Op( + X86Operand::CreateMem(0, Disp, X86::ECX, 0, 1, SMLoc(), SMLoc())); + Op->addMemOperands(Inst, 5); + EmitInstruction(Out, Inst); + break; } + case 4: + EmitInstruction(Out, MCInstBuilder(X86::ADD32ri8).addReg(X86::ECX) + .addReg(X86::ECX).addImm(3)); + break; + default: + assert(false && "Incorrect access size"); + break; + } + + EmitInstruction( + Out, MCInstBuilder(X86::MOVSX32rr8).addReg(X86::EAX).addReg(X86::AL)); + EmitInstruction( + Out, MCInstBuilder(X86::CMP32rr).addReg(X86::ECX).addReg(X86::EAX)); + EmitInstruction(Out, MCInstBuilder(X86::JL_4).addExpr(DoneExpr)); + + EmitCallAsanReport(Ctx, Out, AccessSize, IsWrite); + EmitLabel(Out, DoneSym); + + EmitInstruction(Out, MCInstBuilder(X86::POPF64)); EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(X86::RDI)); + EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(X86::RCX)); + EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(X86::RAX)); + EmitAdjustRSP(Ctx, Out, 128); +} + +void X86AddressSanitizer64::InstrumentMemOperandLargeImpl( + X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, + MCStreamer &Out) { + EmitAdjustRSP(Ctx, Out, -128); + EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(X86::RAX)); + EmitInstruction(Out, MCInstBuilder(X86::PUSHF64)); - // Restore old %rsp value. { MCInst Inst; Inst.setOpcode(X86::LEA64r); - Inst.addOperand(MCOperand::CreateReg(X86::RSP)); - - const MCExpr *Disp = MCConstantExpr::Create(128, Ctx); + Inst.addOperand(MCOperand::CreateReg(X86::RAX)); + Op.addMemOperands(Inst, 5); + EmitInstruction(Out, Inst); + } + EmitInstruction(Out, MCInstBuilder(X86::SHR64ri).addReg(X86::RAX) + .addReg(X86::RAX).addImm(3)); + { + MCInst Inst; + switch (AccessSize) { + case 8: + Inst.setOpcode(X86::CMP8mi); + break; + case 16: + Inst.setOpcode(X86::CMP16mi); + break; + default: + assert(false && "Incorrect access size"); + break; + } + const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx); std::unique_ptr<X86Operand> Op( - X86Operand::CreateMem(0, Disp, X86::RSP, 0, 1, SMLoc(), SMLoc())); + X86Operand::CreateMem(0, Disp, X86::RAX, 0, 1, SMLoc(), SMLoc())); Op->addMemOperands(Inst, 5); + Inst.addOperand(MCOperand::CreateImm(0)); EmitInstruction(Out, Inst); } + + MCSymbol *DoneSym = Ctx.CreateTempSymbol(); + const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx); + EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr)); + + EmitCallAsanReport(Ctx, Out, AccessSize, IsWrite); + EmitLabel(Out, DoneSym); + + EmitInstruction(Out, MCInstBuilder(X86::POPF64)); + EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(X86::RAX)); + EmitAdjustRSP(Ctx, Out, 128); } } // End anonymous namespace @@ -233,8 +486,8 @@ X86AsmInstrumentation::X86AsmInstrumentation() {} X86AsmInstrumentation::~X86AsmInstrumentation() {} void X86AsmInstrumentation::InstrumentInstruction( - const MCInst &Inst, SmallVectorImpl<MCParsedAsmOperand *> &Operands, - MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out) {} + const MCInst &Inst, OperandVector &Operands, MCContext &Ctx, + const MCInstrInfo &MII, MCStreamer &Out) {} X86AsmInstrumentation * CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions, diff --git a/lib/Target/X86/AsmParser/X86AsmInstrumentation.h b/lib/Target/X86/AsmParser/X86AsmInstrumentation.h index 0369b14d98d2..1bc3c0930153 100644 --- a/lib/Target/X86/AsmParser/X86AsmInstrumentation.h +++ b/lib/Target/X86/AsmParser/X86AsmInstrumentation.h @@ -12,6 +12,8 @@ #include "llvm/ADT/SmallVector.h" +#include <memory> + namespace llvm { class MCContext; @@ -35,10 +37,9 @@ public: // Instruments Inst. Should be called just before the original // instruction is sent to Out. virtual void InstrumentInstruction( - const MCInst &Inst, SmallVectorImpl<MCParsedAsmOperand *> &Operands, - MCContext &Ctx, - const MCInstrInfo &MII, - MCStreamer &Out); + const MCInst &Inst, + SmallVectorImpl<std::unique_ptr<MCParsedAsmOperand>> &Operands, + MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out); protected: friend X86AsmInstrumentation * diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp index d3e695e31ab1..f0765ed50d71 100644 --- a/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -235,6 +235,7 @@ private: IES_RSHIFT, IES_PLUS, IES_MINUS, + IES_NOT, IES_MULTIPLY, IES_DIVIDE, IES_LBRAC, @@ -372,6 +373,7 @@ private: State = IES_ERROR; break; case IES_PLUS: + case IES_NOT: case IES_MULTIPLY: case IES_DIVIDE: case IES_LPAREN: @@ -401,6 +403,19 @@ private: } PrevState = CurrState; } + void onNot() { + IntelExprState CurrState = State; + switch (State) { + default: + State = IES_ERROR; + break; + case IES_PLUS: + case IES_NOT: + State = IES_NOT; + break; + } + PrevState = CurrState; + } void onRegister(unsigned Reg) { IntelExprState CurrState = State; switch (State) { @@ -438,6 +453,7 @@ private: break; case IES_PLUS: case IES_MINUS: + case IES_NOT: State = IES_INTEGER; Sym = SymRef; SymName = SymRefName; @@ -453,6 +469,7 @@ private: break; case IES_PLUS: case IES_MINUS: + case IES_NOT: case IES_OR: case IES_AND: case IES_LSHIFT: @@ -476,11 +493,22 @@ private: PrevState == IES_OR || PrevState == IES_AND || PrevState == IES_LSHIFT || PrevState == IES_RSHIFT || PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE || - PrevState == IES_LPAREN || PrevState == IES_LBRAC) && + PrevState == IES_LPAREN || PrevState == IES_LBRAC || + PrevState == IES_NOT) && CurrState == IES_MINUS) { // Unary minus. No need to pop the minus operand because it was never // pushed. IC.pushOperand(IC_IMM, -TmpInt); // Push -Imm. + } else if ((PrevState == IES_PLUS || PrevState == IES_MINUS || + PrevState == IES_OR || PrevState == IES_AND || + PrevState == IES_LSHIFT || PrevState == IES_RSHIFT || + PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE || + PrevState == IES_LPAREN || PrevState == IES_LBRAC || + PrevState == IES_NOT) && + CurrState == IES_NOT) { + // Unary not. No need to pop the not operand because it was never + // pushed. + IC.pushOperand(IC_IMM, ~TmpInt); // Push ~Imm. } else { IC.pushOperand(IC_IMM, TmpInt); } @@ -561,6 +589,7 @@ private: break; case IES_PLUS: case IES_MINUS: + case IES_NOT: case IES_OR: case IES_AND: case IES_LSHIFT: @@ -568,13 +597,14 @@ private: case IES_MULTIPLY: case IES_DIVIDE: case IES_LPAREN: - // FIXME: We don't handle this type of unary minus, yet. + // FIXME: We don't handle this type of unary minus or not, yet. if ((PrevState == IES_PLUS || PrevState == IES_MINUS || PrevState == IES_OR || PrevState == IES_AND || PrevState == IES_LSHIFT || PrevState == IES_RSHIFT || PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE || - PrevState == IES_LPAREN || PrevState == IES_LBRAC) && - CurrState == IES_MINUS) { + PrevState == IES_LPAREN || PrevState == IES_LBRAC || + PrevState == IES_NOT) && + (CurrState == IES_MINUS || CurrState == IES_NOT)) { State = IES_ERROR; break; } @@ -618,52 +648,52 @@ private: return Error(L, Msg, Ranges, MatchingInlineAsm); } - X86Operand *ErrorOperand(SMLoc Loc, StringRef Msg) { + std::nullptr_t ErrorOperand(SMLoc Loc, StringRef Msg) { Error(Loc, Msg); return nullptr; } - X86Operand *DefaultMemSIOperand(SMLoc Loc); - X86Operand *DefaultMemDIOperand(SMLoc Loc); - X86Operand *ParseOperand(); - X86Operand *ParseATTOperand(); - X86Operand *ParseIntelOperand(); - X86Operand *ParseIntelOffsetOfOperator(); + std::unique_ptr<X86Operand> DefaultMemSIOperand(SMLoc Loc); + std::unique_ptr<X86Operand> DefaultMemDIOperand(SMLoc Loc); + std::unique_ptr<X86Operand> ParseOperand(); + std::unique_ptr<X86Operand> ParseATTOperand(); + std::unique_ptr<X86Operand> ParseIntelOperand(); + std::unique_ptr<X86Operand> ParseIntelOffsetOfOperator(); bool ParseIntelDotOperator(const MCExpr *Disp, const MCExpr *&NewDisp); - X86Operand *ParseIntelOperator(unsigned OpKind); - X86Operand *ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start, unsigned Size); - X86Operand *ParseIntelMemOperand(int64_t ImmDisp, SMLoc StartLoc, - unsigned Size); + std::unique_ptr<X86Operand> ParseIntelOperator(unsigned OpKind); + std::unique_ptr<X86Operand> + ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start, unsigned Size); + std::unique_ptr<X86Operand> + ParseIntelMemOperand(int64_t ImmDisp, SMLoc StartLoc, unsigned Size); bool ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End); - X86Operand *ParseIntelBracExpression(unsigned SegReg, SMLoc Start, - int64_t ImmDisp, unsigned Size); + std::unique_ptr<X86Operand> ParseIntelBracExpression(unsigned SegReg, + SMLoc Start, + int64_t ImmDisp, + unsigned Size); bool ParseIntelIdentifier(const MCExpr *&Val, StringRef &Identifier, InlineAsmIdentifierInfo &Info, bool IsUnevaluatedOperand, SMLoc &End); - X86Operand *ParseMemOperand(unsigned SegReg, SMLoc StartLoc); + std::unique_ptr<X86Operand> ParseMemOperand(unsigned SegReg, SMLoc StartLoc); - X86Operand *CreateMemForInlineAsm(unsigned SegReg, const MCExpr *Disp, - unsigned BaseReg, unsigned IndexReg, - unsigned Scale, SMLoc Start, SMLoc End, - unsigned Size, StringRef Identifier, - InlineAsmIdentifierInfo &Info); + std::unique_ptr<X86Operand> + CreateMemForInlineAsm(unsigned SegReg, const MCExpr *Disp, unsigned BaseReg, + unsigned IndexReg, unsigned Scale, SMLoc Start, + SMLoc End, unsigned Size, StringRef Identifier, + InlineAsmIdentifierInfo &Info); bool ParseDirectiveWord(unsigned Size, SMLoc L); bool ParseDirectiveCode(StringRef IDVal, SMLoc L); - bool processInstruction(MCInst &Inst, - const SmallVectorImpl<MCParsedAsmOperand*> &Ops); + bool processInstruction(MCInst &Inst, const OperandVector &Ops); /// Wrapper around MCStreamer::EmitInstruction(). Possibly adds /// instrumentation around Inst. - void EmitInstruction(MCInst &Inst, - SmallVectorImpl<MCParsedAsmOperand *> &Operands, - MCStreamer &Out); + void EmitInstruction(MCInst &Inst, OperandVector &Operands, MCStreamer &Out); bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, - SmallVectorImpl<MCParsedAsmOperand*> &Operands, - MCStreamer &Out, unsigned &ErrorInfo, + OperandVector &Operands, MCStreamer &Out, + unsigned &ErrorInfo, bool MatchingInlineAsm) override; /// doSrcDstMatch - Returns true if operands are matching in their @@ -674,8 +704,8 @@ private: /// Parses AVX512 specific operand primitives: masked registers ({%k<NUM>}, {z}) /// and memory broadcasting ({1to<NUM>}) primitives, updating Operands vector if required. /// \return \c true if no parsing errors occurred, \c false otherwise. - bool HandleAVX512Operand(SmallVectorImpl<MCParsedAsmOperand*> &Operands, - const MCParsedAsmOperand &Op); + bool HandleAVX512Operand(OperandVector &Operands, + const MCParsedAsmOperand &Op); bool is64BitMode() const { // FIXME: Can tablegen auto-generate this? @@ -725,9 +755,8 @@ public: bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override; - bool - ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, - SmallVectorImpl<MCParsedAsmOperand*> &Operands) override; + bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + SMLoc NameLoc, OperandVector &Operands) override; bool ParseDirective(AsmToken DirectiveID) override; }; @@ -908,7 +937,7 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo, return false; } -X86Operand *X86AsmParser::DefaultMemSIOperand(SMLoc Loc) { +std::unique_ptr<X86Operand> X86AsmParser::DefaultMemSIOperand(SMLoc Loc) { unsigned basereg = is64BitMode() ? X86::RSI : (is32BitMode() ? X86::ESI : X86::SI); const MCExpr *Disp = MCConstantExpr::Create(0, getContext()); @@ -916,7 +945,7 @@ X86Operand *X86AsmParser::DefaultMemSIOperand(SMLoc Loc) { /*IndexReg=*/0, /*Scale=*/1, Loc, Loc, 0); } -X86Operand *X86AsmParser::DefaultMemDIOperand(SMLoc Loc) { +std::unique_ptr<X86Operand> X86AsmParser::DefaultMemDIOperand(SMLoc Loc) { unsigned basereg = is64BitMode() ? X86::RDI : (is32BitMode() ? X86::EDI : X86::DI); const MCExpr *Disp = MCConstantExpr::Create(0, getContext()); @@ -924,7 +953,7 @@ X86Operand *X86AsmParser::DefaultMemDIOperand(SMLoc Loc) { /*IndexReg=*/0, /*Scale=*/1, Loc, Loc, 0); } -X86Operand *X86AsmParser::ParseOperand() { +std::unique_ptr<X86Operand> X86AsmParser::ParseOperand() { if (isParsingIntelSyntax()) return ParseIntelOperand(); return ParseATTOperand(); @@ -946,12 +975,10 @@ static unsigned getIntelMemOperandSize(StringRef OpStr) { return Size; } -X86Operand * -X86AsmParser::CreateMemForInlineAsm(unsigned SegReg, const MCExpr *Disp, - unsigned BaseReg, unsigned IndexReg, - unsigned Scale, SMLoc Start, SMLoc End, - unsigned Size, StringRef Identifier, - InlineAsmIdentifierInfo &Info){ +std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm( + unsigned SegReg, const MCExpr *Disp, unsigned BaseReg, unsigned IndexReg, + unsigned Scale, SMLoc Start, SMLoc End, unsigned Size, StringRef Identifier, + InlineAsmIdentifierInfo &Info) { // If this is not a VarDecl then assume it is a FuncDecl or some other label // reference. We need an 'r' constraint here, so we need to create register // operand to ensure proper matching. Just pick a GPR based on the size of @@ -1064,7 +1091,8 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { if (SM.getStopOnLBrac() && getLexer().getKind() == AsmToken::LBrac) break; - switch (getLexer().getKind()) { + AsmToken::TokenKind TK = getLexer().getKind(); + switch (TK) { default: { if (SM.isValidEndState()) { Done = true; @@ -1076,13 +1104,14 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { Done = true; break; } + case AsmToken::String: case AsmToken::Identifier: { // This could be a register or a symbolic displacement. unsigned TmpReg; const MCExpr *Val; SMLoc IdentLoc = Tok.getLoc(); StringRef Identifier = Tok.getString(); - if(!ParseRegister(TmpReg, IdentLoc, End)) { + if (TK != AsmToken::String && !ParseRegister(TmpReg, IdentLoc, End)) { SM.onRegister(TmpReg); UpdateLocLex = false; break; @@ -1142,6 +1171,7 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { } case AsmToken::Plus: SM.onPlus(); break; case AsmToken::Minus: SM.onMinus(); break; + case AsmToken::Tilde: SM.onNot(); break; case AsmToken::Star: SM.onStar(); break; case AsmToken::Slash: SM.onDivide(); break; case AsmToken::Pipe: SM.onOr(); break; @@ -1164,9 +1194,9 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { return false; } -X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start, - int64_t ImmDisp, - unsigned Size) { +std::unique_ptr<X86Operand> +X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start, + int64_t ImmDisp, unsigned Size) { const AsmToken &Tok = Parser.getTok(); SMLoc BracLoc = Tok.getLoc(), End = Tok.getEndLoc(); if (getLexer().isNot(AsmToken::LBrac)) @@ -1270,9 +1300,9 @@ bool X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val, } /// \brief Parse intel style segment override. -X86Operand *X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg, - SMLoc Start, - unsigned Size) { +std::unique_ptr<X86Operand> +X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start, + unsigned Size) { assert(SegReg != 0 && "Tried to parse a segment override without a segment!"); const AsmToken &Tok = Parser.getTok(); // Eat colon. if (Tok.isNot(AsmToken::Colon)) @@ -1321,8 +1351,9 @@ X86Operand *X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg, } /// ParseIntelMemOperand - Parse intel style memory operand. -X86Operand *X86AsmParser::ParseIntelMemOperand(int64_t ImmDisp, SMLoc Start, - unsigned Size) { +std::unique_ptr<X86Operand> X86AsmParser::ParseIntelMemOperand(int64_t ImmDisp, + SMLoc Start, + unsigned Size) { const AsmToken &Tok = Parser.getTok(); SMLoc End; @@ -1425,7 +1456,7 @@ bool X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp, /// Parse the 'offset' operator. This operator is used to specify the /// location rather then the content of a variable. -X86Operand *X86AsmParser::ParseIntelOffsetOfOperator() { +std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOffsetOfOperator() { const AsmToken &Tok = Parser.getTok(); SMLoc OffsetOfLoc = Tok.getLoc(); Parser.Lex(); // Eat offset. @@ -1462,7 +1493,7 @@ enum IntelOperatorKind { /// variable. A variable's size is the product of its LENGTH and TYPE. The /// TYPE operator returns the size of a C or C++ type or variable. If the /// variable is an array, TYPE returns the size of a single element. -X86Operand *X86AsmParser::ParseIntelOperator(unsigned OpKind) { +std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperator(unsigned OpKind) { const AsmToken &Tok = Parser.getTok(); SMLoc TypeLoc = Tok.getLoc(); Parser.Lex(); // Eat operator. @@ -1495,7 +1526,7 @@ X86Operand *X86AsmParser::ParseIntelOperator(unsigned OpKind) { return X86Operand::CreateImm(Imm, Start, End); } -X86Operand *X86AsmParser::ParseIntelOperand() { +std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() { const AsmToken &Tok = Parser.getTok(); SMLoc Start, End; @@ -1523,7 +1554,7 @@ X86Operand *X86AsmParser::ParseIntelOperand() { // Immediate. if (getLexer().is(AsmToken::Integer) || getLexer().is(AsmToken::Minus) || - getLexer().is(AsmToken::LParen)) { + getLexer().is(AsmToken::Tilde) || getLexer().is(AsmToken::LParen)) { AsmToken StartTok = Tok; IntelExprStateMachine SM(/*Imm=*/0, /*StopOnLBrac=*/true, /*AddImmPrefix=*/false); @@ -1577,7 +1608,7 @@ X86Operand *X86AsmParser::ParseIntelOperand() { return ParseIntelMemOperand(/*Disp=*/0, Start, Size); } -X86Operand *X86AsmParser::ParseATTOperand() { +std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() { switch (getLexer().getKind()) { default: // Parse a memory operand with no segment register. @@ -1613,9 +1644,8 @@ X86Operand *X86AsmParser::ParseATTOperand() { } } -bool -X86AsmParser::HandleAVX512Operand(SmallVectorImpl<MCParsedAsmOperand*> &Operands, - const MCParsedAsmOperand &Op) { +bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands, + const MCParsedAsmOperand &Op) { if(STI.getFeatureBits() & X86::FeatureAVX512) { if (getLexer().is(AsmToken::LCurly)) { // Eat "{" and mark the current place. @@ -1653,8 +1683,8 @@ X86AsmParser::HandleAVX512Operand(SmallVectorImpl<MCParsedAsmOperand*> &Operands } else { // Parse mask register {%k1} Operands.push_back(X86Operand::CreateToken("{", consumedToken)); - if (X86Operand *Op = ParseOperand()) { - Operands.push_back(Op); + if (std::unique_ptr<X86Operand> Op = ParseOperand()) { + Operands.push_back(std::move(Op)); if (!getLexer().is(AsmToken::RCurly)) return !ErrorAndEatStatement(getLexer().getLoc(), "Expected } at this point"); @@ -1682,7 +1712,8 @@ X86AsmParser::HandleAVX512Operand(SmallVectorImpl<MCParsedAsmOperand*> &Operands /// ParseMemOperand: segment: disp(basereg, indexreg, scale). The '%ds:' prefix /// has already been parsed if present. -X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) { +std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg, + SMLoc MemStart) { // We have to disambiguate a parenthesized expression "(4+5)" from the start // of a memory operand with a missing displacement "(%ebx)" or "(,%eax)". The @@ -1845,9 +1876,8 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) { MemStart, MemEnd); } -bool X86AsmParser:: -ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, - SmallVectorImpl<MCParsedAsmOperand*> &Operands) { +bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + SMLoc NameLoc, OperandVector &Operands) { InstInfo = &Info; StringRef PatchedName = Name; @@ -1940,9 +1970,9 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, // Read the operands. while(1) { - if (X86Operand *Op = ParseOperand()) { - Operands.push_back(Op); - if (!HandleAVX512Operand(Operands, *Op)) + if (std::unique_ptr<X86Operand> Op = ParseOperand()) { + Operands.push_back(std::move(Op)); + if (!HandleAVX512Operand(Operands, *Operands.back())) return true; } else { Parser.eatToEndOfStatement(); @@ -1973,27 +2003,25 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, // documented form in various unofficial manuals, so a lot of code uses it. if ((Name == "outb" || Name == "outw" || Name == "outl" || Name == "out") && Operands.size() == 3) { - X86Operand &Op = *(X86Operand*)Operands.back(); + X86Operand &Op = (X86Operand &)*Operands.back(); if (Op.isMem() && Op.Mem.SegReg == 0 && isa<MCConstantExpr>(Op.Mem.Disp) && cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 && Op.Mem.BaseReg == MatchRegisterName("dx") && Op.Mem.IndexReg == 0) { SMLoc Loc = Op.getEndLoc(); Operands.back() = X86Operand::CreateReg(Op.Mem.BaseReg, Loc, Loc); - delete &Op; } } // Same hack for "in[bwl]? (%dx), %al" -> "inb %dx, %al". if ((Name == "inb" || Name == "inw" || Name == "inl" || Name == "in") && Operands.size() == 3) { - X86Operand &Op = *(X86Operand*)Operands.begin()[1]; + X86Operand &Op = (X86Operand &)*Operands[1]; if (Op.isMem() && Op.Mem.SegReg == 0 && isa<MCConstantExpr>(Op.Mem.Disp) && cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 && Op.Mem.BaseReg == MatchRegisterName("dx") && Op.Mem.IndexReg == 0) { SMLoc Loc = Op.getEndLoc(); - Operands.begin()[1] = X86Operand::CreateReg(Op.Mem.BaseReg, Loc, Loc); - delete &Op; + Operands[1] = X86Operand::CreateReg(Op.Mem.BaseReg, Loc, Loc); } } @@ -2060,8 +2088,8 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, Operands.push_back(DefaultMemSIOperand(NameLoc)); } } else if (Operands.size() == 3) { - X86Operand &Op = *(X86Operand*)Operands.begin()[1]; - X86Operand &Op2 = *(X86Operand*)Operands.begin()[2]; + X86Operand &Op = (X86Operand &)*Operands[1]; + X86Operand &Op2 = (X86Operand &)*Operands[2]; if (!doSrcDstMatch(Op, Op2)) return Error(Op.getStartLoc(), "mismatching source and destination index registers"); @@ -2076,10 +2104,8 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, (Name == "smov" || Name == "smovb" || Name == "smovw" || Name == "smovl" || Name == "smovd" || Name == "smovq"))) { if (Operands.size() == 1) { - if (Name == "movsd") { - delete Operands.back(); + if (Name == "movsd") Operands.back() = X86Operand::CreateToken("movsl", NameLoc); - } if (isParsingIntelSyntax()) { Operands.push_back(DefaultMemDIOperand(NameLoc)); Operands.push_back(DefaultMemSIOperand(NameLoc)); @@ -2088,8 +2114,8 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, Operands.push_back(DefaultMemDIOperand(NameLoc)); } } else if (Operands.size() == 3) { - X86Operand &Op = *(X86Operand*)Operands.begin()[1]; - X86Operand &Op2 = *(X86Operand*)Operands.begin()[2]; + X86Operand &Op = (X86Operand &)*Operands[1]; + X86Operand &Op2 = (X86Operand &)*Operands[2]; if (!doSrcDstMatch(Op, Op2)) return Error(Op.getStartLoc(), "mismatching source and destination index registers"); @@ -2105,31 +2131,26 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, Operands.size() == 3) { if (isParsingIntelSyntax()) { // Intel syntax - X86Operand *Op1 = static_cast<X86Operand*>(Operands[2]); - if (Op1->isImm() && isa<MCConstantExpr>(Op1->getImm()) && - cast<MCConstantExpr>(Op1->getImm())->getValue() == 1) { - delete Operands[2]; + X86Operand &Op1 = static_cast<X86Operand &>(*Operands[2]); + if (Op1.isImm() && isa<MCConstantExpr>(Op1.getImm()) && + cast<MCConstantExpr>(Op1.getImm())->getValue() == 1) Operands.pop_back(); - } } else { - X86Operand *Op1 = static_cast<X86Operand*>(Operands[1]); - if (Op1->isImm() && isa<MCConstantExpr>(Op1->getImm()) && - cast<MCConstantExpr>(Op1->getImm())->getValue() == 1) { - delete Operands[1]; + X86Operand &Op1 = static_cast<X86Operand &>(*Operands[1]); + if (Op1.isImm() && isa<MCConstantExpr>(Op1.getImm()) && + cast<MCConstantExpr>(Op1.getImm())->getValue() == 1) Operands.erase(Operands.begin() + 1); - } } } // Transforms "int $3" into "int3" as a size optimization. We can't write an // instalias with an immediate operand yet. if (Name == "int" && Operands.size() == 2) { - X86Operand *Op1 = static_cast<X86Operand*>(Operands[1]); - if (Op1->isImm() && isa<MCConstantExpr>(Op1->getImm()) && - cast<MCConstantExpr>(Op1->getImm())->getValue() == 3) { - delete Operands[1]; + X86Operand &Op1 = static_cast<X86Operand &>(*Operands[1]); + if (Op1.isImm() && isa<MCConstantExpr>(Op1.getImm()) && + cast<MCConstantExpr>(Op1.getImm())->getValue() == 3) { Operands.erase(Operands.begin() + 1); - static_cast<X86Operand*>(Operands[0])->setTokenValue("int3"); + static_cast<X86Operand &>(*Operands[0]).setTokenValue("int3"); } } @@ -2175,9 +2196,7 @@ static bool convert64i32to64ri8(MCInst &Inst, unsigned Opcode, return convertToSExti8(Inst, Opcode, X86::RAX, isCmp); } -bool X86AsmParser:: -processInstruction(MCInst &Inst, - const SmallVectorImpl<MCParsedAsmOperand*> &Ops) { +bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) { switch (Inst.getOpcode()) { default: return false; case X86::AND16i16: return convert16i16to16ri8(Inst, X86::AND16ri8); @@ -2258,51 +2277,47 @@ processInstruction(MCInst &Inst, static const char *getSubtargetFeatureName(unsigned Val); -void X86AsmParser::EmitInstruction( - MCInst &Inst, SmallVectorImpl<MCParsedAsmOperand *> &Operands, - MCStreamer &Out) { +void X86AsmParser::EmitInstruction(MCInst &Inst, OperandVector &Operands, + MCStreamer &Out) { Instrumentation->InstrumentInstruction(Inst, Operands, getContext(), MII, Out); Out.EmitInstruction(Inst, STI); } -bool X86AsmParser:: -MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, - SmallVectorImpl<MCParsedAsmOperand*> &Operands, - MCStreamer &Out, unsigned &ErrorInfo, - bool MatchingInlineAsm) { +bool X86AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + OperandVector &Operands, + MCStreamer &Out, unsigned &ErrorInfo, + bool MatchingInlineAsm) { assert(!Operands.empty() && "Unexpect empty operand list!"); - X86Operand *Op = static_cast<X86Operand*>(Operands[0]); - assert(Op->isToken() && "Leading operand should always be a mnemonic!"); + X86Operand &Op = static_cast<X86Operand &>(*Operands[0]); + assert(Op.isToken() && "Leading operand should always be a mnemonic!"); ArrayRef<SMRange> EmptyRanges = None; // First, handle aliases that expand to multiple instructions. // FIXME: This should be replaced with a real .td file alias mechanism. // Also, MatchInstructionImpl should actually *do* the EmitInstruction // call. - if (Op->getToken() == "fstsw" || Op->getToken() == "fstcw" || - Op->getToken() == "fstsww" || Op->getToken() == "fstcww" || - Op->getToken() == "finit" || Op->getToken() == "fsave" || - Op->getToken() == "fstenv" || Op->getToken() == "fclex") { + if (Op.getToken() == "fstsw" || Op.getToken() == "fstcw" || + Op.getToken() == "fstsww" || Op.getToken() == "fstcww" || + Op.getToken() == "finit" || Op.getToken() == "fsave" || + Op.getToken() == "fstenv" || Op.getToken() == "fclex") { MCInst Inst; Inst.setOpcode(X86::WAIT); Inst.setLoc(IDLoc); if (!MatchingInlineAsm) EmitInstruction(Inst, Operands, Out); - const char *Repl = - StringSwitch<const char*>(Op->getToken()) - .Case("finit", "fninit") - .Case("fsave", "fnsave") - .Case("fstcw", "fnstcw") - .Case("fstcww", "fnstcw") - .Case("fstenv", "fnstenv") - .Case("fstsw", "fnstsw") - .Case("fstsww", "fnstsw") - .Case("fclex", "fnclex") - .Default(nullptr); + const char *Repl = StringSwitch<const char *>(Op.getToken()) + .Case("finit", "fninit") + .Case("fsave", "fnsave") + .Case("fstcw", "fnstcw") + .Case("fstcww", "fnstcw") + .Case("fstenv", "fnstenv") + .Case("fstsw", "fnstsw") + .Case("fstsww", "fnstsw") + .Case("fclex", "fnclex") + .Default(nullptr); assert(Repl && "Unknown wait-prefixed instruction"); - delete Operands[0]; Operands[0] = X86Operand::CreateToken(Repl, IDLoc); } @@ -2355,11 +2370,11 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, // following hack. // Change the operand to point to a temporary token. - StringRef Base = Op->getToken(); + StringRef Base = Op.getToken(); SmallString<16> Tmp; Tmp += Base; Tmp += ' '; - Op->setTokenValue(Tmp.str()); + Op.setTokenValue(Tmp.str()); // If this instruction starts with an 'f', then it is a floating point stack // instruction. These come in up to three forms for 32-bit, 64-bit, and @@ -2400,7 +2415,7 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, ErrorInfoMissingFeature = ErrorInfoIgnore; // Restore the old token. - Op->setTokenValue(Base); + Op.setTokenValue(Base); // If exactly one matched, then we treat that as a successful match (and the // instruction will already have been filled in correctly, since the failing @@ -2450,8 +2465,8 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, if ((Match1 == Match_MnemonicFail) && (Match2 == Match_MnemonicFail) && (Match3 == Match_MnemonicFail) && (Match4 == Match_MnemonicFail)) { if (!WasOriginallyInvalidOperand) { - ArrayRef<SMRange> Ranges = MatchingInlineAsm ? EmptyRanges : - Op->getLocRange(); + ArrayRef<SMRange> Ranges = + MatchingInlineAsm ? EmptyRanges : Op.getLocRange(); return Error(IDLoc, "invalid instruction mnemonic '" + Base + "'", Ranges, MatchingInlineAsm); } @@ -2462,10 +2477,10 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, return Error(IDLoc, "too few operands for instruction", EmptyRanges, MatchingInlineAsm); - X86Operand *Operand = (X86Operand*)Operands[ErrorInfo]; - if (Operand->getStartLoc().isValid()) { - SMRange OperandRange = Operand->getLocRange(); - return Error(Operand->getStartLoc(), "invalid operand for instruction", + X86Operand &Operand = (X86Operand &)*Operands[ErrorInfo]; + if (Operand.getStartLoc().isValid()) { + SMRange OperandRange = Operand.getLocRange(); + return Error(Operand.getStartLoc(), "invalid operand for instruction", OperandRange, MatchingInlineAsm); } } diff --git a/lib/Target/X86/AsmParser/X86Operand.h b/lib/Target/X86/AsmParser/X86Operand.h index de3be38c1d5a..1bbfc1192447 100644 --- a/lib/Target/X86/AsmParser/X86Operand.h +++ b/lib/Target/X86/AsmParser/X86Operand.h @@ -13,6 +13,7 @@ #include "X86AsmParserCommon.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCParser/MCParsedAsmOperand.h" +#include "llvm/ADT/STLExtras.h" namespace llvm { @@ -410,20 +411,19 @@ struct X86Operand : public MCParsedAsmOperand { Inst.addOperand(MCOperand::CreateReg(getMemSegReg())); } - static X86Operand *CreateToken(StringRef Str, SMLoc Loc) { + static std::unique_ptr<X86Operand> CreateToken(StringRef Str, SMLoc Loc) { SMLoc EndLoc = SMLoc::getFromPointer(Loc.getPointer() + Str.size()); - X86Operand *Res = new X86Operand(Token, Loc, EndLoc); + auto Res = llvm::make_unique<X86Operand>(Token, Loc, EndLoc); Res->Tok.Data = Str.data(); Res->Tok.Length = Str.size(); return Res; } - static X86Operand *CreateReg(unsigned RegNo, SMLoc StartLoc, SMLoc EndLoc, - bool AddressOf = false, - SMLoc OffsetOfLoc = SMLoc(), - StringRef SymName = StringRef(), - void *OpDecl = nullptr) { - X86Operand *Res = new X86Operand(Register, StartLoc, EndLoc); + static std::unique_ptr<X86Operand> + CreateReg(unsigned RegNo, SMLoc StartLoc, SMLoc EndLoc, + bool AddressOf = false, SMLoc OffsetOfLoc = SMLoc(), + StringRef SymName = StringRef(), void *OpDecl = nullptr) { + auto Res = llvm::make_unique<X86Operand>(Register, StartLoc, EndLoc); Res->Reg.RegNo = RegNo; Res->AddressOf = AddressOf; Res->OffsetOfLoc = OffsetOfLoc; @@ -432,17 +432,18 @@ struct X86Operand : public MCParsedAsmOperand { return Res; } - static X86Operand *CreateImm(const MCExpr *Val, SMLoc StartLoc, SMLoc EndLoc){ - X86Operand *Res = new X86Operand(Immediate, StartLoc, EndLoc); + static std::unique_ptr<X86Operand> CreateImm(const MCExpr *Val, + SMLoc StartLoc, SMLoc EndLoc) { + auto Res = llvm::make_unique<X86Operand>(Immediate, StartLoc, EndLoc); Res->Imm.Val = Val; return Res; } /// Create an absolute memory operand. - static X86Operand *CreateMem(const MCExpr *Disp, SMLoc StartLoc, SMLoc EndLoc, - unsigned Size = 0, StringRef SymName = StringRef(), - void *OpDecl = nullptr) { - X86Operand *Res = new X86Operand(Memory, StartLoc, EndLoc); + static std::unique_ptr<X86Operand> + CreateMem(const MCExpr *Disp, SMLoc StartLoc, SMLoc EndLoc, unsigned Size = 0, + StringRef SymName = StringRef(), void *OpDecl = nullptr) { + auto Res = llvm::make_unique<X86Operand>(Memory, StartLoc, EndLoc); Res->Mem.SegReg = 0; Res->Mem.Disp = Disp; Res->Mem.BaseReg = 0; @@ -456,12 +457,11 @@ struct X86Operand : public MCParsedAsmOperand { } /// Create a generalized memory operand. - static X86Operand *CreateMem(unsigned SegReg, const MCExpr *Disp, - unsigned BaseReg, unsigned IndexReg, - unsigned Scale, SMLoc StartLoc, SMLoc EndLoc, - unsigned Size = 0, - StringRef SymName = StringRef(), - void *OpDecl = nullptr) { + static std::unique_ptr<X86Operand> + CreateMem(unsigned SegReg, const MCExpr *Disp, unsigned BaseReg, + unsigned IndexReg, unsigned Scale, SMLoc StartLoc, SMLoc EndLoc, + unsigned Size = 0, StringRef SymName = StringRef(), + void *OpDecl = nullptr) { // We should never just have a displacement, that should be parsed as an // absolute memory operand. assert((SegReg || BaseReg || IndexReg) && "Invalid memory operand!"); @@ -469,7 +469,7 @@ struct X86Operand : public MCParsedAsmOperand { // The scale should always be one of {1,2,4,8}. assert(((Scale == 1 || Scale == 2 || Scale == 4 || Scale == 8)) && "Invalid scale!"); - X86Operand *Res = new X86Operand(Memory, StartLoc, EndLoc); + auto Res = llvm::make_unique<X86Operand>(Memory, StartLoc, EndLoc); Res->Mem.SegReg = SegReg; Res->Mem.Disp = Disp; Res->Mem.BaseReg = BaseReg; diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt index c54fbc1dc806..a09767e1eaff 100644 --- a/lib/Target/X86/CMakeLists.txt +++ b/lib/Target/X86/CMakeLists.txt @@ -14,6 +14,7 @@ add_public_tablegen_target(X86CommonTableGen) set(sources X86AsmPrinter.cpp + X86AtomicExpandPass.cpp X86CodeEmitter.cpp X86FastISel.cpp X86FloatingPoint.cpp diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp index 804606d917ba..55587d462ff6 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp @@ -1620,7 +1620,8 @@ static int readVVVV(struct InternalInstruction* insn) { int vvvv; if (insn->vectorExtensionType == TYPE_EVEX) - vvvv = vvvvFromEVEX3of4(insn->vectorExtensionPrefix[2]); + vvvv = (v2FromEVEX4of4(insn->vectorExtensionPrefix[3]) << 4 | + vvvvFromEVEX3of4(insn->vectorExtensionPrefix[2])); else if (insn->vectorExtensionType == TYPE_VEX_3B) vvvv = vvvvFromVEX3of3(insn->vectorExtensionPrefix[2]); else if (insn->vectorExtensionType == TYPE_VEX_2B) diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index bf30a8e66633..23bca0d53d3d 100644 --- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -73,11 +73,12 @@ public: }; class X86AsmBackend : public MCAsmBackend { - StringRef CPU; + const StringRef CPU; bool HasNopl; + const uint64_t MaxNopLength; public: X86AsmBackend(const Target &T, StringRef _CPU) - : MCAsmBackend(), CPU(_CPU) { + : MCAsmBackend(), CPU(_CPU), MaxNopLength(_CPU == "slm" ? 7 : 15) { HasNopl = CPU != "generic" && CPU != "i386" && CPU != "i486" && CPU != "i586" && CPU != "pentium" && CPU != "pentium-mmx" && CPU != "i686" && CPU != "k6" && CPU != "k6-2" && CPU != "k6-3" && @@ -331,7 +332,7 @@ bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const { // 15 is the longest single nop instruction. Emit as many 15-byte nops as // needed, then emit a nop of the remaining length. do { - const uint8_t ThisNopLength = (uint8_t) std::min(Count, (uint64_t) 15); + const uint8_t ThisNopLength = (uint8_t) std::min(Count, MaxNopLength); const uint8_t Prefixes = ThisNopLength <= 10 ? 0 : ThisNopLength - 10; for (uint8_t i = 0; i < Prefixes; i++) OW->Write8(0x66); @@ -365,6 +366,17 @@ public: } }; +class ELFX86_X32AsmBackend : public ELFX86AsmBackend { +public: + ELFX86_X32AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU) + : ELFX86AsmBackend(T, OSABI, CPU) {} + + MCObjectWriter *createObjectWriter(raw_ostream &OS) const override { + return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI, + ELF::EM_X86_64); + } +}; + class ELFX86_64AsmBackend : public ELFX86AsmBackend { public: ELFX86_64AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU) @@ -717,11 +729,10 @@ public: }; class DarwinX86_32AsmBackend : public DarwinX86AsmBackend { - bool SupportsCU; public: DarwinX86_32AsmBackend(const Target &T, const MCRegisterInfo &MRI, - StringRef CPU, bool SupportsCU) - : DarwinX86AsmBackend(T, MRI, CPU, false), SupportsCU(SupportsCU) {} + StringRef CPU) + : DarwinX86AsmBackend(T, MRI, CPU, false) {} MCObjectWriter *createObjectWriter(raw_ostream &OS) const override { return createX86MachObjectWriter(OS, /*Is64Bit=*/false, @@ -732,20 +743,16 @@ public: /// \brief Generate the compact unwind encoding for the CFI instructions. uint32_t generateCompactUnwindEncoding( ArrayRef<MCCFIInstruction> Instrs) const override { - return SupportsCU ? generateCompactUnwindEncodingImpl(Instrs) : 0; + return generateCompactUnwindEncodingImpl(Instrs); } }; class DarwinX86_64AsmBackend : public DarwinX86AsmBackend { - bool SupportsCU; const MachO::CPUSubTypeX86 Subtype; public: DarwinX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI, - StringRef CPU, bool SupportsCU, - MachO::CPUSubTypeX86 st) - : DarwinX86AsmBackend(T, MRI, CPU, true), SupportsCU(SupportsCU), - Subtype(st) { - } + StringRef CPU, MachO::CPUSubTypeX86 st) + : DarwinX86AsmBackend(T, MRI, CPU, true), Subtype(st) {} MCObjectWriter *createObjectWriter(raw_ostream &OS) const override { return createX86MachObjectWriter(OS, /*Is64Bit=*/true, @@ -788,7 +795,7 @@ public: /// \brief Generate the compact unwind encoding for the CFI instructions. uint32_t generateCompactUnwindEncoding( ArrayRef<MCCFIInstruction> Instrs) const override { - return SupportsCU ? generateCompactUnwindEncodingImpl(Instrs) : 0; + return generateCompactUnwindEncodingImpl(Instrs); } }; @@ -801,9 +808,7 @@ MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T, Triple TheTriple(TT); if (TheTriple.isOSBinFormatMachO()) - return new DarwinX86_32AsmBackend(T, MRI, CPU, - TheTriple.isMacOSX() && - !TheTriple.isMacOSXVersionLT(10, 7)); + return new DarwinX86_32AsmBackend(T, MRI, CPU); if (TheTriple.isOSWindows() && !TheTriple.isOSBinFormatELF()) return new WindowsX86AsmBackend(T, false, CPU); @@ -823,14 +828,15 @@ MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T, StringSwitch<MachO::CPUSubTypeX86>(TheTriple.getArchName()) .Case("x86_64h", MachO::CPU_SUBTYPE_X86_64_H) .Default(MachO::CPU_SUBTYPE_X86_64_ALL); - return new DarwinX86_64AsmBackend(T, MRI, CPU, - TheTriple.isMacOSX() && - !TheTriple.isMacOSXVersionLT(10, 7), CS); + return new DarwinX86_64AsmBackend(T, MRI, CPU, CS); } if (TheTriple.isOSWindows() && !TheTriple.isOSBinFormatELF()) return new WindowsX86AsmBackend(T, true, CPU); uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS()); + + if (TheTriple.getEnvironment() == Triple::GNUX32) + return new ELFX86_X32AsmBackend(T, OSABI, CPU); return new ELFX86_64AsmBackend(T, OSABI, CPU); } diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp index 39480eaaac16..83b27779a2d7 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp @@ -74,8 +74,9 @@ X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &T) { // FIXME: this should not depend on the target OS version, but on the ld64 // version in use. From at least >= ld64-97.17 (Xcode 3.2.6) the abs-ified - // FDE relocs may be used. - DwarfFDESymbolsUseAbsDiff = T.isMacOSX() && !T.isMacOSXVersionLT(10, 6); + // FDE relocs may be used. We also use them for the ios simulator. + DwarfFDESymbolsUseAbsDiff = (T.isMacOSX() && !T.isMacOSXVersionLT(10, 6)) + || T.isiOS(); UseIntegratedAssembler = true; } @@ -142,8 +143,11 @@ getNonexecutableStackSection(MCContext &Ctx) const { void X86MCAsmInfoMicrosoft::anchor() { } X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) { - if (Triple.getArch() == Triple::x86_64) + if (Triple.getArch() == Triple::x86_64) { PrivateGlobalPrefix = ".L"; + PointerSize = 8; + ExceptionsType = ExceptionHandling::WinEH; + } AssemblerDialect = AsmWriterFlavor; @@ -157,17 +161,18 @@ X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) { void X86MCAsmInfoGNUCOFF::anchor() { } X86MCAsmInfoGNUCOFF::X86MCAsmInfoGNUCOFF(const Triple &Triple) { + assert(Triple.isOSWindows() && "Windows is the only supported COFF target"); if (Triple.getArch() == Triple::x86_64) { PrivateGlobalPrefix = ".L"; PointerSize = 8; + ExceptionsType = ExceptionHandling::WinEH; + } else { + ExceptionsType = ExceptionHandling::DwarfCFI; } AssemblerDialect = AsmWriterFlavor; TextAlignFillValue = 0x90; - // Exceptions handling - ExceptionsType = ExceptionHandling::DwarfCFI; - UseIntegratedAssembler = true; } diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp index e63036c1a557..5e29e5c359ac 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -197,14 +197,13 @@ void X86_MC::DetectFamilyModel(unsigned EAX, unsigned &Family, } } -unsigned X86_MC::getDwarfRegFlavour(StringRef TT, bool isEH) { - Triple TheTriple(TT); - if (TheTriple.getArch() == Triple::x86_64) +unsigned X86_MC::getDwarfRegFlavour(Triple TT, bool isEH) { + if (TT.getArch() == Triple::x86_64) return DWARFFlavour::X86_64; - if (TheTriple.isOSDarwin()) + if (TT.isOSDarwin()) return isEH ? DWARFFlavour::X86_32_DarwinEH : DWARFFlavour::X86_32_Generic; - if (TheTriple.isOSCygMing()) + if (TT.isOSCygMing()) // Unsupported by now, just quick fallback return DWARFFlavour::X86_32_Generic; return DWARFFlavour::X86_32_Generic; @@ -251,8 +250,8 @@ static MCRegisterInfo *createX86MCRegisterInfo(StringRef TT) { MCRegisterInfo *X = new MCRegisterInfo(); InitX86MCRegisterInfo(X, RA, - X86_MC::getDwarfRegFlavour(TT, false), - X86_MC::getDwarfRegFlavour(TT, true), + X86_MC::getDwarfRegFlavour(TheTriple, false), + X86_MC::getDwarfRegFlavour(TheTriple, true), RA); X86_MC::InitLLVM2SEHRegisterMapping(X); return X; diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h index 8fe40fd23620..ebe74cfeaddc 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h +++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h @@ -28,6 +28,7 @@ class MCSubtargetInfo; class MCRelocationInfo; class MCStreamer; class Target; +class Triple; class StringRef; class raw_ostream; @@ -64,7 +65,7 @@ namespace X86_MC { void DetectFamilyModel(unsigned EAX, unsigned &Family, unsigned &Model); - unsigned getDwarfRegFlavour(StringRef TT, bool isEH); + unsigned getDwarfRegFlavour(Triple TT, bool isEH); void InitLLVM2SEHRegisterMapping(MCRegisterInfo *MRI); diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp index c62fd0a9390a..7fa4180217b4 100644 --- a/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp +++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp @@ -19,12 +19,12 @@ public: raw_ostream &OS) : MCWinCOFFStreamer(C, AB, *CE, OS) { } - void EmitWin64EHHandlerData() override; + void EmitWinEHHandlerData() override; void FinishImpl() override; }; -void X86WinCOFFStreamer::EmitWin64EHHandlerData() { - MCStreamer::EmitWin64EHHandlerData(); +void X86WinCOFFStreamer::EmitWinEHHandlerData() { + MCStreamer::EmitWinEHHandlerData(); // We have to emit the unwind info now, because this directive // actually switches to the .xdata section! diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h index 64e8ea834f47..d5522ed95eb4 100644 --- a/lib/Target/X86/X86.h +++ b/lib/Target/X86/X86.h @@ -24,6 +24,10 @@ class ImmutablePass; class JITCodeEmitter; class X86TargetMachine; +/// createX86AtomicExpandPass - This pass expands atomic operations that cannot +/// be handled natively in terms of a loop using cmpxchg. +FunctionPass *createX86AtomicExpandPass(const X86TargetMachine *TM); + /// createX86ISelDag - This pass converts a legalized DAG into a /// X86-specific DAG, ready for instruction scheduling. /// diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index 6912b579b12c..93f516a151a9 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -168,6 +168,8 @@ def FeatureLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true", "LEA instruction needs inputs at AG stage">; def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true", "LEA instruction with certain arguments is slow">; +def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true", + "INC and DEC instructions are slower than ADD and SUB">; //===----------------------------------------------------------------------===// // X86 processors supported. @@ -228,7 +230,7 @@ def : ProcessorModel<"slm", SLMModel, [ProcIntelSLM, FeaturePCLMUL, FeatureAES, FeatureCallRegIndirect, FeaturePRFCHW, - FeatureSlowLEA, + FeatureSlowLEA, FeatureSlowIncDec, FeatureSlowBTMem, FeatureFastUAMem]>; // "Arrandale" along with corei3 and corei5 def : ProcessorModel<"corei7", SandyBridgeModel, @@ -271,7 +273,8 @@ def : ProcessorModel<"knl", HaswellModel, FeatureCMPXCHG16B, FeatureFastUAMem, FeaturePOPCNT, FeatureAES, FeaturePCLMUL, FeatureRDRAND, FeatureF16C, FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT, FeatureBMI, - FeatureBMI2, FeatureFMA, FeatureRTM, FeatureHLE]>; + FeatureBMI2, FeatureFMA, FeatureRTM, FeatureHLE, + FeatureSlowIncDec]>; def : Proc<"k6", [FeatureMMX]>; def : Proc<"k6-2", [Feature3DNow]>; diff --git a/lib/Target/X86/X86AtomicExpandPass.cpp b/lib/Target/X86/X86AtomicExpandPass.cpp new file mode 100644 index 000000000000..61eefbbf75b1 --- /dev/null +++ b/lib/Target/X86/X86AtomicExpandPass.cpp @@ -0,0 +1,287 @@ +//===-- X86AtomicExpandPass.cpp - Expand illegal atomic instructions --0---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a pass (at IR level) to replace atomic instructions which +// cannot be implemented as a single instruction with cmpxchg-based loops. +// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86TargetMachine.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetMachine.h" +using namespace llvm; + +#define DEBUG_TYPE "x86-atomic-expand" + +namespace { + class X86AtomicExpandPass : public FunctionPass { + const X86TargetMachine *TM; + public: + static char ID; // Pass identification, replacement for typeid + explicit X86AtomicExpandPass(const X86TargetMachine *TM) + : FunctionPass(ID), TM(TM) {} + + bool runOnFunction(Function &F) override; + bool expandAtomicInsts(Function &F); + + bool needsCmpXchgNb(Type *MemType); + + /// There are four kinds of atomic operations. Two never need expanding: + /// cmpxchg is what we expand the others *to*, and loads are easily handled + /// by ISelLowering. Atomicrmw and store can need expanding in some + /// circumstances. + bool shouldExpand(Instruction *Inst); + + /// 128-bit atomic stores (64-bit on i686) need to be implemented in terms + /// of trivial cmpxchg16b loops. A simple store isn't necessarily atomic. + bool shouldExpandStore(StoreInst *SI); + + /// Only some atomicrmw instructions need expanding -- some operations + /// (e.g. max) have absolutely no architectural support; some (e.g. or) have + /// limited support but can't return the previous value; some (e.g. add) + /// have complete support in the instruction set. + /// + /// Also, naturally, 128-bit operations always need to be expanded. + bool shouldExpandAtomicRMW(AtomicRMWInst *AI); + + bool expandAtomicRMW(AtomicRMWInst *AI); + bool expandAtomicStore(StoreInst *SI); + }; +} + +char X86AtomicExpandPass::ID = 0; + +FunctionPass *llvm::createX86AtomicExpandPass(const X86TargetMachine *TM) { + return new X86AtomicExpandPass(TM); +} + +bool X86AtomicExpandPass::runOnFunction(Function &F) { + SmallVector<Instruction *, 1> AtomicInsts; + + // Changing control-flow while iterating through it is a bad idea, so gather a + // list of all atomic instructions before we start. + for (BasicBlock &BB : F) + for (Instruction &Inst : BB) { + if (isa<AtomicRMWInst>(&Inst) || + (isa<StoreInst>(&Inst) && cast<StoreInst>(&Inst)->isAtomic())) + AtomicInsts.push_back(&Inst); + } + + bool MadeChange = false; + for (Instruction *Inst : AtomicInsts) { + if (!shouldExpand(Inst)) + continue; + + if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(Inst)) + MadeChange |= expandAtomicRMW(AI); + if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) + MadeChange |= expandAtomicStore(SI); + + assert(MadeChange && "Atomic inst not expanded when it should be?"); + Inst->eraseFromParent(); + } + + return MadeChange; +} + +/// Returns true if operations on the given type will need to use either +/// cmpxchg8b or cmpxchg16b. This occurs if the type is 1 step up from the +/// native width, and the instructions are available (otherwise we leave them +/// alone to become __sync_fetch_and_... calls). +bool X86AtomicExpandPass::needsCmpXchgNb(llvm::Type *MemType) { + const X86Subtarget &Subtarget = TM->getSubtarget<X86Subtarget>(); + if (!Subtarget.hasCmpxchg16b()) + return false; + + unsigned CmpXchgNbWidth = Subtarget.is64Bit() ? 128 : 64; + + unsigned OpWidth = MemType->getPrimitiveSizeInBits(); + if (OpWidth == CmpXchgNbWidth) + return true; + + return false; +} + + +bool X86AtomicExpandPass::shouldExpandAtomicRMW(AtomicRMWInst *AI) { + const X86Subtarget &Subtarget = TM->getSubtarget<X86Subtarget>(); + unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32; + + if (needsCmpXchgNb(AI->getType())) + return true; + + if (AI->getType()->getPrimitiveSizeInBits() > NativeWidth) + return false; + + AtomicRMWInst::BinOp Op = AI->getOperation(); + switch (Op) { + default: + llvm_unreachable("Unknown atomic operation"); + case AtomicRMWInst::Xchg: + case AtomicRMWInst::Add: + case AtomicRMWInst::Sub: + // It's better to use xadd, xsub or xchg for these in all cases. + return false; + case AtomicRMWInst::Or: + case AtomicRMWInst::And: + case AtomicRMWInst::Xor: + // If the atomicrmw's result isn't actually used, we can just add a "lock" + // prefix to a normal instruction for these operations. + return !AI->use_empty(); + case AtomicRMWInst::Nand: + case AtomicRMWInst::Max: + case AtomicRMWInst::Min: + case AtomicRMWInst::UMax: + case AtomicRMWInst::UMin: + // These always require a non-trivial set of data operations on x86. We must + // use a cmpxchg loop. + return true; + } +} + +bool X86AtomicExpandPass::shouldExpandStore(StoreInst *SI) { + if (needsCmpXchgNb(SI->getValueOperand()->getType())) + return true; + + return false; +} + +bool X86AtomicExpandPass::shouldExpand(Instruction *Inst) { + if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(Inst)) + return shouldExpandAtomicRMW(AI); + if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) + return shouldExpandStore(SI); + return false; +} + +/// Emit IR to implement the given atomicrmw operation on values in registers, +/// returning the new value. +static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder, + Value *Loaded, Value *Inc) { + Value *NewVal; + switch (Op) { + case AtomicRMWInst::Xchg: + return Inc; + case AtomicRMWInst::Add: + return Builder.CreateAdd(Loaded, Inc, "new"); + case AtomicRMWInst::Sub: + return Builder.CreateSub(Loaded, Inc, "new"); + case AtomicRMWInst::And: + return Builder.CreateAnd(Loaded, Inc, "new"); + case AtomicRMWInst::Nand: + return Builder.CreateNot(Builder.CreateAnd(Loaded, Inc), "new"); + case AtomicRMWInst::Or: + return Builder.CreateOr(Loaded, Inc, "new"); + case AtomicRMWInst::Xor: + return Builder.CreateXor(Loaded, Inc, "new"); + case AtomicRMWInst::Max: + NewVal = Builder.CreateICmpSGT(Loaded, Inc); + return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); + case AtomicRMWInst::Min: + NewVal = Builder.CreateICmpSLE(Loaded, Inc); + return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); + case AtomicRMWInst::UMax: + NewVal = Builder.CreateICmpUGT(Loaded, Inc); + return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); + case AtomicRMWInst::UMin: + NewVal = Builder.CreateICmpULE(Loaded, Inc); + return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); + default: + break; + } + llvm_unreachable("Unknown atomic op"); +} + +bool X86AtomicExpandPass::expandAtomicRMW(AtomicRMWInst *AI) { + AtomicOrdering Order = + AI->getOrdering() == Unordered ? Monotonic : AI->getOrdering(); + Value *Addr = AI->getPointerOperand(); + BasicBlock *BB = AI->getParent(); + Function *F = BB->getParent(); + LLVMContext &Ctx = F->getContext(); + + // Given: atomicrmw some_op iN* %addr, iN %incr ordering + // + // The standard expansion we produce is: + // [...] + // %init_loaded = load atomic iN* %addr + // br label %loop + // loop: + // %loaded = phi iN [ %init_loaded, %entry ], [ %new_loaded, %loop ] + // %new = some_op iN %loaded, %incr + // %pair = cmpxchg iN* %addr, iN %loaded, iN %new + // %new_loaded = extractvalue { iN, i1 } %pair, 0 + // %success = extractvalue { iN, i1 } %pair, 1 + // br i1 %success, label %atomicrmw.end, label %loop + // atomicrmw.end: + // [...] + BasicBlock *ExitBB = BB->splitBasicBlock(AI, "atomicrmw.end"); + BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB); + + // This grabs the DebugLoc from AI. + IRBuilder<> Builder(AI); + + // The split call above "helpfully" added a branch at the end of BB (to the + // wrong place), but we want a load. It's easiest to just remove + // the branch entirely. + std::prev(BB->end())->eraseFromParent(); + Builder.SetInsertPoint(BB); + LoadInst *InitLoaded = Builder.CreateLoad(Addr); + InitLoaded->setAlignment(AI->getType()->getPrimitiveSizeInBits()); + Builder.CreateBr(LoopBB); + + // Start the main loop block now that we've taken care of the preliminaries. + Builder.SetInsertPoint(LoopBB); + PHINode *Loaded = Builder.CreatePHI(AI->getType(), 2, "loaded"); + Loaded->addIncoming(InitLoaded, BB); + + Value *NewVal = + performAtomicOp(AI->getOperation(), Builder, Loaded, AI->getValOperand()); + + Value *Pair = Builder.CreateAtomicCmpXchg( + Addr, Loaded, NewVal, Order, + AtomicCmpXchgInst::getStrongestFailureOrdering(Order)); + Value *NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded"); + Loaded->addIncoming(NewLoaded, LoopBB); + + Value *Success = Builder.CreateExtractValue(Pair, 1, "success"); + Builder.CreateCondBr(Success, ExitBB, LoopBB); + + AI->replaceAllUsesWith(NewLoaded); + + return true; +} + +bool X86AtomicExpandPass::expandAtomicStore(StoreInst *SI) { + // An atomic store might need cmpxchg16b (or 8b on x86) to execute. Express + // this in terms of the usual expansion to "atomicrmw xchg". + IRBuilder<> Builder(SI); + AtomicOrdering Order = + SI->getOrdering() == Unordered ? Monotonic : SI->getOrdering(); + AtomicRMWInst *AI = + Builder.CreateAtomicRMW(AtomicRMWInst::Xchg, SI->getPointerOperand(), + SI->getValueOperand(), Order); + + // Now we have an appropriate swap instruction, lower it as usual. + if (shouldExpandAtomicRMW(AI)) { + expandAtomicRMW(AI); + AI->eraseFromParent(); + return true; + } + + return AI; +} diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp index 76718d0b6877..a3ae7ee315a3 100644 --- a/lib/Target/X86/X86CodeEmitter.cpp +++ b/lib/Target/X86/X86CodeEmitter.cpp @@ -1113,9 +1113,14 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI, case TargetOpcode::INLINEASM: // We allow inline assembler nodes with empty bodies - they can // implicitly define registers, which is ok for JIT. - if (MI.getOperand(0).getSymbolName()[0]) + if (MI.getOperand(0).getSymbolName()[0]) { + DebugLoc DL = MI.getDebugLoc(); + DL.print(MI.getParent()->getParent()->getFunction()->getContext(), + llvm::errs()); report_fatal_error("JIT does not support inline asm!"); + } break; + case TargetOpcode::DBG_VALUE: case TargetOpcode::CFI_INSTRUCTION: break; case TargetOpcode::GC_LABEL: @@ -1126,6 +1131,16 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI, case TargetOpcode::IMPLICIT_DEF: case TargetOpcode::KILL: break; + + case X86::SEH_PushReg: + case X86::SEH_SaveReg: + case X86::SEH_SaveXMM: + case X86::SEH_StackAlloc: + case X86::SEH_SetFrame: + case X86::SEH_PushFrame: + case X86::SEH_EndPrologue: + break; + case X86::MOVPC32r: { // This emits the "call" portion of this pseudo instruction. MCE.emitByte(BaseOpcode); diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp index 56bcfa30ff90..ce554ba21d63 100644 --- a/lib/Target/X86/X86FastISel.cpp +++ b/lib/Target/X86/X86FastISel.cpp @@ -16,10 +16,12 @@ #include "X86.h" #include "X86CallingConv.h" #include "X86InstrBuilder.h" +#include "X86InstrInfo.h" #include "X86MachineFunctionInfo.h" #include "X86RegisterInfo.h" #include "X86Subtarget.h" #include "X86TargetMachine.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/FastISel.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" @@ -78,12 +80,14 @@ public: private: bool X86FastEmitCompare(const Value *LHS, const Value *RHS, EVT VT); - bool X86FastEmitLoad(EVT VT, const X86AddressMode &AM, unsigned &RR); + bool X86FastEmitLoad(EVT VT, const X86AddressMode &AM, MachineMemOperand *MMO, + unsigned &ResultReg); bool X86FastEmitStore(EVT VT, const Value *Val, const X86AddressMode &AM, - bool Aligned = false); - bool X86FastEmitStore(EVT VT, unsigned ValReg, const X86AddressMode &AM, - bool Aligned = false); + MachineMemOperand *MMO = nullptr, bool Aligned = false); + bool X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill, + const X86AddressMode &AM, + MachineMemOperand *MMO = nullptr, bool Aligned = false); bool X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT, unsigned &ResultReg); @@ -107,6 +111,12 @@ private: bool X86SelectDivRem(const Instruction *I); + bool X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I); + + bool X86FastEmitSSESelect(MVT RetVT, const Instruction *I); + + bool X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I); + bool X86SelectSelect(const Instruction *I); bool X86SelectTrunc(const Instruction *I); @@ -147,10 +157,182 @@ private: bool TryEmitSmallMemcpy(X86AddressMode DestAM, X86AddressMode SrcAM, uint64_t Len); + + bool foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I, + const Value *Cond); }; } // end anonymous namespace. +static CmpInst::Predicate optimizeCmpPredicate(const CmpInst *CI) { + // If both operands are the same, then try to optimize or fold the cmp. + CmpInst::Predicate Predicate = CI->getPredicate(); + if (CI->getOperand(0) != CI->getOperand(1)) + return Predicate; + + switch (Predicate) { + default: llvm_unreachable("Invalid predicate!"); + case CmpInst::FCMP_FALSE: Predicate = CmpInst::FCMP_FALSE; break; + case CmpInst::FCMP_OEQ: Predicate = CmpInst::FCMP_ORD; break; + case CmpInst::FCMP_OGT: Predicate = CmpInst::FCMP_FALSE; break; + case CmpInst::FCMP_OGE: Predicate = CmpInst::FCMP_ORD; break; + case CmpInst::FCMP_OLT: Predicate = CmpInst::FCMP_FALSE; break; + case CmpInst::FCMP_OLE: Predicate = CmpInst::FCMP_ORD; break; + case CmpInst::FCMP_ONE: Predicate = CmpInst::FCMP_FALSE; break; + case CmpInst::FCMP_ORD: Predicate = CmpInst::FCMP_ORD; break; + case CmpInst::FCMP_UNO: Predicate = CmpInst::FCMP_UNO; break; + case CmpInst::FCMP_UEQ: Predicate = CmpInst::FCMP_TRUE; break; + case CmpInst::FCMP_UGT: Predicate = CmpInst::FCMP_UNO; break; + case CmpInst::FCMP_UGE: Predicate = CmpInst::FCMP_TRUE; break; + case CmpInst::FCMP_ULT: Predicate = CmpInst::FCMP_UNO; break; + case CmpInst::FCMP_ULE: Predicate = CmpInst::FCMP_TRUE; break; + case CmpInst::FCMP_UNE: Predicate = CmpInst::FCMP_UNO; break; + case CmpInst::FCMP_TRUE: Predicate = CmpInst::FCMP_TRUE; break; + + case CmpInst::ICMP_EQ: Predicate = CmpInst::FCMP_TRUE; break; + case CmpInst::ICMP_NE: Predicate = CmpInst::FCMP_FALSE; break; + case CmpInst::ICMP_UGT: Predicate = CmpInst::FCMP_FALSE; break; + case CmpInst::ICMP_UGE: Predicate = CmpInst::FCMP_TRUE; break; + case CmpInst::ICMP_ULT: Predicate = CmpInst::FCMP_FALSE; break; + case CmpInst::ICMP_ULE: Predicate = CmpInst::FCMP_TRUE; break; + case CmpInst::ICMP_SGT: Predicate = CmpInst::FCMP_FALSE; break; + case CmpInst::ICMP_SGE: Predicate = CmpInst::FCMP_TRUE; break; + case CmpInst::ICMP_SLT: Predicate = CmpInst::FCMP_FALSE; break; + case CmpInst::ICMP_SLE: Predicate = CmpInst::FCMP_TRUE; break; + } + + return Predicate; +} + +static std::pair<X86::CondCode, bool> +getX86ConditionCode(CmpInst::Predicate Predicate) { + X86::CondCode CC = X86::COND_INVALID; + bool NeedSwap = false; + switch (Predicate) { + default: break; + // Floating-point Predicates + case CmpInst::FCMP_UEQ: CC = X86::COND_E; break; + case CmpInst::FCMP_OLT: NeedSwap = true; // fall-through + case CmpInst::FCMP_OGT: CC = X86::COND_A; break; + case CmpInst::FCMP_OLE: NeedSwap = true; // fall-through + case CmpInst::FCMP_OGE: CC = X86::COND_AE; break; + case CmpInst::FCMP_UGT: NeedSwap = true; // fall-through + case CmpInst::FCMP_ULT: CC = X86::COND_B; break; + case CmpInst::FCMP_UGE: NeedSwap = true; // fall-through + case CmpInst::FCMP_ULE: CC = X86::COND_BE; break; + case CmpInst::FCMP_ONE: CC = X86::COND_NE; break; + case CmpInst::FCMP_UNO: CC = X86::COND_P; break; + case CmpInst::FCMP_ORD: CC = X86::COND_NP; break; + case CmpInst::FCMP_OEQ: // fall-through + case CmpInst::FCMP_UNE: CC = X86::COND_INVALID; break; + + // Integer Predicates + case CmpInst::ICMP_EQ: CC = X86::COND_E; break; + case CmpInst::ICMP_NE: CC = X86::COND_NE; break; + case CmpInst::ICMP_UGT: CC = X86::COND_A; break; + case CmpInst::ICMP_UGE: CC = X86::COND_AE; break; + case CmpInst::ICMP_ULT: CC = X86::COND_B; break; + case CmpInst::ICMP_ULE: CC = X86::COND_BE; break; + case CmpInst::ICMP_SGT: CC = X86::COND_G; break; + case CmpInst::ICMP_SGE: CC = X86::COND_GE; break; + case CmpInst::ICMP_SLT: CC = X86::COND_L; break; + case CmpInst::ICMP_SLE: CC = X86::COND_LE; break; + } + + return std::make_pair(CC, NeedSwap); +} + +static std::pair<unsigned, bool> +getX86SSEConditionCode(CmpInst::Predicate Predicate) { + unsigned CC; + bool NeedSwap = false; + + // SSE Condition code mapping: + // 0 - EQ + // 1 - LT + // 2 - LE + // 3 - UNORD + // 4 - NEQ + // 5 - NLT + // 6 - NLE + // 7 - ORD + switch (Predicate) { + default: llvm_unreachable("Unexpected predicate"); + case CmpInst::FCMP_OEQ: CC = 0; break; + case CmpInst::FCMP_OGT: NeedSwap = true; // fall-through + case CmpInst::FCMP_OLT: CC = 1; break; + case CmpInst::FCMP_OGE: NeedSwap = true; // fall-through + case CmpInst::FCMP_OLE: CC = 2; break; + case CmpInst::FCMP_UNO: CC = 3; break; + case CmpInst::FCMP_UNE: CC = 4; break; + case CmpInst::FCMP_ULE: NeedSwap = true; // fall-through + case CmpInst::FCMP_UGE: CC = 5; break; + case CmpInst::FCMP_ULT: NeedSwap = true; // fall-through + case CmpInst::FCMP_UGT: CC = 6; break; + case CmpInst::FCMP_ORD: CC = 7; break; + case CmpInst::FCMP_UEQ: + case CmpInst::FCMP_ONE: CC = 8; break; + } + + return std::make_pair(CC, NeedSwap); +} + +/// \brief Check if it is possible to fold the condition from the XALU intrinsic +/// into the user. The condition code will only be updated on success. +bool X86FastISel::foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I, + const Value *Cond) { + if (!isa<ExtractValueInst>(Cond)) + return false; + + const auto *EV = cast<ExtractValueInst>(Cond); + if (!isa<IntrinsicInst>(EV->getAggregateOperand())) + return false; + + const auto *II = cast<IntrinsicInst>(EV->getAggregateOperand()); + MVT RetVT; + const Function *Callee = II->getCalledFunction(); + Type *RetTy = + cast<StructType>(Callee->getReturnType())->getTypeAtIndex(0U); + if (!isTypeLegal(RetTy, RetVT)) + return false; + + if (RetVT != MVT::i32 && RetVT != MVT::i64) + return false; + + X86::CondCode TmpCC; + switch (II->getIntrinsicID()) { + default: return false; + case Intrinsic::sadd_with_overflow: + case Intrinsic::ssub_with_overflow: + case Intrinsic::smul_with_overflow: + case Intrinsic::umul_with_overflow: TmpCC = X86::COND_O; break; + case Intrinsic::uadd_with_overflow: + case Intrinsic::usub_with_overflow: TmpCC = X86::COND_B; break; + } + + // Check if both instructions are in the same basic block. + if (II->getParent() != I->getParent()) + return false; + + // Make sure nothing is in the way + BasicBlock::const_iterator Start = I; + BasicBlock::const_iterator End = II; + for (auto Itr = std::prev(Start); Itr != End; --Itr) { + // We only expect extractvalue instructions between the intrinsic and the + // instruction to be selected. + if (!isa<ExtractValueInst>(Itr)) + return false; + + // Check that the extractvalue operand comes from the intrinsic. + const auto *EVI = cast<ExtractValueInst>(Itr); + if (EVI->getAggregateOperand() != II) + return false; + } + + CC = TmpCC; + return true; +} + bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) { EVT evt = TLI.getValueType(Ty, /*HandleUnknown=*/true); if (evt == MVT::Other || !evt.isSimple()) @@ -180,7 +362,7 @@ bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) { /// The address is either pre-computed, i.e. Ptr, or a GlobalAddress, i.e. GV. /// Return true and the result register by reference if it is possible. bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM, - unsigned &ResultReg) { + MachineMemOperand *MMO, unsigned &ResultReg) { // Get opcode and regclass of the output for the given load instruction. unsigned Opc = 0; const TargetRegisterClass *RC = nullptr; @@ -228,8 +410,11 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM, } ResultReg = createResultReg(RC); - addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, - DbgLoc, TII.get(Opc), ResultReg), AM); + MachineInstrBuilder MIB = + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg); + addFullAddress(MIB, AM); + if (MMO) + MIB->addMemOperand(*FuncInfo.MF, MMO); return true; } @@ -237,9 +422,9 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM, /// type VT. The address is either pre-computed, consisted of a base ptr, Ptr /// and a displacement offset, or a GlobalAddress, /// i.e. V. Return true if it is possible. -bool -X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, - const X86AddressMode &AM, bool Aligned) { +bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill, + const X86AddressMode &AM, + MachineMemOperand *MMO, bool Aligned) { // Get opcode and regclass of the output for the given store instruction. unsigned Opc = 0; switch (VT.getSimpleVT().SimpleTy) { @@ -249,7 +434,8 @@ X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, // Mask out all but lowest bit. unsigned AndResult = createResultReg(&X86::GR8RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(X86::AND8ri), AndResult).addReg(ValReg).addImm(1); + TII.get(X86::AND8ri), AndResult) + .addReg(ValReg, getKillRegState(ValIsKill)).addImm(1); ValReg = AndResult; } // FALLTHROUGH, handling i1 as i8. @@ -288,13 +474,18 @@ X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, break; } - addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, - DbgLoc, TII.get(Opc)), AM).addReg(ValReg); + MachineInstrBuilder MIB = + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)); + addFullAddress(MIB, AM).addReg(ValReg, getKillRegState(ValIsKill)); + if (MMO) + MIB->addMemOperand(*FuncInfo.MF, MMO); + return true; } bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val, - const X86AddressMode &AM, bool Aligned) { + const X86AddressMode &AM, + MachineMemOperand *MMO, bool Aligned) { // Handle 'null' like i32/i64 0. if (isa<ConstantPointerNull>(Val)) Val = Constant::getNullValue(DL.getIntPtrType(Val->getContext())); @@ -317,10 +508,12 @@ bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val, } if (Opc) { - addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, - DbgLoc, TII.get(Opc)), AM) - .addImm(Signed ? (uint64_t) CI->getSExtValue() : - CI->getZExtValue()); + MachineInstrBuilder MIB = + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)); + addFullAddress(MIB, AM).addImm(Signed ? (uint64_t) CI->getSExtValue() + : CI->getZExtValue()); + if (MMO) + MIB->addMemOperand(*FuncInfo.MF, MMO); return true; } } @@ -329,7 +522,8 @@ bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val, if (ValReg == 0) return false; - return X86FastEmitStore(VT, ValReg, AM, Aligned); + bool ValKill = hasTrivialKill(Val); + return X86FastEmitStore(VT, ValReg, ValKill, AM, MMO, Aligned); } /// X86FastEmitExtend - Emit a machine instruction to extend a value Src of @@ -355,17 +549,8 @@ bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) { return false; // Can't handle TLS yet. - if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV)) - if (GVar->isThreadLocal()) - return false; - - // Can't handle TLS yet, part 2 (this is slightly crazy, but this is how - // it works...). - if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) - if (const GlobalVariable *GVar = - dyn_cast_or_null<GlobalVariable>(GA->getAliasee())) - if (GVar->isThreadLocal()) - return false; + if (GV->isThreadLocal()) + return false; // RIP-relative addresses can't have additional register operands, so if // we've already folded stuff into the addressing mode, just force the @@ -696,7 +881,7 @@ bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) { (AM.Base.Reg != 0 || AM.IndexReg != 0)) return false; - // Can't handle DbgLocLImport. + // Can't handle DLL Import. if (GV->hasDLLImportStorageClass()) return false; @@ -749,19 +934,24 @@ bool X86FastISel::X86SelectStore(const Instruction *I) { if (S->isAtomic()) return false; - unsigned SABIAlignment = - DL.getABITypeAlignment(S->getValueOperand()->getType()); - bool Aligned = S->getAlignment() == 0 || S->getAlignment() >= SABIAlignment; + const Value *Val = S->getValueOperand(); + const Value *Ptr = S->getPointerOperand(); MVT VT; - if (!isTypeLegal(I->getOperand(0)->getType(), VT, /*AllowI1=*/true)) + if (!isTypeLegal(Val->getType(), VT, /*AllowI1=*/true)) return false; + unsigned Alignment = S->getAlignment(); + unsigned ABIAlignment = DL.getABITypeAlignment(Val->getType()); + if (Alignment == 0) // Ensure that codegen never sees alignment 0 + Alignment = ABIAlignment; + bool Aligned = Alignment >= ABIAlignment; + X86AddressMode AM; - if (!X86SelectAddress(I->getOperand(1), AM)) + if (!X86SelectAddress(Ptr, AM)) return false; - return X86FastEmitStore(VT, I->getOperand(0), AM, Aligned); + return X86FastEmitStore(VT, Val, AM, createMachineMemOperandFor(I), Aligned); } /// X86SelectRet - Select and emit code to implement ret instructions. @@ -896,25 +1086,29 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { /// X86SelectLoad - Select and emit code to implement load instructions. /// -bool X86FastISel::X86SelectLoad(const Instruction *I) { +bool X86FastISel::X86SelectLoad(const Instruction *I) { + const LoadInst *LI = cast<LoadInst>(I); + // Atomic loads need special handling. - if (cast<LoadInst>(I)->isAtomic()) + if (LI->isAtomic()) return false; MVT VT; - if (!isTypeLegal(I->getType(), VT, /*AllowI1=*/true)) + if (!isTypeLegal(LI->getType(), VT, /*AllowI1=*/true)) return false; + const Value *Ptr = LI->getPointerOperand(); + X86AddressMode AM; - if (!X86SelectAddress(I->getOperand(0), AM)) + if (!X86SelectAddress(Ptr, AM)) return false; unsigned ResultReg = 0; - if (X86FastEmitLoad(VT, AM, ResultReg)) { - UpdateValueMap(I, ResultReg); - return true; - } - return false; + if (!X86FastEmitLoad(VT, AM, createMachineMemOperandFor(LI), ResultReg)) + return false; + + UpdateValueMap(I, ResultReg); + return true; } static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) { @@ -994,73 +1188,89 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) { if (!isTypeLegal(I->getOperand(0)->getType(), VT)) return false; - unsigned ResultReg = createResultReg(&X86::GR8RegClass); - unsigned SetCCOpc; - bool SwapArgs; // false -> compare Op0, Op1. true -> compare Op1, Op0. - switch (CI->getPredicate()) { - case CmpInst::FCMP_OEQ: { - if (!X86FastEmitCompare(CI->getOperand(0), CI->getOperand(1), VT)) + // Try to optimize or fold the cmp. + CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); + unsigned ResultReg = 0; + switch (Predicate) { + default: break; + case CmpInst::FCMP_FALSE: { + ResultReg = createResultReg(&X86::GR32RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV32r0), + ResultReg); + ResultReg = FastEmitInst_extractsubreg(MVT::i8, ResultReg, /*Kill=*/true, + X86::sub_8bit); + if (!ResultReg) return false; + break; + } + case CmpInst::FCMP_TRUE: { + ResultReg = createResultReg(&X86::GR8RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV8ri), + ResultReg).addImm(1); + break; + } + } - unsigned EReg = createResultReg(&X86::GR8RegClass); - unsigned NPReg = createResultReg(&X86::GR8RegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETEr), EReg); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(X86::SETNPr), NPReg); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(X86::AND8rr), ResultReg).addReg(NPReg).addReg(EReg); + if (ResultReg) { UpdateValueMap(I, ResultReg); return true; } - case CmpInst::FCMP_UNE: { - if (!X86FastEmitCompare(CI->getOperand(0), CI->getOperand(1), VT)) + + const Value *LHS = CI->getOperand(0); + const Value *RHS = CI->getOperand(1); + + // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0. + // We don't have to materialize a zero constant for this case and can just use + // %x again on the RHS. + if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) { + const auto *RHSC = dyn_cast<ConstantFP>(RHS); + if (RHSC && RHSC->isNullValue()) + RHS = LHS; + } + + // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction. + static unsigned SETFOpcTable[2][3] = { + { X86::SETEr, X86::SETNPr, X86::AND8rr }, + { X86::SETNEr, X86::SETPr, X86::OR8rr } + }; + unsigned *SETFOpc = nullptr; + switch (Predicate) { + default: break; + case CmpInst::FCMP_OEQ: SETFOpc = &SETFOpcTable[0][0]; break; + case CmpInst::FCMP_UNE: SETFOpc = &SETFOpcTable[1][0]; break; + } + + ResultReg = createResultReg(&X86::GR8RegClass); + if (SETFOpc) { + if (!X86FastEmitCompare(LHS, RHS, VT)) return false; - unsigned NEReg = createResultReg(&X86::GR8RegClass); - unsigned PReg = createResultReg(&X86::GR8RegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETNEr), NEReg); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETPr), PReg); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::OR8rr),ResultReg) - .addReg(PReg).addReg(NEReg); + unsigned FlagReg1 = createResultReg(&X86::GR8RegClass); + unsigned FlagReg2 = createResultReg(&X86::GR8RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]), + FlagReg1); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]), + FlagReg2); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[2]), + ResultReg).addReg(FlagReg1).addReg(FlagReg2); UpdateValueMap(I, ResultReg); return true; } - case CmpInst::FCMP_OGT: SwapArgs = false; SetCCOpc = X86::SETAr; break; - case CmpInst::FCMP_OGE: SwapArgs = false; SetCCOpc = X86::SETAEr; break; - case CmpInst::FCMP_OLT: SwapArgs = true; SetCCOpc = X86::SETAr; break; - case CmpInst::FCMP_OLE: SwapArgs = true; SetCCOpc = X86::SETAEr; break; - case CmpInst::FCMP_ONE: SwapArgs = false; SetCCOpc = X86::SETNEr; break; - case CmpInst::FCMP_ORD: SwapArgs = false; SetCCOpc = X86::SETNPr; break; - case CmpInst::FCMP_UNO: SwapArgs = false; SetCCOpc = X86::SETPr; break; - case CmpInst::FCMP_UEQ: SwapArgs = false; SetCCOpc = X86::SETEr; break; - case CmpInst::FCMP_UGT: SwapArgs = true; SetCCOpc = X86::SETBr; break; - case CmpInst::FCMP_UGE: SwapArgs = true; SetCCOpc = X86::SETBEr; break; - case CmpInst::FCMP_ULT: SwapArgs = false; SetCCOpc = X86::SETBr; break; - case CmpInst::FCMP_ULE: SwapArgs = false; SetCCOpc = X86::SETBEr; break; - - case CmpInst::ICMP_EQ: SwapArgs = false; SetCCOpc = X86::SETEr; break; - case CmpInst::ICMP_NE: SwapArgs = false; SetCCOpc = X86::SETNEr; break; - case CmpInst::ICMP_UGT: SwapArgs = false; SetCCOpc = X86::SETAr; break; - case CmpInst::ICMP_UGE: SwapArgs = false; SetCCOpc = X86::SETAEr; break; - case CmpInst::ICMP_ULT: SwapArgs = false; SetCCOpc = X86::SETBr; break; - case CmpInst::ICMP_ULE: SwapArgs = false; SetCCOpc = X86::SETBEr; break; - case CmpInst::ICMP_SGT: SwapArgs = false; SetCCOpc = X86::SETGr; break; - case CmpInst::ICMP_SGE: SwapArgs = false; SetCCOpc = X86::SETGEr; break; - case CmpInst::ICMP_SLT: SwapArgs = false; SetCCOpc = X86::SETLr; break; - case CmpInst::ICMP_SLE: SwapArgs = false; SetCCOpc = X86::SETLEr; break; - default: - return false; - } - const Value *Op0 = CI->getOperand(0), *Op1 = CI->getOperand(1); + X86::CondCode CC; + bool SwapArgs; + std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate); + assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code."); + unsigned Opc = X86::getSETFromCond(CC); + if (SwapArgs) - std::swap(Op0, Op1); + std::swap(LHS, RHS); - // Emit a compare of Op0/Op1. - if (!X86FastEmitCompare(Op0, Op1, VT)) + // Emit a compare of LHS/RHS. + if (!X86FastEmitCompare(LHS, RHS, VT)) return false; - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SetCCOpc), ResultReg); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg); UpdateValueMap(I, ResultReg); return true; } @@ -1126,73 +1336,88 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { // Fold the common case of a conditional branch with a comparison // in the same block (values defined on other blocks may not have // initialized registers). + X86::CondCode CC; if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) { if (CI->hasOneUse() && CI->getParent() == I->getParent()) { EVT VT = TLI.getValueType(CI->getOperand(0)->getType()); + // Try to optimize or fold the cmp. + CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); + switch (Predicate) { + default: break; + case CmpInst::FCMP_FALSE: FastEmitBranch(FalseMBB, DbgLoc); return true; + case CmpInst::FCMP_TRUE: FastEmitBranch(TrueMBB, DbgLoc); return true; + } + + const Value *CmpLHS = CI->getOperand(0); + const Value *CmpRHS = CI->getOperand(1); + + // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, + // 0.0. + // We don't have to materialize a zero constant for this case and can just + // use %x again on the RHS. + if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) { + const auto *CmpRHSC = dyn_cast<ConstantFP>(CmpRHS); + if (CmpRHSC && CmpRHSC->isNullValue()) + CmpRHS = CmpLHS; + } + // Try to take advantage of fallthrough opportunities. - CmpInst::Predicate Predicate = CI->getPredicate(); if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) { std::swap(TrueMBB, FalseMBB); Predicate = CmpInst::getInversePredicate(Predicate); } - bool SwapArgs; // false -> compare Op0, Op1. true -> compare Op1, Op0. - unsigned BranchOpc; // Opcode to jump on, e.g. "X86::JA" - + // FCMP_OEQ and FCMP_UNE cannot be expressed with a single flag/condition + // code check. Instead two branch instructions are required to check all + // the flags. First we change the predicate to a supported condition code, + // which will be the first branch. Later one we will emit the second + // branch. + bool NeedExtraBranch = false; switch (Predicate) { + default: break; case CmpInst::FCMP_OEQ: - std::swap(TrueMBB, FalseMBB); - Predicate = CmpInst::FCMP_UNE; - // FALL THROUGH - case CmpInst::FCMP_UNE: SwapArgs = false; BranchOpc = X86::JNE_4; break; - case CmpInst::FCMP_OGT: SwapArgs = false; BranchOpc = X86::JA_4; break; - case CmpInst::FCMP_OGE: SwapArgs = false; BranchOpc = X86::JAE_4; break; - case CmpInst::FCMP_OLT: SwapArgs = true; BranchOpc = X86::JA_4; break; - case CmpInst::FCMP_OLE: SwapArgs = true; BranchOpc = X86::JAE_4; break; - case CmpInst::FCMP_ONE: SwapArgs = false; BranchOpc = X86::JNE_4; break; - case CmpInst::FCMP_ORD: SwapArgs = false; BranchOpc = X86::JNP_4; break; - case CmpInst::FCMP_UNO: SwapArgs = false; BranchOpc = X86::JP_4; break; - case CmpInst::FCMP_UEQ: SwapArgs = false; BranchOpc = X86::JE_4; break; - case CmpInst::FCMP_UGT: SwapArgs = true; BranchOpc = X86::JB_4; break; - case CmpInst::FCMP_UGE: SwapArgs = true; BranchOpc = X86::JBE_4; break; - case CmpInst::FCMP_ULT: SwapArgs = false; BranchOpc = X86::JB_4; break; - case CmpInst::FCMP_ULE: SwapArgs = false; BranchOpc = X86::JBE_4; break; - - case CmpInst::ICMP_EQ: SwapArgs = false; BranchOpc = X86::JE_4; break; - case CmpInst::ICMP_NE: SwapArgs = false; BranchOpc = X86::JNE_4; break; - case CmpInst::ICMP_UGT: SwapArgs = false; BranchOpc = X86::JA_4; break; - case CmpInst::ICMP_UGE: SwapArgs = false; BranchOpc = X86::JAE_4; break; - case CmpInst::ICMP_ULT: SwapArgs = false; BranchOpc = X86::JB_4; break; - case CmpInst::ICMP_ULE: SwapArgs = false; BranchOpc = X86::JBE_4; break; - case CmpInst::ICMP_SGT: SwapArgs = false; BranchOpc = X86::JG_4; break; - case CmpInst::ICMP_SGE: SwapArgs = false; BranchOpc = X86::JGE_4; break; - case CmpInst::ICMP_SLT: SwapArgs = false; BranchOpc = X86::JL_4; break; - case CmpInst::ICMP_SLE: SwapArgs = false; BranchOpc = X86::JLE_4; break; - default: - return false; + std::swap(TrueMBB, FalseMBB); // fall-through + case CmpInst::FCMP_UNE: + NeedExtraBranch = true; + Predicate = CmpInst::FCMP_ONE; + break; } - const Value *Op0 = CI->getOperand(0), *Op1 = CI->getOperand(1); + bool SwapArgs; + unsigned BranchOpc; + std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate); + assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code."); + + BranchOpc = X86::GetCondBranchFromCond(CC); if (SwapArgs) - std::swap(Op0, Op1); + std::swap(CmpLHS, CmpRHS); // Emit a compare of the LHS and RHS, setting the flags. - if (!X86FastEmitCompare(Op0, Op1, VT)) + if (!X86FastEmitCompare(CmpLHS, CmpRHS, VT)) return false; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc)) .addMBB(TrueMBB); - if (Predicate == CmpInst::FCMP_UNE) { - // X86 requires a second branch to handle UNE (and OEQ, - // which is mapped to UNE above). + // X86 requires a second branch to handle UNE (and OEQ, which is mapped + // to UNE above). + if (NeedExtraBranch) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JP_4)) .addMBB(TrueMBB); } + // Obtain the branch weight and add the TrueBB to the successor list. + uint32_t BranchWeight = 0; + if (FuncInfo.BPI) + BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), + TrueMBB->getBasicBlock()); + FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight); + + // Emits an unconditional branch to the FalseBB, obtains the branch + // weight, and adds it to the successor list. FastEmitBranch(FalseMBB, DbgLoc); - FuncInfo.MBB->addSuccessor(TrueMBB); + return true; } } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) { @@ -1224,10 +1449,32 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(JmpOpc)) .addMBB(TrueMBB); FastEmitBranch(FalseMBB, DbgLoc); - FuncInfo.MBB->addSuccessor(TrueMBB); + uint32_t BranchWeight = 0; + if (FuncInfo.BPI) + BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), + TrueMBB->getBasicBlock()); + FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight); return true; } } + } else if (foldX86XALUIntrinsic(CC, BI, BI->getCondition())) { + // Fake request the condition, otherwise the intrinsic might be completely + // optimized away. + unsigned TmpReg = getRegForValue(BI->getCondition()); + if (TmpReg == 0) + return false; + + unsigned BranchOpc = X86::GetCondBranchFromCond(CC); + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc)) + .addMBB(TrueMBB); + FastEmitBranch(FalseMBB, DbgLoc); + uint32_t BranchWeight = 0; + if (FuncInfo.BPI) + BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), + TrueMBB->getBasicBlock()); + FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight); + return true; } // Otherwise do a clumsy setcc and re-test it. @@ -1241,7 +1488,11 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JNE_4)) .addMBB(TrueMBB); FastEmitBranch(FalseMBB, DbgLoc); - FuncInfo.MBB->addSuccessor(TrueMBB); + uint32_t BranchWeight = 0; + if (FuncInfo.BPI) + BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), + TrueMBB->getBasicBlock()); + FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight); return true; } @@ -1478,50 +1729,319 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) { return true; } -bool X86FastISel::X86SelectSelect(const Instruction *I) { - MVT VT; - if (!isTypeLegal(I->getType(), VT)) +/// \brief Emit a conditional move instruction (if the are supported) to lower +/// the select. +bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) { + // Check if the subtarget supports these instructions. + if (!Subtarget->hasCMov()) return false; - // We only use cmov here, if we don't have a cmov instruction bail. - if (!Subtarget->hasCMov()) return false; + // FIXME: Add support for i8. + if (RetVT < MVT::i16 || RetVT > MVT::i64) + return false; - unsigned Opc = 0; - const TargetRegisterClass *RC = nullptr; - if (VT == MVT::i16) { - Opc = X86::CMOVE16rr; - RC = &X86::GR16RegClass; - } else if (VT == MVT::i32) { - Opc = X86::CMOVE32rr; - RC = &X86::GR32RegClass; - } else if (VT == MVT::i64) { - Opc = X86::CMOVE64rr; - RC = &X86::GR64RegClass; - } else { + const Value *Cond = I->getOperand(0); + const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); + bool NeedTest = true; + X86::CondCode CC = X86::COND_NE; + + // Optimize conditions coming from a compare if both instructions are in the + // same basic block (values defined in other basic blocks may not have + // initialized registers). + const auto *CI = dyn_cast<CmpInst>(Cond); + if (CI && (CI->getParent() == I->getParent())) { + CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); + + // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction. + static unsigned SETFOpcTable[2][3] = { + { X86::SETNPr, X86::SETEr , X86::TEST8rr }, + { X86::SETPr, X86::SETNEr, X86::OR8rr } + }; + unsigned *SETFOpc = nullptr; + switch (Predicate) { + default: break; + case CmpInst::FCMP_OEQ: + SETFOpc = &SETFOpcTable[0][0]; + Predicate = CmpInst::ICMP_NE; + break; + case CmpInst::FCMP_UNE: + SETFOpc = &SETFOpcTable[1][0]; + Predicate = CmpInst::ICMP_NE; + break; + } + + bool NeedSwap; + std::tie(CC, NeedSwap) = getX86ConditionCode(Predicate); + assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code."); + + const Value *CmpLHS = CI->getOperand(0); + const Value *CmpRHS = CI->getOperand(1); + if (NeedSwap) + std::swap(CmpLHS, CmpRHS); + + EVT CmpVT = TLI.getValueType(CmpLHS->getType()); + // Emit a compare of the LHS and RHS, setting the flags. + if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT)) + return false; + + if (SETFOpc) { + unsigned FlagReg1 = createResultReg(&X86::GR8RegClass); + unsigned FlagReg2 = createResultReg(&X86::GR8RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]), + FlagReg1); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]), + FlagReg2); + auto const &II = TII.get(SETFOpc[2]); + if (II.getNumDefs()) { + unsigned TmpReg = createResultReg(&X86::GR8RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, TmpReg) + .addReg(FlagReg2).addReg(FlagReg1); + } else { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II) + .addReg(FlagReg2).addReg(FlagReg1); + } + } + NeedTest = false; + } else if (foldX86XALUIntrinsic(CC, I, Cond)) { + // Fake request the condition, otherwise the intrinsic might be completely + // optimized away. + unsigned TmpReg = getRegForValue(Cond); + if (TmpReg == 0) + return false; + + NeedTest = false; + } + + if (NeedTest) { + // Selects operate on i1, however, CondReg is 8 bits width and may contain + // garbage. Indeed, only the less significant bit is supposed to be + // accurate. If we read more than the lsb, we may see non-zero values + // whereas lsb is zero. Therefore, we have to truncate Op0Reg to i1 for + // the select. This is achieved by performing TEST against 1. + unsigned CondReg = getRegForValue(Cond); + if (CondReg == 0) + return false; + bool CondIsKill = hasTrivialKill(Cond); + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri)) + .addReg(CondReg, getKillRegState(CondIsKill)).addImm(1); + } + + const Value *LHS = I->getOperand(1); + const Value *RHS = I->getOperand(2); + + unsigned RHSReg = getRegForValue(RHS); + bool RHSIsKill = hasTrivialKill(RHS); + + unsigned LHSReg = getRegForValue(LHS); + bool LHSIsKill = hasTrivialKill(LHS); + + if (!LHSReg || !RHSReg) + return false; + + unsigned Opc = X86::getCMovFromCond(CC, RC->getSize()); + unsigned ResultReg = FastEmitInst_rr(Opc, RC, RHSReg, RHSIsKill, + LHSReg, LHSIsKill); + UpdateValueMap(I, ResultReg); + return true; +} + +/// \brief Emit SSE instructions to lower the select. +/// +/// Try to use SSE1/SSE2 instructions to simulate a select without branches. +/// This lowers fp selects into a CMP/AND/ANDN/OR sequence when the necessary +/// SSE instructions are available. +bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) { + // Optimize conditions coming from a compare if both instructions are in the + // same basic block (values defined in other basic blocks may not have + // initialized registers). + const auto *CI = dyn_cast<FCmpInst>(I->getOperand(0)); + if (!CI || (CI->getParent() != I->getParent())) return false; + + if (I->getType() != CI->getOperand(0)->getType() || + !((Subtarget->hasSSE1() && RetVT == MVT::f32) || + (Subtarget->hasSSE2() && RetVT == MVT::f64) )) + return false; + + const Value *CmpLHS = CI->getOperand(0); + const Value *CmpRHS = CI->getOperand(1); + CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); + + // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0. + // We don't have to materialize a zero constant for this case and can just use + // %x again on the RHS. + if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) { + const auto *CmpRHSC = dyn_cast<ConstantFP>(CmpRHS); + if (CmpRHSC && CmpRHSC->isNullValue()) + CmpRHS = CmpLHS; } - unsigned Op0Reg = getRegForValue(I->getOperand(0)); - if (Op0Reg == 0) return false; - unsigned Op1Reg = getRegForValue(I->getOperand(1)); - if (Op1Reg == 0) return false; - unsigned Op2Reg = getRegForValue(I->getOperand(2)); - if (Op2Reg == 0) return false; - - // Selects operate on i1, however, Op0Reg is 8 bits width and may contain - // garbage. Indeed, only the less significant bit is supposed to be accurate. - // If we read more than the lsb, we may see non-zero values whereas lsb - // is zero. Therefore, we have to truncate Op0Reg to i1 for the select. - // This is achieved by performing TEST against 1. - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri)) - .addReg(Op0Reg).addImm(1); - unsigned ResultReg = createResultReg(RC); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) - .addReg(Op1Reg).addReg(Op2Reg); + unsigned CC; + bool NeedSwap; + std::tie(CC, NeedSwap) = getX86SSEConditionCode(Predicate); + if (CC > 7) + return false; + + if (NeedSwap) + std::swap(CmpLHS, CmpRHS); + + static unsigned OpcTable[2][2][4] = { + { { X86::CMPSSrr, X86::FsANDPSrr, X86::FsANDNPSrr, X86::FsORPSrr }, + { X86::VCMPSSrr, X86::VFsANDPSrr, X86::VFsANDNPSrr, X86::VFsORPSrr } }, + { { X86::CMPSDrr, X86::FsANDPDrr, X86::FsANDNPDrr, X86::FsORPDrr }, + { X86::VCMPSDrr, X86::VFsANDPDrr, X86::VFsANDNPDrr, X86::VFsORPDrr } } + }; + + bool HasAVX = Subtarget->hasAVX(); + unsigned *Opc = nullptr; + switch (RetVT.SimpleTy) { + default: return false; + case MVT::f32: Opc = &OpcTable[0][HasAVX][0]; break; + case MVT::f64: Opc = &OpcTable[1][HasAVX][0]; break; + } + + const Value *LHS = I->getOperand(1); + const Value *RHS = I->getOperand(2); + + unsigned LHSReg = getRegForValue(LHS); + bool LHSIsKill = hasTrivialKill(LHS); + + unsigned RHSReg = getRegForValue(RHS); + bool RHSIsKill = hasTrivialKill(RHS); + + unsigned CmpLHSReg = getRegForValue(CmpLHS); + bool CmpLHSIsKill = hasTrivialKill(CmpLHS); + + unsigned CmpRHSReg = getRegForValue(CmpRHS); + bool CmpRHSIsKill = hasTrivialKill(CmpRHS); + + if (!LHSReg || !RHSReg || !CmpLHS || !CmpRHS) + return false; + + const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); + unsigned CmpReg = FastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill, + CmpRHSReg, CmpRHSIsKill, CC); + unsigned AndReg = FastEmitInst_rr(Opc[1], RC, CmpReg, /*IsKill=*/false, + LHSReg, LHSIsKill); + unsigned AndNReg = FastEmitInst_rr(Opc[2], RC, CmpReg, /*IsKill=*/true, + RHSReg, RHSIsKill); + unsigned ResultReg = FastEmitInst_rr(Opc[3], RC, AndNReg, /*IsKill=*/true, + AndReg, /*IsKill=*/true); + UpdateValueMap(I, ResultReg); + return true; +} + +bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) { + // These are pseudo CMOV instructions and will be later expanded into control- + // flow. + unsigned Opc; + switch (RetVT.SimpleTy) { + default: return false; + case MVT::i8: Opc = X86::CMOV_GR8; break; + case MVT::i16: Opc = X86::CMOV_GR16; break; + case MVT::i32: Opc = X86::CMOV_GR32; break; + case MVT::f32: Opc = X86::CMOV_FR32; break; + case MVT::f64: Opc = X86::CMOV_FR64; break; + } + + const Value *Cond = I->getOperand(0); + X86::CondCode CC = X86::COND_NE; + + // Optimize conditions coming from a compare if both instructions are in the + // same basic block (values defined in other basic blocks may not have + // initialized registers). + const auto *CI = dyn_cast<CmpInst>(Cond); + if (CI && (CI->getParent() == I->getParent())) { + bool NeedSwap; + std::tie(CC, NeedSwap) = getX86ConditionCode(CI->getPredicate()); + if (CC > X86::LAST_VALID_COND) + return false; + + const Value *CmpLHS = CI->getOperand(0); + const Value *CmpRHS = CI->getOperand(1); + + if (NeedSwap) + std::swap(CmpLHS, CmpRHS); + + EVT CmpVT = TLI.getValueType(CmpLHS->getType()); + if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT)) + return false; + } else { + unsigned CondReg = getRegForValue(Cond); + if (CondReg == 0) + return false; + bool CondIsKill = hasTrivialKill(Cond); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri)) + .addReg(CondReg, getKillRegState(CondIsKill)).addImm(1); + } + + const Value *LHS = I->getOperand(1); + const Value *RHS = I->getOperand(2); + + unsigned LHSReg = getRegForValue(LHS); + bool LHSIsKill = hasTrivialKill(LHS); + + unsigned RHSReg = getRegForValue(RHS); + bool RHSIsKill = hasTrivialKill(RHS); + + if (!LHSReg || !RHSReg) + return false; + + const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); + + unsigned ResultReg = + FastEmitInst_rri(Opc, RC, RHSReg, RHSIsKill, LHSReg, LHSIsKill, CC); UpdateValueMap(I, ResultReg); return true; } +bool X86FastISel::X86SelectSelect(const Instruction *I) { + MVT RetVT; + if (!isTypeLegal(I->getType(), RetVT)) + return false; + + // Check if we can fold the select. + if (const auto *CI = dyn_cast<CmpInst>(I->getOperand(0))) { + CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); + const Value *Opnd = nullptr; + switch (Predicate) { + default: break; + case CmpInst::FCMP_FALSE: Opnd = I->getOperand(2); break; + case CmpInst::FCMP_TRUE: Opnd = I->getOperand(1); break; + } + // No need for a select anymore - this is an unconditional move. + if (Opnd) { + unsigned OpReg = getRegForValue(Opnd); + if (OpReg == 0) + return false; + bool OpIsKill = hasTrivialKill(Opnd); + const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); + unsigned ResultReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(OpReg, getKillRegState(OpIsKill)); + UpdateValueMap(I, ResultReg); + return true; + } + } + + // First try to use real conditional move instructions. + if (X86FastEmitCMoveSelect(RetVT, I)) + return true; + + // Try to use a sequence of SSE instructions to simulate a conditional move. + if (X86FastEmitSSESelect(RetVT, I)) + return true; + + // Fall-back to pseudo conditional move instructions, which will be later + // converted to control-flow. + if (X86FastEmitPseudoSelect(RetVT, I)) + return true; + + return false; +} + bool X86FastISel::X86SelectFPExt(const Instruction *I) { // fpext from float to double. if (X86ScalarSSEf64 && @@ -1633,8 +2153,8 @@ bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM, } unsigned Reg; - bool RV = X86FastEmitLoad(VT, SrcAM, Reg); - RV &= X86FastEmitStore(VT, Reg, DestAM); + bool RV = X86FastEmitLoad(VT, SrcAM, nullptr, Reg); + RV &= X86FastEmitStore(VT, Reg, /*Kill=*/true, DestAM); assert(RV && "Failed to emit load or store??"); unsigned Size = VT.getSizeInBits()/8; @@ -1646,10 +2166,74 @@ bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM, return true; } +static bool isCommutativeIntrinsic(IntrinsicInst const &I) { + switch (I.getIntrinsicID()) { + case Intrinsic::sadd_with_overflow: + case Intrinsic::uadd_with_overflow: + case Intrinsic::smul_with_overflow: + case Intrinsic::umul_with_overflow: + return true; + default: + return false; + } +} + bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { // FIXME: Handle more intrinsics. switch (I.getIntrinsicID()) { default: return false; + case Intrinsic::frameaddress: { + Type *RetTy = I.getCalledFunction()->getReturnType(); + + MVT VT; + if (!isTypeLegal(RetTy, VT)) + return false; + + unsigned Opc; + const TargetRegisterClass *RC = nullptr; + + switch (VT.SimpleTy) { + default: llvm_unreachable("Invalid result type for frameaddress."); + case MVT::i32: Opc = X86::MOV32rm; RC = &X86::GR32RegClass; break; + case MVT::i64: Opc = X86::MOV64rm; RC = &X86::GR64RegClass; break; + } + + // This needs to be set before we call getFrameRegister, otherwise we get + // the wrong frame register. + MachineFrameInfo *MFI = FuncInfo.MF->getFrameInfo(); + MFI->setFrameAddressIsTaken(true); + + const X86RegisterInfo *RegInfo = + static_cast<const X86RegisterInfo*>(TM.getRegisterInfo()); + unsigned FrameReg = RegInfo->getFrameRegister(*(FuncInfo.MF)); + assert(((FrameReg == X86::RBP && VT == MVT::i64) || + (FrameReg == X86::EBP && VT == MVT::i32)) && + "Invalid Frame Register!"); + + // Always make a copy of the frame register to to a vreg first, so that we + // never directly reference the frame register (the TwoAddressInstruction- + // Pass doesn't like that). + unsigned SrcReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), SrcReg).addReg(FrameReg); + + // Now recursively load from the frame address. + // movq (%rbp), %rax + // movq (%rax), %rax + // movq (%rax), %rax + // ... + unsigned DestReg; + unsigned Depth = cast<ConstantInt>(I.getOperand(0))->getZExtValue(); + while (Depth--) { + DestReg = createResultReg(RC); + addDirectMem(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Opc), DestReg), SrcReg); + SrcReg = DestReg; + } + + UpdateValueMap(&I, SrcReg); + return true; + } case Intrinsic::memcpy: { const MemCpyInst &MCI = cast<MemCpyInst>(I); // Don't handle volatile or variable length memcpys. @@ -1726,52 +2310,233 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TRAP)); return true; } - case Intrinsic::sadd_with_overflow: - case Intrinsic::uadd_with_overflow: { - // FIXME: Should fold immediates. + case Intrinsic::sqrt: { + if (!Subtarget->hasSSE1()) + return false; - // Replace "add with overflow" intrinsics with an "add" instruction followed - // by a seto/setc instruction. - const Function *Callee = I.getCalledFunction(); - Type *RetTy = - cast<StructType>(Callee->getReturnType())->getTypeAtIndex(unsigned(0)); + Type *RetTy = I.getCalledFunction()->getReturnType(); MVT VT; if (!isTypeLegal(RetTy, VT)) return false; - const Value *Op1 = I.getArgOperand(0); - const Value *Op2 = I.getArgOperand(1); - unsigned Reg1 = getRegForValue(Op1); - unsigned Reg2 = getRegForValue(Op2); + // Unfortunately we can't use FastEmit_r, because the AVX version of FSQRT + // is not generated by FastISel yet. + // FIXME: Update this code once tablegen can handle it. + static const unsigned SqrtOpc[2][2] = { + {X86::SQRTSSr, X86::VSQRTSSr}, + {X86::SQRTSDr, X86::VSQRTSDr} + }; + bool HasAVX = Subtarget->hasAVX(); + unsigned Opc; + const TargetRegisterClass *RC; + switch (VT.SimpleTy) { + default: return false; + case MVT::f32: Opc = SqrtOpc[0][HasAVX]; RC = &X86::FR32RegClass; break; + case MVT::f64: Opc = SqrtOpc[1][HasAVX]; RC = &X86::FR64RegClass; break; + } + + const Value *SrcVal = I.getArgOperand(0); + unsigned SrcReg = getRegForValue(SrcVal); - if (Reg1 == 0 || Reg2 == 0) - // FIXME: Handle values *not* in registers. + if (SrcReg == 0) return false; - unsigned OpC = 0; - if (VT == MVT::i32) - OpC = X86::ADD32rr; - else if (VT == MVT::i64) - OpC = X86::ADD64rr; - else + unsigned ImplicitDefReg = 0; + if (HasAVX) { + ImplicitDefReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg); + } + + unsigned ResultReg = createResultReg(RC); + MachineInstrBuilder MIB; + MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), + ResultReg); + + if (ImplicitDefReg) + MIB.addReg(ImplicitDefReg); + + MIB.addReg(SrcReg); + + UpdateValueMap(&I, ResultReg); + return true; + } + case Intrinsic::sadd_with_overflow: + case Intrinsic::uadd_with_overflow: + case Intrinsic::ssub_with_overflow: + case Intrinsic::usub_with_overflow: + case Intrinsic::smul_with_overflow: + case Intrinsic::umul_with_overflow: { + // This implements the basic lowering of the xalu with overflow intrinsics + // into add/sub/mul followed by either seto or setb. + const Function *Callee = I.getCalledFunction(); + auto *Ty = cast<StructType>(Callee->getReturnType()); + Type *RetTy = Ty->getTypeAtIndex(0U); + Type *CondTy = Ty->getTypeAtIndex(1); + + MVT VT; + if (!isTypeLegal(RetTy, VT)) + return false; + + if (VT < MVT::i8 || VT > MVT::i64) + return false; + + const Value *LHS = I.getArgOperand(0); + const Value *RHS = I.getArgOperand(1); + + // Canonicalize immediate to the RHS. + if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) && + isCommutativeIntrinsic(I)) + std::swap(LHS, RHS); + + unsigned BaseOpc, CondOpc; + switch (I.getIntrinsicID()) { + default: llvm_unreachable("Unexpected intrinsic!"); + case Intrinsic::sadd_with_overflow: + BaseOpc = ISD::ADD; CondOpc = X86::SETOr; break; + case Intrinsic::uadd_with_overflow: + BaseOpc = ISD::ADD; CondOpc = X86::SETBr; break; + case Intrinsic::ssub_with_overflow: + BaseOpc = ISD::SUB; CondOpc = X86::SETOr; break; + case Intrinsic::usub_with_overflow: + BaseOpc = ISD::SUB; CondOpc = X86::SETBr; break; + case Intrinsic::smul_with_overflow: + BaseOpc = X86ISD::SMUL; CondOpc = X86::SETOr; break; + case Intrinsic::umul_with_overflow: + BaseOpc = X86ISD::UMUL; CondOpc = X86::SETOr; break; + } + + unsigned LHSReg = getRegForValue(LHS); + if (LHSReg == 0) return false; + bool LHSIsKill = hasTrivialKill(LHS); - // The call to CreateRegs builds two sequential registers, to store the - // both the returned values. - unsigned ResultReg = FuncInfo.CreateRegs(I.getType()); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(OpC), ResultReg) - .addReg(Reg1).addReg(Reg2); + unsigned ResultReg = 0; + // Check if we have an immediate version. + if (auto const *C = dyn_cast<ConstantInt>(RHS)) { + ResultReg = FastEmit_ri(VT, VT, BaseOpc, LHSReg, LHSIsKill, + C->getZExtValue()); + } - unsigned Opc = X86::SETBr; - if (I.getIntrinsicID() == Intrinsic::sadd_with_overflow) - Opc = X86::SETOr; - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), - ResultReg + 1); + unsigned RHSReg; + bool RHSIsKill; + if (!ResultReg) { + RHSReg = getRegForValue(RHS); + if (RHSReg == 0) + return false; + RHSIsKill = hasTrivialKill(RHS); + ResultReg = FastEmit_rr(VT, VT, BaseOpc, LHSReg, LHSIsKill, RHSReg, + RHSIsKill); + } + + // FastISel doesn't have a pattern for all X86::MUL*r and X86::IMUL*r. Emit + // it manually. + if (BaseOpc == X86ISD::UMUL && !ResultReg) { + static const unsigned MULOpc[] = + { X86::MUL8r, X86::MUL16r, X86::MUL32r, X86::MUL64r }; + static const unsigned Reg[] = { X86::AL, X86::AX, X86::EAX, X86::RAX }; + // First copy the first operand into RAX, which is an implicit input to + // the X86::MUL*r instruction. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), Reg[VT.SimpleTy-MVT::i8]) + .addReg(LHSReg, getKillRegState(LHSIsKill)); + ResultReg = FastEmitInst_r(MULOpc[VT.SimpleTy-MVT::i8], + TLI.getRegClassFor(VT), RHSReg, RHSIsKill); + } else if (BaseOpc == X86ISD::SMUL && !ResultReg) { + static const unsigned MULOpc[] = + { X86::IMUL8r, X86::IMUL16rr, X86::IMUL32rr, X86::IMUL64rr }; + if (VT == MVT::i8) { + // Copy the first operand into AL, which is an implicit input to the + // X86::IMUL8r instruction. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), X86::AL) + .addReg(LHSReg, getKillRegState(LHSIsKill)); + ResultReg = FastEmitInst_r(MULOpc[0], TLI.getRegClassFor(VT), RHSReg, + RHSIsKill); + } else + ResultReg = FastEmitInst_rr(MULOpc[VT.SimpleTy-MVT::i8], + TLI.getRegClassFor(VT), LHSReg, LHSIsKill, + RHSReg, RHSIsKill); + } + + if (!ResultReg) + return false; + + unsigned ResultReg2 = FuncInfo.CreateRegs(CondTy); + assert((ResultReg+1) == ResultReg2 && "Nonconsecutive result registers."); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CondOpc), + ResultReg2); UpdateValueMap(&I, ResultReg, 2); return true; } + case Intrinsic::x86_sse_cvttss2si: + case Intrinsic::x86_sse_cvttss2si64: + case Intrinsic::x86_sse2_cvttsd2si: + case Intrinsic::x86_sse2_cvttsd2si64: { + bool IsInputDouble; + switch (I.getIntrinsicID()) { + default: llvm_unreachable("Unexpected intrinsic."); + case Intrinsic::x86_sse_cvttss2si: + case Intrinsic::x86_sse_cvttss2si64: + if (!Subtarget->hasSSE1()) + return false; + IsInputDouble = false; + break; + case Intrinsic::x86_sse2_cvttsd2si: + case Intrinsic::x86_sse2_cvttsd2si64: + if (!Subtarget->hasSSE2()) + return false; + IsInputDouble = true; + break; + } + + Type *RetTy = I.getCalledFunction()->getReturnType(); + MVT VT; + if (!isTypeLegal(RetTy, VT)) + return false; + + static const unsigned CvtOpc[2][2][2] = { + { { X86::CVTTSS2SIrr, X86::VCVTTSS2SIrr }, + { X86::CVTTSS2SI64rr, X86::VCVTTSS2SI64rr } }, + { { X86::CVTTSD2SIrr, X86::VCVTTSD2SIrr }, + { X86::CVTTSD2SI64rr, X86::VCVTTSD2SI64rr } } + }; + bool HasAVX = Subtarget->hasAVX(); + unsigned Opc; + switch (VT.SimpleTy) { + default: llvm_unreachable("Unexpected result type."); + case MVT::i32: Opc = CvtOpc[IsInputDouble][0][HasAVX]; break; + case MVT::i64: Opc = CvtOpc[IsInputDouble][1][HasAVX]; break; + } + + // Check if we can fold insertelement instructions into the convert. + const Value *Op = I.getArgOperand(0); + while (auto *IE = dyn_cast<InsertElementInst>(Op)) { + const Value *Index = IE->getOperand(2); + if (!isa<ConstantInt>(Index)) + break; + unsigned Idx = cast<ConstantInt>(Index)->getZExtValue(); + + if (Idx == 0) { + Op = IE->getOperand(1); + break; + } + Op = IE->getOperand(0); + } + + unsigned Reg = getRegForValue(Op); + if (Reg == 0) + return false; + + unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT)); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) + .addReg(Reg); + + UpdateValueMap(&I, ResultReg); + return true; + } } } @@ -1794,31 +2559,43 @@ bool X86FastISel::FastLowerArguments() { return false; // Only handle simple cases. i.e. Up to 6 i32/i64 scalar arguments. - unsigned Idx = 1; - for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); - I != E; ++I, ++Idx) { - if (Idx > 6) - return false; - + unsigned GPRCnt = 0; + unsigned FPRCnt = 0; + unsigned Idx = 0; + for (auto const &Arg : F->args()) { + // The first argument is at index 1. + ++Idx; if (F->getAttributes().hasAttribute(Idx, Attribute::ByVal) || F->getAttributes().hasAttribute(Idx, Attribute::InReg) || F->getAttributes().hasAttribute(Idx, Attribute::StructRet) || F->getAttributes().hasAttribute(Idx, Attribute::Nest)) return false; - Type *ArgTy = I->getType(); + Type *ArgTy = Arg.getType(); if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy()) return false; EVT ArgVT = TLI.getValueType(ArgTy); if (!ArgVT.isSimple()) return false; switch (ArgVT.getSimpleVT().SimpleTy) { + default: return false; case MVT::i32: case MVT::i64: + ++GPRCnt; + break; + case MVT::f32: + case MVT::f64: + if (!Subtarget->hasSSE1()) + return false; + ++FPRCnt; break; - default: - return false; } + + if (GPRCnt > 6) + return false; + + if (FPRCnt > 8) + return false; } static const MCPhysReg GPR32ArgRegs[] = { @@ -1827,24 +2604,33 @@ bool X86FastISel::FastLowerArguments() { static const MCPhysReg GPR64ArgRegs[] = { X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8 , X86::R9 }; + static const MCPhysReg XMMArgRegs[] = { + X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, + X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 + }; - Idx = 0; - const TargetRegisterClass *RC32 = TLI.getRegClassFor(MVT::i32); - const TargetRegisterClass *RC64 = TLI.getRegClassFor(MVT::i64); - for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); - I != E; ++I, ++Idx) { - bool is32Bit = TLI.getValueType(I->getType()) == MVT::i32; - const TargetRegisterClass *RC = is32Bit ? RC32 : RC64; - unsigned SrcReg = is32Bit ? GPR32ArgRegs[Idx] : GPR64ArgRegs[Idx]; + unsigned GPRIdx = 0; + unsigned FPRIdx = 0; + for (auto const &Arg : F->args()) { + MVT VT = TLI.getSimpleValueType(Arg.getType()); + const TargetRegisterClass *RC = TLI.getRegClassFor(VT); + unsigned SrcReg; + switch (VT.SimpleTy) { + default: llvm_unreachable("Unexpected value type."); + case MVT::i32: SrcReg = GPR32ArgRegs[GPRIdx++]; break; + case MVT::i64: SrcReg = GPR64ArgRegs[GPRIdx++]; break; + case MVT::f32: // fall-through + case MVT::f64: SrcReg = XMMArgRegs[FPRIdx++]; break; + } unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC); // FIXME: Unfortunately it's necessary to emit a copy from the livein copy. // Without this, EmitLiveInCopies may eliminate the livein if its only // use is a bitcast (which isn't turned into an instruction). unsigned ResultReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(TargetOpcode::COPY), - ResultReg).addReg(DstReg, getKillRegState(true)); - UpdateValueMap(I, ResultReg); + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(DstReg, getKillRegState(true)); + UpdateValueMap(&Arg, ResultReg); } return true; } @@ -2147,7 +2933,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { if (!X86FastEmitStore(ArgVT, ArgVal, AM)) return false; } else { - if (!X86FastEmitStore(ArgVT, Arg, AM)) + if (!X86FastEmitStore(ArgVT, Arg, /*ValIsKill=*/false, AM)) return false; } } @@ -2430,7 +3216,7 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) { return 0; } - // Materialize addresses with LEA instructions. + // Materialize addresses with LEA/MOV instructions. if (isa<GlobalValue>(C)) { X86AddressMode AM; if (X86SelectAddress(C, AM)) { @@ -2440,10 +3226,19 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) { AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == nullptr) return AM.Base.Reg; - Opc = TLI.getPointerTy() == MVT::i32 ? X86::LEA32r : X86::LEA64r; unsigned ResultReg = createResultReg(RC); - addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + if (TM.getRelocationModel() == Reloc::Static && + TLI.getPointerTy() == MVT::i64) { + // The displacement code be more than 32 bits away so we need to use + // an instruction with a 64 bit immediate + Opc = X86::MOV64ri; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Opc), ResultReg).addGlobalAddress(cast<GlobalValue>(C)); + } else { + Opc = TLI.getPointerTy() == MVT::i32 ? X86::LEA32r : X86::LEA64r; + addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg), AM); + } return ResultReg; } return 0; @@ -2544,8 +3339,9 @@ unsigned X86FastISel::TargetMaterializeFloatZero(const ConstantFP *CF) { bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, const LoadInst *LI) { + const Value *Ptr = LI->getPointerOperand(); X86AddressMode AM; - if (!X86SelectAddress(LI->getOperand(0), AM)) + if (!X86SelectAddress(Ptr, AM)) return false; const X86InstrInfo &XII = (const X86InstrInfo&)TII; @@ -2553,13 +3349,18 @@ bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, unsigned Size = DL.getTypeAllocSize(LI->getType()); unsigned Alignment = LI->getAlignment(); + if (Alignment == 0) // Ensure that codegen never sees alignment 0 + Alignment = DL.getABITypeAlignment(LI->getType()); + SmallVector<MachineOperand, 8> AddrOps; AM.getFullAddress(AddrOps); MachineInstr *Result = XII.foldMemoryOperandImpl(*FuncInfo.MF, MI, OpNo, AddrOps, Size, Alignment); - if (!Result) return false; + if (!Result) + return false; + Result->addMemOperand(*FuncInfo.MF, createMachineMemOperandFor(LI)); FuncInfo.MBB->insert(FuncInfo.InsertPt, Result); MI->eraseFromParent(); return true; diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp index 6c5b86f61969..4be766a19f96 100644 --- a/lib/Target/X86/X86FixupLEAs.cpp +++ b/lib/Target/X86/X86FixupLEAs.cpp @@ -32,86 +32,89 @@ using namespace llvm; STATISTIC(NumLEAs, "Number of LEA instructions created"); namespace { - class FixupLEAPass : public MachineFunctionPass { - enum RegUsageState { RU_NotUsed, RU_Write, RU_Read }; - static char ID; - /// \brief Loop over all of the instructions in the basic block - /// replacing applicable instructions with LEA instructions, - /// where appropriate. - bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI); +class FixupLEAPass : public MachineFunctionPass { + enum RegUsageState { RU_NotUsed, RU_Write, RU_Read }; + static char ID; + /// \brief Loop over all of the instructions in the basic block + /// replacing applicable instructions with LEA instructions, + /// where appropriate. + bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI); - const char *getPassName() const override { return "X86 Atom LEA Fixup";} + const char *getPassName() const override { return "X86 Atom LEA Fixup"; } - /// \brief Given a machine register, look for the instruction - /// which writes it in the current basic block. If found, - /// try to replace it with an equivalent LEA instruction. - /// If replacement succeeds, then also process the the newly created - /// instruction. - void seekLEAFixup(MachineOperand& p, MachineBasicBlock::iterator& I, - MachineFunction::iterator MFI); + /// \brief Given a machine register, look for the instruction + /// which writes it in the current basic block. If found, + /// try to replace it with an equivalent LEA instruction. + /// If replacement succeeds, then also process the the newly created + /// instruction. + void seekLEAFixup(MachineOperand &p, MachineBasicBlock::iterator &I, + MachineFunction::iterator MFI); - /// \brief Given a memory access or LEA instruction - /// whose address mode uses a base and/or index register, look for - /// an opportunity to replace the instruction which sets the base or index - /// register with an equivalent LEA instruction. - void processInstruction(MachineBasicBlock::iterator& I, - MachineFunction::iterator MFI); + /// \brief Given a memory access or LEA instruction + /// whose address mode uses a base and/or index register, look for + /// an opportunity to replace the instruction which sets the base or index + /// register with an equivalent LEA instruction. + void processInstruction(MachineBasicBlock::iterator &I, + MachineFunction::iterator MFI); - /// \brief Given a LEA instruction which is unprofitable - /// on Silvermont try to replace it with an equivalent ADD instruction - void processInstructionForSLM(MachineBasicBlock::iterator& I, - MachineFunction::iterator MFI); + /// \brief Given a LEA instruction which is unprofitable + /// on Silvermont try to replace it with an equivalent ADD instruction + void processInstructionForSLM(MachineBasicBlock::iterator &I, + MachineFunction::iterator MFI); - /// \brief Determine if an instruction references a machine register - /// and, if so, whether it reads or writes the register. - RegUsageState usesRegister(MachineOperand& p, - MachineBasicBlock::iterator I); + /// \brief Determine if an instruction references a machine register + /// and, if so, whether it reads or writes the register. + RegUsageState usesRegister(MachineOperand &p, MachineBasicBlock::iterator I); - /// \brief Step backwards through a basic block, looking - /// for an instruction which writes a register within - /// a maximum of INSTR_DISTANCE_THRESHOLD instruction latency cycles. - MachineBasicBlock::iterator searchBackwards(MachineOperand& p, - MachineBasicBlock::iterator& I, - MachineFunction::iterator MFI); + /// \brief Step backwards through a basic block, looking + /// for an instruction which writes a register within + /// a maximum of INSTR_DISTANCE_THRESHOLD instruction latency cycles. + MachineBasicBlock::iterator searchBackwards(MachineOperand &p, + MachineBasicBlock::iterator &I, + MachineFunction::iterator MFI); - /// \brief if an instruction can be converted to an - /// equivalent LEA, insert the new instruction into the basic block - /// and return a pointer to it. Otherwise, return zero. - MachineInstr* postRAConvertToLEA(MachineFunction::iterator &MFI, - MachineBasicBlock::iterator &MBBI) const; + /// \brief if an instruction can be converted to an + /// equivalent LEA, insert the new instruction into the basic block + /// and return a pointer to it. Otherwise, return zero. + MachineInstr *postRAConvertToLEA(MachineFunction::iterator &MFI, + MachineBasicBlock::iterator &MBBI) const; - public: - FixupLEAPass() : MachineFunctionPass(ID) {} +public: + FixupLEAPass() : MachineFunctionPass(ID) {} - /// \brief Loop over all of the basic blocks, - /// replacing instructions by equivalent LEA instructions - /// if needed and when possible. - bool runOnMachineFunction(MachineFunction &MF) override; + /// \brief Loop over all of the basic blocks, + /// replacing instructions by equivalent LEA instructions + /// if needed and when possible. + bool runOnMachineFunction(MachineFunction &MF) override; - private: - MachineFunction *MF; - const TargetMachine *TM; - const X86InstrInfo *TII; // Machine instruction info. - - }; - char FixupLEAPass::ID = 0; +private: + MachineFunction *MF; + const TargetMachine *TM; + const X86InstrInfo *TII; // Machine instruction info. +}; +char FixupLEAPass::ID = 0; } MachineInstr * FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI, MachineBasicBlock::iterator &MBBI) const { - MachineInstr* MI = MBBI; - MachineInstr* NewMI; + MachineInstr *MI = MBBI; + MachineInstr *NewMI; switch (MI->getOpcode()) { case X86::MOV32rr: case X86::MOV64rr: { - const MachineOperand& Src = MI->getOperand(1); - const MachineOperand& Dest = MI->getOperand(0); + const MachineOperand &Src = MI->getOperand(1); + const MachineOperand &Dest = MI->getOperand(0); NewMI = BuildMI(*MF, MI->getDebugLoc(), - TII->get( MI->getOpcode() == X86::MOV32rr ? X86::LEA32r : X86::LEA64r)) - .addOperand(Dest) - .addOperand(Src).addImm(1).addReg(0).addImm(0).addReg(0); - MFI->insert(MBBI, NewMI); // Insert the new inst + TII->get(MI->getOpcode() == X86::MOV32rr ? X86::LEA32r + : X86::LEA64r)) + .addOperand(Dest) + .addOperand(Src) + .addImm(1) + .addReg(0) + .addImm(0) + .addReg(0); + MFI->insert(MBBI, NewMI); // Insert the new inst return NewMI; } case X86::ADD64ri32: @@ -144,17 +147,16 @@ FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI, return TII->convertToThreeAddress(MFI, MBBI, nullptr); } -FunctionPass *llvm::createX86FixupLEAs() { - return new FixupLEAPass(); -} +FunctionPass *llvm::createX86FixupLEAs() { return new FixupLEAPass(); } bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) { + MF = &Func; TM = &Func.getTarget(); const X86Subtarget &ST = TM->getSubtarget<X86Subtarget>(); if (!ST.LEAusesAG() && !ST.slowLEA()) return false; - TII = static_cast<const X86InstrInfo*>(TM->getInstrInfo()); + TII = static_cast<const X86InstrInfo *>(TM->getInstrInfo()); DEBUG(dbgs() << "Start X86FixupLEAs\n";); // Process all basic blocks. @@ -165,14 +167,14 @@ bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) { return true; } -FixupLEAPass::RegUsageState FixupLEAPass::usesRegister(MachineOperand& p, - MachineBasicBlock::iterator I) { +FixupLEAPass::RegUsageState +FixupLEAPass::usesRegister(MachineOperand &p, MachineBasicBlock::iterator I) { RegUsageState RegUsage = RU_NotUsed; - MachineInstr* MI = I; + MachineInstr *MI = I; for (unsigned int i = 0; i < MI->getNumOperands(); ++i) { - MachineOperand& opnd = MI->getOperand(i); - if (opnd.isReg() && opnd.getReg() == p.getReg()){ + MachineOperand &opnd = MI->getOperand(i); + if (opnd.isReg() && opnd.getReg() == p.getReg()) { if (opnd.isDef()) return RU_Write; RegUsage = RU_Read; @@ -185,23 +187,22 @@ FixupLEAPass::RegUsageState FixupLEAPass::usesRegister(MachineOperand& p, /// block, return a reference to the previous instruction in the block, /// wrapping around to the last instruction of the block if the block /// branches to itself. -static inline bool getPreviousInstr(MachineBasicBlock::iterator& I, +static inline bool getPreviousInstr(MachineBasicBlock::iterator &I, MachineFunction::iterator MFI) { if (I == MFI->begin()) { if (MFI->isPredecessor(MFI)) { I = --MFI->end(); return true; - } - else + } else return false; } --I; return true; } -MachineBasicBlock::iterator FixupLEAPass::searchBackwards(MachineOperand& p, - MachineBasicBlock::iterator& I, - MachineFunction::iterator MFI) { +MachineBasicBlock::iterator +FixupLEAPass::searchBackwards(MachineOperand &p, MachineBasicBlock::iterator &I, + MachineFunction::iterator MFI) { int InstrDistance = 1; MachineBasicBlock::iterator CurInst; static const int INSTR_DISTANCE_THRESHOLD = 5; @@ -209,12 +210,12 @@ MachineBasicBlock::iterator FixupLEAPass::searchBackwards(MachineOperand& p, CurInst = I; bool Found; Found = getPreviousInstr(CurInst, MFI); - while( Found && I != CurInst) { + while (Found && I != CurInst) { if (CurInst->isCall() || CurInst->isInlineAsm()) break; if (InstrDistance > INSTR_DISTANCE_THRESHOLD) break; // too far back to make a difference - if (usesRegister(p, CurInst) == RU_Write){ + if (usesRegister(p, CurInst) == RU_Write) { return CurInst; } InstrDistance += TII->getInstrLatency(TM->getInstrItineraryData(), CurInst); @@ -223,32 +224,32 @@ MachineBasicBlock::iterator FixupLEAPass::searchBackwards(MachineOperand& p, return nullptr; } -void FixupLEAPass::processInstruction(MachineBasicBlock::iterator& I, +void FixupLEAPass::processInstruction(MachineBasicBlock::iterator &I, MachineFunction::iterator MFI) { // Process a load, store, or LEA instruction. MachineInstr *MI = I; int opcode = MI->getOpcode(); - const MCInstrDesc& Desc = MI->getDesc(); + const MCInstrDesc &Desc = MI->getDesc(); int AddrOffset = X86II::getMemoryOperandNo(Desc.TSFlags, opcode); if (AddrOffset >= 0) { AddrOffset += X86II::getOperandBias(Desc); - MachineOperand& p = MI->getOperand(AddrOffset + X86::AddrBaseReg); + MachineOperand &p = MI->getOperand(AddrOffset + X86::AddrBaseReg); if (p.isReg() && p.getReg() != X86::ESP) { seekLEAFixup(p, I, MFI); } - MachineOperand& q = MI->getOperand(AddrOffset + X86::AddrIndexReg); + MachineOperand &q = MI->getOperand(AddrOffset + X86::AddrIndexReg); if (q.isReg() && q.getReg() != X86::ESP) { seekLEAFixup(q, I, MFI); } } } -void FixupLEAPass::seekLEAFixup(MachineOperand& p, - MachineBasicBlock::iterator& I, +void FixupLEAPass::seekLEAFixup(MachineOperand &p, + MachineBasicBlock::iterator &I, MachineFunction::iterator MFI) { MachineBasicBlock::iterator MBI = searchBackwards(p, I, MFI); if (MBI) { - MachineInstr* NewMI = postRAConvertToLEA(MFI, MBI); + MachineInstr *NewMI = postRAConvertToLEA(MFI, MBI); if (NewMI) { ++NumLEAs; DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MBI->dump();); @@ -256,7 +257,7 @@ void FixupLEAPass::seekLEAFixup(MachineOperand& p, DEBUG(dbgs() << "FixLEA: Replaced by: "; NewMI->dump();); MFI->erase(MBI); MachineBasicBlock::iterator J = - static_cast<MachineBasicBlock::iterator> (NewMI); + static_cast<MachineBasicBlock::iterator>(NewMI); processInstruction(J, MFI); } } @@ -299,7 +300,7 @@ void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I, } DEBUG(dbgs() << "FixLEA: Candidate to replace:"; I->dump();); DEBUG(dbgs() << "FixLEA: Replaced by: ";); - MachineInstr *NewMI = 0; + MachineInstr *NewMI = nullptr; const MachineOperand &Dst = MI->getOperand(0); // Make ADD instruction for two registers writing to LEA's destination if (SrcR1 != 0 && SrcR2 != 0) { diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp index 4c1374f70f42..8c029a8c22d5 100644 --- a/lib/Target/X86/X86FrameLowering.cpp +++ b/lib/Target/X86/X86FrameLowering.cpp @@ -29,6 +29,7 @@ #include "llvm/MC/MCSymbol.h" #include "llvm/Support/CommandLine.h" #include "llvm/Target/TargetOptions.h" +#include "llvm/Support/Debug.h" using namespace llvm; @@ -45,7 +46,7 @@ bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { bool X86FrameLowering::hasFP(const MachineFunction &MF) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); const MachineModuleInfo &MMI = MF.getMMI(); - const TargetRegisterInfo *RegInfo = TM.getRegisterInfo(); + const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo(); return (MF.getTarget().Options.DisableFramePointerElim(MF) || RegInfo->needsStackRealignment(MF) || @@ -305,65 +306,25 @@ static bool isEAXLiveIn(MachineFunction &MF) { return false; } -void X86FrameLowering::emitCalleeSavedFrameMoves( - MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc DL, - unsigned FramePtr) const { +void +X86FrameLowering::emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL) const { MachineFunction &MF = *MBB.getParent(); MachineFrameInfo *MFI = MF.getFrameInfo(); MachineModuleInfo &MMI = MF.getMMI(); const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo(); - const X86InstrInfo &TII = *TM.getInstrInfo(); + const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); // Add callee saved registers to move list. const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); if (CSI.empty()) return; - const X86RegisterInfo *RegInfo = TM.getRegisterInfo(); - bool HasFP = hasFP(MF); - - // Calculate amount of bytes used for return address storing. - int stackGrowth = -RegInfo->getSlotSize(); - - // FIXME: This is dirty hack. The code itself is pretty mess right now. - // It should be rewritten from scratch and generalized sometimes. - - // Determine maximum offset (minimum due to stack growth). - int64_t MaxOffset = 0; - for (std::vector<CalleeSavedInfo>::const_iterator - I = CSI.begin(), E = CSI.end(); I != E; ++I) - MaxOffset = std::min(MaxOffset, - MFI->getObjectOffset(I->getFrameIdx())); - // Calculate offsets. - int64_t saveAreaOffset = (HasFP ? 3 : 2) * stackGrowth; for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(), E = CSI.end(); I != E; ++I) { int64_t Offset = MFI->getObjectOffset(I->getFrameIdx()); unsigned Reg = I->getReg(); - Offset = MaxOffset - Offset + saveAreaOffset; - - // Don't output a new machine move if we're re-saving the frame - // pointer. This happens when the PrologEpilogInserter has inserted an extra - // "PUSH" of the frame pointer -- the "emitPrologue" method automatically - // generates one when frame pointers are used. If we generate a "machine - // move" for this extra "PUSH", the linker will lose track of the fact that - // the frame pointer should have the value of the first "PUSH" when it's - // trying to unwind. - // - // FIXME: This looks inelegant. It's possibly correct, but it's covering up - // another bug. I.e., one where we generate a prolog like this: - // - // pushl %ebp - // movl %esp, %ebp - // pushl %ebp - // pushl %esi - // ... - // - // The immediate re-push of EBP is unnecessary. At the least, it's an - // optimization bug. EBP can be used as a scratch register in certain - // cases, but probably not when we have a frame pointer. - if (HasFP && FramePtr == Reg) - continue; unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true); unsigned CFIIndex = @@ -395,23 +356,107 @@ static bool usesTheStack(const MachineFunction &MF) { /// automatically adjust the stack pointer. Adjust the stack pointer to allocate /// space for local variables. Also emit labels used by the exception handler to /// generate the exception handling frames. + +/* + Here's a gist of what gets emitted: + + ; Establish frame pointer, if needed + [if needs FP] + push %rbp + .cfi_def_cfa_offset 16 + .cfi_offset %rbp, -16 + .seh_pushreg %rpb + mov %rsp, %rbp + .cfi_def_cfa_register %rbp + + ; Spill general-purpose registers + [for all callee-saved GPRs] + pushq %<reg> + [if not needs FP] + .cfi_def_cfa_offset (offset from RETADDR) + .seh_pushreg %<reg> + + ; If the required stack alignment > default stack alignment + ; rsp needs to be re-aligned. This creates a "re-alignment gap" + ; of unknown size in the stack frame. + [if stack needs re-alignment] + and $MASK, %rsp + + ; Allocate space for locals + [if target is Windows and allocated space > 4096 bytes] + ; Windows needs special care for allocations larger + ; than one page. + mov $NNN, %rax + call ___chkstk_ms/___chkstk + sub %rax, %rsp + [else] + sub $NNN, %rsp + + [if needs FP] + .seh_stackalloc (size of XMM spill slots) + .seh_setframe %rbp, SEHFrameOffset ; = size of all spill slots + [else] + .seh_stackalloc NNN + + ; Spill XMMs + ; Note, that while only Windows 64 ABI specifies XMMs as callee-preserved, + ; they may get spilled on any platform, if the current function + ; calls @llvm.eh.unwind.init + [if needs FP] + [for all callee-saved XMM registers] + movaps %<xmm reg>, -MMM(%rbp) + [for all callee-saved XMM registers] + .seh_savexmm %<xmm reg>, (-MMM + SEHFrameOffset) + ; i.e. the offset relative to (%rbp - SEHFrameOffset) + [else] + [for all callee-saved XMM registers] + movaps %<xmm reg>, KKK(%rsp) + [for all callee-saved XMM registers] + .seh_savexmm %<xmm reg>, KKK + + .seh_endprologue + + [if needs base pointer] + mov %rsp, %rbx + + ; Emit CFI info + [if needs FP] + [for all callee-saved registers] + .cfi_offset %<reg>, (offset from %rbp) + [else] + .cfi_def_cfa_offset (offset from RETADDR) + [for all callee-saved registers] + .cfi_offset %<reg>, (offset from %rsp) + + Notes: + - .seh directives are emitted only for Windows 64 ABI + - .cfi directives are emitted for all other ABIs + - for 32-bit code, substitute %e?? registers for %r?? +*/ + void X86FrameLowering::emitPrologue(MachineFunction &MF) const { MachineBasicBlock &MBB = MF.front(); // Prologue goes in entry BB. MachineBasicBlock::iterator MBBI = MBB.begin(); MachineFrameInfo *MFI = MF.getFrameInfo(); const Function *Fn = MF.getFunction(); - const X86RegisterInfo *RegInfo = TM.getRegisterInfo(); - const X86InstrInfo &TII = *TM.getInstrInfo(); + const X86RegisterInfo *RegInfo = + static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo()); + const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); MachineModuleInfo &MMI = MF.getMMI(); X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); - bool needsFrameMoves = MMI.hasDebugInfo() || - Fn->needsUnwindTableEntry(); uint64_t MaxAlign = MFI->getMaxAlignment(); // Desired stack alignment. uint64_t StackSize = MFI->getStackSize(); // Number of bytes to allocate. bool HasFP = hasFP(MF); + const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); bool Is64Bit = STI.is64Bit(); bool IsLP64 = STI.isTarget64BitLP64(); bool IsWin64 = STI.isTargetWin64(); + bool IsWinEH = + MF.getTarget().getMCAsmInfo()->getExceptionHandlingType() == + ExceptionHandling::WinEH; // Not necessarily synonymous with IsWin64. + bool NeedsWinEH = IsWinEH && Fn->needsUnwindTableEntry(); + bool NeedsDwarfCFI = + !IsWinEH && (MMI.hasDebugInfo() || Fn->needsUnwindTableEntry()); bool UseLEA = STI.useLeaForSP(); unsigned StackAlign = getStackAlignment(); unsigned SlotSize = RegInfo->getSlotSize(); @@ -509,7 +554,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { .addReg(FramePtr, RegState::Kill) .setMIFlag(MachineInstr::FrameSetup); - if (needsFrameMoves) { + if (NeedsDwarfCFI) { // Mark the place where EBP/RBP was saved. // Define the current CFA rule to use the provided offset. assert(StackSize); @@ -527,13 +572,19 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { .addCFIIndex(CFIIndex); } + if (NeedsWinEH) { + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg)) + .addImm(FramePtr) + .setMIFlag(MachineInstr::FrameSetup); + } + // Update EBP with the new base value. BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), FramePtr) .addReg(StackPtr) .setMIFlag(MachineInstr::FrameSetup); - if (needsFrameMoves) { + if (NeedsDwarfCFI) { // Mark effective beginning of when frame pointer becomes valid. // Define the current CFA to use the EBP/RBP register. unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(FramePtr, true); @@ -543,9 +594,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { .addCFIIndex(CFIIndex); } - // Mark the FramePtr as live-in in every block except the entry. - for (MachineFunction::iterator I = std::next(MF.begin()), E = MF.end(); - I != E; ++I) + // Mark the FramePtr as live-in in every block. + for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) I->addLiveIn(FramePtr); } else { NumBytes = StackSize - X86FI->getCalleeSavedFrameSize(); @@ -559,10 +609,10 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { (MBBI->getOpcode() == X86::PUSH32r || MBBI->getOpcode() == X86::PUSH64r)) { PushedRegs = true; - MBBI->setFlag(MachineInstr::FrameSetup); + unsigned Reg = MBBI->getOperand(0).getReg(); ++MBBI; - if (!HasFP && needsFrameMoves) { + if (!HasFP && NeedsDwarfCFI) { // Mark callee-saved push instruction. // Define the current CFA rule to use the provided offset. assert(StackSize); @@ -572,16 +622,15 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { .addCFIIndex(CFIIndex); StackOffset += stackGrowth; } + + if (NeedsWinEH) { + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg)).addImm(Reg).setMIFlag( + MachineInstr::FrameSetup); + } } // Realign stack after we pushed callee-saved registers (so that we'll be // able to calculate their offsets from the frame pointer). - - // NOTE: We push the registers before realigning the stack, so - // vector callee-saved (xmm) registers may be saved w/o proper - // alignment in this way. However, currently these regs are saved in - // stack slots (see X86FrameLowering::spillCalleeSavedRegisters()), so - // this shouldn't be a problem. if (RegInfo->needsStackRealignment(MF)) { assert(HasFP && "There should be a frame pointer if stack is realigned."); MachineInstr *MI = @@ -680,23 +729,88 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { MI->setFlag(MachineInstr::FrameSetup); MBB.insert(MBBI, MI); } - } else if (NumBytes) + } else if (NumBytes) { emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, IsLP64, UseLEA, TII, *RegInfo); + } + + int SEHFrameOffset = 0; + if (NeedsWinEH) { + if (HasFP) { + // We need to set frame base offset low enough such that all saved + // register offsets would be positive relative to it, but we can't + // just use NumBytes, because .seh_setframe offset must be <=240. + // So we pretend to have only allocated enough space to spill the + // non-volatile registers. + // We don't care about the rest of stack allocation, because unwinder + // will restore SP to (BP - SEHFrameOffset) + for (const CalleeSavedInfo &Info : MFI->getCalleeSavedInfo()) { + int offset = MFI->getObjectOffset(Info.getFrameIdx()); + SEHFrameOffset = std::max(SEHFrameOffset, abs(offset)); + } + SEHFrameOffset += SEHFrameOffset % 16; // ensure alignmant + + // This only needs to account for XMM spill slots, GPR slots + // are covered by the .seh_pushreg's emitted above. + unsigned Size = SEHFrameOffset - X86FI->getCalleeSavedFrameSize(); + if (Size) { + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc)) + .addImm(Size) + .setMIFlag(MachineInstr::FrameSetup); + } + + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame)) + .addImm(FramePtr) + .addImm(SEHFrameOffset) + .setMIFlag(MachineInstr::FrameSetup); + } else { + // SP will be the base register for restoring XMMs + if (NumBytes) { + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc)) + .addImm(NumBytes) + .setMIFlag(MachineInstr::FrameSetup); + } + } + } + + // Skip the rest of register spilling code + while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup)) + ++MBBI; + + // Emit SEH info for non-GPRs + if (NeedsWinEH) { + for (const CalleeSavedInfo &Info : MFI->getCalleeSavedInfo()) { + unsigned Reg = Info.getReg(); + if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg)) + continue; + assert(X86::FR64RegClass.contains(Reg) && "Unexpected register class"); + + int Offset = getFrameIndexOffset(MF, Info.getFrameIdx()); + Offset += SEHFrameOffset; + + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM)) + .addImm(Reg) + .addImm(Offset) + .setMIFlag(MachineInstr::FrameSetup); + } + + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_EndPrologue)) + .setMIFlag(MachineInstr::FrameSetup); + } // If we need a base pointer, set it up here. It's whatever the value // of the stack pointer is at this point. Any variable size objects // will be allocated after this, so we can still use the base pointer // to reference locals. if (RegInfo->hasBasePointer(MF)) { - // Update the frame pointer with the current stack pointer. + // Update the base pointer with the current stack pointer. unsigned Opc = Is64Bit ? X86::MOV64rr : X86::MOV32rr; BuildMI(MBB, MBBI, DL, TII.get(Opc), BasePtr) .addReg(StackPtr) .setMIFlag(MachineInstr::FrameSetup); } - if (( (!HasFP && NumBytes) || PushedRegs) && needsFrameMoves) { + if (((!HasFP && NumBytes) || PushedRegs) && NeedsDwarfCFI) { // Mark end of stack pointer adjustment. if (!HasFP && NumBytes) { // Define the current CFA rule to use the provided offset. @@ -711,7 +825,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { // Emit DWARF info specifying the offsets of the callee-saved registers. if (PushedRegs) - emitCalleeSavedFrameMoves(MBB, MBBI, DL, HasFP ? FramePtr : StackPtr); + emitCalleeSavedFrameMoves(MBB, MBBI, DL); } } @@ -719,12 +833,14 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); - const X86RegisterInfo *RegInfo = TM.getRegisterInfo(); - const X86InstrInfo &TII = *TM.getInstrInfo(); + const X86RegisterInfo *RegInfo = + static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo()); + const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); assert(MBBI != MBB.end() && "Returning block has no instructions"); unsigned RetOpcode = MBBI->getOpcode(); DebugLoc DL = MBBI->getDebugLoc(); + const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); bool Is64Bit = STI.is64Bit(); bool IsLP64 = STI.isTarget64BitLP64(); bool UseLEA = STI.useLeaForSP(); @@ -969,46 +1085,97 @@ int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, return getFrameIndexOffset(MF, FI); } -bool X86FrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - const std::vector<CalleeSavedInfo> &CSI, - const TargetRegisterInfo *TRI) const { - if (CSI.empty()) - return false; +bool X86FrameLowering::assignCalleeSavedSpillSlots( + MachineFunction &MF, const TargetRegisterInfo *TRI, + std::vector<CalleeSavedInfo> &CSI) const { + MachineFrameInfo *MFI = MF.getFrameInfo(); + const X86RegisterInfo *RegInfo = + static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo()); + unsigned SlotSize = RegInfo->getSlotSize(); + X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); - DebugLoc DL = MBB.findDebugLoc(MI); + unsigned CalleeSavedFrameSize = 0; + int SpillSlotOffset = getOffsetOfLocalArea() + X86FI->getTCReturnAddrDelta(); - MachineFunction &MF = *MBB.getParent(); + if (hasFP(MF)) { + // emitPrologue always spills frame register the first thing. + SpillSlotOffset -= SlotSize; + MFI->CreateFixedSpillStackObject(SlotSize, SpillSlotOffset); + + // Since emitPrologue and emitEpilogue will handle spilling and restoring of + // the frame register, we can delete it from CSI list and not have to worry + // about avoiding it later. + unsigned FPReg = RegInfo->getFrameRegister(MF); + for (unsigned i = 0; i < CSI.size(); ++i) { + if (CSI[i].getReg() == FPReg) { + CSI.erase(CSI.begin() + i); + break; + } + } + } + + // Assign slots for GPRs. It increases frame size. + for (unsigned i = CSI.size(); i != 0; --i) { + unsigned Reg = CSI[i - 1].getReg(); + + if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg)) + continue; - unsigned SlotSize = STI.is64Bit() ? 8 : 4; - unsigned FPReg = TRI->getFrameRegister(MF); - unsigned CalleeFrameSize = 0; + SpillSlotOffset -= SlotSize; + CalleeSavedFrameSize += SlotSize; + + int SlotIndex = MFI->CreateFixedSpillStackObject(SlotSize, SpillSlotOffset); + CSI[i - 1].setFrameIdx(SlotIndex); + } + + X86FI->setCalleeSavedFrameSize(CalleeSavedFrameSize); + + // Assign slots for XMMs. + for (unsigned i = CSI.size(); i != 0; --i) { + unsigned Reg = CSI[i - 1].getReg(); + if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg)) + continue; + + const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg); + // ensure alignment + SpillSlotOffset -= abs(SpillSlotOffset) % RC->getAlignment(); + // spill into slot + SpillSlotOffset -= RC->getSize(); + int SlotIndex = + MFI->CreateFixedSpillStackObject(RC->getSize(), SpillSlotOffset); + CSI[i - 1].setFrameIdx(SlotIndex); + MFI->ensureMaxAlignment(RC->getAlignment()); + } + + return true; +} +bool X86FrameLowering::spillCalleeSavedRegisters( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const { + DebugLoc DL = MBB.findDebugLoc(MI); + + MachineFunction &MF = *MBB.getParent(); const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); - X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); // Push GPRs. It increases frame size. unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r; for (unsigned i = CSI.size(); i != 0; --i) { - unsigned Reg = CSI[i-1].getReg(); - if (!X86::GR64RegClass.contains(Reg) && - !X86::GR32RegClass.contains(Reg)) + unsigned Reg = CSI[i - 1].getReg(); + + if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg)) continue; // Add the callee-saved register as live-in. It's killed at the spill. MBB.addLiveIn(Reg); - if (Reg == FPReg) - // X86RegisterInfo::emitPrologue will handle spilling of frame register. - continue; - CalleeFrameSize += SlotSize; + BuildMI(MBB, MI, DL, TII.get(Opc)).addReg(Reg, RegState::Kill) .setMIFlag(MachineInstr::FrameSetup); } - X86FI->setCalleeSavedFrameSize(CalleeFrameSize); - // Make XMM regs spilled. X86 does not have ability of push/pop XMM. // It can be done by spilling XMMs to stack frame. - // Note that only Win64 ABI might spill XMMs. for (unsigned i = CSI.size(); i != 0; --i) { unsigned Reg = CSI[i-1].getReg(); if (X86::GR64RegClass.contains(Reg) || @@ -1017,8 +1184,12 @@ bool X86FrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB, // Add the callee-saved register as live-in. It's killed at the spill. MBB.addLiveIn(Reg); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - TII.storeRegToStackSlot(MBB, MI, Reg, true, CSI[i-1].getFrameIdx(), - RC, TRI); + + TII.storeRegToStackSlot(MBB, MI, Reg, true, CSI[i - 1].getFrameIdx(), RC, + TRI); + --MI; + MI->setFlag(MachineInstr::FrameSetup); + ++MI; } return true; @@ -1035,6 +1206,7 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, MachineFunction &MF = *MBB.getParent(); const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); // Reload XMMs from stack frame. for (unsigned i = 0, e = CSI.size(); i != e; ++i) { @@ -1042,22 +1214,19 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg)) continue; + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), - RC, TRI); + TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), RC, TRI); } // POP GPRs. - unsigned FPReg = TRI->getFrameRegister(MF); unsigned Opc = STI.is64Bit() ? X86::POP64r : X86::POP32r; for (unsigned i = 0, e = CSI.size(); i != e; ++i) { unsigned Reg = CSI[i].getReg(); if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg)) continue; - if (Reg == FPReg) - // X86RegisterInfo::emitEpilogue will handle restoring of frame register. - continue; + BuildMI(MBB, MI, DL, TII.get(Opc), Reg); } return true; @@ -1065,9 +1234,10 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, void X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, - RegScavenger *RS) const { + RegScavenger *RS) const { MachineFrameInfo *MFI = MF.getFrameInfo(); - const X86RegisterInfo *RegInfo = TM.getRegisterInfo(); + const X86RegisterInfo *RegInfo = + static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo()); unsigned SlotSize = RegInfo->getSlotSize(); X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); @@ -1087,22 +1257,6 @@ X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, TailCallReturnAddrDelta - SlotSize, true); } - if (hasFP(MF)) { - assert((TailCallReturnAddrDelta <= 0) && - "The Delta should always be zero or negative"); - const TargetFrameLowering &TFI = *MF.getTarget().getFrameLowering(); - - // Create a frame entry for the EBP register that must be saved. - int FrameIdx = MFI->CreateFixedObject(SlotSize, - -(int)SlotSize + - TFI.getOffsetOfLocalArea() + - TailCallReturnAddrDelta, - true); - assert(FrameIdx == MFI->getObjectIndexBegin() && - "Slot for EBP register must be last in order to be found!"); - (void)FrameIdx; - } - // Spill the BasePtr if it's used. if (RegInfo->hasBasePointer(MF)) MF.getRegInfo().setPhysRegUsed(RegInfo->getBaseRegister()); @@ -1160,8 +1314,9 @@ void X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { MachineBasicBlock &prologueMBB = MF.front(); MachineFrameInfo *MFI = MF.getFrameInfo(); - const X86InstrInfo &TII = *TM.getInstrInfo(); + const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); uint64_t StackSize; + const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); bool Is64Bit = STI.is64Bit(); unsigned TlsReg, TlsOffset; DebugLoc DL; @@ -1368,9 +1523,12 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { /// temp0 = sp - MaxStack /// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const { - const X86InstrInfo &TII = *TM.getInstrInfo(); + const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); MachineFrameInfo *MFI = MF.getFrameInfo(); - const unsigned SlotSize = TM.getRegisterInfo()->getSlotSize(); + const unsigned SlotSize = + static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo()) + ->getSlotSize(); + const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); const bool Is64Bit = STI.is64Bit(); DebugLoc DL; // HiPE-specific values @@ -1499,12 +1657,14 @@ void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const { void X86FrameLowering:: eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { - const X86InstrInfo &TII = *TM.getInstrInfo(); - const X86RegisterInfo &RegInfo = *TM.getRegisterInfo(); + const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + const X86RegisterInfo &RegInfo = + *static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo()); unsigned StackPtr = RegInfo.getStackRegister(); bool reseveCallFrame = hasReservedCallFrame(MF); int Opcode = I->getOpcode(); bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode(); + const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); bool IsLP64 = STI.isTarget64BitLP64(); DebugLoc DL = I->getDebugLoc(); uint64_t Amount = !reseveCallFrame ? I->getOperand(0).getImm() : 0; @@ -1522,7 +1682,8 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, // We need to keep the stack aligned properly. To do this, we round the // amount of space needed for the outgoing arguments up to the next // alignment boundary. - unsigned StackAlign = TM.getFrameLowering()->getStackAlignment(); + unsigned StackAlign = + MF.getTarget().getFrameLowering()->getStackAlignment(); Amount = (Amount + StackAlign - 1) / StackAlign * StackAlign; MachineInstr *New = nullptr; diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h index 208bb8bd6a34..5ad3d4dc2e97 100644 --- a/lib/Target/X86/X86FrameLowering.h +++ b/lib/Target/X86/X86FrameLowering.h @@ -14,7 +14,6 @@ #ifndef X86_FRAMELOWERING_H #define X86_FRAMELOWERING_H -#include "X86Subtarget.h" #include "llvm/Target/TargetFrameLowering.h" namespace llvm { @@ -23,19 +22,13 @@ class MCSymbol; class X86TargetMachine; class X86FrameLowering : public TargetFrameLowering { - const X86TargetMachine &TM; - const X86Subtarget &STI; public: - explicit X86FrameLowering(const X86TargetMachine &tm, const X86Subtarget &sti) - : TargetFrameLowering(StackGrowsDown, - sti.getStackAlignment(), - (sti.is64Bit() ? -8 : -4)), - TM(tm), STI(sti) { - } + explicit X86FrameLowering(StackDirection D, unsigned StackAl, int LAO) + : TargetFrameLowering(StackGrowsDown, StackAl, LAO) {} void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, DebugLoc DL, - unsigned FramePtr) const; + MachineBasicBlock::iterator MBBI, + DebugLoc DL) const; /// emitProlog/emitEpilog - These methods insert prolog and epilog code into /// the function. @@ -49,6 +42,11 @@ public: void processFunctionBeforeCalleeSavedScan(MachineFunction &MF, RegScavenger *RS = nullptr) const override; + bool + assignCalleeSavedSpillSlots(MachineFunction &MF, + const TargetRegisterInfo *TRI, + std::vector<CalleeSavedInfo> &CSI) const override; + bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const std::vector<CalleeSavedInfo> &CSI, diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index 74386d33990d..ba2f5f645d09 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -2126,38 +2126,6 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { return getGlobalBaseReg(); - case X86ISD::ATOMOR64_DAG: - case X86ISD::ATOMXOR64_DAG: - case X86ISD::ATOMADD64_DAG: - case X86ISD::ATOMSUB64_DAG: - case X86ISD::ATOMNAND64_DAG: - case X86ISD::ATOMAND64_DAG: - case X86ISD::ATOMMAX64_DAG: - case X86ISD::ATOMMIN64_DAG: - case X86ISD::ATOMUMAX64_DAG: - case X86ISD::ATOMUMIN64_DAG: - case X86ISD::ATOMSWAP64_DAG: { - unsigned Opc; - switch (Opcode) { - default: llvm_unreachable("Impossible opcode"); - case X86ISD::ATOMOR64_DAG: Opc = X86::ATOMOR6432; break; - case X86ISD::ATOMXOR64_DAG: Opc = X86::ATOMXOR6432; break; - case X86ISD::ATOMADD64_DAG: Opc = X86::ATOMADD6432; break; - case X86ISD::ATOMSUB64_DAG: Opc = X86::ATOMSUB6432; break; - case X86ISD::ATOMNAND64_DAG: Opc = X86::ATOMNAND6432; break; - case X86ISD::ATOMAND64_DAG: Opc = X86::ATOMAND6432; break; - case X86ISD::ATOMMAX64_DAG: Opc = X86::ATOMMAX6432; break; - case X86ISD::ATOMMIN64_DAG: Opc = X86::ATOMMIN6432; break; - case X86ISD::ATOMUMAX64_DAG: Opc = X86::ATOMUMAX6432; break; - case X86ISD::ATOMUMIN64_DAG: Opc = X86::ATOMUMIN6432; break; - case X86ISD::ATOMSWAP64_DAG: Opc = X86::ATOMSWAP6432; break; - } - SDNode *RetVal = SelectAtomic64(Node, Opc); - if (RetVal) - return RetVal; - break; - } - case ISD::ATOMIC_LOAD_XOR: case ISD::ATOMIC_LOAD_AND: case ISD::ATOMIC_LOAD_OR: diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index cbaf44e35e12..5ccff20e2943 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -44,11 +44,13 @@ #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCSymbol.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetOptions.h" #include <bitset> +#include <numeric> #include <cctype> using namespace llvm; @@ -56,6 +58,17 @@ using namespace llvm; STATISTIC(NumTailCalls, "Number of tail calls"); +static cl::opt<bool> ExperimentalVectorWideningLegalization( + "x86-experimental-vector-widening-legalization", cl::init(false), + cl::desc("Enable an experimental vector type legalization through widening " + "rather than promotion."), + cl::Hidden); + +static cl::opt<bool> ExperimentalVectorShuffleLowering( + "x86-experimental-vector-shuffle-lowering", cl::init(false), + cl::desc("Enable an experimental vector shuffle lowering code path."), + cl::Hidden); + // Forward declarations. static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, SDValue V2); @@ -178,29 +191,28 @@ static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT, return Insert256BitVector(V, V2, NumElems/2, DAG, dl); } -static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { - const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>(); - bool is64Bit = Subtarget->is64Bit(); - - if (Subtarget->isTargetMacho()) { - if (is64Bit) +static TargetLoweringObjectFile *createTLOF(const Triple &TT) { + if (TT.isOSBinFormatMachO()) { + if (TT.getArch() == Triple::x86_64) return new X86_64MachoTargetObjectFile(); return new TargetLoweringObjectFileMachO(); } - if (Subtarget->isTargetLinux()) + if (TT.isOSLinux()) return new X86LinuxTargetObjectFile(); - if (Subtarget->isTargetELF()) + if (TT.isOSBinFormatELF()) return new TargetLoweringObjectFileELF(); - if (Subtarget->isTargetKnownWindowsMSVC()) + if (TT.isKnownWindowsMSVCEnvironment()) return new X86WindowsTargetObjectFile(); - if (Subtarget->isTargetCOFF()) + if (TT.isOSBinFormatCOFF()) return new TargetLoweringObjectFileCOFF(); llvm_unreachable("unknown subtarget type"); } +// FIXME: This should stop caching the target machine as soon as +// we can remove resetOperationActions et al. X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) - : TargetLowering(TM, createTLOF(TM)) { + : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))) { Subtarget = &TM.getSubtarget<X86Subtarget>(); X86ScalarSSEf64 = Subtarget->hasSSE2(); X86ScalarSSEf32 = Subtarget->hasSSE1(); @@ -443,7 +455,13 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::BR_CC , MVT::i16, Expand); setOperationAction(ISD::BR_CC , MVT::i32, Expand); setOperationAction(ISD::BR_CC , MVT::i64, Expand); - setOperationAction(ISD::SELECT_CC , MVT::Other, Expand); + setOperationAction(ISD::SELECT_CC , MVT::f32, Expand); + setOperationAction(ISD::SELECT_CC , MVT::f64, Expand); + setOperationAction(ISD::SELECT_CC , MVT::f80, Expand); + setOperationAction(ISD::SELECT_CC , MVT::i8, Expand); + setOperationAction(ISD::SELECT_CC , MVT::i16, Expand); + setOperationAction(ISD::SELECT_CC , MVT::i32, Expand); + setOperationAction(ISD::SELECT_CC , MVT::i64, Expand); if (Subtarget->is64Bit()) setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); @@ -497,6 +515,14 @@ void X86TargetLowering::resetOperationActions() { } } + // Special handling for half-precision floating point conversions. + // If we don't have F16C support, then lower half float conversions + // into library calls. + if (TM.Options.UseSoftFloat || !Subtarget->hasF16C()) { + setOperationAction(ISD::FP16_TO_FP32, MVT::f32, Expand); + setOperationAction(ISD::FP32_TO_FP16, MVT::i16, Expand); + } + if (Subtarget->hasPOPCNT()) { setOperationAction(ISD::CTPOP , MVT::i8 , Promote); } else { @@ -575,34 +601,18 @@ void X86TargetLowering::resetOperationActions() { // Expand certain atomics for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) { MVT VT = IntVTs[i]; - setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom); + setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom); setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom); setOperationAction(ISD::ATOMIC_STORE, VT, Custom); } - if (!Subtarget->is64Bit()) { - setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i64, Custom); - setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i64, Custom); - } - if (Subtarget->hasCmpxchg16b()) { - setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom); + setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom); } // FIXME - use subtarget debug flags - if (!Subtarget->isTargetDarwin() && - !Subtarget->isTargetELF() && - !Subtarget->isTargetCygMing()) { + if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() && + !Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) { setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); } @@ -861,6 +871,7 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::ZERO_EXTEND, VT, Expand); setOperationAction(ISD::ANY_EXTEND, VT, Expand); setOperationAction(ISD::VSELECT, VT, Expand); + setOperationAction(ISD::SELECT_CC, VT, Expand); for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE; InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) setTruncStoreAction(VT, @@ -1433,6 +1444,11 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::OR, MVT::v16i32, Legal); setOperationAction(ISD::XOR, MVT::v16i32, Legal); + if (Subtarget->hasCDI()) { + setOperationAction(ISD::CTLZ, MVT::v8i64, Legal); + setOperationAction(ISD::CTLZ, MVT::v16i32, Legal); + } + // Custom lower several nodes. for (int i = MVT::FIRST_VECTOR_VALUETYPE; i <= MVT::LAST_VECTOR_VALUETYPE; ++i) { @@ -1563,6 +1579,7 @@ void X86TargetLowering::resetOperationActions() { setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::SETCC); setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); + setTargetDAGCombine(ISD::BUILD_VECTOR); if (Subtarget->is64Bit()) setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::XOR); @@ -1585,6 +1602,16 @@ void X86TargetLowering::resetOperationActions() { setPrefFunctionAlignment(4); // 2^4 bytes. } +TargetLoweringBase::LegalizeTypeAction +X86TargetLowering::getPreferredVectorAction(EVT VT) const { + if (ExperimentalVectorWideningLegalization && + VT.getVectorNumElements() != 1 && + VT.getVectorElementType().getSimpleVT() != MVT::i1) + return TypeWidenVector; + + return TargetLoweringBase::getPreferredVectorAction(VT); +} + EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { if (!VT.isVector()) return Subtarget->hasAVX512() ? MVT::i1: MVT::i8; @@ -1725,7 +1752,7 @@ const MCExpr * X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, const MachineBasicBlock *MBB, unsigned uid,MCContext &Ctx) const{ - assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ && + assert(MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ && Subtarget->isPICStyleGOT()); // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF // entries. @@ -1824,7 +1851,7 @@ X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const { SmallVector<CCValAssign, 16> RVLocs; - CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), + CCState CCInfo(CallConv, isVarArg, MF, MF.getTarget(), RVLocs, Context); return CCInfo.CheckReturn(Outs, RetCC_X86); } @@ -1844,7 +1871,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); SmallVector<CCValAssign, 16> RVLocs; - CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), + CCState CCInfo(CallConv, isVarArg, MF, DAG.getTarget(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeReturn(Outs, RetCC_X86); @@ -2016,7 +2043,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, SmallVector<CCValAssign, 16> RVLocs; bool Is64Bit = Subtarget->is64Bit(); CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), RVLocs, *DAG.getContext()); + DAG.getTarget(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC_X86); // Copy all of the result registers out of their specified physreg. @@ -2166,8 +2193,8 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, unsigned i) const { // Create the nodes corresponding to a load from this parameter slot. ISD::ArgFlagsTy Flags = Ins[i].Flags; - bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv, - getTargetMachine().Options.GuaranteedTailCallOpt); + bool AlwaysUseMutable = FuncIsMadeTailCallSafe( + CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt); bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); EVT ValVT; @@ -2224,7 +2251,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, // Assign locations to all of the incoming arguments. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), + CCState CCInfo(CallConv, isVarArg, MF, DAG.getTarget(), ArgLocs, *DAG.getContext()); // Allocate shadow area for Win64 @@ -2388,7 +2415,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, TotalNumXMMRegs = 0; if (IsWin64) { - const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering(); + const TargetFrameLowering &TFI = *MF.getTarget().getFrameLowering(); // Get to the caller-allocated home save location. Add 8 to account // for the return address. int HomeOffset = TFI.getOffsetOfLocalArea() + 8; @@ -2587,7 +2614,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), + CCState CCInfo(CallConv, isVarArg, MF, MF.getTarget(), ArgLocs, *DAG.getContext()); // Allocate shadow area for Win64 @@ -2602,7 +2629,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // This is a sibcall. The memory operands are available in caller's // own caller's stack. NumBytes = 0; - else if (getTargetMachine().Options.GuaranteedTailCallOpt && + else if (MF.getTarget().Options.GuaranteedTailCallOpt && IsTailCallConvention(CallConv)) NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); @@ -2649,7 +2676,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Walk the register/memloc assignments, inserting copies/loads. In the case // of tail call optimization arguments are handle later. const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); + static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo()); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { // Skip inalloca arguments, they have already been written. ISD::ArgFlagsTy Flags = Outs[i].Flags; @@ -2840,7 +2867,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, InFlag = Chain.getValue(1); } - if (getTargetMachine().getCodeModel() == CodeModel::Large) { + if (DAG.getTarget().getCodeModel() == CodeModel::Large) { assert(Is64Bit && "Large code model is only legal in 64-bit mode."); // In the 64-bit large code model, we have to make all calls // through a register, since the call instruction's 32-bit @@ -2864,7 +2891,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // has hidden or protected visibility, or if it is static or local, then // we don't need to use the PLT - we can directly call it. if (Subtarget->isTargetELF() && - getTargetMachine().getRelocationModel() == Reloc::PIC_ && + DAG.getTarget().getRelocationModel() == Reloc::PIC_ && GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { OpFlags = X86II::MO_PLT; } else if (Subtarget->isPICStyleStubAny() && @@ -2906,7 +2933,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // On ELF targets, in either X86-64 or X86-32 mode, direct calls to // external symbols should go through the PLT. if (Subtarget->isTargetELF() && - getTargetMachine().getRelocationModel() == Reloc::PIC_) { + DAG.getTarget().getRelocationModel() == Reloc::PIC_) { OpFlags = X86II::MO_PLT; } else if (Subtarget->isPICStyleStubAny() && (!Subtarget->getTargetTriple().isMacOSX() || @@ -2945,7 +2972,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, RegsToPass[i].second.getValueType())); // Add a register mask operand representing the call-preserved registers. - const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); + const TargetRegisterInfo *TRI = DAG.getTarget().getRegisterInfo(); const uint32_t *Mask = TRI->getCallPreservedMask(CallConv); assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); @@ -2969,7 +2996,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Create the CALLSEQ_END node. unsigned NumBytesForCalleeToPop; if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, - getTargetMachine().Options.GuaranteedTailCallOpt)) + DAG.getTarget().Options.GuaranteedTailCallOpt)) NumBytesForCalleeToPop = NumBytes; // Callee pops everything else if (!Is64Bit && !IsTailCallConvention(CallConv) && !Subtarget->getTargetTriple().isOSMSVCRT() && @@ -3140,7 +3167,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC); bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC); - if (getTargetMachine().Options.GuaranteedTailCallOpt) { + if (DAG.getTarget().Options.GuaranteedTailCallOpt) { if (IsTailCallConvention(CalleeCC) && CCMatch) return true; return false; @@ -3152,7 +3179,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to // emit a special epilogue. const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); + static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo()); if (RegInfo->needsStackRealignment(MF)) return false; @@ -3181,7 +3208,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ArgLocs, *DAG.getContext()); + DAG.getTarget(), ArgLocs, *DAG.getContext()); CCInfo.AnalyzeCallOperands(Outs, CC_X86); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) @@ -3202,7 +3229,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, if (Unused) { SmallVector<CCValAssign, 16> RVLocs; CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), - getTargetMachine(), RVLocs, *DAG.getContext()); + DAG.getTarget(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC_X86); for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { CCValAssign &VA = RVLocs[i]; @@ -3216,12 +3243,12 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, if (!CCMatch) { SmallVector<CCValAssign, 16> RVLocs1; CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), - getTargetMachine(), RVLocs1, *DAG.getContext()); + DAG.getTarget(), RVLocs1, *DAG.getContext()); CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); SmallVector<CCValAssign, 16> RVLocs2; CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), - getTargetMachine(), RVLocs2, *DAG.getContext()); + DAG.getTarget(), RVLocs2, *DAG.getContext()); CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); if (RVLocs1.size() != RVLocs2.size()) @@ -3248,7 +3275,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // argument is passed on the stack. SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ArgLocs, *DAG.getContext()); + DAG.getTarget(), ArgLocs, *DAG.getContext()); // Allocate shadow area for Win64 if (IsCalleeWin64) @@ -3265,7 +3292,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, MachineFrameInfo *MFI = MF.getFrameInfo(); const MachineRegisterInfo *MRI = &MF.getRegInfo(); const X86InstrInfo *TII = - ((const X86TargetMachine&)getTargetMachine()).getInstrInfo(); + static_cast<const X86InstrInfo *>(DAG.getTarget().getInstrInfo()); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; SDValue Arg = OutVals[i]; @@ -3288,12 +3315,12 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, if (!Subtarget->is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) && !isa<ExternalSymbolSDNode>(Callee)) || - getTargetMachine().getRelocationModel() == Reloc::PIC_)) { + DAG.getTarget().getRelocationModel() == Reloc::PIC_)) { unsigned NumInRegs = 0; // In PIC we need an extra register to formulate the address computation // for the callee. unsigned MaxInRegs = - (getTargetMachine().getRelocationModel() == Reloc::PIC_) ? 2 : 3; + (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3; for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; @@ -3417,7 +3444,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); + static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo()); X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); int ReturnAddrIndex = FuncInfo->getRAIndex(); @@ -3967,14 +3994,22 @@ static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) { unsigned CorrectPosV1 = 0; unsigned CorrectPosV2 = 0; - for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i) + for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i) { + if (Mask[i] == -1) { + ++CorrectPosV1; + ++CorrectPosV2; + continue; + } + if (Mask[i] == i) ++CorrectPosV1; else if (Mask[i] == i + 4) ++CorrectPosV2; + } if (CorrectPosV1 == 3 || CorrectPosV2 == 3) - // We have 3 elements from one vector, and one from another. + // We have 3 elements (undefs count as elements from any vector) from one + // vector, and one from another. return true; return false; @@ -4823,19 +4858,6 @@ static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, return true; } -/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are -/// all the same. -static bool isSplatVector(SDNode *N) { - if (N->getOpcode() != ISD::BUILD_VECTOR) - return false; - - SDValue SplatValue = N->getOperand(0); - for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) - if (N->getOperand(i) != SplatValue) - return false; - return true; -} - /// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved /// to an zero vector. /// FIXME: move to dag combiner / method on ShuffleVectorSDNode @@ -5744,18 +5766,22 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, return SDValue(); case ISD::BUILD_VECTOR: { - // The BUILD_VECTOR node must be a splat. - if (!isSplatVector(Op.getNode())) + auto *BVOp = cast<BuildVectorSDNode>(Op.getNode()); + BitVector UndefElements; + SDValue Splat = BVOp->getSplatValue(&UndefElements); + + // We need a splat of a single value to use broadcast, and it doesn't + // make any sense if the value is only in one element of the vector. + if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1) return SDValue(); - Ld = Op.getOperand(0); + Ld = Splat; ConstSplatVal = (Ld.getOpcode() == ISD::Constant || - Ld.getOpcode() == ISD::ConstantFP); + Ld.getOpcode() == ISD::ConstantFP); - // The suspected load node has several users. Make sure that all - // of its users are from the BUILD_VECTOR node. - // Constants may have multiple users. - if (!ConstSplatVal && !Ld->hasNUsesOfValue(VT.getVectorNumElements(), 0)) + // Make sure that all of the users of a non-constant load are from the + // BUILD_VECTOR node. + if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode())) return SDValue(); break; } @@ -6042,6 +6068,433 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::BITCAST, dl, VT, Select); } +/// \brief Return true if \p N implements a horizontal binop and return the +/// operands for the horizontal binop into V0 and V1. +/// +/// This is a helper function of PerformBUILD_VECTORCombine. +/// This function checks that the build_vector \p N in input implements a +/// horizontal operation. Parameter \p Opcode defines the kind of horizontal +/// operation to match. +/// For example, if \p Opcode is equal to ISD::ADD, then this function +/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode +/// is equal to ISD::SUB, then this function checks if this is a horizontal +/// arithmetic sub. +/// +/// This function only analyzes elements of \p N whose indices are +/// in range [BaseIdx, LastIdx). +static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode, + SelectionDAG &DAG, + unsigned BaseIdx, unsigned LastIdx, + SDValue &V0, SDValue &V1) { + EVT VT = N->getValueType(0); + + assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!"); + assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx && + "Invalid Vector in input!"); + + bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD); + bool CanFold = true; + unsigned ExpectedVExtractIdx = BaseIdx; + unsigned NumElts = LastIdx - BaseIdx; + V0 = DAG.getUNDEF(VT); + V1 = DAG.getUNDEF(VT); + + // Check if N implements a horizontal binop. + for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) { + SDValue Op = N->getOperand(i + BaseIdx); + + // Skip UNDEFs. + if (Op->getOpcode() == ISD::UNDEF) { + // Update the expected vector extract index. + if (i * 2 == NumElts) + ExpectedVExtractIdx = BaseIdx; + ExpectedVExtractIdx += 2; + continue; + } + + CanFold = Op->getOpcode() == Opcode && Op->hasOneUse(); + + if (!CanFold) + break; + + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + + // Try to match the following pattern: + // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1)) + CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && + Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT && + Op0.getOperand(0) == Op1.getOperand(0) && + isa<ConstantSDNode>(Op0.getOperand(1)) && + isa<ConstantSDNode>(Op1.getOperand(1))); + if (!CanFold) + break; + + unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue(); + unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue(); + + if (i * 2 < NumElts) { + if (V0.getOpcode() == ISD::UNDEF) + V0 = Op0.getOperand(0); + } else { + if (V1.getOpcode() == ISD::UNDEF) + V1 = Op0.getOperand(0); + if (i * 2 == NumElts) + ExpectedVExtractIdx = BaseIdx; + } + + SDValue Expected = (i * 2 < NumElts) ? V0 : V1; + if (I0 == ExpectedVExtractIdx) + CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected; + else if (IsCommutable && I1 == ExpectedVExtractIdx) { + // Try to match the following dag sequence: + // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I)) + CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected; + } else + CanFold = false; + + ExpectedVExtractIdx += 2; + } + + return CanFold; +} + +/// \brief Emit a sequence of two 128-bit horizontal add/sub followed by +/// a concat_vector. +/// +/// This is a helper function of PerformBUILD_VECTORCombine. +/// This function expects two 256-bit vectors called V0 and V1. +/// At first, each vector is split into two separate 128-bit vectors. +/// Then, the resulting 128-bit vectors are used to implement two +/// horizontal binary operations. +/// +/// The kind of horizontal binary operation is defined by \p X86Opcode. +/// +/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to +/// the two new horizontal binop. +/// When Mode is set, the first horizontal binop dag node would take as input +/// the lower 128-bit of V0 and the upper 128-bit of V0. The second +/// horizontal binop dag node would take as input the lower 128-bit of V1 +/// and the upper 128-bit of V1. +/// Example: +/// HADD V0_LO, V0_HI +/// HADD V1_LO, V1_HI +/// +/// Otherwise, the first horizontal binop dag node takes as input the lower +/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop +/// dag node takes the the upper 128-bit of V0 and the upper 128-bit of V1. +/// Example: +/// HADD V0_LO, V1_LO +/// HADD V0_HI, V1_HI +/// +/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower +/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to +/// the upper 128-bits of the result. +static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, + SDLoc DL, SelectionDAG &DAG, + unsigned X86Opcode, bool Mode, + bool isUndefLO, bool isUndefHI) { + EVT VT = V0.getValueType(); + assert(VT.is256BitVector() && VT == V1.getValueType() && + "Invalid nodes in input!"); + + unsigned NumElts = VT.getVectorNumElements(); + SDValue V0_LO = Extract128BitVector(V0, 0, DAG, DL); + SDValue V0_HI = Extract128BitVector(V0, NumElts/2, DAG, DL); + SDValue V1_LO = Extract128BitVector(V1, 0, DAG, DL); + SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL); + EVT NewVT = V0_LO.getValueType(); + + SDValue LO = DAG.getUNDEF(NewVT); + SDValue HI = DAG.getUNDEF(NewVT); + + if (Mode) { + // Don't emit a horizontal binop if the result is expected to be UNDEF. + if (!isUndefLO && V0->getOpcode() != ISD::UNDEF) + LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI); + if (!isUndefHI && V1->getOpcode() != ISD::UNDEF) + HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI); + } else { + // Don't emit a horizontal binop if the result is expected to be UNDEF. + if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF || + V1_LO->getOpcode() != ISD::UNDEF)) + LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO); + + if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF || + V1_HI->getOpcode() != ISD::UNDEF)) + HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI); + } + + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI); +} + +/// \brief Try to fold a build_vector that performs an 'addsub' into the +/// sequence of 'vadd + vsub + blendi'. +static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + SDLoc DL(BV); + EVT VT = BV->getValueType(0); + unsigned NumElts = VT.getVectorNumElements(); + SDValue InVec0 = DAG.getUNDEF(VT); + SDValue InVec1 = DAG.getUNDEF(VT); + + assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 || + VT == MVT::v2f64) && "build_vector with an invalid type found!"); + + // Don't try to emit a VSELECT that cannot be lowered into a blend. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT)) + return SDValue(); + + // Odd-numbered elements in the input build vector are obtained from + // adding two integer/float elements. + // Even-numbered elements in the input build vector are obtained from + // subtracting two integer/float elements. + unsigned ExpectedOpcode = ISD::FSUB; + unsigned NextExpectedOpcode = ISD::FADD; + bool AddFound = false; + bool SubFound = false; + + for (unsigned i = 0, e = NumElts; i != e; i++) { + SDValue Op = BV->getOperand(i); + + // Skip 'undef' values. + unsigned Opcode = Op.getOpcode(); + if (Opcode == ISD::UNDEF) { + std::swap(ExpectedOpcode, NextExpectedOpcode); + continue; + } + + // Early exit if we found an unexpected opcode. + if (Opcode != ExpectedOpcode) + return SDValue(); + + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + + // Try to match the following pattern: + // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i)) + // Early exit if we cannot match that sequence. + if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + !isa<ConstantSDNode>(Op0.getOperand(1)) || + !isa<ConstantSDNode>(Op1.getOperand(1)) || + Op0.getOperand(1) != Op1.getOperand(1)) + return SDValue(); + + unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue(); + if (I0 != i) + return SDValue(); + + // We found a valid add/sub node. Update the information accordingly. + if (i & 1) + AddFound = true; + else + SubFound = true; + + // Update InVec0 and InVec1. + if (InVec0.getOpcode() == ISD::UNDEF) + InVec0 = Op0.getOperand(0); + if (InVec1.getOpcode() == ISD::UNDEF) + InVec1 = Op1.getOperand(0); + + // Make sure that operands in input to each add/sub node always + // come from a same pair of vectors. + if (InVec0 != Op0.getOperand(0)) { + if (ExpectedOpcode == ISD::FSUB) + return SDValue(); + + // FADD is commutable. Try to commute the operands + // and then test again. + std::swap(Op0, Op1); + if (InVec0 != Op0.getOperand(0)) + return SDValue(); + } + + if (InVec1 != Op1.getOperand(0)) + return SDValue(); + + // Update the pair of expected opcodes. + std::swap(ExpectedOpcode, NextExpectedOpcode); + } + + // Don't try to fold this build_vector into a VSELECT if it has + // too many UNDEF operands. + if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF && + InVec1.getOpcode() != ISD::UNDEF) { + // Emit a sequence of vector add and sub followed by a VSELECT. + // The new VSELECT will be lowered into a BLENDI. + // At ISel stage, we pattern-match the sequence 'add + sub + BLENDI' + // and emit a single ADDSUB instruction. + SDValue Sub = DAG.getNode(ExpectedOpcode, DL, VT, InVec0, InVec1); + SDValue Add = DAG.getNode(NextExpectedOpcode, DL, VT, InVec0, InVec1); + + // Construct the VSELECT mask. + EVT MaskVT = VT.changeVectorElementTypeToInteger(); + EVT SVT = MaskVT.getVectorElementType(); + unsigned SVTBits = SVT.getSizeInBits(); + SmallVector<SDValue, 8> Ops; + + for (unsigned i = 0, e = NumElts; i != e; ++i) { + APInt Value = i & 1 ? APInt::getNullValue(SVTBits) : + APInt::getAllOnesValue(SVTBits); + SDValue Constant = DAG.getConstant(Value, SVT); + Ops.push_back(Constant); + } + + SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskVT, Ops); + return DAG.getSelect(DL, VT, Mask, Sub, Add); + } + + return SDValue(); +} + +static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + unsigned NumElts = VT.getVectorNumElements(); + BuildVectorSDNode *BV = cast<BuildVectorSDNode>(N); + SDValue InVec0, InVec1; + + // Try to match an ADDSUB. + if ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || + (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) { + SDValue Value = matchAddSub(BV, DAG, Subtarget); + if (Value.getNode()) + return Value; + } + + // Try to match horizontal ADD/SUB. + unsigned NumUndefsLO = 0; + unsigned NumUndefsHI = 0; + unsigned Half = NumElts/2; + + // Count the number of UNDEF operands in the build_vector in input. + for (unsigned i = 0, e = Half; i != e; ++i) + if (BV->getOperand(i)->getOpcode() == ISD::UNDEF) + NumUndefsLO++; + + for (unsigned i = Half, e = NumElts; i != e; ++i) + if (BV->getOperand(i)->getOpcode() == ISD::UNDEF) + NumUndefsHI++; + + // Early exit if this is either a build_vector of all UNDEFs or all the + // operands but one are UNDEF. + if (NumUndefsLO + NumUndefsHI + 1 >= NumElts) + return SDValue(); + + if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) { + // Try to match an SSE3 float HADD/HSUB. + if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1)) + return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1); + + if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1)) + return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1); + } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) { + // Try to match an SSSE3 integer HADD/HSUB. + if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1)) + return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1); + + if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1)) + return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1); + } + + if (!Subtarget->hasAVX()) + return SDValue(); + + if ((VT == MVT::v8f32 || VT == MVT::v4f64)) { + // Try to match an AVX horizontal add/sub of packed single/double + // precision floating point values from 256-bit vectors. + SDValue InVec2, InVec3; + if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) && + isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) && + ((InVec0.getOpcode() == ISD::UNDEF || + InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && + ((InVec1.getOpcode() == ISD::UNDEF || + InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) + return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1); + + if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) && + isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) && + ((InVec0.getOpcode() == ISD::UNDEF || + InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && + ((InVec1.getOpcode() == ISD::UNDEF || + InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) + return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1); + } else if (VT == MVT::v8i32 || VT == MVT::v16i16) { + // Try to match an AVX2 horizontal add/sub of signed integers. + SDValue InVec2, InVec3; + unsigned X86Opcode; + bool CanFold = true; + + if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) && + isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) && + ((InVec0.getOpcode() == ISD::UNDEF || + InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && + ((InVec1.getOpcode() == ISD::UNDEF || + InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) + X86Opcode = X86ISD::HADD; + else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) && + isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) && + ((InVec0.getOpcode() == ISD::UNDEF || + InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && + ((InVec1.getOpcode() == ISD::UNDEF || + InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) + X86Opcode = X86ISD::HSUB; + else + CanFold = false; + + if (CanFold) { + // Fold this build_vector into a single horizontal add/sub. + // Do this only if the target has AVX2. + if (Subtarget->hasAVX2()) + return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1); + + // Do not try to expand this build_vector into a pair of horizontal + // add/sub if we can emit a pair of scalar add/sub. + if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half) + return SDValue(); + + // Convert this build_vector into a pair of horizontal binop followed by + // a concat vector. + bool isUndefLO = NumUndefsLO == Half; + bool isUndefHI = NumUndefsHI == Half; + return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false, + isUndefLO, isUndefHI); + } + } + + if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 || + VT == MVT::v16i16) && Subtarget->hasAVX()) { + unsigned X86Opcode; + if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1)) + X86Opcode = X86ISD::HADD; + else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1)) + X86Opcode = X86ISD::HSUB; + else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1)) + X86Opcode = X86ISD::FHADD; + else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1)) + X86Opcode = X86ISD::FHSUB; + else + return SDValue(); + + // Don't try to expand this build_vector into a pair of horizontal add/sub + // if we can simply emit a pair of scalar add/sub. + if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half) + return SDValue(); + + // Convert this build_vector into two horizontal add/sub followed by + // a concat vector. + bool isUndefLO = NumUndefsLO == Half; + bool isUndefHI = NumUndefsHI == Half; + return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true, + isUndefLO, isUndefHI); + } + + return SDValue(); +} + SDValue X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); @@ -6429,38 +6882,1160 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { return LowerAVXCONCAT_VECTORS(Op, DAG); } -// Try to lower a shuffle node into a simple blend instruction. -static SDValue -LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp, - const X86Subtarget *Subtarget, SelectionDAG &DAG) { - SDValue V1 = SVOp->getOperand(0); - SDValue V2 = SVOp->getOperand(1); - SDLoc dl(SVOp); - MVT VT = SVOp->getSimpleValueType(0); + +//===----------------------------------------------------------------------===// +// Vector shuffle lowering +// +// This is an experimental code path for lowering vector shuffles on x86. It is +// designed to handle arbitrary vector shuffles and blends, gracefully +// degrading performance as necessary. It works hard to recognize idiomatic +// shuffles and lower them to optimal instruction patterns without leaving +// a framework that allows reasonably efficient handling of all vector shuffle +// patterns. +//===----------------------------------------------------------------------===// + +/// \brief Tiny helper function to identify a no-op mask. +/// +/// This is a somewhat boring predicate function. It checks whether the mask +/// array input, which is assumed to be a single-input shuffle mask of the kind +/// used by the X86 shuffle instructions (not a fully general +/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an +/// in-place shuffle are 'no-op's. +static bool isNoopShuffleMask(ArrayRef<int> Mask) { + for (int i = 0, Size = Mask.size(); i < Size; ++i) + if (Mask[i] != -1 && Mask[i] != i) + return false; + return true; +} + +/// \brief Helper function to classify a mask as a single-input mask. +/// +/// This isn't a generic single-input test because in the vector shuffle +/// lowering we canonicalize single inputs to be the first input operand. This +/// means we can more quickly test for a single input by only checking whether +/// an input from the second operand exists. We also assume that the size of +/// mask corresponds to the size of the input vectors which isn't true in the +/// fully general case. +static bool isSingleInputShuffleMask(ArrayRef<int> Mask) { + for (int M : Mask) + if (M >= (int)Mask.size()) + return false; + return true; +} + +/// \brief Get a 4-lane 8-bit shuffle immediate for a mask. +/// +/// This helper function produces an 8-bit shuffle immediate corresponding to +/// the ubiquitous shuffle encoding scheme used in x86 instructions for +/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for +/// example. +/// +/// NB: We rely heavily on "undef" masks preserving the input lane. +static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, + SelectionDAG &DAG) { + assert(Mask.size() == 4 && "Only 4-lane shuffle masks"); + assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!"); + assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!"); + assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!"); + assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!"); + + unsigned Imm = 0; + Imm |= (Mask[0] == -1 ? 0 : Mask[0]) << 0; + Imm |= (Mask[1] == -1 ? 1 : Mask[1]) << 2; + Imm |= (Mask[2] == -1 ? 2 : Mask[2]) << 4; + Imm |= (Mask[3] == -1 ? 3 : Mask[3]) << 6; + return DAG.getConstant(Imm, MVT::i8); +} + +/// \brief Handle lowering of 2-lane 64-bit floating point shuffles. +/// +/// This is the basis function for the 2-lane 64-bit shuffles as we have full +/// support for floating point shuffles but not integer shuffles. These +/// instructions will incur a domain crossing penalty on some chips though so +/// it is better to avoid lowering through this for integer vectors where +/// possible. +static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(Op.getSimpleValueType() == MVT::v2f64 && "Bad shuffle type!"); + assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); + + if (isSingleInputShuffleMask(Mask)) { + // Straight shuffle of a single input vector. Simulate this by using the + // single input as both of the "inputs" to this instruction.. + unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1); + return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V1, + DAG.getConstant(SHUFPDMask, MVT::i8)); + } + assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!"); + assert(Mask[1] >= 2 && "Non-canonicalized blend!"); + + unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1); + return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V2, + DAG.getConstant(SHUFPDMask, MVT::i8)); +} + +/// \brief Handle lowering of 2-lane 64-bit integer shuffles. +/// +/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by +/// the integer unit to minimize domain crossing penalties. However, for blends +/// it falls back to the floating point shuffle operation with appropriate bit +/// casting. +static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(Op.getSimpleValueType() == MVT::v2i64 && "Bad shuffle type!"); + assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); + + if (isSingleInputShuffleMask(Mask)) { + // Straight shuffle of a single input vector. For everything from SSE2 + // onward this has a single fast instruction with no scary immediates. + // We have to map the mask as it is actually a v4i32 shuffle instruction. + V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V1); + int WidenedMask[4] = { + std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1, + std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1}; + return DAG.getNode( + ISD::BITCAST, DL, MVT::v2i64, + DAG.getNode(X86ISD::PSHUFD, SDLoc(Op), MVT::v4i32, V1, + getV4X86ShuffleImm8ForMask(WidenedMask, DAG))); + } + + // We implement this with SHUFPD which is pretty lame because it will likely + // incur 2 cycles of stall for integer vectors on Nehalem and older chips. + // However, all the alternatives are still more cycles and newer chips don't + // have this problem. It would be really nice if x86 had better shuffles here. + V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V1); + V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V2); + return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, + DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask)); +} + +/// \brief Lower 4-lane 32-bit floating point shuffles. +/// +/// Uses instructions exclusively from the floating point unit to minimize +/// domain crossing penalties, as these are sufficient to implement all v4f32 +/// shuffles. +static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!"); + assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); + + SDValue LowV = V1, HighV = V2; + int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]}; + + int NumV2Elements = + std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); + + if (NumV2Elements == 0) + // Straight shuffle of a single input vector. We pass the input vector to + // both operands to simulate this with a SHUFPS. + return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1, + getV4X86ShuffleImm8ForMask(Mask, DAG)); + + if (NumV2Elements == 1) { + int V2Index = + std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) - + Mask.begin(); + // Compute the index adjacent to V2Index and in the same half by toggling + // the low bit. + int V2AdjIndex = V2Index ^ 1; + + if (Mask[V2AdjIndex] == -1) { + // Handles all the cases where we have a single V2 element and an undef. + // This will only ever happen in the high lanes because we commute the + // vector otherwise. + if (V2Index < 2) + std::swap(LowV, HighV); + NewMask[V2Index] -= 4; + } else { + // Handle the case where the V2 element ends up adjacent to a V1 element. + // To make this work, blend them together as the first step. + int V1Index = V2AdjIndex; + int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0}; + V2 = DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V2, V1, + getV4X86ShuffleImm8ForMask(BlendMask, DAG)); + + // Now proceed to reconstruct the final blend as we have the necessary + // high or low half formed. + if (V2Index < 2) { + LowV = V2; + HighV = V1; + } else { + HighV = V2; + } + NewMask[V1Index] = 2; // We put the V1 element in V2[2]. + NewMask[V2Index] = 0; // We shifted the V2 element into V2[0]. + } + } else if (NumV2Elements == 2) { + if (Mask[0] < 4 && Mask[1] < 4) { + // Handle the easy case where we have V1 in the low lanes and V2 in the + // high lanes. We never see this reversed because we sort the shuffle. + NewMask[2] -= 4; + NewMask[3] -= 4; + } else { + // We have a mixture of V1 and V2 in both low and high lanes. Rather than + // trying to place elements directly, just blend them and set up the final + // shuffle to place them. + + // The first two blend mask elements are for V1, the second two are for + // V2. + int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1], + Mask[2] < 4 ? Mask[2] : Mask[3], + (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4, + (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4}; + V1 = DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V2, + getV4X86ShuffleImm8ForMask(BlendMask, DAG)); + + // Now we do a normal shuffle of V1 by giving V1 as both operands to + // a blend. + LowV = HighV = V1; + NewMask[0] = Mask[0] < 4 ? 0 : 2; + NewMask[1] = Mask[0] < 4 ? 2 : 0; + NewMask[2] = Mask[2] < 4 ? 1 : 3; + NewMask[3] = Mask[2] < 4 ? 3 : 1; + } + } + return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, LowV, HighV, + getV4X86ShuffleImm8ForMask(NewMask, DAG)); +} + +/// \brief Lower 4-lane i32 vector shuffles. +/// +/// We try to handle these with integer-domain shuffles where we can, but for +/// blends we use the floating point domain blend instructions. +static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(Op.getSimpleValueType() == MVT::v4i32 && "Bad shuffle type!"); + assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); + + if (isSingleInputShuffleMask(Mask)) + // Straight shuffle of a single input vector. For everything from SSE2 + // onward this has a single fast instruction with no scary immediates. + return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1, + getV4X86ShuffleImm8ForMask(Mask, DAG)); + + // We implement this with SHUFPS because it can blend from two vectors. + // Because we're going to eventually use SHUFPS, we use SHUFPS even to build + // up the inputs, bypassing domain shift penalties that we would encur if we + // directly used PSHUFD on Nehalem and older. For newer chips, this isn't + // relevant. + return DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, + DAG.getVectorShuffle( + MVT::v4f32, DL, + DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V1), + DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V2), Mask)); +} + +/// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2 +/// shuffle lowering, and the most complex part. +/// +/// The lowering strategy is to try to form pairs of input lanes which are +/// targeted at the same half of the final vector, and then use a dword shuffle +/// to place them onto the right half, and finally unpack the paired lanes into +/// their final position. +/// +/// The exact breakdown of how to form these dword pairs and align them on the +/// correct sides is really tricky. See the comments within the function for +/// more of the details. +static SDValue lowerV8I16SingleInputVectorShuffle( + SDLoc DL, SDValue V, MutableArrayRef<int> Mask, + const X86Subtarget *Subtarget, SelectionDAG &DAG) { + assert(V.getSimpleValueType() == MVT::v8i16 && "Bad input type!"); + MutableArrayRef<int> LoMask = Mask.slice(0, 4); + MutableArrayRef<int> HiMask = Mask.slice(4, 4); + + SmallVector<int, 4> LoInputs; + std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs), + [](int M) { return M >= 0; }); + std::sort(LoInputs.begin(), LoInputs.end()); + LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end()); + SmallVector<int, 4> HiInputs; + std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs), + [](int M) { return M >= 0; }); + std::sort(HiInputs.begin(), HiInputs.end()); + HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end()); + int NumLToL = + std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin(); + int NumHToL = LoInputs.size() - NumLToL; + int NumLToH = + std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin(); + int NumHToH = HiInputs.size() - NumLToH; + MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL); + MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH); + MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL); + MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH); + + // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all + // such inputs we can swap two of the dwords across the half mark and end up + // with <=2 inputs to each half in each half. Once there, we can fall through + // to the generic code below. For example: + // + // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h] + // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5] + // + // Before we had 3-1 in the low half and 3-1 in the high half. Afterward, 2-2 + // and 2-2. + auto balanceSides = [&](ArrayRef<int> ThreeInputs, int OneInput, + int ThreeInputHalfSum, int OneInputHalfOffset) { + // Compute the index of dword with only one word among the three inputs in + // a half by taking the sum of the half with three inputs and subtracting + // the sum of the actual three inputs. The difference is the remaining + // slot. + int DWordA = (ThreeInputHalfSum - + std::accumulate(ThreeInputs.begin(), ThreeInputs.end(), 0)) / + 2; + int DWordB = OneInputHalfOffset / 2 + (OneInput / 2 + 1) % 2; + + int PSHUFDMask[] = {0, 1, 2, 3}; + PSHUFDMask[DWordA] = DWordB; + PSHUFDMask[DWordB] = DWordA; + V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, + DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, + DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V), + getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG))); + + // Adjust the mask to match the new locations of A and B. + for (int &M : Mask) + if (M != -1 && M/2 == DWordA) + M = 2 * DWordB + M % 2; + else if (M != -1 && M/2 == DWordB) + M = 2 * DWordA + M % 2; + + // Recurse back into this routine to re-compute state now that this isn't + // a 3 and 1 problem. + return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16), + Mask); + }; + if (NumLToL == 3 && NumHToL == 1) + return balanceSides(LToLInputs, HToLInputs[0], 0 + 1 + 2 + 3, 4); + else if (NumLToL == 1 && NumHToL == 3) + return balanceSides(HToLInputs, LToLInputs[0], 4 + 5 + 6 + 7, 0); + else if (NumLToH == 1 && NumHToH == 3) + return balanceSides(HToHInputs, LToHInputs[0], 4 + 5 + 6 + 7, 0); + else if (NumLToH == 3 && NumHToH == 1) + return balanceSides(LToHInputs, HToHInputs[0], 0 + 1 + 2 + 3, 4); + + // At this point there are at most two inputs to the low and high halves from + // each half. That means the inputs can always be grouped into dwords and + // those dwords can then be moved to the correct half with a dword shuffle. + // We use at most one low and one high word shuffle to collect these paired + // inputs into dwords, and finally a dword shuffle to place them. + int PSHUFLMask[4] = {-1, -1, -1, -1}; + int PSHUFHMask[4] = {-1, -1, -1, -1}; + int PSHUFDMask[4] = {-1, -1, -1, -1}; + + // First fix the masks for all the inputs that are staying in their + // original halves. This will then dictate the targets of the cross-half + // shuffles. + auto fixInPlaceInputs = [&PSHUFDMask]( + ArrayRef<int> InPlaceInputs, MutableArrayRef<int> SourceHalfMask, + MutableArrayRef<int> HalfMask, int HalfOffset) { + if (InPlaceInputs.empty()) + return; + if (InPlaceInputs.size() == 1) { + SourceHalfMask[InPlaceInputs[0] - HalfOffset] = + InPlaceInputs[0] - HalfOffset; + PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2; + return; + } + + assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!"); + SourceHalfMask[InPlaceInputs[0] - HalfOffset] = + InPlaceInputs[0] - HalfOffset; + // Put the second input next to the first so that they are packed into + // a dword. We find the adjacent index by toggling the low bit. + int AdjIndex = InPlaceInputs[0] ^ 1; + SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset; + std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex); + PSHUFDMask[AdjIndex / 2] = AdjIndex / 2; + }; + if (!HToLInputs.empty()) + fixInPlaceInputs(LToLInputs, PSHUFLMask, LoMask, 0); + if (!LToHInputs.empty()) + fixInPlaceInputs(HToHInputs, PSHUFHMask, HiMask, 4); + + // Now gather the cross-half inputs and place them into a free dword of + // their target half. + // FIXME: This operation could almost certainly be simplified dramatically to + // look more like the 3-1 fixing operation. + auto moveInputsToRightHalf = [&PSHUFDMask]( + MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs, + MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask, + int SourceOffset, int DestOffset) { + auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) { + return SourceHalfMask[Word] != -1 && SourceHalfMask[Word] != Word; + }; + auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask, + int Word) { + int LowWord = Word & ~1; + int HighWord = Word | 1; + return isWordClobbered(SourceHalfMask, LowWord) || + isWordClobbered(SourceHalfMask, HighWord); + }; + + if (IncomingInputs.empty()) + return; + + if (ExistingInputs.empty()) { + // Map any dwords with inputs from them into the right half. + for (int Input : IncomingInputs) { + // If the source half mask maps over the inputs, turn those into + // swaps and use the swapped lane. + if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) { + if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == -1) { + SourceHalfMask[SourceHalfMask[Input - SourceOffset]] = + Input - SourceOffset; + // We have to swap the uses in our half mask in one sweep. + for (int &M : HalfMask) + if (M == SourceHalfMask[Input - SourceOffset]) + M = Input; + else if (M == Input) + M = SourceHalfMask[Input - SourceOffset] + SourceOffset; + } else { + assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == + Input - SourceOffset && + "Previous placement doesn't match!"); + } + // Note that this correctly re-maps both when we do a swap and when + // we observe the other side of the swap above. We rely on that to + // avoid swapping the members of the input list directly. + Input = SourceHalfMask[Input - SourceOffset] + SourceOffset; + } + + // Map the input's dword into the correct half. + if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == -1) + PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2; + else + assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == + Input / 2 && + "Previous placement doesn't match!"); + } + + // And just directly shift any other-half mask elements to be same-half + // as we will have mirrored the dword containing the element into the + // same position within that half. + for (int &M : HalfMask) + if (M >= SourceOffset && M < SourceOffset + 4) { + M = M - SourceOffset + DestOffset; + assert(M >= 0 && "This should never wrap below zero!"); + } + return; + } + + // Ensure we have the input in a viable dword of its current half. This + // is particularly tricky because the original position may be clobbered + // by inputs being moved and *staying* in that half. + if (IncomingInputs.size() == 1) { + if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) { + int InputFixed = std::find(std::begin(SourceHalfMask), + std::end(SourceHalfMask), -1) - + std::begin(SourceHalfMask) + SourceOffset; + SourceHalfMask[InputFixed - SourceOffset] = + IncomingInputs[0] - SourceOffset; + std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0], + InputFixed); + IncomingInputs[0] = InputFixed; + } + } else if (IncomingInputs.size() == 2) { + if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 || + isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) { + int SourceDWordBase = !isDWordClobbered(SourceHalfMask, 0) ? 0 : 2; + assert(!isDWordClobbered(SourceHalfMask, SourceDWordBase) && + "Not all dwords can be clobbered!"); + SourceHalfMask[SourceDWordBase] = IncomingInputs[0] - SourceOffset; + SourceHalfMask[SourceDWordBase + 1] = IncomingInputs[1] - SourceOffset; + for (int &M : HalfMask) + if (M == IncomingInputs[0]) + M = SourceDWordBase + SourceOffset; + else if (M == IncomingInputs[1]) + M = SourceDWordBase + 1 + SourceOffset; + IncomingInputs[0] = SourceDWordBase + SourceOffset; + IncomingInputs[1] = SourceDWordBase + 1 + SourceOffset; + } + } else { + llvm_unreachable("Unhandled input size!"); + } + + // Now hoist the DWord down to the right half. + int FreeDWord = (PSHUFDMask[DestOffset / 2] == -1 ? 0 : 1) + DestOffset / 2; + assert(PSHUFDMask[FreeDWord] == -1 && "DWord not free"); + PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2; + for (int Input : IncomingInputs) + std::replace(HalfMask.begin(), HalfMask.end(), Input, + FreeDWord * 2 + Input % 2); + }; + moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, + /*SourceOffset*/ 4, /*DestOffset*/ 0); + moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, + /*SourceOffset*/ 0, /*DestOffset*/ 4); + + // Now enact all the shuffles we've computed to move the inputs into their + // target half. + if (!isNoopShuffleMask(PSHUFLMask)) + V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V, + getV4X86ShuffleImm8ForMask(PSHUFLMask, DAG)); + if (!isNoopShuffleMask(PSHUFHMask)) + V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V, + getV4X86ShuffleImm8ForMask(PSHUFHMask, DAG)); + if (!isNoopShuffleMask(PSHUFDMask)) + V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, + DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, + DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V), + getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG))); + + // At this point, each half should contain all its inputs, and we can then + // just shuffle them into their final position. + assert(std::count_if(LoMask.begin(), LoMask.end(), + [](int M) { return M >= 4; }) == 0 && + "Failed to lift all the high half inputs to the low mask!"); + assert(std::count_if(HiMask.begin(), HiMask.end(), + [](int M) { return M >= 0 && M < 4; }) == 0 && + "Failed to lift all the low half inputs to the high mask!"); + + // Do a half shuffle for the low mask. + if (!isNoopShuffleMask(LoMask)) + V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V, + getV4X86ShuffleImm8ForMask(LoMask, DAG)); + + // Do a half shuffle with the high mask after shifting its values down. + for (int &M : HiMask) + if (M >= 0) + M -= 4; + if (!isNoopShuffleMask(HiMask)) + V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V, + getV4X86ShuffleImm8ForMask(HiMask, DAG)); + + return V; +} + +/// \brief Detect whether the mask pattern should be lowered through +/// interleaving. +/// +/// This essentially tests whether viewing the mask as an interleaving of two +/// sub-sequences reduces the cross-input traffic of a blend operation. If so, +/// lowering it through interleaving is a significantly better strategy. +static bool shouldLowerAsInterleaving(ArrayRef<int> Mask) { + int NumEvenInputs[2] = {0, 0}; + int NumOddInputs[2] = {0, 0}; + int NumLoInputs[2] = {0, 0}; + int NumHiInputs[2] = {0, 0}; + for (int i = 0, Size = Mask.size(); i < Size; ++i) { + if (Mask[i] < 0) + continue; + + int InputIdx = Mask[i] >= Size; + + if (i < Size / 2) + ++NumLoInputs[InputIdx]; + else + ++NumHiInputs[InputIdx]; + + if ((i % 2) == 0) + ++NumEvenInputs[InputIdx]; + else + ++NumOddInputs[InputIdx]; + } + + // The minimum number of cross-input results for both the interleaved and + // split cases. If interleaving results in fewer cross-input results, return + // true. + int InterleavedCrosses = std::min(NumEvenInputs[1] + NumOddInputs[0], + NumEvenInputs[0] + NumOddInputs[1]); + int SplitCrosses = std::min(NumLoInputs[1] + NumHiInputs[0], + NumLoInputs[0] + NumHiInputs[1]); + return InterleavedCrosses < SplitCrosses; +} + +/// \brief Blend two v8i16 vectors using a naive unpack strategy. +/// +/// This strategy only works when the inputs from each vector fit into a single +/// half of that vector, and generally there are not so many inputs as to leave +/// the in-place shuffles required highly constrained (and thus expensive). It +/// shifts all the inputs into a single side of both input vectors and then +/// uses an unpack to interleave these inputs in a single vector. At that +/// point, we will fall back on the generic single input shuffle lowering. +static SDValue lowerV8I16BasicBlendVectorShuffle(SDLoc DL, SDValue V1, + SDValue V2, + MutableArrayRef<int> Mask, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad input type!"); + assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad input type!"); + SmallVector<int, 3> LoV1Inputs, HiV1Inputs, LoV2Inputs, HiV2Inputs; + for (int i = 0; i < 8; ++i) + if (Mask[i] >= 0 && Mask[i] < 4) + LoV1Inputs.push_back(i); + else if (Mask[i] >= 4 && Mask[i] < 8) + HiV1Inputs.push_back(i); + else if (Mask[i] >= 8 && Mask[i] < 12) + LoV2Inputs.push_back(i); + else if (Mask[i] >= 12) + HiV2Inputs.push_back(i); + + int NumV1Inputs = LoV1Inputs.size() + HiV1Inputs.size(); + int NumV2Inputs = LoV2Inputs.size() + HiV2Inputs.size(); + (void)NumV1Inputs; + (void)NumV2Inputs; + assert(NumV1Inputs > 0 && NumV1Inputs <= 3 && "At most 3 inputs supported"); + assert(NumV2Inputs > 0 && NumV2Inputs <= 3 && "At most 3 inputs supported"); + assert(NumV1Inputs + NumV2Inputs <= 4 && "At most 4 combined inputs"); + + bool MergeFromLo = LoV1Inputs.size() + LoV2Inputs.size() >= + HiV1Inputs.size() + HiV2Inputs.size(); + + auto moveInputsToHalf = [&](SDValue V, ArrayRef<int> LoInputs, + ArrayRef<int> HiInputs, bool MoveToLo, + int MaskOffset) { + ArrayRef<int> GoodInputs = MoveToLo ? LoInputs : HiInputs; + ArrayRef<int> BadInputs = MoveToLo ? HiInputs : LoInputs; + if (BadInputs.empty()) + return V; + + int MoveMask[] = {-1, -1, -1, -1, -1, -1, -1, -1}; + int MoveOffset = MoveToLo ? 0 : 4; + + if (GoodInputs.empty()) { + for (int BadInput : BadInputs) { + MoveMask[Mask[BadInput] % 4 + MoveOffset] = Mask[BadInput] - MaskOffset; + Mask[BadInput] = Mask[BadInput] % 4 + MoveOffset + MaskOffset; + } + } else { + if (GoodInputs.size() == 2) { + // If the low inputs are spread across two dwords, pack them into + // a single dword. + MoveMask[Mask[GoodInputs[0]] % 2 + MoveOffset] = + Mask[GoodInputs[0]] - MaskOffset; + MoveMask[Mask[GoodInputs[1]] % 2 + MoveOffset] = + Mask[GoodInputs[1]] - MaskOffset; + Mask[GoodInputs[0]] = Mask[GoodInputs[0]] % 2 + MoveOffset + MaskOffset; + Mask[GoodInputs[1]] = Mask[GoodInputs[0]] % 2 + MoveOffset + MaskOffset; + } else { + // Otherwise pin the low inputs. + for (int GoodInput : GoodInputs) + MoveMask[Mask[GoodInput] - MaskOffset] = Mask[GoodInput] - MaskOffset; + } + + int MoveMaskIdx = + std::find(std::begin(MoveMask) + MoveOffset, std::end(MoveMask), -1) - + std::begin(MoveMask); + assert(MoveMaskIdx >= MoveOffset && "Established above"); + + if (BadInputs.size() == 2) { + assert(MoveMask[MoveMaskIdx] == -1 && "Expected empty slot"); + assert(MoveMask[MoveMaskIdx + 1] == -1 && "Expected empty slot"); + MoveMask[MoveMaskIdx + Mask[BadInputs[0]] % 2] = + Mask[BadInputs[0]] - MaskOffset; + MoveMask[MoveMaskIdx + Mask[BadInputs[1]] % 2] = + Mask[BadInputs[1]] - MaskOffset; + Mask[BadInputs[0]] = MoveMaskIdx + Mask[BadInputs[0]] % 2 + MaskOffset; + Mask[BadInputs[1]] = MoveMaskIdx + Mask[BadInputs[1]] % 2 + MaskOffset; + } else { + assert(BadInputs.size() == 1 && "All sizes handled"); + MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset; + Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset; + } + } + + return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16), + MoveMask); + }; + V1 = moveInputsToHalf(V1, LoV1Inputs, HiV1Inputs, MergeFromLo, + /*MaskOffset*/ 0); + V2 = moveInputsToHalf(V2, LoV2Inputs, HiV2Inputs, MergeFromLo, + /*MaskOffset*/ 8); + + // FIXME: Select an interleaving of the merge of V1 and V2 that minimizes + // cross-half traffic in the final shuffle. + + // Munge the mask to be a single-input mask after the unpack merges the + // results. + for (int &M : Mask) + if (M != -1) + M = 2 * (M % 4) + (M / 8); + + return DAG.getVectorShuffle( + MVT::v8i16, DL, DAG.getNode(MergeFromLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, + DL, MVT::v8i16, V1, V2), + DAG.getUNDEF(MVT::v8i16), Mask); +} + +/// \brief Generic lowering of 8-lane i16 shuffles. +/// +/// This handles both single-input shuffles and combined shuffle/blends with +/// two inputs. The single input shuffles are immediately delegated to +/// a dedicated lowering routine. +/// +/// The blends are lowered in one of three fundamental ways. If there are few +/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle +/// of the input is significantly cheaper when lowered as an interleaving of +/// the two inputs, try to interleave them. Otherwise, blend the low and high +/// halves of the inputs separately (making them have relatively few inputs) +/// and then concatenate them. +static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(Op.getSimpleValueType() == MVT::v8i16 && "Bad shuffle type!"); + assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> OrigMask = SVOp->getMask(); + int MaskStorage[8] = {OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3], + OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7]}; + MutableArrayRef<int> Mask(MaskStorage); + + assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); + + auto isV1 = [](int M) { return M >= 0 && M < 8; }; + auto isV2 = [](int M) { return M >= 8; }; + + int NumV1Inputs = std::count_if(Mask.begin(), Mask.end(), isV1); + int NumV2Inputs = std::count_if(Mask.begin(), Mask.end(), isV2); + + if (NumV2Inputs == 0) + return lowerV8I16SingleInputVectorShuffle(DL, V1, Mask, Subtarget, DAG); + + assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized " + "to be V1-input shuffles."); + + if (NumV1Inputs + NumV2Inputs <= 4) + return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG); + + // Check whether an interleaving lowering is likely to be more efficient. + // This isn't perfect but it is a strong heuristic that tends to work well on + // the kinds of shuffles that show up in practice. + // + // FIXME: Handle 1x, 2x, and 4x interleaving. + if (shouldLowerAsInterleaving(Mask)) { + // FIXME: Figure out whether we should pack these into the low or high + // halves. + + int EMask[8], OMask[8]; + for (int i = 0; i < 4; ++i) { + EMask[i] = Mask[2*i]; + OMask[i] = Mask[2*i + 1]; + EMask[i + 4] = -1; + OMask[i + 4] = -1; + } + + SDValue Evens = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, EMask); + SDValue Odds = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, OMask); + + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, Evens, Odds); + } + + int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; + int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; + + for (int i = 0; i < 4; ++i) { + LoBlendMask[i] = Mask[i]; + HiBlendMask[i] = Mask[i + 4]; + } + + SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask); + SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask); + LoV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, LoV); + HiV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, HiV); + + return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, + DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, LoV, HiV)); +} + +/// \brief Generic lowering of v16i8 shuffles. +/// +/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to +/// detect any complexity reducing interleaving. If that doesn't help, it uses +/// UNPCK to spread the i8 elements across two i16-element vectors, and uses +/// the existing lowering for v8i16 blends on each half, finally PACK-ing them +/// back together. +static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(Op.getSimpleValueType() == MVT::v16i8 && "Bad shuffle type!"); + assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> OrigMask = SVOp->getMask(); + assert(OrigMask.size() == 16 && "Unexpected mask size for v16 shuffle!"); + int MaskStorage[16] = { + OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3], + OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7], + OrigMask[8], OrigMask[9], OrigMask[10], OrigMask[11], + OrigMask[12], OrigMask[13], OrigMask[14], OrigMask[15]}; + MutableArrayRef<int> Mask(MaskStorage); + MutableArrayRef<int> LoMask = Mask.slice(0, 8); + MutableArrayRef<int> HiMask = Mask.slice(8, 8); + + // For single-input shuffles, there are some nicer lowering tricks we can use. + if (isSingleInputShuffleMask(Mask)) { + // Check whether we can widen this to an i16 shuffle by duplicating bytes. + // Notably, this handles splat and partial-splat shuffles more efficiently. + // However, it only makes sense if the pre-duplication shuffle simplifies + // things significantly. Currently, this means we need to be able to + // express the pre-duplication shuffle as an i16 shuffle. + // + // FIXME: We should check for other patterns which can be widened into an + // i16 shuffle as well. + auto canWidenViaDuplication = [](ArrayRef<int> Mask) { + for (int i = 0; i < 16; i += 2) { + if (Mask[i] != Mask[i + 1]) + return false; + } + return true; + }; + auto tryToWidenViaDuplication = [&]() -> SDValue { + if (!canWidenViaDuplication(Mask)) + return SDValue(); + SmallVector<int, 4> LoInputs; + std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs), + [](int M) { return M >= 0 && M < 8; }); + std::sort(LoInputs.begin(), LoInputs.end()); + LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), + LoInputs.end()); + SmallVector<int, 4> HiInputs; + std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs), + [](int M) { return M >= 8; }); + std::sort(HiInputs.begin(), HiInputs.end()); + HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), + HiInputs.end()); + + bool TargetLo = LoInputs.size() >= HiInputs.size(); + ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs; + ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs; + + int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1}; + SmallDenseMap<int, int, 8> LaneMap; + for (int I : InPlaceInputs) { + PreDupI16Shuffle[I/2] = I/2; + LaneMap[I] = I; + } + int j = TargetLo ? 0 : 4, je = j + 4; + for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) { + // Check if j is already a shuffle of this input. This happens when + // there are two adjacent bytes after we move the low one. + if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) { + // If we haven't yet mapped the input, search for a slot into which + // we can map it. + while (j < je && PreDupI16Shuffle[j] != -1) + ++j; + + if (j == je) + // We can't place the inputs into a single half with a simple i16 shuffle, so bail. + return SDValue(); + + // Map this input with the i16 shuffle. + PreDupI16Shuffle[j] = MovingInputs[i] / 2; + } + + // Update the lane map based on the mapping we ended up with. + LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2; + } + V1 = DAG.getNode( + ISD::BITCAST, DL, MVT::v16i8, + DAG.getVectorShuffle(MVT::v8i16, DL, + DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1), + DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle)); + + // Unpack the bytes to form the i16s that will be shuffled into place. + V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, + MVT::v16i8, V1, V1); + + int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; + for (int i = 0; i < 16; i += 2) { + if (Mask[i] != -1) + PostDupI16Shuffle[i / 2] = LaneMap[Mask[i]] - (TargetLo ? 0 : 8); + assert(PostDupI16Shuffle[i / 2] < 8 && "Invalid v8 shuffle mask!"); + } + return DAG.getNode( + ISD::BITCAST, DL, MVT::v16i8, + DAG.getVectorShuffle(MVT::v8i16, DL, + DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1), + DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle)); + }; + if (SDValue V = tryToWidenViaDuplication()) + return V; + } + + // Check whether an interleaving lowering is likely to be more efficient. + // This isn't perfect but it is a strong heuristic that tends to work well on + // the kinds of shuffles that show up in practice. + // + // FIXME: We need to handle other interleaving widths (i16, i32, ...). + if (shouldLowerAsInterleaving(Mask)) { + // FIXME: Figure out whether we should pack these into the low or high + // halves. + + int EMask[16], OMask[16]; + for (int i = 0; i < 8; ++i) { + EMask[i] = Mask[2*i]; + OMask[i] = Mask[2*i + 1]; + EMask[i + 8] = -1; + OMask[i + 8] = -1; + } + + SDValue Evens = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, EMask); + SDValue Odds = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, OMask); + + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, Evens, Odds); + } + + int V1LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; + int V1HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; + int V2LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; + int V2HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; + + auto buildBlendMasks = [](MutableArrayRef<int> HalfMask, + MutableArrayRef<int> V1HalfBlendMask, + MutableArrayRef<int> V2HalfBlendMask) { + for (int i = 0; i < 8; ++i) + if (HalfMask[i] >= 0 && HalfMask[i] < 16) { + V1HalfBlendMask[i] = HalfMask[i]; + HalfMask[i] = i; + } else if (HalfMask[i] >= 16) { + V2HalfBlendMask[i] = HalfMask[i] - 16; + HalfMask[i] = i + 8; + } + }; + buildBlendMasks(LoMask, V1LoBlendMask, V2LoBlendMask); + buildBlendMasks(HiMask, V1HiBlendMask, V2HiBlendMask); + + SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL); + + auto buildLoAndHiV8s = [&](SDValue V, MutableArrayRef<int> LoBlendMask, + MutableArrayRef<int> HiBlendMask) { + SDValue V1, V2; + // Check if any of the odd lanes in the v16i8 are used. If not, we can mask + // them out and avoid using UNPCK{L,H} to extract the elements of V as + // i16s. + if (std::none_of(LoBlendMask.begin(), LoBlendMask.end(), + [](int M) { return M >= 0 && M % 2 == 1; }) && + std::none_of(HiBlendMask.begin(), HiBlendMask.end(), + [](int M) { return M >= 0 && M % 2 == 1; })) { + // Use a mask to drop the high bytes. + V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V); + V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, V1, + DAG.getConstant(0x00FF, MVT::v8i16)); + + // This will be a single vector shuffle instead of a blend so nuke V2. + V2 = DAG.getUNDEF(MVT::v8i16); + + // Squash the masks to point directly into V1. + for (int &M : LoBlendMask) + if (M >= 0) + M /= 2; + for (int &M : HiBlendMask) + if (M >= 0) + M /= 2; + } else { + // Otherwise just unpack the low half of V into V1 and the high half into + // V2 so that we can blend them as i16s. + V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, + DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero)); + V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, + DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero)); + } + + SDValue BlendedLo = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask); + SDValue BlendedHi = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask); + return std::make_pair(BlendedLo, BlendedHi); + }; + SDValue V1Lo, V1Hi, V2Lo, V2Hi; + std::tie(V1Lo, V1Hi) = buildLoAndHiV8s(V1, V1LoBlendMask, V1HiBlendMask); + std::tie(V2Lo, V2Hi) = buildLoAndHiV8s(V2, V2LoBlendMask, V2HiBlendMask); + + SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Lo, V2Lo, LoMask); + SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Hi, V2Hi, HiMask); + + return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV); +} + +/// \brief Dispatching routine to lower various 128-bit x86 vector shuffles. +/// +/// This routine breaks down the specific type of 128-bit shuffle and +/// dispatches to the lowering routines accordingly. +static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, + MVT VT, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + switch (VT.SimpleTy) { + case MVT::v2i64: + return lowerV2I64VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v2f64: + return lowerV2F64VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v4i32: + return lowerV4I32VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v4f32: + return lowerV4F32VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v8i16: + return lowerV8I16VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v16i8: + return lowerV16I8VectorShuffle(Op, V1, V2, Subtarget, DAG); + + default: + llvm_unreachable("Unimplemented!"); + } +} + +/// \brief Tiny helper function to test whether adjacent masks are sequential. +static bool areAdjacentMasksSequential(ArrayRef<int> Mask) { + for (int i = 0, Size = Mask.size(); i < Size; i += 2) + if (Mask[i] + 1 != Mask[i+1]) + return false; + + return true; +} + +/// \brief Top-level lowering for x86 vector shuffles. +/// +/// This handles decomposition, canonicalization, and lowering of all x86 +/// vector shuffles. Most of the specific lowering strategies are encapsulated +/// above in helper routines. The canonicalization attempts to widen shuffles +/// to involve fewer lanes of wider elements, consolidate symmetric patterns +/// s.t. only one of the two inputs needs to be tested, etc. +static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + MVT VT = Op.getSimpleValueType(); + int NumElements = VT.getVectorNumElements(); + SDLoc dl(Op); + + assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles"); + + bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; + bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; + if (V1IsUndef && V2IsUndef) + return DAG.getUNDEF(VT); + + // When we create a shuffle node we put the UNDEF node to second operand, + // but in some cases the first operand may be transformed to UNDEF. + // In this case we should just commute the node. + if (V1IsUndef) + return CommuteVectorShuffle(SVOp, DAG); + + // Check for non-undef masks pointing at an undef vector and make the masks + // undef as well. This makes it easier to match the shuffle based solely on + // the mask. + if (V2IsUndef) + for (int M : Mask) + if (M >= NumElements) { + SmallVector<int, 8> NewMask(Mask.begin(), Mask.end()); + for (int &M : NewMask) + if (M >= NumElements) + M = -1; + return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask); + } + + // For integer vector shuffles, try to collapse them into a shuffle of fewer + // lanes but wider integers. We cap this to not form integers larger than i64 + // but it might be interesting to form i128 integers to handle flipping the + // low and high halves of AVX 256-bit vectors. + if (VT.isInteger() && VT.getScalarSizeInBits() < 64 && + areAdjacentMasksSequential(Mask)) { + SmallVector<int, 8> NewMask; + for (int i = 0, Size = Mask.size(); i < Size; i += 2) + NewMask.push_back(Mask[i] / 2); + MVT NewVT = + MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits() * 2), + VT.getVectorNumElements() / 2); + V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1); + V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2); + return DAG.getNode(ISD::BITCAST, dl, VT, + DAG.getVectorShuffle(NewVT, dl, V1, V2, NewMask)); + } + + int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0; + for (int M : SVOp->getMask()) + if (M < 0) + ++NumUndefElements; + else if (M < NumElements) + ++NumV1Elements; + else + ++NumV2Elements; + + // Commute the shuffle as needed such that more elements come from V1 than + // V2. This allows us to match the shuffle pattern strictly on how many + // elements come from V1 without handling the symmetric cases. + if (NumV2Elements > NumV1Elements) + return CommuteVectorShuffle(SVOp, DAG); + + // When the number of V1 and V2 elements are the same, try to minimize the + // number of uses of V2 in the low half of the vector. + if (NumV1Elements == NumV2Elements) { + int LowV1Elements = 0, LowV2Elements = 0; + for (int M : SVOp->getMask().slice(0, NumElements / 2)) + if (M >= NumElements) + ++LowV2Elements; + else if (M >= 0) + ++LowV1Elements; + if (LowV2Elements > LowV1Elements) + return CommuteVectorShuffle(SVOp, DAG); + } + + // For each vector width, delegate to a specialized lowering routine. + if (VT.getSizeInBits() == 128) + return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); + + llvm_unreachable("Unimplemented!"); +} + + +//===----------------------------------------------------------------------===// +// Legacy vector shuffle lowering +// +// This code is the legacy code handling vector shuffles until the above +// replaces its functionality and performance. +//===----------------------------------------------------------------------===// + +static bool isBlendMask(ArrayRef<int> MaskVals, MVT VT, bool hasSSE41, + bool hasInt256, unsigned *MaskOut = nullptr) { MVT EltVT = VT.getVectorElementType(); - unsigned NumElems = VT.getVectorNumElements(); // There is no blend with immediate in AVX-512. if (VT.is512BitVector()) - return SDValue(); + return false; - if (!Subtarget->hasSSE41() || EltVT == MVT::i8) - return SDValue(); - if (!Subtarget->hasInt256() && VT == MVT::v16i16) - return SDValue(); + if (!hasSSE41 || EltVT == MVT::i8) + return false; + if (!hasInt256 && VT == MVT::v16i16) + return false; - // Check the mask for BLEND and build the value. unsigned MaskValue = 0; + unsigned NumElems = VT.getVectorNumElements(); // There are 2 lanes if (NumElems > 8), and 1 lane otherwise. - unsigned NumLanes = (NumElems-1)/8 + 1; + unsigned NumLanes = (NumElems - 1) / 8 + 1; unsigned NumElemsInLane = NumElems / NumLanes; // Blend for v16i16 should be symetric for the both lanes. for (unsigned i = 0; i < NumElemsInLane; ++i) { - int SndLaneEltIdx = (NumLanes == 2) ? - SVOp->getMaskElt(i + NumElemsInLane) : -1; - int EltIdx = SVOp->getMaskElt(i); + int SndLaneEltIdx = (NumLanes == 2) ? MaskVals[i + NumElemsInLane] : -1; + int EltIdx = MaskVals[i]; if ((EltIdx < 0 || EltIdx == (int)i) && (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane))) @@ -6469,11 +8044,34 @@ LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp, if (((unsigned)EltIdx == (i + NumElems)) && (SndLaneEltIdx < 0 || (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane)) - MaskValue |= (1<<i); + MaskValue |= (1 << i); else - return SDValue(); + return false; } + if (MaskOut) + *MaskOut = MaskValue; + return true; +} + +// Try to lower a shuffle node into a simple blend instruction. +// This function assumes isBlendMask returns true for this +// SuffleVectorSDNode +static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp, + unsigned MaskValue, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MVT VT = SVOp->getSimpleValueType(0); + MVT EltVT = VT.getVectorElementType(); + assert(isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(), + Subtarget->hasInt256() && "Trying to lower a " + "VECTOR_SHUFFLE to a Blend but " + "with the wrong mask")); + SDValue V1 = SVOp->getOperand(0); + SDValue V2 = SVOp->getOperand(1); + SDLoc dl(SVOp); + unsigned NumElems = VT.getVectorNumElements(); + // Convert i32 vectors to floating point if it is not AVX2. // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors. MVT BlendVT = VT; @@ -7450,8 +9048,9 @@ static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl, assert((VT == MVT::v4f32 || VT == MVT::v4i32) && "unsupported vector type for insertps/pinsrd"); - int FromV1 = std::count_if(Mask.begin(), Mask.end(), - [](const int &i) { return i < 4; }); + auto FromV1Predicate = [](const int &i) { return i < 4 && i > -1; }; + auto FromV2Predicate = [](const int &i) { return i >= 4; }; + int FromV1 = std::count_if(Mask.begin(), Mask.end(), FromV1Predicate); SDValue From; SDValue To; @@ -7459,23 +9058,26 @@ static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl, if (FromV1 == 1) { From = V1; To = V2; - DestIndex = std::find_if(Mask.begin(), Mask.end(), - [](const int &i) { return i < 4; }) - + DestIndex = std::find_if(Mask.begin(), Mask.end(), FromV1Predicate) - Mask.begin(); } else { + assert(std::count_if(Mask.begin(), Mask.end(), FromV2Predicate) == 1 && + "More than one element from V1 and from V2, or no elements from one " + "of the vectors. This case should not have returned true from " + "isINSERTPSMask"); From = V2; To = V1; - DestIndex = std::find_if(Mask.begin(), Mask.end(), - [](const int &i) { return i >= 4; }) - - Mask.begin(); + DestIndex = + std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin(); } + unsigned SrcIndex = Mask[DestIndex] % 4; if (MayFoldLoad(From)) { // Trivial case, when From comes from a load and is only used by the // shuffle. Make it use insertps from the vector that we need from that // load. SDValue NewLoad = - NarrowVectorLoadToElement(cast<LoadSDNode>(From), DestIndex, DAG); + NarrowVectorLoadToElement(cast<LoadSDNode>(From), SrcIndex, DAG); if (!NewLoad.getNode()) return SDValue(); @@ -7496,7 +9098,6 @@ static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl, } // Vector-element-to-vector - unsigned SrcIndex = Mask[DestIndex] % 4; SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4 | SrcIndex << 6); return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, From, InsertpsMask); } @@ -7663,6 +9264,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { bool OptForSize = MF.getFunction()->getAttributes(). hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize); + // Check if we should use the experimental vector shuffle lowering. If so, + // delegate completely to that code path. + if (ExperimentalVectorShuffleLowering) + return lowerVectorShuffle(Op, Subtarget, DAG); + assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles"); if (V1IsUndef && V2IsUndef) @@ -7796,8 +9402,13 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { bool Commuted = false; // FIXME: This should also accept a bitcast of a splat? Be careful, not // 1,1,1,1 -> v8i16 though. - V1IsSplat = isSplatVector(V1.getNode()); - V2IsSplat = isSplatVector(V2.getNode()); + BitVector UndefElements; + if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V1.getNode())) + if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none()) + V1IsSplat = true; + if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V2.getNode())) + if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none()) + V2IsSplat = true; // Canonicalize the splat or undef, if present, to be on the RHS. if (!V2IsUndef && V1IsSplat && !V2IsSplat) { @@ -7873,6 +9484,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { getShufflePSHUFLWImmediate(SVOp), DAG); + unsigned MaskValue; + if (isBlendMask(M, VT, Subtarget->hasSSE41(), Subtarget->hasInt256(), + &MaskValue)) + return LowerVECTOR_SHUFFLEtoBlend(SVOp, MaskValue, Subtarget, DAG); + if (isSHUFPMask(M, VT)) return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2, getShuffleSHUFImmediate(SVOp), DAG); @@ -7910,10 +9526,6 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1, V2, getShuffleVPERM2X128Immediate(SVOp), DAG); - SDValue BlendOp = LowerVECTOR_SHUFFLEtoBlend(SVOp, Subtarget, DAG); - if (BlendOp.getNode()) - return BlendOp; - if (Subtarget->hasSSE41() && isINSERTPSMask(M, VT)) return getINSERTPS(SVOp, dl, DAG); @@ -8530,7 +10142,7 @@ X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { // global base reg. unsigned char OpFlag = 0; unsigned WrapperKind = X86ISD::Wrapper; - CodeModel::Model M = getTargetMachine().getCodeModel(); + CodeModel::Model M = DAG.getTarget().getCodeModel(); if (Subtarget->isPICStyleRIPRel() && (M == CodeModel::Small || M == CodeModel::Kernel)) @@ -8563,7 +10175,7 @@ SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { // global base reg. unsigned char OpFlag = 0; unsigned WrapperKind = X86ISD::Wrapper; - CodeModel::Model M = getTargetMachine().getCodeModel(); + CodeModel::Model M = DAG.getTarget().getCodeModel(); if (Subtarget->isPICStyleRIPRel() && (M == CodeModel::Small || M == CodeModel::Kernel)) @@ -8596,7 +10208,7 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { // global base reg. unsigned char OpFlag = 0; unsigned WrapperKind = X86ISD::Wrapper; - CodeModel::Model M = getTargetMachine().getCodeModel(); + CodeModel::Model M = DAG.getTarget().getCodeModel(); if (Subtarget->isPICStyleRIPRel() && (M == CodeModel::Small || M == CodeModel::Kernel)) { @@ -8617,7 +10229,7 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); // With PIC, the address is actually $g + Offset. - if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && + if (DAG.getTarget().getRelocationModel() == Reloc::PIC_ && !Subtarget->is64Bit()) { Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), DAG.getNode(X86ISD::GlobalBaseReg, @@ -8639,7 +10251,7 @@ X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { // Create the TargetBlockAddressAddress node. unsigned char OpFlags = Subtarget->ClassifyBlockAddressReference(); - CodeModel::Model M = getTargetMachine().getCodeModel(); + CodeModel::Model M = DAG.getTarget().getCodeModel(); const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset(); SDLoc dl(Op); @@ -8668,8 +10280,8 @@ X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl, // Create the TargetGlobalAddress node, folding in the constant // offset if it is legal. unsigned char OpFlags = - Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); - CodeModel::Model M = getTargetMachine().getCodeModel(); + Subtarget->ClassifyGlobalReference(GV, DAG.getTarget()); + CodeModel::Model M = DAG.getTarget().getCodeModel(); SDValue Result; if (OpFlags == X86II::MO_NO_FLAG && X86::isOffsetSuitableForCodeModel(Offset, M)) { @@ -8868,7 +10480,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { const GlobalValue *GV = GA->getGlobal(); if (Subtarget->isTargetELF()) { - TLSModel::Model model = getTargetMachine().getTLSModel(GV); + TLSModel::Model model = DAG.getTarget().getTLSModel(GV); switch (model) { case TLSModel::GeneralDynamic: @@ -8880,9 +10492,9 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { Subtarget->is64Bit()); case TLSModel::InitialExec: case TLSModel::LocalExec: - return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, - Subtarget->is64Bit(), - getTargetMachine().getRelocationModel() == Reloc::PIC_); + return LowerToTLSExecModel( + GA, DAG, getPointerTy(), model, Subtarget->is64Bit(), + DAG.getTarget().getRelocationModel() == Reloc::PIC_); } llvm_unreachable("Unknown TLS model."); } @@ -8895,8 +10507,8 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the // global base reg. - bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) && - !Subtarget->is64Bit(); + bool PIC32 = (DAG.getTarget().getRelocationModel() == Reloc::PIC_) && + !Subtarget->is64Bit(); if (PIC32) OpFlag = X86II::MO_TLVP_PIC_BASE; else @@ -10050,10 +11662,27 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl, break; case X86::COND_G: case X86::COND_GE: case X86::COND_L: case X86::COND_LE: - case X86::COND_O: case X86::COND_NO: - NeedOF = true; + case X86::COND_O: case X86::COND_NO: { + // Check if we really need to set the + // Overflow flag. If NoSignedWrap is present + // that is not actually needed. + switch (Op->getOpcode()) { + case ISD::ADD: + case ISD::SUB: + case ISD::MUL: + case ISD::SHL: { + const BinaryWithFlagsSDNode *BinNode = + cast<BinaryWithFlagsSDNode>(Op.getNode()); + if (BinNode->hasNoSignedWrap()) + break; + } + default: + NeedOF = true; + break; + } break; } + } // See if we can use the EFLAGS value from the operand instead of // doing a separate TEST. TEST always sets OF and CF to 0, so unless // we prove that the arithmetic won't overflow, we can't use OF or CF. @@ -10115,14 +11744,14 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl, if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) { // An add of one will be selected as an INC. - if (C->getAPIntValue() == 1) { + if (C->getAPIntValue() == 1 && !Subtarget->slowIncDec()) { Opcode = X86ISD::INC; NumOperands = 1; break; } // An add of negative one (subtract of one) will be selected as a DEC. - if (C->getAPIntValue().isAllOnesValue()) { + if (C->getAPIntValue().isAllOnesValue() && !Subtarget->slowIncDec()) { Opcode = X86ISD::DEC; NumOperands = 1; break; @@ -10138,7 +11767,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl, // If we have a constant logical shift that's only used in a comparison // against zero turn it into an equivalent AND. This allows turning it into // a TEST instruction later. - if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && + if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() && isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) { EVT VT = Op.getValueType(); unsigned BitWidth = VT.getSizeInBits(); @@ -11469,8 +13098,9 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { } if (addTest) { - CC = DAG.getConstant(X86::COND_NE, MVT::i8); - Cond = EmitTest(Cond, X86::COND_NE, dl, DAG); + X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE; + CC = DAG.getConstant(X86Cond, MVT::i8); + Cond = EmitTest(Cond, X86Cond, dl, DAG); } Cond = ConvertCmpIfNecessary(Cond, DAG); return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), @@ -11513,7 +13143,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT); Chain = SP.getValue(1); unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue(); - const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering(); + const TargetFrameLowering &TFI = *DAG.getTarget().getFrameLowering(); unsigned StackAlign = TFI.getStackAlignment(); Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value if (Align > StackAlign) @@ -11572,7 +13202,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag); const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); + static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo()); unsigned SPReg = RegInfo->getStackRegister(); SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy); Chain = SP.getValue(1); @@ -11681,7 +13311,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { if (ArgMode == 2) { // Sanity Check: Make sure using fp_offset makes sense. - assert(!getTargetMachine().Options.UseSoftFloat && + assert(!DAG.getTarget().Options.UseSoftFloat && !(DAG.getMachineFunction() .getFunction()->getAttributes() .hasAttribute(AttributeSet::FunctionIndex, @@ -12158,11 +13788,37 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { Op.getOperand(1), Op.getOperand(2)); } + case Intrinsic::x86_sse2_packssdw_128: + case Intrinsic::x86_sse2_packsswb_128: + case Intrinsic::x86_avx2_packssdw: + case Intrinsic::x86_avx2_packsswb: + return DAG.getNode(X86ISD::PACKSS, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + + case Intrinsic::x86_sse2_packuswb_128: + case Intrinsic::x86_sse41_packusdw: + case Intrinsic::x86_avx2_packuswb: + case Intrinsic::x86_avx2_packusdw: + return DAG.getNode(X86ISD::PACKUS, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_ssse3_pshuf_b_128: case Intrinsic::x86_avx2_pshuf_b: return DAG.getNode(X86ISD::PSHUFB, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_sse2_pshuf_d: + return DAG.getNode(X86ISD::PSHUFD, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + + case Intrinsic::x86_sse2_pshufl_w: + return DAG.getNode(X86ISD::PSHUFLW, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + + case Intrinsic::x86_sse2_pshufh_w: + return DAG.getNode(X86ISD::PSHUFHW, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_ssse3_psign_b_128: case Intrinsic::x86_ssse3_psign_w_128: case Intrinsic::x86_ssse3_psign_d_128: @@ -12610,6 +14266,51 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, return SDValue(Res, 0); } +// getReadPerformanceCounter - Handles the lowering of builtin intrinsics that +// read performance monitor counters (x86_rdpmc). +static void getReadPerformanceCounter(SDNode *N, SDLoc DL, + SelectionDAG &DAG, const X86Subtarget *Subtarget, + SmallVectorImpl<SDValue> &Results) { + assert(N->getNumOperands() == 3 && "Unexpected number of operands!"); + SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue LO, HI; + + // The ECX register is used to select the index of the performance counter + // to read. + SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, + N->getOperand(2)); + SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain); + + // Reads the content of a 64-bit performance counter and returns it in the + // registers EDX:EAX. + if (Subtarget->is64Bit()) { + LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1)); + HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64, + LO.getValue(2)); + } else { + LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1)); + HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32, + LO.getValue(2)); + } + Chain = HI.getValue(1); + + if (Subtarget->is64Bit()) { + // The EAX register is loaded with the low-order 32 bits. The EDX register + // is loaded with the supported high-order bits of the counter. + SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI, + DAG.getConstant(32, MVT::i8)); + Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp)); + Results.push_back(Chain); + return; + } + + // Use a buildpair to merge the two 32-bit values into a 64-bit one. + SDValue Ops[] = { LO, HI }; + SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops); + Results.push_back(Pair); + Results.push_back(Chain); +} + // getReadTimeStampCounter - Handles the lowering of builtin intrinsics that // read the time stamp counter (x86_rdtsc and x86_rdtscp). This function is // also used to custom lower READCYCLECOUNTER nodes. @@ -12674,7 +14375,7 @@ static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget, } enum IntrinsicType { - GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDTSC, XTEST + GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST }; struct IntrinsicData { @@ -12768,6 +14469,8 @@ static void InitIntinsicsMap() { IntrinsicData(RDTSC, X86ISD::RDTSC_DAG, 0))); IntrMap.insert(std::make_pair(Intrinsic::x86_rdtscp, IntrinsicData(RDTSC, X86ISD::RDTSCP_DAG, 0))); + IntrMap.insert(std::make_pair(Intrinsic::x86_rdpmc, + IntrinsicData(RDPMC, X86ISD::RDPMC_DAG, 0))); Initialized = true; } @@ -12826,7 +14529,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, case PREFETCH: { SDValue Hint = Op.getOperand(6); unsigned HintVal; - if (dyn_cast<ConstantSDNode> (Hint) == 0 || + if (dyn_cast<ConstantSDNode> (Hint) == nullptr || (HintVal = dyn_cast<ConstantSDNode> (Hint)->getZExtValue()) > 1) llvm_unreachable("Wrong prefetch hint in intrinsic: should be 0 or 1"); unsigned Opcode = (HintVal ? Intr.Opc1 : Intr.Opc0); @@ -12843,6 +14546,12 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, getReadTimeStampCounter(Op.getNode(), dl, Intr.Opc0, DAG, Subtarget, Results); return DAG.getMergeValues(Results, dl); } + // Read Performance Monitoring Counters. + case RDPMC: { + SmallVector<SDValue, 2> Results; + getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results); + return DAG.getMergeValues(Results, dl); + } // XTEST intrinsics. case XTEST: { SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other); @@ -12873,7 +14582,7 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, if (Depth > 0) { SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); + static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo()); SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), PtrVT); return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getNode(ISD::ADD, dl, PtrVT, @@ -12895,7 +14604,7 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); // FIXME probably not meaningful unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); + static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo()); unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction()); assert(((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && @@ -12924,7 +14633,7 @@ unsigned X86TargetLowering::getRegisterByName(const char* RegName, SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const { const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); + static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo()); return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize()); } @@ -12936,7 +14645,7 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { EVT PtrVT = getPointerTy(); const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); + static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo()); unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction()); assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && @@ -12983,7 +14692,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, SDLoc dl (Op); const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); - const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo(); + const TargetRegisterInfo* TRI = DAG.getTarget().getRegisterInfo(); if (Subtarget->is64Bit()) { SDValue OutChains[6]; @@ -13431,7 +15140,7 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons CLI.setDebugLoc(dl).setChain(InChain) .setCallee(getLibcallCallingConv(LC), static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), - Callee, &Args, 0) + Callee, std::move(Args), 0) .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned); std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI); @@ -13448,7 +15157,7 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget, (VT == MVT::v8i32 && Subtarget->hasInt256())); // Get the high parts. - const int Mask[] = {1, 2, 3, 4, 5, 6, 7, 8}; + const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1}; SDValue Hi0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask); SDValue Hi1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask); @@ -13464,10 +15173,18 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget, DAG.getNode(Opcode, dl, MulVT, Hi0, Hi1)); // Shuffle it back into the right order. - const int HighMask[] = {1, 5, 3, 7, 9, 13, 11, 15}; - SDValue Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask); - const int LowMask[] = {0, 4, 2, 6, 8, 12, 10, 14}; - SDValue Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask); + SDValue Highs, Lows; + if (VT == MVT::v8i32) { + const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15}; + Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask); + const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14}; + Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask); + } else { + const int HighMask[] = {1, 5, 3, 7}; + Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask); + const int LowMask[] = {0, 4, 2, 6}; + Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask); + } // If we have a signed multiply but no PMULDQ fix up the high parts of a // unsigned multiply. @@ -13494,10 +15211,9 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, SDValue Amt = Op.getOperand(1); // Optimize shl/srl/sra with constant shift amount. - if (isSplatVector(Amt.getNode())) { - SDValue SclrAmt = Amt->getOperand(0); - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) { - uint64_t ShiftAmt = C->getZExtValue(); + if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) { + if (auto *ShiftConst = BVAmt->getConstantSplatNode()) { + uint64_t ShiftAmt = ShiftConst->getZExtValue(); if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 || (Subtarget->hasInt256() && @@ -13804,15 +15520,14 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, SelectionDAG &DAG) { - MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); SDValue R = Op.getOperand(0); SDValue Amt = Op.getOperand(1); SDValue V; - if (!Subtarget->hasSSE2()) - return SDValue(); + assert(VT.isVector() && "Custom lowering only for vector shifts!"); + assert(Subtarget->hasSSE2() && "Only custom lower when we have SSE2!"); V = LowerScalarImmediateShift(Op, DAG, Subtarget); if (V.getNode()) @@ -14254,7 +15969,7 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget, break; } SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg, - Op.getOperand(2), SDValue()); + Op.getOperand(2), SDValue()); SDValue Ops[] = { cpIn.getValue(0), Op.getOperand(1), Op.getOperand(3), @@ -14264,9 +15979,18 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget, MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand(); SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys, Ops, T, MMO); + SDValue cpOut = DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1)); - return cpOut; + SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS, + MVT::i32, cpOut.getValue(2)); + SDValue Success = DAG.getNode(X86ISD::SETCC, DL, Op->getValueType(1), + DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS); + + DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut); + DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success); + DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1)); + return SDValue(); } static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget, @@ -14422,7 +16146,7 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget, TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) - .setCallee(CallingConv::C, RetTy, Callee, &Args, 0); + .setCallee(CallingConv::C, RetTy, Callee, std::move(Args), 0); std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); @@ -14446,7 +16170,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { default: llvm_unreachable("Should not custom lower this!"); case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op,DAG); case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG); - case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op, Subtarget, DAG); + case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: + return LowerCMP_SWAP(Op, Subtarget, DAG); case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); @@ -14528,8 +16253,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { } static void ReplaceATOMIC_LOAD(SDNode *Node, - SmallVectorImpl<SDValue> &Results, - SelectionDAG &DAG) { + SmallVectorImpl<SDValue> &Results, + SelectionDAG &DAG) { SDLoc dl(Node); EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT(); @@ -14538,38 +16263,16 @@ static void ReplaceATOMIC_LOAD(SDNode *Node, // (The only way to get a 16-byte load is cmpxchg16b) // FIXME: 16-byte ATOMIC_CMP_SWAP isn't actually hooked up at the moment. SDValue Zero = DAG.getConstant(0, VT); - SDValue Swap = DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, dl, VT, - Node->getOperand(0), - Node->getOperand(1), Zero, Zero, - cast<AtomicSDNode>(Node)->getMemOperand(), - cast<AtomicSDNode>(Node)->getOrdering(), - cast<AtomicSDNode>(Node)->getOrdering(), - cast<AtomicSDNode>(Node)->getSynchScope()); + SDVTList VTs = DAG.getVTList(VT, MVT::i1, MVT::Other); + SDValue Swap = + DAG.getAtomicCmpSwap(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, dl, VT, VTs, + Node->getOperand(0), Node->getOperand(1), Zero, Zero, + cast<AtomicSDNode>(Node)->getMemOperand(), + cast<AtomicSDNode>(Node)->getOrdering(), + cast<AtomicSDNode>(Node)->getOrdering(), + cast<AtomicSDNode>(Node)->getSynchScope()); Results.push_back(Swap.getValue(0)); - Results.push_back(Swap.getValue(1)); -} - -static void -ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, - SelectionDAG &DAG, unsigned NewOp) { - SDLoc dl(Node); - assert (Node->getValueType(0) == MVT::i64 && - "Only know how to expand i64 atomics"); - - SDValue Chain = Node->getOperand(0); - SDValue In1 = Node->getOperand(1); - SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, - Node->getOperand(2), DAG.getIntPtrConstant(0)); - SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, - Node->getOperand(2), DAG.getIntPtrConstant(1)); - SDValue Ops[] = { Chain, In1, In2L, In2H }; - SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); - SDValue Result = - DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, MVT::i64, - cast<MemSDNode>(Node)->getMemOperand()); - SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; - Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF)); - Results.push_back(Result.getValue(2)); + Results.push_back(Swap.getValue(2)); } /// ReplaceNodeResults - Replace a node with an illegal result type @@ -14656,13 +16359,15 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, case Intrinsic::x86_rdtscp: return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget, Results); + case Intrinsic::x86_rdpmc: + return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results); } } case ISD::READCYCLECOUNTER: { return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget, Results); } - case ISD::ATOMIC_CMP_SWAP: { + case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: { EVT T = N->getValueType(0); assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair"); bool Regs64bit = T == MVT::i128; @@ -14704,61 +16409,33 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, Regs64bit ? X86::RDX : X86::EDX, HalfT, cpOutL.getValue(2)); SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; + + SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS, + MVT::i32, cpOutH.getValue(2)); + SDValue Success = + DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS); + Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1)); + Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF)); - Results.push_back(cpOutH.getValue(1)); + Results.push_back(Success); + Results.push_back(EFLAGS.getValue(1)); return; } + case ISD::ATOMIC_SWAP: case ISD::ATOMIC_LOAD_ADD: + case ISD::ATOMIC_LOAD_SUB: case ISD::ATOMIC_LOAD_AND: - case ISD::ATOMIC_LOAD_NAND: case ISD::ATOMIC_LOAD_OR: - case ISD::ATOMIC_LOAD_SUB: case ISD::ATOMIC_LOAD_XOR: - case ISD::ATOMIC_LOAD_MAX: + case ISD::ATOMIC_LOAD_NAND: case ISD::ATOMIC_LOAD_MIN: - case ISD::ATOMIC_LOAD_UMAX: + case ISD::ATOMIC_LOAD_MAX: case ISD::ATOMIC_LOAD_UMIN: - case ISD::ATOMIC_SWAP: { - unsigned Opc; - switch (N->getOpcode()) { - default: llvm_unreachable("Unexpected opcode"); - case ISD::ATOMIC_LOAD_ADD: - Opc = X86ISD::ATOMADD64_DAG; - break; - case ISD::ATOMIC_LOAD_AND: - Opc = X86ISD::ATOMAND64_DAG; - break; - case ISD::ATOMIC_LOAD_NAND: - Opc = X86ISD::ATOMNAND64_DAG; - break; - case ISD::ATOMIC_LOAD_OR: - Opc = X86ISD::ATOMOR64_DAG; - break; - case ISD::ATOMIC_LOAD_SUB: - Opc = X86ISD::ATOMSUB64_DAG; - break; - case ISD::ATOMIC_LOAD_XOR: - Opc = X86ISD::ATOMXOR64_DAG; - break; - case ISD::ATOMIC_LOAD_MAX: - Opc = X86ISD::ATOMMAX64_DAG; - break; - case ISD::ATOMIC_LOAD_MIN: - Opc = X86ISD::ATOMMIN64_DAG; - break; - case ISD::ATOMIC_LOAD_UMAX: - Opc = X86ISD::ATOMUMAX64_DAG; - break; - case ISD::ATOMIC_LOAD_UMIN: - Opc = X86ISD::ATOMUMIN64_DAG; - break; - case ISD::ATOMIC_SWAP: - Opc = X86ISD::ATOMSWAP64_DAG; - break; - } - ReplaceATOMIC_BINARY_64(N, Results, DAG, Opc); - return; - } + case ISD::ATOMIC_LOAD_UMAX: + // Delegate to generic TypeLegalization. Situations we can really handle + // should have already been dealt with by X86AtomicExpand.cpp. + break; case ISD::ATOMIC_LOAD: { ReplaceATOMIC_LOAD(N, Results, DAG); return; @@ -14779,6 +16456,13 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, MVT::v2f64, N->getOperand(0)); SDValue ToVecInt = DAG.getNode(ISD::BITCAST, dl, WiderVT, Expanded); + if (ExperimentalVectorWideningLegalization) { + // If we are legalizing vectors by widening, we already have the desired + // legal vector type, just return it. + Results.push_back(ToVecInt); + return; + } + SmallVector<SDValue, 8> Elts; for (unsigned i = 0, e = NumElts; i != e; ++i) Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, @@ -14810,6 +16494,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::FST: return "X86ISD::FST"; case X86ISD::CALL: return "X86ISD::CALL"; case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; + case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG"; + case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG"; case X86ISD::BT: return "X86ISD::BT"; case X86ISD::CMP: return "X86ISD::CMP"; case X86ISD::COMI: return "X86ISD::COMI"; @@ -14863,12 +16549,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r"; case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; - case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG"; - case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG"; - case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG"; - case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG"; - case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG"; - case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; + case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG"; case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; case X86ISD::VZEXT: return "X86ISD::VZEXT"; @@ -14909,6 +16590,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::TESTM: return "X86ISD::TESTM"; case X86ISD::TESTNM: return "X86ISD::TESTNM"; case X86ISD::KORTEST: return "X86ISD::KORTEST"; + case X86ISD::PACKSS: return "X86ISD::PACKSS"; + case X86ISD::PACKUS: return "X86ISD::PACKUS"; case X86ISD::PALIGNR: return "X86ISD::PALIGNR"; case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; @@ -15173,7 +16856,8 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, isUNPCKLMask(M, SVT, Subtarget->hasInt256()) || isUNPCKHMask(M, SVT, Subtarget->hasInt256()) || isUNPCKL_v_undef_Mask(M, SVT, Subtarget->hasInt256()) || - isUNPCKH_v_undef_Mask(M, SVT, Subtarget->hasInt256())); + isUNPCKH_v_undef_Mask(M, SVT, Subtarget->hasInt256()) || + isBlendMask(M, SVT, Subtarget->hasSSE41(), Subtarget->hasInt256())); } bool @@ -15256,685 +16940,6 @@ static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB, return sinkMBB; } -// Get CMPXCHG opcode for the specified data type. -static unsigned getCmpXChgOpcode(EVT VT) { - switch (VT.getSimpleVT().SimpleTy) { - case MVT::i8: return X86::LCMPXCHG8; - case MVT::i16: return X86::LCMPXCHG16; - case MVT::i32: return X86::LCMPXCHG32; - case MVT::i64: return X86::LCMPXCHG64; - default: - break; - } - llvm_unreachable("Invalid operand size!"); -} - -// Get LOAD opcode for the specified data type. -static unsigned getLoadOpcode(EVT VT) { - switch (VT.getSimpleVT().SimpleTy) { - case MVT::i8: return X86::MOV8rm; - case MVT::i16: return X86::MOV16rm; - case MVT::i32: return X86::MOV32rm; - case MVT::i64: return X86::MOV64rm; - default: - break; - } - llvm_unreachable("Invalid operand size!"); -} - -// Get opcode of the non-atomic one from the specified atomic instruction. -static unsigned getNonAtomicOpcode(unsigned Opc) { - switch (Opc) { - case X86::ATOMAND8: return X86::AND8rr; - case X86::ATOMAND16: return X86::AND16rr; - case X86::ATOMAND32: return X86::AND32rr; - case X86::ATOMAND64: return X86::AND64rr; - case X86::ATOMOR8: return X86::OR8rr; - case X86::ATOMOR16: return X86::OR16rr; - case X86::ATOMOR32: return X86::OR32rr; - case X86::ATOMOR64: return X86::OR64rr; - case X86::ATOMXOR8: return X86::XOR8rr; - case X86::ATOMXOR16: return X86::XOR16rr; - case X86::ATOMXOR32: return X86::XOR32rr; - case X86::ATOMXOR64: return X86::XOR64rr; - } - llvm_unreachable("Unhandled atomic-load-op opcode!"); -} - -// Get opcode of the non-atomic one from the specified atomic instruction with -// extra opcode. -static unsigned getNonAtomicOpcodeWithExtraOpc(unsigned Opc, - unsigned &ExtraOpc) { - switch (Opc) { - case X86::ATOMNAND8: ExtraOpc = X86::NOT8r; return X86::AND8rr; - case X86::ATOMNAND16: ExtraOpc = X86::NOT16r; return X86::AND16rr; - case X86::ATOMNAND32: ExtraOpc = X86::NOT32r; return X86::AND32rr; - case X86::ATOMNAND64: ExtraOpc = X86::NOT64r; return X86::AND64rr; - case X86::ATOMMAX8: ExtraOpc = X86::CMP8rr; return X86::CMOVL32rr; - case X86::ATOMMAX16: ExtraOpc = X86::CMP16rr; return X86::CMOVL16rr; - case X86::ATOMMAX32: ExtraOpc = X86::CMP32rr; return X86::CMOVL32rr; - case X86::ATOMMAX64: ExtraOpc = X86::CMP64rr; return X86::CMOVL64rr; - case X86::ATOMMIN8: ExtraOpc = X86::CMP8rr; return X86::CMOVG32rr; - case X86::ATOMMIN16: ExtraOpc = X86::CMP16rr; return X86::CMOVG16rr; - case X86::ATOMMIN32: ExtraOpc = X86::CMP32rr; return X86::CMOVG32rr; - case X86::ATOMMIN64: ExtraOpc = X86::CMP64rr; return X86::CMOVG64rr; - case X86::ATOMUMAX8: ExtraOpc = X86::CMP8rr; return X86::CMOVB32rr; - case X86::ATOMUMAX16: ExtraOpc = X86::CMP16rr; return X86::CMOVB16rr; - case X86::ATOMUMAX32: ExtraOpc = X86::CMP32rr; return X86::CMOVB32rr; - case X86::ATOMUMAX64: ExtraOpc = X86::CMP64rr; return X86::CMOVB64rr; - case X86::ATOMUMIN8: ExtraOpc = X86::CMP8rr; return X86::CMOVA32rr; - case X86::ATOMUMIN16: ExtraOpc = X86::CMP16rr; return X86::CMOVA16rr; - case X86::ATOMUMIN32: ExtraOpc = X86::CMP32rr; return X86::CMOVA32rr; - case X86::ATOMUMIN64: ExtraOpc = X86::CMP64rr; return X86::CMOVA64rr; - } - llvm_unreachable("Unhandled atomic-load-op opcode!"); -} - -// Get opcode of the non-atomic one from the specified atomic instruction for -// 64-bit data type on 32-bit target. -static unsigned getNonAtomic6432Opcode(unsigned Opc, unsigned &HiOpc) { - switch (Opc) { - case X86::ATOMAND6432: HiOpc = X86::AND32rr; return X86::AND32rr; - case X86::ATOMOR6432: HiOpc = X86::OR32rr; return X86::OR32rr; - case X86::ATOMXOR6432: HiOpc = X86::XOR32rr; return X86::XOR32rr; - case X86::ATOMADD6432: HiOpc = X86::ADC32rr; return X86::ADD32rr; - case X86::ATOMSUB6432: HiOpc = X86::SBB32rr; return X86::SUB32rr; - case X86::ATOMSWAP6432: HiOpc = X86::MOV32rr; return X86::MOV32rr; - case X86::ATOMMAX6432: HiOpc = X86::SETLr; return X86::SETLr; - case X86::ATOMMIN6432: HiOpc = X86::SETGr; return X86::SETGr; - case X86::ATOMUMAX6432: HiOpc = X86::SETBr; return X86::SETBr; - case X86::ATOMUMIN6432: HiOpc = X86::SETAr; return X86::SETAr; - } - llvm_unreachable("Unhandled atomic-load-op opcode!"); -} - -// Get opcode of the non-atomic one from the specified atomic instruction for -// 64-bit data type on 32-bit target with extra opcode. -static unsigned getNonAtomic6432OpcodeWithExtraOpc(unsigned Opc, - unsigned &HiOpc, - unsigned &ExtraOpc) { - switch (Opc) { - case X86::ATOMNAND6432: - ExtraOpc = X86::NOT32r; - HiOpc = X86::AND32rr; - return X86::AND32rr; - } - llvm_unreachable("Unhandled atomic-load-op opcode!"); -} - -// Get pseudo CMOV opcode from the specified data type. -static unsigned getPseudoCMOVOpc(EVT VT) { - switch (VT.getSimpleVT().SimpleTy) { - case MVT::i8: return X86::CMOV_GR8; - case MVT::i16: return X86::CMOV_GR16; - case MVT::i32: return X86::CMOV_GR32; - default: - break; - } - llvm_unreachable("Unknown CMOV opcode!"); -} - -// EmitAtomicLoadArith - emit the code sequence for pseudo atomic instructions. -// They will be translated into a spin-loop or compare-exchange loop from -// -// ... -// dst = atomic-fetch-op MI.addr, MI.val -// ... -// -// to -// -// ... -// t1 = LOAD MI.addr -// loop: -// t4 = phi(t1, t3 / loop) -// t2 = OP MI.val, t4 -// EAX = t4 -// LCMPXCHG [MI.addr], t2, [EAX is implicitly used & defined] -// t3 = EAX -// JNE loop -// sink: -// dst = t3 -// ... -MachineBasicBlock * -X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI, - MachineBasicBlock *MBB) const { - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); - DebugLoc DL = MI->getDebugLoc(); - - MachineFunction *MF = MBB->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); - - const BasicBlock *BB = MBB->getBasicBlock(); - MachineFunction::iterator I = MBB; - ++I; - - assert(MI->getNumOperands() <= X86::AddrNumOperands + 4 && - "Unexpected number of operands"); - - assert(MI->hasOneMemOperand() && - "Expected atomic-load-op to have one memoperand"); - - // Memory Reference - MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); - MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); - - unsigned DstReg, SrcReg; - unsigned MemOpndSlot; - - unsigned CurOp = 0; - - DstReg = MI->getOperand(CurOp++).getReg(); - MemOpndSlot = CurOp; - CurOp += X86::AddrNumOperands; - SrcReg = MI->getOperand(CurOp++).getReg(); - - const TargetRegisterClass *RC = MRI.getRegClass(DstReg); - MVT::SimpleValueType VT = *RC->vt_begin(); - unsigned t1 = MRI.createVirtualRegister(RC); - unsigned t2 = MRI.createVirtualRegister(RC); - unsigned t3 = MRI.createVirtualRegister(RC); - unsigned t4 = MRI.createVirtualRegister(RC); - unsigned PhyReg = getX86SubSuperRegister(X86::EAX, VT); - - unsigned LCMPXCHGOpc = getCmpXChgOpcode(VT); - unsigned LOADOpc = getLoadOpcode(VT); - - // For the atomic load-arith operator, we generate - // - // thisMBB: - // t1 = LOAD [MI.addr] - // mainMBB: - // t4 = phi(t1 / thisMBB, t3 / mainMBB) - // t1 = OP MI.val, EAX - // EAX = t4 - // LCMPXCHG [MI.addr], t1, [EAX is implicitly used & defined] - // t3 = EAX - // JNE mainMBB - // sinkMBB: - // dst = t3 - - MachineBasicBlock *thisMBB = MBB; - MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); - MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); - MF->insert(I, mainMBB); - MF->insert(I, sinkMBB); - - MachineInstrBuilder MIB; - - // Transfer the remainder of BB and its successor edges to sinkMBB. - sinkMBB->splice(sinkMBB->begin(), MBB, - std::next(MachineBasicBlock::iterator(MI)), MBB->end()); - sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); - - // thisMBB: - MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1); - for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { - MachineOperand NewMO = MI->getOperand(MemOpndSlot + i); - if (NewMO.isReg()) - NewMO.setIsKill(false); - MIB.addOperand(NewMO); - } - for (MachineInstr::mmo_iterator MMOI = MMOBegin; MMOI != MMOEnd; ++MMOI) { - unsigned flags = (*MMOI)->getFlags(); - flags = (flags & ~MachineMemOperand::MOStore) | MachineMemOperand::MOLoad; - MachineMemOperand *MMO = - MF->getMachineMemOperand((*MMOI)->getPointerInfo(), flags, - (*MMOI)->getSize(), - (*MMOI)->getBaseAlignment(), - (*MMOI)->getTBAAInfo(), - (*MMOI)->getRanges()); - MIB.addMemOperand(MMO); - } - - thisMBB->addSuccessor(mainMBB); - - // mainMBB: - MachineBasicBlock *origMainMBB = mainMBB; - - // Add a PHI. - MachineInstr *Phi = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4) - .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(mainMBB); - - unsigned Opc = MI->getOpcode(); - switch (Opc) { - default: - llvm_unreachable("Unhandled atomic-load-op opcode!"); - case X86::ATOMAND8: - case X86::ATOMAND16: - case X86::ATOMAND32: - case X86::ATOMAND64: - case X86::ATOMOR8: - case X86::ATOMOR16: - case X86::ATOMOR32: - case X86::ATOMOR64: - case X86::ATOMXOR8: - case X86::ATOMXOR16: - case X86::ATOMXOR32: - case X86::ATOMXOR64: { - unsigned ARITHOpc = getNonAtomicOpcode(Opc); - BuildMI(mainMBB, DL, TII->get(ARITHOpc), t2).addReg(SrcReg) - .addReg(t4); - break; - } - case X86::ATOMNAND8: - case X86::ATOMNAND16: - case X86::ATOMNAND32: - case X86::ATOMNAND64: { - unsigned Tmp = MRI.createVirtualRegister(RC); - unsigned NOTOpc; - unsigned ANDOpc = getNonAtomicOpcodeWithExtraOpc(Opc, NOTOpc); - BuildMI(mainMBB, DL, TII->get(ANDOpc), Tmp).addReg(SrcReg) - .addReg(t4); - BuildMI(mainMBB, DL, TII->get(NOTOpc), t2).addReg(Tmp); - break; - } - case X86::ATOMMAX8: - case X86::ATOMMAX16: - case X86::ATOMMAX32: - case X86::ATOMMAX64: - case X86::ATOMMIN8: - case X86::ATOMMIN16: - case X86::ATOMMIN32: - case X86::ATOMMIN64: - case X86::ATOMUMAX8: - case X86::ATOMUMAX16: - case X86::ATOMUMAX32: - case X86::ATOMUMAX64: - case X86::ATOMUMIN8: - case X86::ATOMUMIN16: - case X86::ATOMUMIN32: - case X86::ATOMUMIN64: { - unsigned CMPOpc; - unsigned CMOVOpc = getNonAtomicOpcodeWithExtraOpc(Opc, CMPOpc); - - BuildMI(mainMBB, DL, TII->get(CMPOpc)) - .addReg(SrcReg) - .addReg(t4); - - if (Subtarget->hasCMov()) { - if (VT != MVT::i8) { - // Native support - BuildMI(mainMBB, DL, TII->get(CMOVOpc), t2) - .addReg(SrcReg) - .addReg(t4); - } else { - // Promote i8 to i32 to use CMOV32 - const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo(); - const TargetRegisterClass *RC32 = - TRI->getSubClassWithSubReg(getRegClassFor(MVT::i32), X86::sub_8bit); - unsigned SrcReg32 = MRI.createVirtualRegister(RC32); - unsigned AccReg32 = MRI.createVirtualRegister(RC32); - unsigned Tmp = MRI.createVirtualRegister(RC32); - - unsigned Undef = MRI.createVirtualRegister(RC32); - BuildMI(mainMBB, DL, TII->get(TargetOpcode::IMPLICIT_DEF), Undef); - - BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), SrcReg32) - .addReg(Undef) - .addReg(SrcReg) - .addImm(X86::sub_8bit); - BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), AccReg32) - .addReg(Undef) - .addReg(t4) - .addImm(X86::sub_8bit); - - BuildMI(mainMBB, DL, TII->get(CMOVOpc), Tmp) - .addReg(SrcReg32) - .addReg(AccReg32); - - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t2) - .addReg(Tmp, 0, X86::sub_8bit); - } - } else { - // Use pseudo select and lower them. - assert((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) && - "Invalid atomic-load-op transformation!"); - unsigned SelOpc = getPseudoCMOVOpc(VT); - X86::CondCode CC = X86::getCondFromCMovOpc(CMOVOpc); - assert(CC != X86::COND_INVALID && "Invalid atomic-load-op transformation!"); - MIB = BuildMI(mainMBB, DL, TII->get(SelOpc), t2) - .addReg(SrcReg).addReg(t4) - .addImm(CC); - mainMBB = EmitLoweredSelect(MIB, mainMBB); - // Replace the original PHI node as mainMBB is changed after CMOV - // lowering. - BuildMI(*origMainMBB, Phi, DL, TII->get(X86::PHI), t4) - .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(mainMBB); - Phi->eraseFromParent(); - } - break; - } - } - - // Copy PhyReg back from virtual register. - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), PhyReg) - .addReg(t4); - - MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc)); - for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { - MachineOperand NewMO = MI->getOperand(MemOpndSlot + i); - if (NewMO.isReg()) - NewMO.setIsKill(false); - MIB.addOperand(NewMO); - } - MIB.addReg(t2); - MIB.setMemRefs(MMOBegin, MMOEnd); - - // Copy PhyReg back to virtual register. - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3) - .addReg(PhyReg); - - BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB); - - mainMBB->addSuccessor(origMainMBB); - mainMBB->addSuccessor(sinkMBB); - - // sinkMBB: - BuildMI(*sinkMBB, sinkMBB->begin(), DL, - TII->get(TargetOpcode::COPY), DstReg) - .addReg(t3); - - MI->eraseFromParent(); - return sinkMBB; -} - -// EmitAtomicLoadArith6432 - emit the code sequence for pseudo atomic -// instructions. They will be translated into a spin-loop or compare-exchange -// loop from -// -// ... -// dst = atomic-fetch-op MI.addr, MI.val -// ... -// -// to -// -// ... -// t1L = LOAD [MI.addr + 0] -// t1H = LOAD [MI.addr + 4] -// loop: -// t4L = phi(t1L, t3L / loop) -// t4H = phi(t1H, t3H / loop) -// t2L = OP MI.val.lo, t4L -// t2H = OP MI.val.hi, t4H -// EAX = t4L -// EDX = t4H -// EBX = t2L -// ECX = t2H -// LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined] -// t3L = EAX -// t3H = EDX -// JNE loop -// sink: -// dstL = t3L -// dstH = t3H -// ... -MachineBasicBlock * -X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI, - MachineBasicBlock *MBB) const { - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); - DebugLoc DL = MI->getDebugLoc(); - - MachineFunction *MF = MBB->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); - - const BasicBlock *BB = MBB->getBasicBlock(); - MachineFunction::iterator I = MBB; - ++I; - - assert(MI->getNumOperands() <= X86::AddrNumOperands + 7 && - "Unexpected number of operands"); - - assert(MI->hasOneMemOperand() && - "Expected atomic-load-op32 to have one memoperand"); - - // Memory Reference - MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); - MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); - - unsigned DstLoReg, DstHiReg; - unsigned SrcLoReg, SrcHiReg; - unsigned MemOpndSlot; - - unsigned CurOp = 0; - - DstLoReg = MI->getOperand(CurOp++).getReg(); - DstHiReg = MI->getOperand(CurOp++).getReg(); - MemOpndSlot = CurOp; - CurOp += X86::AddrNumOperands; - SrcLoReg = MI->getOperand(CurOp++).getReg(); - SrcHiReg = MI->getOperand(CurOp++).getReg(); - - const TargetRegisterClass *RC = &X86::GR32RegClass; - const TargetRegisterClass *RC8 = &X86::GR8RegClass; - - unsigned t1L = MRI.createVirtualRegister(RC); - unsigned t1H = MRI.createVirtualRegister(RC); - unsigned t2L = MRI.createVirtualRegister(RC); - unsigned t2H = MRI.createVirtualRegister(RC); - unsigned t3L = MRI.createVirtualRegister(RC); - unsigned t3H = MRI.createVirtualRegister(RC); - unsigned t4L = MRI.createVirtualRegister(RC); - unsigned t4H = MRI.createVirtualRegister(RC); - - unsigned LCMPXCHGOpc = X86::LCMPXCHG8B; - unsigned LOADOpc = X86::MOV32rm; - - // For the atomic load-arith operator, we generate - // - // thisMBB: - // t1L = LOAD [MI.addr + 0] - // t1H = LOAD [MI.addr + 4] - // mainMBB: - // t4L = phi(t1L / thisMBB, t3L / mainMBB) - // t4H = phi(t1H / thisMBB, t3H / mainMBB) - // t2L = OP MI.val.lo, t4L - // t2H = OP MI.val.hi, t4H - // EBX = t2L - // ECX = t2H - // LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined] - // t3L = EAX - // t3H = EDX - // JNE loop - // sinkMBB: - // dstL = t3L - // dstH = t3H - - MachineBasicBlock *thisMBB = MBB; - MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); - MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); - MF->insert(I, mainMBB); - MF->insert(I, sinkMBB); - - MachineInstrBuilder MIB; - - // Transfer the remainder of BB and its successor edges to sinkMBB. - sinkMBB->splice(sinkMBB->begin(), MBB, - std::next(MachineBasicBlock::iterator(MI)), MBB->end()); - sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); - - // thisMBB: - // Lo - MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1L); - for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { - MachineOperand NewMO = MI->getOperand(MemOpndSlot + i); - if (NewMO.isReg()) - NewMO.setIsKill(false); - MIB.addOperand(NewMO); - } - for (MachineInstr::mmo_iterator MMOI = MMOBegin; MMOI != MMOEnd; ++MMOI) { - unsigned flags = (*MMOI)->getFlags(); - flags = (flags & ~MachineMemOperand::MOStore) | MachineMemOperand::MOLoad; - MachineMemOperand *MMO = - MF->getMachineMemOperand((*MMOI)->getPointerInfo(), flags, - (*MMOI)->getSize(), - (*MMOI)->getBaseAlignment(), - (*MMOI)->getTBAAInfo(), - (*MMOI)->getRanges()); - MIB.addMemOperand(MMO); - }; - MachineInstr *LowMI = MIB; - - // Hi - MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1H); - for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { - if (i == X86::AddrDisp) { - MIB.addDisp(MI->getOperand(MemOpndSlot + i), 4); // 4 == sizeof(i32) - } else { - MachineOperand NewMO = MI->getOperand(MemOpndSlot + i); - if (NewMO.isReg()) - NewMO.setIsKill(false); - MIB.addOperand(NewMO); - } - } - MIB.setMemRefs(LowMI->memoperands_begin(), LowMI->memoperands_end()); - - thisMBB->addSuccessor(mainMBB); - - // mainMBB: - MachineBasicBlock *origMainMBB = mainMBB; - - // Add PHIs. - MachineInstr *PhiL = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4L) - .addReg(t1L).addMBB(thisMBB).addReg(t3L).addMBB(mainMBB); - MachineInstr *PhiH = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4H) - .addReg(t1H).addMBB(thisMBB).addReg(t3H).addMBB(mainMBB); - - unsigned Opc = MI->getOpcode(); - switch (Opc) { - default: - llvm_unreachable("Unhandled atomic-load-op6432 opcode!"); - case X86::ATOMAND6432: - case X86::ATOMOR6432: - case X86::ATOMXOR6432: - case X86::ATOMADD6432: - case X86::ATOMSUB6432: { - unsigned HiOpc; - unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc); - BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(t4L) - .addReg(SrcLoReg); - BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(t4H) - .addReg(SrcHiReg); - break; - } - case X86::ATOMNAND6432: { - unsigned HiOpc, NOTOpc; - unsigned LoOpc = getNonAtomic6432OpcodeWithExtraOpc(Opc, HiOpc, NOTOpc); - unsigned TmpL = MRI.createVirtualRegister(RC); - unsigned TmpH = MRI.createVirtualRegister(RC); - BuildMI(mainMBB, DL, TII->get(LoOpc), TmpL).addReg(SrcLoReg) - .addReg(t4L); - BuildMI(mainMBB, DL, TII->get(HiOpc), TmpH).addReg(SrcHiReg) - .addReg(t4H); - BuildMI(mainMBB, DL, TII->get(NOTOpc), t2L).addReg(TmpL); - BuildMI(mainMBB, DL, TII->get(NOTOpc), t2H).addReg(TmpH); - break; - } - case X86::ATOMMAX6432: - case X86::ATOMMIN6432: - case X86::ATOMUMAX6432: - case X86::ATOMUMIN6432: { - unsigned HiOpc; - unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc); - unsigned cL = MRI.createVirtualRegister(RC8); - unsigned cH = MRI.createVirtualRegister(RC8); - unsigned cL32 = MRI.createVirtualRegister(RC); - unsigned cH32 = MRI.createVirtualRegister(RC); - unsigned cc = MRI.createVirtualRegister(RC); - // cl := cmp src_lo, lo - BuildMI(mainMBB, DL, TII->get(X86::CMP32rr)) - .addReg(SrcLoReg).addReg(t4L); - BuildMI(mainMBB, DL, TII->get(LoOpc), cL); - BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cL32).addReg(cL); - // ch := cmp src_hi, hi - BuildMI(mainMBB, DL, TII->get(X86::CMP32rr)) - .addReg(SrcHiReg).addReg(t4H); - BuildMI(mainMBB, DL, TII->get(HiOpc), cH); - BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cH32).addReg(cH); - // cc := if (src_hi == hi) ? cl : ch; - if (Subtarget->hasCMov()) { - BuildMI(mainMBB, DL, TII->get(X86::CMOVE32rr), cc) - .addReg(cH32).addReg(cL32); - } else { - MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), cc) - .addReg(cH32).addReg(cL32) - .addImm(X86::COND_E); - mainMBB = EmitLoweredSelect(MIB, mainMBB); - } - BuildMI(mainMBB, DL, TII->get(X86::TEST32rr)).addReg(cc).addReg(cc); - if (Subtarget->hasCMov()) { - BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t2L) - .addReg(SrcLoReg).addReg(t4L); - BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t2H) - .addReg(SrcHiReg).addReg(t4H); - } else { - MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t2L) - .addReg(SrcLoReg).addReg(t4L) - .addImm(X86::COND_NE); - mainMBB = EmitLoweredSelect(MIB, mainMBB); - // As the lowered CMOV won't clobber EFLAGS, we could reuse it for the - // 2nd CMOV lowering. - mainMBB->addLiveIn(X86::EFLAGS); - MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t2H) - .addReg(SrcHiReg).addReg(t4H) - .addImm(X86::COND_NE); - mainMBB = EmitLoweredSelect(MIB, mainMBB); - // Replace the original PHI node as mainMBB is changed after CMOV - // lowering. - BuildMI(*origMainMBB, PhiL, DL, TII->get(X86::PHI), t4L) - .addReg(t1L).addMBB(thisMBB).addReg(t3L).addMBB(mainMBB); - BuildMI(*origMainMBB, PhiH, DL, TII->get(X86::PHI), t4H) - .addReg(t1H).addMBB(thisMBB).addReg(t3H).addMBB(mainMBB); - PhiL->eraseFromParent(); - PhiH->eraseFromParent(); - } - break; - } - case X86::ATOMSWAP6432: { - unsigned HiOpc; - unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc); - BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(SrcLoReg); - BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(SrcHiReg); - break; - } - } - - // Copy EDX:EAX back from HiReg:LoReg - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EAX).addReg(t4L); - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EDX).addReg(t4H); - // Copy ECX:EBX from t1H:t1L - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EBX).addReg(t2L); - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::ECX).addReg(t2H); - - MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc)); - for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { - MachineOperand NewMO = MI->getOperand(MemOpndSlot + i); - if (NewMO.isReg()) - NewMO.setIsKill(false); - MIB.addOperand(NewMO); - } - MIB.setMemRefs(MMOBegin, MMOEnd); - - // Copy EDX:EAX back to t3H:t3L - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3L).addReg(X86::EAX); - BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3H).addReg(X86::EDX); - - BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB); - - mainMBB->addSuccessor(origMainMBB); - mainMBB->addSuccessor(sinkMBB); - - // sinkMBB: - BuildMI(*sinkMBB, sinkMBB->begin(), DL, - TII->get(TargetOpcode::COPY), DstLoReg) - .addReg(t3L); - BuildMI(*sinkMBB, sinkMBB->begin(), DL, - TII->get(TargetOpcode::COPY), DstHiReg) - .addReg(t3H); - - MI->eraseFromParent(); - return sinkMBB; -} - // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 // or XMM0_V32I8 in AVX all of this code can be replaced with that // in the .td file. @@ -16068,7 +17073,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter( MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); // Machine Information - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + const TargetInstrInfo *TII = MBB->getParent()->getTarget().getInstrInfo(); MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64); const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32); @@ -16324,7 +17329,7 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( XMMSaveMBB->addSuccessor(EndMBB); // Now add the instructions. - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + const TargetInstrInfo *TII = MBB->getParent()->getTarget().getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); unsigned CountReg = MI->getOperand(0).getReg(); @@ -16407,7 +17412,7 @@ static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock * X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, MachineBasicBlock *BB) const { - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + const TargetInstrInfo *TII = BB->getParent()->getTarget().getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); // To "insert" a SELECT_CC instruction, we actually have to insert the @@ -16433,7 +17438,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, // If the EFLAGS register isn't dead in the terminator, then claim that it's // live into the sink and copy blocks. - const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo(); + const TargetRegisterInfo* TRI = BB->getParent()->getTarget().getRegisterInfo(); if (!MI->killsRegister(X86::EFLAGS) && !checkAndUpdateEFLAGSKill(MI, BB, TRI)) { copy0MBB->addLiveIn(X86::EFLAGS); @@ -16474,9 +17479,9 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, MachineBasicBlock * X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, bool Is64Bit) const { - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); - DebugLoc DL = MI->getDebugLoc(); MachineFunction *MF = BB->getParent(); + const TargetInstrInfo *TII = MF->getTarget().getInstrInfo(); + DebugLoc DL = MI->getDebugLoc(); const BasicBlock *LLVM_BB = BB->getBasicBlock(); assert(MF->shouldSplitStack()); @@ -16546,7 +17551,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, // Calls into a routine in libgcc to allocate more space from the heap. const uint32_t *RegMask = - getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C); + MF->getTarget().getRegisterInfo()->getCallPreservedMask(CallingConv::C); if (Is64Bit) { BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI) .addReg(sizeVReg); @@ -16594,8 +17599,8 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, MachineBasicBlock * X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, - MachineBasicBlock *BB) const { - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + MachineBasicBlock *BB) const { + const TargetInstrInfo *TII = BB->getParent()->getTarget().getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); assert(!Subtarget->isTargetMacho()); @@ -16651,10 +17656,10 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, // our load from the relocation, sticking it in either RDI (x86-64) // or EAX and doing an indirect call. The return value will then // be in the normal return register. + MachineFunction *F = BB->getParent(); const X86InstrInfo *TII - = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo()); + = static_cast<const X86InstrInfo*>(F->getTarget().getInstrInfo()); DebugLoc DL = MI->getDebugLoc(); - MachineFunction *F = BB->getParent(); assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?"); assert(MI->getOperand(3).isGlobal() && "This should be a global"); @@ -16663,7 +17668,7 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, // FIXME: The 32-bit calls have non-standard calling conventions. Use a // proper register mask. const uint32_t *RegMask = - getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C); + F->getTarget().getRegisterInfo()->getCallPreservedMask(CallingConv::C); if (Subtarget->is64Bit()) { MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI) @@ -16675,7 +17680,7 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m)); addDirectMem(MIB, X86::RDI); MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask); - } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) { + } else if (F->getTarget().getRelocationModel() != Reloc::PIC_) { MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX) .addReg(0) @@ -16707,9 +17712,8 @@ MachineBasicBlock * X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, MachineBasicBlock *MBB) const { DebugLoc DL = MI->getDebugLoc(); - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); - MachineFunction *MF = MBB->getParent(); + const TargetInstrInfo *TII = MF->getTarget().getInstrInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); const BasicBlock *BB = MBB->getBasicBlock(); @@ -16771,8 +17775,8 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, unsigned PtrStoreOpc = 0; unsigned LabelReg = 0; const int64_t LabelOffset = 1 * PVT.getStoreSize(); - Reloc::Model RM = getTargetMachine().getRelocationModel(); - bool UseImmLabel = (getTargetMachine().getCodeModel() == CodeModel::Small) && + Reloc::Model RM = MF->getTarget().getRelocationModel(); + bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) && (RM == Reloc::Static || RM == Reloc::DynamicNoPIC); // Prepare IP either in reg or imm. @@ -16816,7 +17820,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, .addMBB(restoreMBB); const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); + static_cast<const X86RegisterInfo*>(MF->getTarget().getRegisterInfo()); MIB.addRegMask(RegInfo->getNoPreservedMask()); thisMBB->addSuccessor(mainMBB); thisMBB->addSuccessor(restoreMBB); @@ -16845,9 +17849,8 @@ MachineBasicBlock * X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, MachineBasicBlock *MBB) const { DebugLoc DL = MI->getDebugLoc(); - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); - MachineFunction *MF = MBB->getParent(); + const TargetInstrInfo *TII = MF->getTarget().getInstrInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); // Memory Reference @@ -16863,7 +17866,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, unsigned Tmp = MRI.createVirtualRegister(RC); // Since FP is only updated here but NOT referenced, it's treated as GPR. const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); + static_cast<const X86RegisterInfo*>(MF->getTarget().getRegisterInfo()); unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP; unsigned SP = RegInfo->getStackRegister(); @@ -17038,12 +18041,12 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::FP80_TO_INT16_IN_MEM: case X86::FP80_TO_INT32_IN_MEM: case X86::FP80_TO_INT64_IN_MEM: { - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + MachineFunction *F = BB->getParent(); + const TargetInstrInfo *TII = F->getTarget().getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); // Change the floating point control register to use "round towards zero" // mode when truncating to an integer value. - MachineFunction *F = BB->getParent(); int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false); addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FNSTCW16m)), CWFrameIdx); @@ -17123,7 +18126,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::VPCMPESTRM128MEM: assert(Subtarget->hasSSE42() && "Target must have SSE4.2 or AVX features enabled"); - return EmitPCMPSTRM(MI, BB, getTargetMachine().getInstrInfo()); + return EmitPCMPSTRM(MI, BB, BB->getParent()->getTarget().getInstrInfo()); // String/text processing lowering. case X86::PCMPISTRIREG: @@ -17136,71 +18139,15 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::VPCMPESTRIMEM: assert(Subtarget->hasSSE42() && "Target must have SSE4.2 or AVX features enabled"); - return EmitPCMPSTRI(MI, BB, getTargetMachine().getInstrInfo()); + return EmitPCMPSTRI(MI, BB, BB->getParent()->getTarget().getInstrInfo()); // Thread synchronization. case X86::MONITOR: - return EmitMonitor(MI, BB, getTargetMachine().getInstrInfo(), Subtarget); + return EmitMonitor(MI, BB, BB->getParent()->getTarget().getInstrInfo(), Subtarget); // xbegin case X86::XBEGIN: - return EmitXBegin(MI, BB, getTargetMachine().getInstrInfo()); - - // Atomic Lowering. - case X86::ATOMAND8: - case X86::ATOMAND16: - case X86::ATOMAND32: - case X86::ATOMAND64: - // Fall through - case X86::ATOMOR8: - case X86::ATOMOR16: - case X86::ATOMOR32: - case X86::ATOMOR64: - // Fall through - case X86::ATOMXOR16: - case X86::ATOMXOR8: - case X86::ATOMXOR32: - case X86::ATOMXOR64: - // Fall through - case X86::ATOMNAND8: - case X86::ATOMNAND16: - case X86::ATOMNAND32: - case X86::ATOMNAND64: - // Fall through - case X86::ATOMMAX8: - case X86::ATOMMAX16: - case X86::ATOMMAX32: - case X86::ATOMMAX64: - // Fall through - case X86::ATOMMIN8: - case X86::ATOMMIN16: - case X86::ATOMMIN32: - case X86::ATOMMIN64: - // Fall through - case X86::ATOMUMAX8: - case X86::ATOMUMAX16: - case X86::ATOMUMAX32: - case X86::ATOMUMAX64: - // Fall through - case X86::ATOMUMIN8: - case X86::ATOMUMIN16: - case X86::ATOMUMIN32: - case X86::ATOMUMIN64: - return EmitAtomicLoadArith(MI, BB); - - // This group does 64-bit operations on a 32-bit host. - case X86::ATOMAND6432: - case X86::ATOMOR6432: - case X86::ATOMXOR6432: - case X86::ATOMNAND6432: - case X86::ATOMADD6432: - case X86::ATOMSUB6432: - case X86::ATOMMAX6432: - case X86::ATOMMIN6432: - case X86::ATOMUMAX6432: - case X86::ATOMUMIN6432: - case X86::ATOMSWAP6432: - return EmitAtomicLoadArith6432(MI, BB); + return EmitXBegin(MI, BB, BB->getParent()->getTarget().getInstrInfo()); case X86::VASTART_SAVE_XMM_REGS: return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); @@ -17473,13 +18420,385 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, return SDValue(); } +/// \brief Get the PSHUF-style mask from PSHUF node. +/// +/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4 +/// PSHUF-style masks that can be reused with such instructions. +static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) { + SmallVector<int, 4> Mask; + bool IsUnary; + bool HaveMask = getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), Mask, IsUnary); + (void)HaveMask; + assert(HaveMask); + + switch (N.getOpcode()) { + case X86ISD::PSHUFD: + return Mask; + case X86ISD::PSHUFLW: + Mask.resize(4); + return Mask; + case X86ISD::PSHUFHW: + Mask.erase(Mask.begin(), Mask.begin() + 4); + for (int &M : Mask) + M -= 4; + return Mask; + default: + llvm_unreachable("No valid shuffle instruction found!"); + } +} + +/// \brief Search for a combinable shuffle across a chain ending in pshufd. +/// +/// We walk up the chain and look for a combinable shuffle, skipping over +/// shuffles that we could hoist this shuffle's transformation past without +/// altering anything. +static bool combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask, + SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + assert(N.getOpcode() == X86ISD::PSHUFD && + "Called with something other than an x86 128-bit half shuffle!"); + SDLoc DL(N); + + // Walk up a single-use chain looking for a combinable shuffle. + SDValue V = N.getOperand(0); + for (; V.hasOneUse(); V = V.getOperand(0)) { + switch (V.getOpcode()) { + default: + return false; // Nothing combined! + + case ISD::BITCAST: + // Skip bitcasts as we always know the type for the target specific + // instructions. + continue; + + case X86ISD::PSHUFD: + // Found another dword shuffle. + break; + + case X86ISD::PSHUFLW: + // Check that the low words (being shuffled) are the identity in the + // dword shuffle, and the high words are self-contained. + if (Mask[0] != 0 || Mask[1] != 1 || + !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4)) + return false; + + continue; + + case X86ISD::PSHUFHW: + // Check that the high words (being shuffled) are the identity in the + // dword shuffle, and the low words are self-contained. + if (Mask[2] != 2 || Mask[3] != 3 || + !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2)) + return false; + + continue; + + case X86ISD::UNPCKL: + case X86ISD::UNPCKH: + // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword + // shuffle into a preceding word shuffle. + if (V.getValueType() != MVT::v16i8 && V.getValueType() != MVT::v8i16) + return false; + + // Search for a half-shuffle which we can combine with. + unsigned CombineOp = + V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW; + if (V.getOperand(0) != V.getOperand(1) || + !V->isOnlyUserOf(V.getOperand(0).getNode())) + return false; + V = V.getOperand(0); + do { + switch (V.getOpcode()) { + default: + return false; // Nothing to combine. + + case X86ISD::PSHUFLW: + case X86ISD::PSHUFHW: + if (V.getOpcode() == CombineOp) + break; + + // Fallthrough! + case ISD::BITCAST: + V = V.getOperand(0); + continue; + } + break; + } while (V.hasOneUse()); + break; + } + // Break out of the loop if we break out of the switch. + break; + } + + if (!V.hasOneUse()) + // We fell out of the loop without finding a viable combining instruction. + return false; + + // Record the old value to use in RAUW-ing. + SDValue Old = V; + + // Merge this node's mask and our incoming mask. + SmallVector<int, 4> VMask = getPSHUFShuffleMask(V); + for (int &M : Mask) + M = VMask[M]; + V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0), + getV4X86ShuffleImm8ForMask(Mask, DAG)); + + // It is possible that one of the combinable shuffles was completely absorbed + // by the other, just replace it and revisit all users in that case. + if (Old.getNode() == V.getNode()) { + DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo=*/true); + return true; + } + + // Replace N with its operand as we're going to combine that shuffle away. + DAG.ReplaceAllUsesWith(N, N.getOperand(0)); + + // Replace the combinable shuffle with the combined one, updating all users + // so that we re-evaluate the chain here. + DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true); + return true; +} + +/// \brief Search for a combinable shuffle across a chain ending in pshuflw or pshufhw. +/// +/// We walk up the chain, skipping shuffles of the other half and looking +/// through shuffles which switch halves trying to find a shuffle of the same +/// pair of dwords. +static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask, + SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + assert( + (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) && + "Called with something other than an x86 128-bit half shuffle!"); + SDLoc DL(N); + unsigned CombineOpcode = N.getOpcode(); + + // Walk up a single-use chain looking for a combinable shuffle. + SDValue V = N.getOperand(0); + for (; V.hasOneUse(); V = V.getOperand(0)) { + switch (V.getOpcode()) { + default: + return false; // Nothing combined! + + case ISD::BITCAST: + // Skip bitcasts as we always know the type for the target specific + // instructions. + continue; + + case X86ISD::PSHUFLW: + case X86ISD::PSHUFHW: + if (V.getOpcode() == CombineOpcode) + break; + + // Other-half shuffles are no-ops. + continue; + + case X86ISD::PSHUFD: { + // We can only handle pshufd if the half we are combining either stays in + // its half, or switches to the other half. Bail if one of these isn't + // true. + SmallVector<int, 4> VMask = getPSHUFShuffleMask(V); + int DOffset = CombineOpcode == X86ISD::PSHUFLW ? 0 : 2; + if (!((VMask[DOffset + 0] < 2 && VMask[DOffset + 1] < 2) || + (VMask[DOffset + 0] >= 2 && VMask[DOffset + 1] >= 2))) + return false; + + // Map the mask through the pshufd and keep walking up the chain. + for (int i = 0; i < 4; ++i) + Mask[i] = 2 * (VMask[DOffset + Mask[i] / 2] % 2) + Mask[i] % 2; + + // Switch halves if the pshufd does. + CombineOpcode = + VMask[DOffset + Mask[0] / 2] < 2 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW; + continue; + } + } + // Break out of the loop if we break out of the switch. + break; + } + + if (!V.hasOneUse()) + // We fell out of the loop without finding a viable combining instruction. + return false; + + // Record the old value to use in RAUW-ing. + SDValue Old = V; + + // Merge this node's mask and our incoming mask (adjusted to account for all + // the pshufd instructions encountered). + SmallVector<int, 4> VMask = getPSHUFShuffleMask(V); + for (int &M : Mask) + M = VMask[M]; + V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0), + getV4X86ShuffleImm8ForMask(Mask, DAG)); + + // Replace N with its operand as we're going to combine that shuffle away. + DAG.ReplaceAllUsesWith(N, N.getOperand(0)); + + // Replace the combinable shuffle with the combined one, updating all users + // so that we re-evaluate the chain here. + DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true); + return true; +} + +/// \brief Try to combine x86 target specific shuffles. +static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + SDLoc DL(N); + MVT VT = N.getSimpleValueType(); + SmallVector<int, 4> Mask; + + switch (N.getOpcode()) { + case X86ISD::PSHUFD: + case X86ISD::PSHUFLW: + case X86ISD::PSHUFHW: + Mask = getPSHUFShuffleMask(N); + assert(Mask.size() == 4); + break; + default: + return SDValue(); + } + + // Nuke no-op shuffles that show up after combining. + if (isNoopShuffleMask(Mask)) + return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true); + + // Look for simplifications involving one or two shuffle instructions. + SDValue V = N.getOperand(0); + switch (N.getOpcode()) { + default: + break; + case X86ISD::PSHUFLW: + case X86ISD::PSHUFHW: + assert(VT == MVT::v8i16); + (void)VT; + + if (combineRedundantHalfShuffle(N, Mask, DAG, DCI)) + return SDValue(); // We combined away this shuffle, so we're done. + + // See if this reduces to a PSHUFD which is no more expensive and can + // combine with more operations. + if (Mask[0] % 2 == 0 && Mask[2] % 2 == 0 && + areAdjacentMasksSequential(Mask)) { + int DMask[] = {-1, -1, -1, -1}; + int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2; + DMask[DOffset + 0] = DOffset + Mask[0] / 2; + DMask[DOffset + 1] = DOffset + Mask[2] / 2; + V = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V); + DCI.AddToWorklist(V.getNode()); + V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V, + getV4X86ShuffleImm8ForMask(DMask, DAG)); + DCI.AddToWorklist(V.getNode()); + return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V); + } + + // Look for shuffle patterns which can be implemented as a single unpack. + // FIXME: This doesn't handle the location of the PSHUFD generically, and + // only works when we have a PSHUFD followed by two half-shuffles. + if (Mask[0] == Mask[1] && Mask[2] == Mask[3] && + (V.getOpcode() == X86ISD::PSHUFLW || + V.getOpcode() == X86ISD::PSHUFHW) && + V.getOpcode() != N.getOpcode() && + V.hasOneUse()) { + SDValue D = V.getOperand(0); + while (D.getOpcode() == ISD::BITCAST && D.hasOneUse()) + D = D.getOperand(0); + if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) { + SmallVector<int, 4> VMask = getPSHUFShuffleMask(V); + SmallVector<int, 4> DMask = getPSHUFShuffleMask(D); + int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4; + int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4; + int WordMask[8]; + for (int i = 0; i < 4; ++i) { + WordMask[i + NOffset] = Mask[i] + NOffset; + WordMask[i + VOffset] = VMask[i] + VOffset; + } + // Map the word mask through the DWord mask. + int MappedMask[8]; + for (int i = 0; i < 8; ++i) + MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2; + const int UnpackLoMask[] = {0, 0, 1, 1, 2, 2, 3, 3}; + const int UnpackHiMask[] = {4, 4, 5, 5, 6, 6, 7, 7}; + if (std::equal(std::begin(MappedMask), std::end(MappedMask), + std::begin(UnpackLoMask)) || + std::equal(std::begin(MappedMask), std::end(MappedMask), + std::begin(UnpackHiMask))) { + // We can replace all three shuffles with an unpack. + V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, D.getOperand(0)); + DCI.AddToWorklist(V.getNode()); + return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL + : X86ISD::UNPCKH, + DL, MVT::v8i16, V, V); + } + } + } + + break; + + case X86ISD::PSHUFD: + if (combineRedundantDWordShuffle(N, Mask, DAG, DCI)) + return SDValue(); // We combined away this shuffle. + + break; + } + + return SDValue(); +} + /// PerformShuffleCombine - Performs several different shuffle combines. static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { SDLoc dl(N); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); + // Canonicalize shuffles that perform 'addsub' on packed float vectors + // according to the rule: + // (shuffle (FADD A, B), (FSUB A, B), Mask) -> + // (shuffle (FSUB A, -B), (FADD A, -B), Mask) + // + // Where 'Mask' is: + // <0,5,2,7> -- for v4f32 and v4f64 shuffles; + // <0,3> -- for v2f64 shuffles; + // <0,9,2,11,4,13,6,15> -- for v8f32 shuffles. + // + // This helps pattern-matching more SSE3/AVX ADDSUB instructions + // during ISel stage. + if (N->getOpcode() == ISD::VECTOR_SHUFFLE && + ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || + (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && + N0->getOpcode() == ISD::FADD && N1->getOpcode() == ISD::FSUB && + // Operands to the FADD and FSUB must be the same. + ((N0->getOperand(0) == N1->getOperand(0) && + N0->getOperand(1) == N1->getOperand(1)) || + // FADD is commutable. See if by commuting the operands of the FADD + // we would still be able to match the operands of the FSUB dag node. + (N0->getOperand(1) == N1->getOperand(0) && + N0->getOperand(0) == N1->getOperand(1))) && + N0->getOperand(0)->getOpcode() != ISD::UNDEF && + N0->getOperand(1)->getOpcode() != ISD::UNDEF) { + + ShuffleVectorSDNode *SV = cast<ShuffleVectorSDNode>(N); + unsigned NumElts = VT.getVectorNumElements(); + ArrayRef<int> Mask = SV->getMask(); + bool CanFold = true; + + for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) + CanFold = Mask[i] == (int)((i & 1) ? i + NumElts : i); + + if (CanFold) { + SDValue Op0 = N1->getOperand(0); + SDValue Op1 = DAG.getNode(ISD::FNEG, dl, VT, N1->getOperand(1)); + SDValue Sub = DAG.getNode(ISD::FSUB, dl, VT, Op0, Op1); + SDValue Add = DAG.getNode(ISD::FADD, dl, VT, Op0, Op1); + return DAG.getVectorShuffle(VT, dl, Sub, Add, Mask); + } + } + // Don't create instructions with illegal types after legalize types has run. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType())) @@ -17490,6 +18809,57 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, N->getOpcode() == ISD::VECTOR_SHUFFLE) return PerformShuffleCombine256(N, DAG, DCI, Subtarget); + // During Type Legalization, when promoting illegal vector types, + // the backend might introduce new shuffle dag nodes and bitcasts. + // + // This code performs the following transformation: + // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) -> + // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>) + // + // We do this only if both the bitcast and the BINOP dag nodes have + // one use. Also, perform this transformation only if the new binary + // operation is legal. This is to avoid introducing dag nodes that + // potentially need to be further expanded (or custom lowered) into a + // less optimal sequence of dag nodes. + if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() && + N1.getOpcode() == ISD::UNDEF && N0.hasOneUse() && + N0.getOpcode() == ISD::BITCAST) { + SDValue BC0 = N0.getOperand(0); + EVT SVT = BC0.getValueType(); + unsigned Opcode = BC0.getOpcode(); + unsigned NumElts = VT.getVectorNumElements(); + + if (BC0.hasOneUse() && SVT.isVector() && + SVT.getVectorNumElements() * 2 == NumElts && + TLI.isOperationLegal(Opcode, VT)) { + bool CanFold = false; + switch (Opcode) { + default : break; + case ISD::ADD : + case ISD::FADD : + case ISD::SUB : + case ISD::FSUB : + case ISD::MUL : + case ISD::FMUL : + CanFold = true; + } + + unsigned SVTNumElts = SVT.getVectorNumElements(); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); + for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i) + CanFold = SVOp->getMaskElt(i) == (int)(i * 2); + for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i) + CanFold = SVOp->getMaskElt(i) < 0; + + if (CanFold) { + SDValue BC00 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(0)); + SDValue BC01 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(1)); + SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01); + return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, &SVOp->getMask()[0]); + } + } + } + // Only handle 128 wide vector from here on. if (!VT.is128BitVector()) return SDValue(); @@ -17501,7 +18871,18 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) Elts.push_back(getShuffleScalarElt(N, i, DAG, 0)); - return EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true); + SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true); + if (LD.getNode()) + return LD; + + if (isTargetShuffle(N->getOpcode())) { + SDValue Shuffle = + PerformTargetShuffleCombine(SDValue(N, 0), DAG, DCI, Subtarget); + if (Shuffle.getNode()) + return Shuffle; + } + + return SDValue(); } /// PerformTruncateCombine - Converts truncate operation to @@ -18155,28 +19536,34 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS)) return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS); - // If the RHS is a constant we have to reverse the const canonicalization. - // x > C-1 ? x+-C : 0 --> subus x, C - if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD && - isSplatVector(CondRHS.getNode()) && isSplatVector(OpRHS.getNode())) { - APInt A = cast<ConstantSDNode>(OpRHS.getOperand(0))->getAPIntValue(); - if (CondRHS.getConstantOperandVal(0) == -A-1) - return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, - DAG.getConstant(-A, VT)); - } - - // Another special case: If C was a sign bit, the sub has been - // canonicalized into a xor. - // FIXME: Would it be better to use computeKnownBits to determine whether - // it's safe to decanonicalize the xor? - // x s< 0 ? x^C : 0 --> subus x, C - if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR && - ISD::isBuildVectorAllZeros(CondRHS.getNode()) && - isSplatVector(OpRHS.getNode())) { - APInt A = cast<ConstantSDNode>(OpRHS.getOperand(0))->getAPIntValue(); - if (A.isSignBit()) - return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS); - } + if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS)) + if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) { + if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS)) + if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode()) + // If the RHS is a constant we have to reverse the const + // canonicalization. + // x > C-1 ? x+-C : 0 --> subus x, C + if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD && + CondRHSConst->getAPIntValue() == + (-OpRHSConst->getAPIntValue() - 1)) + return DAG.getNode( + X86ISD::SUBUS, DL, VT, OpLHS, + DAG.getConstant(-OpRHSConst->getAPIntValue(), VT)); + + // Another special case: If C was a sign bit, the sub has been + // canonicalized into a xor. + // FIXME: Would it be better to use computeKnownBits to determine + // whether it's safe to decanonicalize the xor? + // x s< 0 ? x^C : 0 --> subus x, C + if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR && + ISD::isBuildVectorAllZeros(CondRHS.getNode()) && + OpRHSConst->getAPIntValue().isSignBit()) + // Note that we have to rebuild the RHS constant here to ensure we + // don't rely on particular values of undef lanes. + return DAG.getNode( + X86ISD::SUBUS, DL, VT, OpLHS, + DAG.getConstant(OpRHSConst->getAPIntValue(), VT)); + } } } @@ -18743,6 +20130,8 @@ static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG, if (C->isAllOnesValue()) return Op1; } + + return SDValue(); } // Packed SSE2/AVX2 arithmetic shift immediate intrinsics. @@ -18882,16 +20271,15 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { // vector operations in many cases. Also, on sandybridge ADD is faster than // shl. // (shl V, 1) -> add V,V - if (isSplatVector(N1.getNode())) { - assert(N0.getValueType().isVector() && "Invalid vector shift type"); - ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1->getOperand(0)); - // We shift all of the values by one. In many cases we do not have - // hardware support for this operation. This is better expressed as an ADD - // of two values. - if (N1C && (1 == N1C->getZExtValue())) { - return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0); + if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1)) + if (auto *N1SplatC = N1BV->getConstantSplatNode()) { + assert(N0.getValueType().isVector() && "Invalid vector shift type"); + // We shift all of the values by one. In many cases we do not have + // hardware support for this operation. This is better expressed as an ADD + // of two values. + if (N1SplatC->getZExtValue() == 1) + return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0); } - } return SDValue(); } @@ -18910,10 +20298,9 @@ static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG, SDValue Amt = N->getOperand(1); SDLoc DL(N); - if (isSplatVector(Amt.getNode())) { - SDValue SclrAmt = Amt->getOperand(0); - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) { - APInt ShiftAmt = C->getAPIntValue(); + if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt)) + if (auto *AmtSplat = AmtBV->getConstantSplatNode()) { + APInt ShiftAmt = AmtSplat->getAPIntValue(); unsigned MaxAmount = VT.getVectorElementType().getSizeInBits(); // SSE2/AVX2 logical shifts always return a vector of 0s @@ -18923,7 +20310,6 @@ static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG, if (ShiftAmt.trunc(8).uge(MaxAmount)) return getZeroVector(VT, Subtarget, DAG, DL); } - } return SDValue(); } @@ -19117,9 +20503,10 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, // The right side has to be a 'trunc' or a constant vector. bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE; - bool RHSConst = (isSplatVector(N1.getNode()) && - isa<ConstantSDNode>(N1->getOperand(0))); - if (!RHSTrunc && !RHSConst) + ConstantSDNode *RHSConstSplat = nullptr; + if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1)) + RHSConstSplat = RHSBV->getConstantSplatNode(); + if (!RHSTrunc && !RHSConstSplat) return SDValue(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -19129,9 +20516,9 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, // Set N0 and N1 to hold the inputs to the new wide operation. N0 = N0->getOperand(0); - if (RHSConst) { + if (RHSConstSplat) { N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(), - N1->getOperand(0)); + SDValue(RHSConstSplat, 0)); SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1); N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C); } else if (RHSTrunc) { @@ -19277,12 +20664,9 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits(); unsigned SraAmt = ~0; if (Mask.getOpcode() == ISD::SRA) { - SDValue Amt = Mask.getOperand(1); - if (isSplatVector(Amt.getNode())) { - SDValue SclrAmt = Amt->getOperand(0); - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) - SraAmt = C->getZExtValue(); - } + if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1))) + if (auto *AmtConst = AmtBV->getConstantSplatNode()) + SraAmt = AmtConst->getZExtValue(); } else if (Mask.getOpcode() == X86ISD::VSRAI) { SDValue SraC = Mask.getOperand(1); SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue(); @@ -20642,6 +22026,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget); case X86ISD::INSERTPS: return PerformINSERTPSCombine(N, DAG, Subtarget); + case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DAG, Subtarget); } return SDValue(); @@ -21146,8 +22531,8 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, const GlobalValue *GV = GA->getGlobal(); // If we require an extra load to get this address, as in PIC mode, we // can't accept it. - if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV, - getTargetMachine()))) + if (isGlobalStubReference( + Subtarget->ClassifyGlobalReference(GV, DAG.getTarget()))) return; Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op), @@ -21425,3 +22810,7 @@ int X86TargetLowering::getScalingFactorCost(const AddrMode &AM, return AM.Scale != 0; return -1; } + +bool X86TargetLowering::isTargetFTOL() const { + return Subtarget->isTargetKnownWindowsMSVC() && !Subtarget->is64Bit(); +} diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 9f51b53a00c4..c8cdce7c7664 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -15,13 +15,13 @@ #ifndef X86ISELLOWERING_H #define X86ISELLOWERING_H -#include "X86Subtarget.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetOptions.h" namespace llvm { + class X86Subtarget; class X86TargetMachine; namespace X86ISD { @@ -86,6 +86,9 @@ namespace llvm { /// X86 Read Time-Stamp Counter and Processor ID. RDTSCP_DAG, + /// X86 Read Performance Monitoring Counters. + RDPMC_DAG, + /// X86 compare and logical compare instructions. CMP, COMI, UCOMI, @@ -315,6 +318,8 @@ namespace llvm { KORTEST, // Several flavors of instructions with vector shuffle behaviors. + PACKSS, + PACKUS, PALIGNR, PSHUFD, PSHUFHW, @@ -400,23 +405,8 @@ namespace llvm { // XTEST - Test if in transactional execution. XTEST, - // ATOMADD64_DAG, ATOMSUB64_DAG, ATOMOR64_DAG, ATOMAND64_DAG, - // ATOMXOR64_DAG, ATOMNAND64_DAG, ATOMSWAP64_DAG - - // Atomic 64-bit binary operations. - ATOMADD64_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE, - ATOMSUB64_DAG, - ATOMOR64_DAG, - ATOMXOR64_DAG, - ATOMAND64_DAG, - ATOMNAND64_DAG, - ATOMMAX64_DAG, - ATOMMIN64_DAG, - ATOMUMAX64_DAG, - ATOMUMIN64_DAG, - ATOMSWAP64_DAG, - // LCMPXCHG_DAG, LCMPXCHG8_DAG, LCMPXCHG16_DAG - Compare and swap. - LCMPXCHG_DAG, + LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE, LCMPXCHG8_DAG, LCMPXCHG16_DAG, @@ -766,9 +756,7 @@ namespace llvm { /// isTargetFTOL - Return true if the target uses the MSVC _ftol2 routine /// for fptoui. - bool isTargetFTOL() const { - return Subtarget->isTargetKnownWindowsMSVC() && !Subtarget->is64Bit(); - } + bool isTargetFTOL() const; /// isIntegerTypeFTOL - Return true if the MSVC _ftol2 routine should be /// used for fptoui to the given type. @@ -808,6 +796,9 @@ namespace llvm { /// \brief Reset the operation actions based on target options. void resetOperationActions() override; + /// \brief Customize the preferred legalization strategy for certain types. + LegalizeTypeAction getPreferredVectorAction(EVT VT) const override; + protected: std::pair<const TargetRegisterClass*, uint8_t> findRepresentativeClass(MVT VT) const override; diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 37bcc5235e4f..41e900ed11a4 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -476,6 +476,28 @@ defm VPBROADCASTQZ : avx512_int_broadcast_rm<0x59, "vpbroadcastq", i64mem, loadi64, VR512, v8i64, v2i64, VK8WM>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; +multiclass avx512_int_subvec_broadcast_rm<bits<8> opc, string OpcodeStr, + X86MemOperand x86memop, PatFrag ld_frag, + RegisterClass KRC> { + let mayLoad = 1 in { + def rm : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), + []>, EVEX; + def krm : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), (ins KRC:$mask, + x86memop:$src), + !strconcat(OpcodeStr, + " \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), + []>, EVEX, EVEX_KZ; + } +} + +defm VBROADCASTI32X4 : avx512_int_subvec_broadcast_rm<0x5a, "vbroadcasti32x4", + i128mem, loadv2i64, VK16WM>, + EVEX_V512, EVEX_CD8<32, CD8VT4>; +defm VBROADCASTI64X4 : avx512_int_subvec_broadcast_rm<0x5b, "vbroadcasti64x4", + i256mem, loadv4i64, VK16WM>, VEX_W, + EVEX_V512, EVEX_CD8<64, CD8VT4>; + def : Pat<(v16i32 (int_x86_avx512_pbroadcastd_512 (v4i32 VR128X:$src))), (VPBROADCASTDZrr VR128X:$src)>; def : Pat<(v8i64 (int_x86_avx512_pbroadcastq_512 (v2i64 VR128X:$src))), @@ -517,10 +539,12 @@ def rr : AVX512XS8I<opc, MRMDestReg, (outs DstRC:$dst), (ins KRC:$src), []>, EVEX; } +let Predicates = [HasCDI] in { defm VPBROADCASTMW2D : avx512_mask_broadcast<0x3A, "vpbroadcastmw2d", VR512, VK16, v16i32, v16i1>, EVEX_V512; defm VPBROADCASTMB2Q : avx512_mask_broadcast<0x2A, "vpbroadcastmb2q", VR512, VK8, v8i64, v8i1>, EVEX_V512, VEX_W; +} //===----------------------------------------------------------------------===// // AVX-512 - VPERM @@ -585,7 +609,7 @@ defm VPERMPDZ : avx512_perm<0x16, "vpermpd", VR512, memopv8f64, f512mem, // -- VPERM2I - 3 source operands form -- multiclass avx512_perm_3src<bits<8> opc, string OpcodeStr, RegisterClass RC, PatFrag mem_frag, X86MemOperand x86memop, - SDNode OpNode, ValueType OpVT> { + SDNode OpNode, ValueType OpVT, RegisterClass KRC> { let Constraints = "$src1 = $dst" in { def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, RC:$src3), @@ -595,48 +619,107 @@ let Constraints = "$src1 = $dst" in { (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>, EVEX_4V; + def rrk : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, KRC:$mask, RC:$src2, RC:$src3), + !strconcat(OpcodeStr, + " \t{$src3, $src2, $dst {${mask}}|" + "$dst {${mask}}, $src2, $src3}"), + [(set RC:$dst, (OpVT (vselect KRC:$mask, + (OpNode RC:$src1, RC:$src2, + RC:$src3), + RC:$src1)))]>, + EVEX_4V, EVEX_K; + + let AddedComplexity = 30 in // Prefer over VMOV*rrkz Pat<> + def rrkz : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, KRC:$mask, RC:$src2, RC:$src3), + !strconcat(OpcodeStr, + " \t{$src3, $src2, $dst {${mask}} {z} |", + "$dst {${mask}} {z}, $src2, $src3}"), + [(set RC:$dst, (OpVT (vselect KRC:$mask, + (OpNode RC:$src1, RC:$src2, + RC:$src3), + (OpVT (bitconvert + (v16i32 immAllZerosV))))))]>, + EVEX_4V, EVEX_KZ; + def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, RC:$src2, x86memop:$src3), !strconcat(OpcodeStr, " \t{$src3, $src2, $dst|$dst, $src2, $src3}"), [(set RC:$dst, - (OpVT (OpNode RC:$src1, RC:$src2, + (OpVT (OpNode RC:$src1, RC:$src2, (mem_frag addr:$src3))))]>, EVEX_4V; + + def rmk : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, KRC:$mask, RC:$src2, x86memop:$src3), + !strconcat(OpcodeStr, + " \t{$src3, $src2, $dst {${mask}}|" + "$dst {${mask}}, $src2, $src3}"), + [(set RC:$dst, + (OpVT (vselect KRC:$mask, + (OpNode RC:$src1, RC:$src2, + (mem_frag addr:$src3)), + RC:$src1)))]>, + EVEX_4V, EVEX_K; + + let AddedComplexity = 10 in // Prefer over the rrkz variant + def rmkz : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, KRC:$mask, RC:$src2, x86memop:$src3), + !strconcat(OpcodeStr, + " \t{$src3, $src2, $dst {${mask}} {z}|" + "$dst {${mask}} {z}, $src2, $src3}"), + [(set RC:$dst, + (OpVT (vselect KRC:$mask, + (OpNode RC:$src1, RC:$src2, + (mem_frag addr:$src3)), + (OpVT (bitconvert + (v16i32 immAllZerosV))))))]>, + EVEX_4V, EVEX_KZ; } } -defm VPERMI2D : avx512_perm_3src<0x76, "vpermi2d", VR512, memopv16i32, i512mem, - X86VPermiv3, v16i32>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPERMI2Q : avx512_perm_3src<0x76, "vpermi2q", VR512, memopv8i64, i512mem, - X86VPermiv3, v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; -defm VPERMI2PS : avx512_perm_3src<0x77, "vpermi2ps", VR512, memopv16f32, i512mem, - X86VPermiv3, v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPERMI2PD : avx512_perm_3src<0x77, "vpermi2pd", VR512, memopv8f64, i512mem, - X86VPermiv3, v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - -defm VPERMT2D : avx512_perm_3src<0x7E, "vpermt2d", VR512, memopv16i32, i512mem, - X86VPermv3, v16i32>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPERMT2Q : avx512_perm_3src<0x7E, "vpermt2q", VR512, memopv8i64, i512mem, - X86VPermv3, v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; -defm VPERMT2PS : avx512_perm_3src<0x7F, "vpermt2ps", VR512, memopv16f32, i512mem, - X86VPermv3, v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPERMT2PD : avx512_perm_3src<0x7F, "vpermt2pd", VR512, memopv8f64, i512mem, - X86VPermv3, v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - -def : Pat<(v16f32 (int_x86_avx512_mask_vpermt_ps_512 (v16i32 VR512:$idx), - (v16f32 VR512:$src1), (v16f32 VR512:$src2), (i16 -1))), - (VPERMT2PSrr VR512:$src1, VR512:$idx, VR512:$src2)>; - -def : Pat<(v16i32 (int_x86_avx512_mask_vpermt_d_512 (v16i32 VR512:$idx), - (v16i32 VR512:$src1), (v16i32 VR512:$src2), (i16 -1))), - (VPERMT2Drr VR512:$src1, VR512:$idx, VR512:$src2)>; - -def : Pat<(v8f64 (int_x86_avx512_mask_vpermt_pd_512 (v8i64 VR512:$idx), - (v8f64 VR512:$src1), (v8f64 VR512:$src2), (i8 -1))), - (VPERMT2PDrr VR512:$src1, VR512:$idx, VR512:$src2)>; - -def : Pat<(v8i64 (int_x86_avx512_mask_vpermt_q_512 (v8i64 VR512:$idx), - (v8i64 VR512:$src1), (v8i64 VR512:$src2), (i8 -1))), - (VPERMT2Qrr VR512:$src1, VR512:$idx, VR512:$src2)>; +defm VPERMI2D : avx512_perm_3src<0x76, "vpermi2d", VR512, memopv16i32, + i512mem, X86VPermiv3, v16i32, VK16WM>, + EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VPERMI2Q : avx512_perm_3src<0x76, "vpermi2q", VR512, memopv8i64, + i512mem, X86VPermiv3, v8i64, VK8WM>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPERMI2PS : avx512_perm_3src<0x77, "vpermi2ps", VR512, memopv16f32, + i512mem, X86VPermiv3, v16f32, VK16WM>, + EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VPERMI2PD : avx512_perm_3src<0x77, "vpermi2pd", VR512, memopv8f64, + i512mem, X86VPermiv3, v8f64, VK8WM>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + +multiclass avx512_perm_table_3src<bits<8> opc, string Suffix, RegisterClass RC, + PatFrag mem_frag, X86MemOperand x86memop, + SDNode OpNode, ValueType OpVT, RegisterClass KRC, + ValueType MaskVT, RegisterClass MRC> : + avx512_perm_3src<opc, "vpermt2"##Suffix, RC, mem_frag, x86memop, OpNode, + OpVT, KRC> { + def : Pat<(OpVT (!cast<Intrinsic>("int_x86_avx512_mask_vpermt_"##Suffix##"_512") + VR512:$idx, VR512:$src1, VR512:$src2, -1)), + (!cast<Instruction>(NAME#rr) VR512:$src1, VR512:$idx, VR512:$src2)>; + + def : Pat<(OpVT (!cast<Intrinsic>("int_x86_avx512_mask_vpermt_"##Suffix##"_512") + VR512:$idx, VR512:$src1, VR512:$src2, MRC:$mask)), + (!cast<Instruction>(NAME#rrk) VR512:$src1, + (MaskVT (COPY_TO_REGCLASS MRC:$mask, KRC)), VR512:$idx, VR512:$src2)>; +} + +defm VPERMT2D : avx512_perm_table_3src<0x7E, "d", VR512, memopv16i32, i512mem, + X86VPermv3, v16i32, VK16WM, v16i1, GR16>, + EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VPERMT2Q : avx512_perm_table_3src<0x7E, "q", VR512, memopv8i64, i512mem, + X86VPermv3, v8i64, VK8WM, v8i1, GR8>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPERMT2PS : avx512_perm_table_3src<0x7F, "ps", VR512, memopv16f32, i512mem, + X86VPermv3, v16f32, VK16WM, v16i1, GR16>, + EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VPERMT2PD : avx512_perm_table_3src<0x7F, "pd", VR512, memopv8f64, i512mem, + X86VPermv3, v8f64, VK8WM, v8i1, GR8>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + //===----------------------------------------------------------------------===// // AVX-512 - BLEND using mask // @@ -790,52 +873,61 @@ def : Pat<(v8i1 (X86pcmpeqm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)), (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm))), VK8)>; -multiclass avx512_icmp_cc<bits<8> opc, RegisterClass KRC, +multiclass avx512_icmp_cc<bits<8> opc, RegisterClass WMRC, RegisterClass KRC, RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag, - SDNode OpNode, ValueType vt, Operand CC, string asm, - string asm_alt> { + SDNode OpNode, ValueType vt, Operand CC, string Suffix> { def rri : AVX512AIi8<opc, MRMSrcReg, - (outs KRC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm, + (outs KRC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), + !strconcat("vpcmp${cc}", Suffix, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set KRC:$dst, (OpNode (vt RC:$src1), (vt RC:$src2), imm:$cc))], IIC_SSE_ALU_F32P_RR>, EVEX_4V; def rmi : AVX512AIi8<opc, MRMSrcMem, - (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm, + (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), + !strconcat("vpcmp${cc}", Suffix, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set KRC:$dst, (OpNode (vt RC:$src1), (memop_frag addr:$src2), imm:$cc))], IIC_SSE_ALU_F32P_RM>, EVEX_4V; // Accept explicit immediate argument form instead of comparison code. let isAsmParserOnly = 1, hasSideEffects = 0 in { def rri_alt : AVX512AIi8<opc, MRMSrcReg, (outs KRC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc), - asm_alt, [], IIC_SSE_ALU_F32P_RR>, EVEX_4V; + !strconcat("vpcmp", Suffix, + "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), + [], IIC_SSE_ALU_F32P_RR>, EVEX_4V; + def rrik_alt : AVX512AIi8<opc, MRMSrcReg, + (outs KRC:$dst), (ins WMRC:$mask, RC:$src1, RC:$src2, i8imm:$cc), + !strconcat("vpcmp", Suffix, + "\t{$cc, $src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2, $cc}"), + [], IIC_SSE_ALU_F32P_RR>, EVEX_4V, EVEX_K; def rmi_alt : AVX512AIi8<opc, MRMSrcMem, (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc), - asm_alt, [], IIC_SSE_ALU_F32P_RM>, EVEX_4V; + !strconcat("vpcmp", Suffix, + "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), + [], IIC_SSE_ALU_F32P_RM>, EVEX_4V; + def rmik_alt : AVX512AIi8<opc, MRMSrcMem, + (outs KRC:$dst), (ins WMRC:$mask, RC:$src1, x86memop:$src2, i8imm:$cc), + !strconcat("vpcmp", Suffix, + "\t{$cc, $src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2, $cc}"), + [], IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K; } } -defm VPCMPDZ : avx512_icmp_cc<0x1F, VK16, VR512, i512mem, memopv16i32, - X86cmpm, v16i32, AVXCC, - "vpcmp${cc}d\t{$src2, $src1, $dst|$dst, $src1, $src2}", - "vpcmpd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">, - EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPCMPUDZ : avx512_icmp_cc<0x1E, VK16, VR512, i512mem, memopv16i32, - X86cmpmu, v16i32, AVXCC, - "vpcmp${cc}ud\t{$src2, $src1, $dst|$dst, $src1, $src2}", - "vpcmpud\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">, - EVEX_V512, EVEX_CD8<32, CD8VF>; - -defm VPCMPQZ : avx512_icmp_cc<0x1F, VK8, VR512, i512mem, memopv8i64, - X86cmpm, v8i64, AVXCC, - "vpcmp${cc}q\t{$src2, $src1, $dst|$dst, $src1, $src2}", - "vpcmpq\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">, - VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; -defm VPCMPUQZ : avx512_icmp_cc<0x1E, VK8, VR512, i512mem, memopv8i64, - X86cmpmu, v8i64, AVXCC, - "vpcmp${cc}uq\t{$src2, $src1, $dst|$dst, $src1, $src2}", - "vpcmpuq\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">, - VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; - -// avx512_cmp_packed - sse 1 & 2 compare packed instructions +defm VPCMPDZ : avx512_icmp_cc<0x1F, VK16WM, VK16, VR512, i512mem, memopv16i32, + X86cmpm, v16i32, AVXCC, "d">, + EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VPCMPUDZ : avx512_icmp_cc<0x1E, VK16WM, VK16, VR512, i512mem, memopv16i32, + X86cmpmu, v16i32, AVXCC, "ud">, + EVEX_V512, EVEX_CD8<32, CD8VF>; + +defm VPCMPQZ : avx512_icmp_cc<0x1F, VK8WM, VK8, VR512, i512mem, memopv8i64, + X86cmpm, v8i64, AVXCC, "q">, + VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; +defm VPCMPUQZ : avx512_icmp_cc<0x1E, VK8WM, VK8, VR512, i512mem, memopv8i64, + X86cmpmu, v8i64, AVXCC, "uq">, + VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; + +// avx512_cmp_packed - compare packed instructions multiclass avx512_cmp_packed<RegisterClass KRC, RegisterClass RC, X86MemOperand x86memop, ValueType vt, string suffix, Domain d> { @@ -859,11 +951,11 @@ multiclass avx512_cmp_packed<RegisterClass KRC, RegisterClass RC, // Accept explicit immediate argument form instead of comparison code. let isAsmParserOnly = 1, hasSideEffects = 0 in { def rri_alt : AVX512PIi8<0xC2, MRMSrcReg, - (outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc), + (outs KRC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc), !strconcat("vcmp", suffix, " \t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [], d>; def rmi_alt : AVX512PIi8<0xC2, MRMSrcMem, - (outs RC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc), + (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc), !strconcat("vcmp", suffix, " \t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [], d>; } @@ -1788,6 +1880,46 @@ def : Pat<(v8i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))), (SUBREG_TO_REG (i32 0), (VMOV64toPQIZrr GR64:$src2), sub_xmm)>; //===----------------------------------------------------------------------===// +// AVX-512 - Non-temporals +//===----------------------------------------------------------------------===// + +def VMOVNTDQAZrm : AVX5128I<0x2A, MRMSrcMem, (outs VR512:$dst), + (ins i512mem:$src), + "vmovntdqa\t{$src, $dst|$dst, $src}", + [(set VR512:$dst, + (int_x86_avx512_movntdqa addr:$src))]>, + EVEX, EVEX_V512, EVEX_CD8<64, CD8VF>; + +// Prefer non-temporal over temporal versions +let AddedComplexity = 400, SchedRW = [WriteStore] in { + +def VMOVNTPSZmr : AVX512PSI<0x2B, MRMDestMem, (outs), + (ins f512mem:$dst, VR512:$src), + "vmovntps\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v16f32 VR512:$src), + addr:$dst)], + IIC_SSE_MOVNT>, + EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>; + +def VMOVNTPDZmr : AVX512PDI<0x2B, MRMDestMem, (outs), + (ins f512mem:$dst, VR512:$src), + "vmovntpd\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v8f64 VR512:$src), + addr:$dst)], + IIC_SSE_MOVNT>, + EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + + +def VMOVNTDQZmr : AVX512BI<0xE7, MRMDestMem, (outs), + (ins i512mem:$dst, VR512:$src), + "vmovntdq\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v8i64 VR512:$src), + addr:$dst)], + IIC_SSE_MOVNT>, + EVEX, EVEX_V512, EVEX_CD8<64, CD8VF>; +} + +//===----------------------------------------------------------------------===// // AVX-512 - Integer arithmetic // multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, @@ -3161,6 +3293,10 @@ def : Pat<(v4f32 (uint_to_fp (v4i32 VR128X:$src1))), (EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr (v16i32 (SUBREG_TO_REG (i32 0), VR128X:$src1, sub_xmm)))), sub_xmm)>; +def : Pat<(v4f64 (uint_to_fp (v4i32 VR128X:$src1))), + (EXTRACT_SUBREG (v8f64 (VCVTUDQ2PDZrr + (v8i32 (SUBREG_TO_REG (i32 0), VR128X:$src1, sub_xmm)))), sub_ymm)>; + def : Pat<(v16f32 (int_x86_avx512_mask_cvtdq2ps_512 (v16i32 VR512:$src), (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1), imm:$rc)), (VCVTDQ2PSZrrb VR512:$src, imm:$rc)>; @@ -4343,6 +4479,37 @@ def : Pat<(int_x86_avx512_mask_conflict_q_512 VR512:$src2, VR512:$src1, (VPCONFLICTQrrk VR512:$src1, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), VR512:$src2)>; +let Predicates = [HasCDI] in { +defm VPLZCNTD : avx512_conflict<0x44, "vplzcntd", VR512, VK16WM, + i512mem, i32mem, "{1to16}">, + EVEX_V512, EVEX_CD8<32, CD8VF>; + + +defm VPLZCNTQ : avx512_conflict<0x44, "vplzcntq", VR512, VK8WM, + i512mem, i64mem, "{1to8}">, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + +} + +def : Pat<(int_x86_avx512_mask_lzcnt_d_512 VR512:$src2, VR512:$src1, + GR16:$mask), + (VPLZCNTDrrk VR512:$src1, + (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), VR512:$src2)>; + +def : Pat<(int_x86_avx512_mask_lzcnt_q_512 VR512:$src2, VR512:$src1, + GR8:$mask), + (VPLZCNTQrrk VR512:$src1, + (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), VR512:$src2)>; + +def : Pat<(v16i32 (ctlz (memopv16i32 addr:$src))), + (VPLZCNTDrm addr:$src)>; +def : Pat<(v16i32 (ctlz (v16i32 VR512:$src))), + (VPLZCNTDrr VR512:$src)>; +def : Pat<(v8i64 (ctlz (memopv8i64 addr:$src))), + (VPLZCNTQrm addr:$src)>; +def : Pat<(v8i64 (ctlz (v8i64 VR512:$src))), + (VPLZCNTQrr VR512:$src)>; + def : Pat<(store (i1 -1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>; def : Pat<(store (i1 1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>; def : Pat<(store (i1 0), addr:$dst), (MOV8mi addr:$dst, (i8 0))>; diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td index 368e14b91f40..f2574cc3700e 100644 --- a/lib/Target/X86/X86InstrArithmetic.td +++ b/lib/Target/X86/X86InstrArithmetic.td @@ -1278,8 +1278,10 @@ let isCompare = 1 in { def TEST64mi32 : BinOpMI_F<"test", Xi64, X86testpat, MRM0m, 0xF6>; // When testing the result of EXTRACT_SUBREG sub_8bit_hi, make sure the - // register class is constrained to GR8_NOREX. - let isPseudo = 1 in + // register class is constrained to GR8_NOREX. This pseudo is explicitly + // marked side-effect free, since it doesn't have an isel pattern like + // other test instructions. + let isPseudo = 1, hasSideEffects = 0 in def TEST8ri_NOREX : I<0, Pseudo, (outs), (ins GR8_NOREX:$src, i8imm:$mask), "", [], IIC_BIN_NONMEM>, Sched<[WriteALU]>; } // Defs = [EFLAGS] diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td index 34d8fb9e1817..ca4f608a6b80 100644 --- a/lib/Target/X86/X86InstrCompiler.td +++ b/lib/Target/X86/X86InstrCompiler.td @@ -110,7 +110,7 @@ let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in // When using segmented stacks these are lowered into instructions which first // check if the current stacklet has enough free memory. If it does, memory is -// allocated by bumping the stack pointer. Otherwise memory is allocated from +// allocated by bumping the stack pointer. Otherwise memory is allocated from // the heap. let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in @@ -197,6 +197,26 @@ let isBranch = 1, isTerminator = 1, isCodeGenOnly = 1 in { } //===----------------------------------------------------------------------===// +// Pseudo instructions used by unwind info. +// +let isPseudo = 1 in { + def SEH_PushReg : I<0, Pseudo, (outs), (ins i32imm:$reg), + "#SEH_PushReg $reg", []>; + def SEH_SaveReg : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$dst), + "#SEH_SaveReg $reg, $dst", []>; + def SEH_SaveXMM : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$dst), + "#SEH_SaveXMM $reg, $dst", []>; + def SEH_StackAlloc : I<0, Pseudo, (outs), (ins i32imm:$size), + "#SEH_StackAlloc $size", []>; + def SEH_SetFrame : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$offset), + "#SEH_SetFrame $reg, $offset", []>; + def SEH_PushFrame : I<0, Pseudo, (outs), (ins i1imm:$mode), + "#SEH_PushFrame $mode", []>; + def SEH_EndPrologue : I<0, Pseudo, (outs), (ins), + "#SEH_EndPrologue", []>; +} + +//===----------------------------------------------------------------------===// // Pseudo instructions used by segmented stacks. // @@ -371,7 +391,7 @@ let Defs = [RCX,RDI], isCodeGenOnly = 1 in { def REP_STOSD_64 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}", [(X86rep_stos i32)], IIC_REP_STOS>, REP, OpSize32, Requires<[In64BitMode]>; - + let Uses = [RAX,RCX,RDI] in def REP_STOSQ_64 : RI<0xAB, RawFrm, (outs), (ins), "{rep;stosq|rep stosq}", [(X86rep_stos i64)], IIC_REP_STOS>, REP, @@ -502,83 +522,6 @@ def CMOV_RFP80 : I<0, Pseudo, //===----------------------------------------------------------------------===// -// Atomic Instruction Pseudo Instructions -//===----------------------------------------------------------------------===// - -// Pseudo atomic instructions - -multiclass PSEUDO_ATOMIC_LOAD_BINOP<string mnemonic> { - let usesCustomInserter = 1, mayLoad = 1, mayStore = 1 in { - let Defs = [EFLAGS, AL] in - def NAME#8 : I<0, Pseudo, (outs GR8:$dst), - (ins i8mem:$ptr, GR8:$val), - !strconcat(mnemonic, "8 PSEUDO!"), []>; - let Defs = [EFLAGS, AX] in - def NAME#16 : I<0, Pseudo,(outs GR16:$dst), - (ins i16mem:$ptr, GR16:$val), - !strconcat(mnemonic, "16 PSEUDO!"), []>; - let Defs = [EFLAGS, EAX] in - def NAME#32 : I<0, Pseudo, (outs GR32:$dst), - (ins i32mem:$ptr, GR32:$val), - !strconcat(mnemonic, "32 PSEUDO!"), []>; - let Defs = [EFLAGS, RAX] in - def NAME#64 : I<0, Pseudo, (outs GR64:$dst), - (ins i64mem:$ptr, GR64:$val), - !strconcat(mnemonic, "64 PSEUDO!"), []>; - } -} - -multiclass PSEUDO_ATOMIC_LOAD_BINOP_PATS<string name, string frag> { - def : Pat<(!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val), - (!cast<Instruction>(name # "8") addr:$ptr, GR8:$val)>; - def : Pat<(!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val), - (!cast<Instruction>(name # "16") addr:$ptr, GR16:$val)>; - def : Pat<(!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val), - (!cast<Instruction>(name # "32") addr:$ptr, GR32:$val)>; - def : Pat<(!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val), - (!cast<Instruction>(name # "64") addr:$ptr, GR64:$val)>; -} - -// Atomic exchange, and, or, xor -defm ATOMAND : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMAND">; -defm ATOMOR : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMOR">; -defm ATOMXOR : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMXOR">; -defm ATOMNAND : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMNAND">; -defm ATOMMAX : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMMAX">; -defm ATOMMIN : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMMIN">; -defm ATOMUMAX : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMUMAX">; -defm ATOMUMIN : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMUMIN">; - -defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMAND", "atomic_load_and">; -defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMOR", "atomic_load_or">; -defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMXOR", "atomic_load_xor">; -defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMNAND", "atomic_load_nand">; -defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMMAX", "atomic_load_max">; -defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMMIN", "atomic_load_min">; -defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMUMAX", "atomic_load_umax">; -defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMUMIN", "atomic_load_umin">; - -multiclass PSEUDO_ATOMIC_LOAD_BINOP6432<string mnemonic> { - let usesCustomInserter = 1, Defs = [EFLAGS, EAX, EDX], - mayLoad = 1, mayStore = 1, hasSideEffects = 0 in - def NAME#6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2), - (ins i64mem:$ptr, GR32:$val1, GR32:$val2), - !strconcat(mnemonic, "6432 PSEUDO!"), []>; -} - -defm ATOMAND : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMAND">; -defm ATOMOR : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMOR">; -defm ATOMXOR : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMXOR">; -defm ATOMNAND : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMNAND">; -defm ATOMADD : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMADD">; -defm ATOMSUB : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMSUB">; -defm ATOMMAX : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMMAX">; -defm ATOMMIN : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMMIN">; -defm ATOMUMAX : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMUMAX">; -defm ATOMUMIN : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMUMIN">; -defm ATOMSWAP : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMSWAP">; - -//===----------------------------------------------------------------------===// // Normal-Instructions-With-Lock-Prefix Pseudo Instructions //===----------------------------------------------------------------------===// @@ -1696,20 +1639,34 @@ def : Pat<(mul (loadi64 addr:$src1), i64immSExt32:$src2), (IMUL64rmi32 addr:$src1, i64immSExt32:$src2)>; // Increment reg. -def : Pat<(add GR8 :$src, 1), (INC8r GR8 :$src)>; -def : Pat<(add GR16:$src, 1), (INC16r GR16:$src)>, Requires<[Not64BitMode]>; -def : Pat<(add GR16:$src, 1), (INC64_16r GR16:$src)>, Requires<[In64BitMode]>; -def : Pat<(add GR32:$src, 1), (INC32r GR32:$src)>, Requires<[Not64BitMode]>; -def : Pat<(add GR32:$src, 1), (INC64_32r GR32:$src)>, Requires<[In64BitMode]>; -def : Pat<(add GR64:$src, 1), (INC64r GR64:$src)>; +// Do not make INC if it is slow +def : Pat<(add GR8:$src, 1), + (INC8r GR8:$src)>, Requires<[NotSlowIncDec]>; +def : Pat<(add GR16:$src, 1), + (INC16r GR16:$src)>, Requires<[NotSlowIncDec, Not64BitMode]>; +def : Pat<(add GR16:$src, 1), + (INC64_16r GR16:$src)>, Requires<[NotSlowIncDec, In64BitMode]>; +def : Pat<(add GR32:$src, 1), + (INC32r GR32:$src)>, Requires<[NotSlowIncDec, Not64BitMode]>; +def : Pat<(add GR32:$src, 1), + (INC64_32r GR32:$src)>, Requires<[NotSlowIncDec, In64BitMode]>; +def : Pat<(add GR64:$src, 1), + (INC64r GR64:$src)>, Requires<[NotSlowIncDec]>; // Decrement reg. -def : Pat<(add GR8 :$src, -1), (DEC8r GR8 :$src)>; -def : Pat<(add GR16:$src, -1), (DEC16r GR16:$src)>, Requires<[Not64BitMode]>; -def : Pat<(add GR16:$src, -1), (DEC64_16r GR16:$src)>, Requires<[In64BitMode]>; -def : Pat<(add GR32:$src, -1), (DEC32r GR32:$src)>, Requires<[Not64BitMode]>; -def : Pat<(add GR32:$src, -1), (DEC64_32r GR32:$src)>, Requires<[In64BitMode]>; -def : Pat<(add GR64:$src, -1), (DEC64r GR64:$src)>; +// Do not make DEC if it is slow +def : Pat<(add GR8:$src, -1), + (DEC8r GR8:$src)>, Requires<[NotSlowIncDec]>; +def : Pat<(add GR16:$src, -1), + (DEC16r GR16:$src)>, Requires<[NotSlowIncDec, Not64BitMode]>; +def : Pat<(add GR16:$src, -1), + (DEC64_16r GR16:$src)>, Requires<[NotSlowIncDec, In64BitMode]>; +def : Pat<(add GR32:$src, -1), + (DEC32r GR32:$src)>, Requires<[NotSlowIncDec, Not64BitMode]>; +def : Pat<(add GR32:$src, -1), + (DEC64_32r GR32:$src)>, Requires<[NotSlowIncDec, In64BitMode]>; +def : Pat<(add GR64:$src, -1), + (DEC64r GR64:$src)>, Requires<[NotSlowIncDec]>; // or reg/reg. def : Pat<(or GR8 :$src1, GR8 :$src2), (OR8rr GR8 :$src1, GR8 :$src2)>; diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 1582f4388192..6f0fa9462770 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -224,6 +224,10 @@ def X86Movhlps : SDNode<"X86ISD::MOVHLPS", SDTShuff2Op>; def X86Movlps : SDNode<"X86ISD::MOVLPS", SDTShuff2Op>; def X86Movlpd : SDNode<"X86ISD::MOVLPD", SDTShuff2Op>; +def SDTPack : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<2, 1>]>; +def X86Packss : SDNode<"X86ISD::PACKSS", SDTPack>; +def X86Packus : SDNode<"X86ISD::PACKUS", SDTPack>; + def X86Unpckl : SDNode<"X86ISD::UNPCKL", SDTShuff2Op>; def X86Unpckh : SDNode<"X86ISD::UNPCKH", SDTShuff2Op>; diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 6993577d19c0..0d3afc43c2bb 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -28,6 +28,7 @@ #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/LLVMContext.h" #include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -97,14 +98,11 @@ struct X86OpTblEntry { // Pin the vtable to this file. void X86InstrInfo::anchor() {} -X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) - : X86GenInstrInfo((tm.getSubtarget<X86Subtarget>().is64Bit() - ? X86::ADJCALLSTACKDOWN64 - : X86::ADJCALLSTACKDOWN32), - (tm.getSubtarget<X86Subtarget>().is64Bit() - ? X86::ADJCALLSTACKUP64 - : X86::ADJCALLSTACKUP32)), - TM(tm), RI(tm) { +X86InstrInfo::X86InstrInfo(X86Subtarget &STI) + : X86GenInstrInfo( + (STI.is64Bit() ? X86::ADJCALLSTACKDOWN64 : X86::ADJCALLSTACKDOWN32), + (STI.is64Bit() ? X86::ADJCALLSTACKUP64 : X86::ADJCALLSTACKUP32)), + Subtarget(STI), RI(STI) { static const X86OpTblEntry OpTbl2Addr[] = { { X86::ADC32ri, X86::ADC32mi, 0 }, @@ -1472,7 +1470,7 @@ X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, case X86::MOVSX32rr8: case X86::MOVZX32rr8: case X86::MOVSX64rr8: - if (!TM.getSubtarget<X86Subtarget>().is64Bit()) + if (!Subtarget.is64Bit()) // It's not always legal to reference the low 8-bit of the larger // register in 32-bit mode. return false; @@ -1950,7 +1948,7 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc, MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo(); unsigned leaOutReg = RegInfo.createVirtualRegister(&X86::GR32RegClass); unsigned Opc, leaInReg; - if (TM.getSubtarget<X86Subtarget>().is64Bit()) { + if (Subtarget.is64Bit()) { Opc = X86::LEA64_32r; leaInReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass); } else { @@ -2006,7 +2004,7 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc, // just a single insert_subreg. addRegReg(MIB, leaInReg, true, leaInReg, false); } else { - if (TM.getSubtarget<X86Subtarget>().is64Bit()) + if (Subtarget.is64Bit()) leaInReg2 = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass); else leaInReg2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass); @@ -2076,13 +2074,13 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, // we have better subtarget support, enable the 16-bit LEA generation here. // 16-bit LEA is also slow on Core2. bool DisableLEA16 = true; - bool is64Bit = TM.getSubtarget<X86Subtarget>().is64Bit(); + bool is64Bit = Subtarget.is64Bit(); unsigned MIOpc = MI->getOpcode(); switch (MIOpc) { case X86::SHUFPSrri: { assert(MI->getNumOperands() == 4 && "Unknown shufps instruction!"); - if (!TM.getSubtarget<X86Subtarget>().hasSSE2()) return nullptr; + if (!Subtarget.hasSSE2()) return nullptr; unsigned B = MI->getOperand(1).getReg(); unsigned C = MI->getOperand(2).getReg(); @@ -2094,7 +2092,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, } case X86::SHUFPDrri: { assert(MI->getNumOperands() == 4 && "Unknown shufpd instruction!"); - if (!TM.getSubtarget<X86Subtarget>().hasSSE2()) return nullptr; + if (!Subtarget.hasSSE2()) return nullptr; unsigned B = MI->getOperand(1).getReg(); unsigned C = MI->getOperand(2).getReg(); @@ -2672,8 +2670,7 @@ static X86::CondCode getSwappedCondition(X86::CondCode CC) { /// getSETFromCond - Return a set opcode for the given condition and /// whether it has memory operand. -static unsigned getSETFromCond(X86::CondCode CC, - bool HasMemoryOperand) { +unsigned X86::getSETFromCond(CondCode CC, bool HasMemoryOperand) { static const uint16_t Opc[16][2] = { { X86::SETAr, X86::SETAm }, { X86::SETAEr, X86::SETAEm }, @@ -2693,14 +2690,14 @@ static unsigned getSETFromCond(X86::CondCode CC, { X86::SETSr, X86::SETSm } }; - assert(CC < 16 && "Can only handle standard cond codes"); + assert(CC <= LAST_VALID_COND && "Can only handle standard cond codes"); return Opc[CC][HasMemoryOperand ? 1 : 0]; } /// getCMovFromCond - Return a cmov opcode for the given condition, /// register size in bytes, and operand type. -static unsigned getCMovFromCond(X86::CondCode CC, unsigned RegBytes, - bool HasMemoryOperand) { +unsigned X86::getCMovFromCond(CondCode CC, unsigned RegBytes, + bool HasMemoryOperand) { static const uint16_t Opc[32][3] = { { X86::CMOVA16rr, X86::CMOVA32rr, X86::CMOVA64rr }, { X86::CMOVAE16rr, X86::CMOVAE32rr, X86::CMOVAE64rr }, @@ -2976,7 +2973,7 @@ canInsertSelect(const MachineBasicBlock &MBB, unsigned TrueReg, unsigned FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const { // Not all subtargets have cmov instructions. - if (!TM.getSubtarget<X86Subtarget>().hasCMov()) + if (!Subtarget.hasCMov()) return false; if (Cond.size() != 1) return false; @@ -3027,8 +3024,7 @@ static bool isHReg(unsigned Reg) { // Try and copy between VR128/VR64 and GR64 registers. static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg, - const X86Subtarget& Subtarget) { - + const X86Subtarget &Subtarget) { // SrcReg(VR128) -> DestReg(GR64) // SrcReg(VR64) -> DestReg(GR64) @@ -3107,8 +3103,8 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB, unsigned DestReg, unsigned SrcReg, bool KillSrc) const { // First deal with the normal symmetric copies. - bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX(); - bool HasAVX512 = TM.getSubtarget<X86Subtarget>().hasAVX512(); + bool HasAVX = Subtarget.hasAVX(); + bool HasAVX512 = Subtarget.hasAVX512(); unsigned Opc = 0; if (X86::GR64RegClass.contains(DestReg, SrcReg)) Opc = X86::MOV64rr; @@ -3120,7 +3116,7 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB, // Copying to or from a physical H register on x86-64 requires a NOREX // move. Otherwise use a normal move. if ((isHReg(DestReg) || isHReg(SrcReg)) && - TM.getSubtarget<X86Subtarget>().is64Bit()) { + Subtarget.is64Bit()) { Opc = X86::MOV8rr_NOREX; // Both operands must be encodable without an REX prefix. assert(X86::GR8_NOREXRegClass.contains(SrcReg, DestReg) && @@ -3137,7 +3133,7 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB, else if (X86::VR256RegClass.contains(DestReg, SrcReg)) Opc = X86::VMOVAPSYrr; if (!Opc) - Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, TM.getSubtarget<X86Subtarget>()); + Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, Subtarget); if (Opc) { BuildMI(MBB, MI, DL, get(Opc), DestReg) @@ -3183,9 +3179,9 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB, static unsigned getLoadStoreRegOpcode(unsigned Reg, const TargetRegisterClass *RC, bool isStackAligned, - const TargetMachine &TM, + const X86Subtarget &STI, bool load) { - if (TM.getSubtarget<X86Subtarget>().hasAVX512()) { + if (STI.hasAVX512()) { if (X86::VK8RegClass.hasSubClassEq(RC) || X86::VK16RegClass.hasSubClassEq(RC)) return load ? X86::KMOVWkm : X86::KMOVWmk; @@ -3197,13 +3193,13 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg, return load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr; } - bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX(); + bool HasAVX = STI.hasAVX(); switch (RC->getSize()) { default: llvm_unreachable("Unknown spill size"); case 1: assert(X86::GR8RegClass.hasSubClassEq(RC) && "Unknown 1-byte regclass"); - if (TM.getSubtarget<X86Subtarget>().is64Bit()) + if (STI.is64Bit()) // Copying to or from a physical H register on x86-64 requires a NOREX // move. Otherwise use a normal move. if (isHReg(Reg) || X86::GR8_ABCD_HRegClass.hasSubClassEq(RC)) @@ -3270,16 +3266,16 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg, static unsigned getStoreRegOpcode(unsigned SrcReg, const TargetRegisterClass *RC, bool isStackAligned, - TargetMachine &TM) { - return getLoadStoreRegOpcode(SrcReg, RC, isStackAligned, TM, false); + const X86Subtarget &STI) { + return getLoadStoreRegOpcode(SrcReg, RC, isStackAligned, STI, false); } static unsigned getLoadRegOpcode(unsigned DestReg, const TargetRegisterClass *RC, bool isStackAligned, - const TargetMachine &TM) { - return getLoadStoreRegOpcode(DestReg, RC, isStackAligned, TM, true); + const X86Subtarget &STI) { + return getLoadStoreRegOpcode(DestReg, RC, isStackAligned, STI, true); } void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, @@ -3291,9 +3287,10 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, assert(MF.getFrameInfo()->getObjectSize(FrameIdx) >= RC->getSize() && "Stack slot too small for store"); unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16); - bool isAligned = (TM.getFrameLowering()->getStackAlignment() >= Alignment) || - RI.canRealignStack(MF); - unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, TM); + bool isAligned = + (MF.getTarget().getFrameLowering()->getStackAlignment() >= Alignment) || + RI.canRealignStack(MF); + unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget); DebugLoc DL = MBB.findDebugLoc(MI); addFrameReference(BuildMI(MBB, MI, DL, get(Opc)), FrameIdx) .addReg(SrcReg, getKillRegState(isKill)); @@ -3309,7 +3306,7 @@ void X86InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg, unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16); bool isAligned = MMOBegin != MMOEnd && (*MMOBegin)->getAlignment() >= Alignment; - unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, TM); + unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget); DebugLoc DL; MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc)); for (unsigned i = 0, e = Addr.size(); i != e; ++i) @@ -3327,9 +3324,10 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, const TargetRegisterInfo *TRI) const { const MachineFunction &MF = *MBB.getParent(); unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16); - bool isAligned = (TM.getFrameLowering()->getStackAlignment() >= Alignment) || - RI.canRealignStack(MF); - unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, TM); + bool isAligned = + (MF.getTarget().getFrameLowering()->getStackAlignment() >= Alignment) || + RI.canRealignStack(MF); + unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget); DebugLoc DL = MBB.findDebugLoc(MI); addFrameReference(BuildMI(MBB, MI, DL, get(Opc), DestReg), FrameIdx); } @@ -3343,7 +3341,7 @@ void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg, unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16); bool isAligned = MMOBegin != MMOEnd && (*MMOBegin)->getAlignment() >= Alignment; - unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, TM); + unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget); DebugLoc DL; MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg); for (unsigned i = 0, e = Addr.size(); i != e; ++i) @@ -3741,7 +3739,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, continue; // EFLAGS is used by this instruction. - X86::CondCode OldCC; + X86::CondCode OldCC = X86::COND_INVALID; bool OpcIsSET = false; if (IsCmpZero || IsSwapped) { // We decode the condition code from opcode. @@ -3964,7 +3962,7 @@ static bool Expand2AddrUndef(MachineInstrBuilder &MIB, } bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { - bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX(); + bool HasAVX = Subtarget.hasAVX(); MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI); switch (MI->getOpcode()) { case X86::MOV32r0: @@ -4075,7 +4073,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, unsigned Size, unsigned Align) const { const DenseMap<unsigned, std::pair<unsigned,unsigned> > *OpcodeTablePtr = nullptr; - bool isCallRegIndirect = TM.getSubtarget<X86Subtarget>().callRegIndirect(); + bool isCallRegIndirect = Subtarget.callRegIndirect(); bool isTwoAddrFold = false; // Atom favors register form of call. So, we do not fold loads into calls @@ -4316,7 +4314,7 @@ breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum, if (X86::VR128RegClass.contains(Reg)) { // These instructions are all floating point domain, so xorps is the best // choice. - bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX(); + bool HasAVX = Subtarget.hasAVX(); unsigned Opc = HasAVX ? X86::VXORPSrr : X86::XORPSrr; BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(Opc), Reg) .addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef); @@ -4352,7 +4350,8 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, // If the function stack isn't realigned we don't want to fold instructions // that need increased alignment. if (!RI.needsStackRealignment(MF)) - Alignment = std::min(Alignment, TM.getFrameLowering()->getStackAlignment()); + Alignment = std::min( + Alignment, MF.getTarget().getFrameLowering()->getStackAlignment()); if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) { unsigned NewOpc = 0; unsigned RCSize = 0; @@ -4453,14 +4452,14 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, // Create a constant-pool entry and operands to load from it. // Medium and large mode can't fold loads this way. - if (TM.getCodeModel() != CodeModel::Small && - TM.getCodeModel() != CodeModel::Kernel) + if (MF.getTarget().getCodeModel() != CodeModel::Small && + MF.getTarget().getCodeModel() != CodeModel::Kernel) return nullptr; // x86-32 PIC requires a PIC base register for constant pools. unsigned PICBase = 0; - if (TM.getRelocationModel() == Reloc::PIC_) { - if (TM.getSubtarget<X86Subtarget>().is64Bit()) + if (MF.getTarget().getRelocationModel() == Reloc::PIC_) { + if (Subtarget.is64Bit()) PICBase = X86::RIP; else // FIXME: PICBase = getGlobalBaseReg(&MF); @@ -4600,7 +4599,7 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF); if (!MI->hasOneMemOperand() && RC == &X86::VR128RegClass && - !TM.getSubtarget<X86Subtarget>().isUnalignedMemAccessFast()) + !Subtarget.isUnalignedMemAccessFast()) // Without memoperands, loadRegFromAddr and storeRegToStackSlot will // conservatively assume the address is unaligned. That's bad for // performance. @@ -4748,13 +4747,13 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, cast<MachineSDNode>(N)->memoperands_end()); if (!(*MMOs.first) && RC == &X86::VR128RegClass && - !TM.getSubtarget<X86Subtarget>().isUnalignedMemAccessFast()) + !Subtarget.isUnalignedMemAccessFast()) // Do not introduce a slow unaligned load. return false; unsigned Alignment = RC->getSize() == 32 ? 32 : 16; bool isAligned = (*MMOs.first) && (*MMOs.first)->getAlignment() >= Alignment; - Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, TM), dl, + Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, Subtarget), dl, VT, MVT::Other, AddrOps); NewNodes.push_back(Load); @@ -4791,15 +4790,15 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, cast<MachineSDNode>(N)->memoperands_end()); if (!(*MMOs.first) && RC == &X86::VR128RegClass && - !TM.getSubtarget<X86Subtarget>().isUnalignedMemAccessFast()) + !Subtarget.isUnalignedMemAccessFast()) // Do not introduce a slow unaligned store. return false; unsigned Alignment = RC->getSize() == 32 ? 32 : 16; bool isAligned = (*MMOs.first) && (*MMOs.first)->getAlignment() >= Alignment; - SDNode *Store = DAG.getMachineNode(getStoreRegOpcode(0, DstRC, - isAligned, TM), - dl, MVT::Other, AddrOps); + SDNode *Store = + DAG.getMachineNode(getStoreRegOpcode(0, DstRC, isAligned, Subtarget), + dl, MVT::Other, AddrOps); NewNodes.push_back(Store); // Preserve memory reference information. @@ -4960,7 +4959,7 @@ bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, default: // XMM registers. In 64-bit mode we can be a bit more aggressive since we // have 16 of them to play with. - if (TM.getSubtargetImpl()->is64Bit()) { + if (Subtarget.is64Bit()) { if (NumLoads >= 3) return false; } else if (NumLoads) { @@ -4986,7 +4985,7 @@ bool X86InstrInfo::shouldScheduleAdjacent(MachineInstr* First, // Check if this processor supports macro-fusion. Since this is a minor // heuristic, we haven't specifically reserved a feature. hasAVX is a decent // proxy for SandyBridge+. - if (!TM.getSubtarget<X86Subtarget>().hasAVX()) + if (!Subtarget.hasAVX()) return false; enum { @@ -5038,6 +5037,7 @@ bool X86InstrInfo::shouldScheduleAdjacent(MachineInstr* First, case X86::TEST16rm: case X86::TEST32rm: case X86::TEST64rm: + case X86::TEST8ri_NOREX: case X86::AND16i16: case X86::AND16ri: case X86::AND16ri8: @@ -5168,7 +5168,7 @@ isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const { /// TODO: Eliminate this and move the code to X86MachineFunctionInfo. /// unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const { - assert(!TM.getSubtarget<X86Subtarget>().is64Bit() && + assert(!Subtarget.is64Bit() && "X86-64 PIC uses RIP relative addressing"); X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>(); @@ -5271,7 +5271,7 @@ static const uint16_t *lookupAVX2(unsigned opcode, unsigned domain) { std::pair<uint16_t, uint16_t> X86InstrInfo::getExecutionDomain(const MachineInstr *MI) const { uint16_t domain = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3; - bool hasAVX2 = TM.getSubtarget<X86Subtarget>().hasAVX2(); + bool hasAVX2 = Subtarget.hasAVX2(); uint16_t validDomains = 0; if (domain && lookup(MI->getOpcode(), domain)) validDomains = 0xe; @@ -5286,7 +5286,7 @@ void X86InstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const { assert(dom && "Not an SSE instruction"); const uint16_t *table = lookup(MI->getOpcode(), dom); if (!table) { // try the other table - assert((TM.getSubtarget<X86Subtarget>().hasAVX2() || Domain < 3) && + assert((Subtarget.hasAVX2() || Domain < 3) && "256-bit vector operations only available in AVX2"); table = lookupAVX2(MI->getOpcode(), dom); } @@ -5299,6 +5299,16 @@ void X86InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { NopInst.setOpcode(X86::NOOP); } +void X86InstrInfo::getUnconditionalBranch( + MCInst &Branch, const MCSymbolRefExpr *BranchTarget) const { + Branch.setOpcode(X86::JMP_4); + Branch.addOperand(MCOperand::CreateExpr(BranchTarget)); +} + +void X86InstrInfo::getTrap(MCInst &MI) const { + MI.setOpcode(X86::TRAP); +} + bool X86InstrInfo::isHighLatencyDef(int opc) const { switch (opc) { default: return false; diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h index 5f3491536d8a..c177e3a5c7c7 100644 --- a/lib/Target/X86/X86InstrInfo.h +++ b/lib/Target/X86/X86InstrInfo.h @@ -24,7 +24,7 @@ namespace llvm { class X86RegisterInfo; - class X86TargetMachine; + class X86Subtarget; namespace X86 { // X86 specific condition code. These correspond to X86_*_COND in @@ -46,6 +46,7 @@ namespace X86 { COND_O = 13, COND_P = 14, COND_S = 15, + LAST_VALID_COND = COND_S, // Artificial condition codes. These are used by AnalyzeBranch // to indicate a block terminated with two conditional branches to @@ -61,12 +62,21 @@ namespace X86 { // Turn condition code into conditional branch opcode. unsigned GetCondBranchFromCond(CondCode CC); + /// \brief Return a set opcode for the given condition and whether it has + /// a memory operand. + unsigned getSETFromCond(CondCode CC, bool HasMemoryOperand = false); + + /// \brief Return a cmov opcode for the given condition, register size in + /// bytes, and operand type. + unsigned getCMovFromCond(CondCode CC, unsigned RegBytes, + bool HasMemoryOperand = false); + // Turn CMov opcode into condition code. CondCode getCondFromCMovOpc(unsigned Opc); /// GetOppositeBranchCondition - Return the inverse of the specified cond, /// e.g. turning COND_E to COND_NE. - CondCode GetOppositeBranchCondition(X86::CondCode CC); + CondCode GetOppositeBranchCondition(CondCode CC); } // end namespace X86; @@ -129,7 +139,7 @@ inline static bool isMem(const MachineInstr *MI, unsigned Op) { } class X86InstrInfo final : public X86GenInstrInfo { - X86TargetMachine &TM; + X86Subtarget &Subtarget; const X86RegisterInfo RI; /// RegOp2MemOpTable3Addr, RegOp2MemOpTable0, RegOp2MemOpTable1, @@ -156,7 +166,7 @@ class X86InstrInfo final : public X86GenInstrInfo { virtual void anchor(); public: - explicit X86InstrInfo(X86TargetMachine &tm); + explicit X86InstrInfo(X86Subtarget &STI); /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As /// such, whenever a client has an instance of instruction info, it should @@ -396,6 +406,12 @@ public: const SmallVectorImpl<MachineOperand> &MOs, unsigned Size, unsigned Alignment) const; + void + getUnconditionalBranch(MCInst &Branch, + const MCSymbolRefExpr *BranchTarget) const override; + + void getTrap(MCInst &MI) const override; + bool isHighLatencyDef(int opc) const override; bool hasHighOperandLatency(const InstrItineraryData *ItinData, diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index 0d97669b2259..e7b532c6af81 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -155,27 +155,6 @@ def X86cas16 : SDNode<"X86ISD::LCMPXCHG16_DAG", SDTX86caspair, [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; -def X86AtomAdd64 : SDNode<"X86ISD::ATOMADD64_DAG", SDTX86atomicBinary, - [SDNPHasChain, SDNPMayStore, - SDNPMayLoad, SDNPMemOperand]>; -def X86AtomSub64 : SDNode<"X86ISD::ATOMSUB64_DAG", SDTX86atomicBinary, - [SDNPHasChain, SDNPMayStore, - SDNPMayLoad, SDNPMemOperand]>; -def X86AtomOr64 : SDNode<"X86ISD::ATOMOR64_DAG", SDTX86atomicBinary, - [SDNPHasChain, SDNPMayStore, - SDNPMayLoad, SDNPMemOperand]>; -def X86AtomXor64 : SDNode<"X86ISD::ATOMXOR64_DAG", SDTX86atomicBinary, - [SDNPHasChain, SDNPMayStore, - SDNPMayLoad, SDNPMemOperand]>; -def X86AtomAnd64 : SDNode<"X86ISD::ATOMAND64_DAG", SDTX86atomicBinary, - [SDNPHasChain, SDNPMayStore, - SDNPMayLoad, SDNPMemOperand]>; -def X86AtomNand64 : SDNode<"X86ISD::ATOMNAND64_DAG", SDTX86atomicBinary, - [SDNPHasChain, SDNPMayStore, - SDNPMayLoad, SDNPMemOperand]>; -def X86AtomSwap64 : SDNode<"X86ISD::ATOMSWAP64_DAG", SDTX86atomicBinary, - [SDNPHasChain, SDNPMayStore, - SDNPMayLoad, SDNPMemOperand]>; def X86retflag : SDNode<"X86ISD::RET_FLAG", SDTX86Ret, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; @@ -208,6 +187,8 @@ def X86rdtsc : SDNode<"X86ISD::RDTSC_DAG", SDTX86Void, [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; def X86rdtscp : SDNode<"X86ISD::RDTSCP_DAG", SDTX86Void, [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; +def X86rdpmc : SDNode<"X86ISD::RDPMC_DAG", SDTX86Void, + [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; def X86Wrapper : SDNode<"X86ISD::Wrapper", SDTX86Wrapper>; def X86WrapperRIP : SDNode<"X86ISD::WrapperRIP", SDTX86Wrapper>; @@ -795,6 +776,7 @@ def OptForSpeed : Predicate<"!OptForSize">; def FastBTMem : Predicate<"!Subtarget->isBTMemSlow()">; def CallImmAddr : Predicate<"Subtarget->IsLegalToCallImmediateAddr(TM)">; def FavorMemIndirectCall : Predicate<"!Subtarget->callRegIndirect()">; +def NotSlowIncDec : Predicate<"!Subtarget->slowIncDec()">; //===----------------------------------------------------------------------===// // X86 Instruction Format Definitions. diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 1eb04851b721..f9a5ae1a3d52 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -4337,20 +4337,6 @@ defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32, SSE_INTALU_ITINS_P, 0>; //===---------------------------------------------------------------------===// -// SSE2 - Packed Integer Pack Instructions -//===---------------------------------------------------------------------===// - -defm PACKSSWB : PDI_binop_all_int<0x63, "packsswb", int_x86_sse2_packsswb_128, - int_x86_avx2_packsswb, - SSE_INTALU_ITINS_SHUFF_P, 0>; -defm PACKSSDW : PDI_binop_all_int<0x6B, "packssdw", int_x86_sse2_packssdw_128, - int_x86_avx2_packssdw, - SSE_INTALU_ITINS_SHUFF_P, 0>; -defm PACKUSWB : PDI_binop_all_int<0x67, "packuswb", int_x86_sse2_packuswb_128, - int_x86_avx2_packuswb, - SSE_INTALU_ITINS_SHUFF_P, 0>; - -//===---------------------------------------------------------------------===// // SSE2 - Packed Integer Shuffle Instructions //===---------------------------------------------------------------------===// @@ -4432,6 +4418,136 @@ let Predicates = [UseSSE2] in { } //===---------------------------------------------------------------------===// +// Packed Integer Pack Instructions (SSE & AVX) +//===---------------------------------------------------------------------===// + +let ExeDomain = SSEPackedInt in { +multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT, + ValueType ArgVT, SDNode OpNode, PatFrag bc_frag, + bit Is2Addr = 1> { + def rr : PDI<opc, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, + (OutVT (OpNode (ArgVT VR128:$src1), VR128:$src2)))]>, + Sched<[WriteShuffle]>; + def rm : PDI<opc, MRMSrcMem, + (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, + (OutVT (OpNode VR128:$src1, + (bc_frag (memopv2i64 addr:$src2)))))]>, + Sched<[WriteShuffleLd, ReadAfterLd]>; +} + +multiclass sse2_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT, + ValueType ArgVT, SDNode OpNode, PatFrag bc_frag> { + def Yrr : PDI<opc, MRMSrcReg, + (outs VR256:$dst), (ins VR256:$src1, VR256:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, + (OutVT (OpNode (ArgVT VR256:$src1), VR256:$src2)))]>, + Sched<[WriteShuffle]>; + def Yrm : PDI<opc, MRMSrcMem, + (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, + (OutVT (OpNode VR256:$src1, + (bc_frag (memopv4i64 addr:$src2)))))]>, + Sched<[WriteShuffleLd, ReadAfterLd]>; +} + +multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT, + ValueType ArgVT, SDNode OpNode, PatFrag bc_frag, + bit Is2Addr = 1> { + def rr : SS48I<opc, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, + (OutVT (OpNode (ArgVT VR128:$src1), VR128:$src2)))]>, + Sched<[WriteShuffle]>; + def rm : SS48I<opc, MRMSrcMem, + (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, + (OutVT (OpNode VR128:$src1, + (bc_frag (memopv2i64 addr:$src2)))))]>, + Sched<[WriteShuffleLd, ReadAfterLd]>; +} + +multiclass sse4_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT, + ValueType ArgVT, SDNode OpNode, PatFrag bc_frag> { + def Yrr : SS48I<opc, MRMSrcReg, + (outs VR256:$dst), (ins VR256:$src1, VR256:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, + (OutVT (OpNode (ArgVT VR256:$src1), VR256:$src2)))]>, + Sched<[WriteShuffle]>; + def Yrm : SS48I<opc, MRMSrcMem, + (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, + (OutVT (OpNode VR256:$src1, + (bc_frag (memopv4i64 addr:$src2)))))]>, + Sched<[WriteShuffleLd, ReadAfterLd]>; +} + +let Predicates = [HasAVX] in { + defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, + bc_v8i16, 0>, VEX_4V; + defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, + bc_v4i32, 0>, VEX_4V; + + defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, + bc_v8i16, 0>, VEX_4V; + defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, + bc_v4i32, 0>, VEX_4V; +} + +let Predicates = [HasAVX2] in { + defm VPACKSSWB : sse2_pack_y<0x63, "vpacksswb", v32i8, v16i16, X86Packss, + bc_v16i16>, VEX_4V, VEX_L; + defm VPACKSSDW : sse2_pack_y<0x6B, "vpackssdw", v16i16, v8i32, X86Packss, + bc_v8i32>, VEX_4V, VEX_L; + + defm VPACKUSWB : sse2_pack_y<0x67, "vpackuswb", v32i8, v16i16, X86Packus, + bc_v16i16>, VEX_4V, VEX_L; + defm VPACKUSDW : sse4_pack_y<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, + bc_v8i32>, VEX_4V, VEX_L; +} + +let Constraints = "$src1 = $dst" in { + defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, + bc_v8i16>; + defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, + bc_v4i32>; + + defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, + bc_v8i16>; + + let Predicates = [HasSSE41] in + defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, + bc_v4i32>; +} +} // ExeDomain = SSEPackedInt + +//===---------------------------------------------------------------------===// // SSE2 - Packed Integer Unpack Instructions //===---------------------------------------------------------------------===// @@ -5239,6 +5355,60 @@ let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in { f128mem, SSE_ALU_F64P>, PD; } +// Patterns used to select 'addsub' instructions. +let Predicates = [HasAVX] in { + // Constant 170 corresponds to the binary mask '10101010'. + // When used as a blend mask, it allows selecting eight elements from two + // input vectors as follow: + // - Even-numbered values in the destination are copied from + // the corresponding elements in the first input vector; + // - Odd-numbered values in the destination are copied from + // the corresponding elements in the second input vector. + + def : Pat<(v8f32 (X86Blendi (v8f32 (fsub VR256:$lhs, VR256:$rhs)), + (v8f32 (fadd VR256:$lhs, VR256:$rhs)), (i32 170))), + (VADDSUBPSYrr VR256:$lhs, VR256:$rhs)>; + + // Constant 10 corresponds to the binary mask '1010'. + // In the two pattens below, constant 10 is used as a blend mask to select + // - the 1st and 3rd element from the first input vector (the 'fsub' node); + // - the 2nd and 4th element from the second input vector (the 'fadd' node). + + def : Pat<(v4f64 (X86Blendi (v4f64 (fsub VR256:$lhs, VR256:$rhs)), + (v4f64 (fadd VR256:$lhs, VR256:$rhs)), (i32 10))), + (VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>; + def : Pat<(v4f64 (X86Blendi (v4f64 (fsub VR256:$lhs, VR256:$rhs)), + (v4f64 (fadd VR256:$lhs, VR256:$rhs)), (i32 10))), + (VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>; + def : Pat<(v4f32 (X86Blendi (v4f32 (fsub VR128:$lhs, VR128:$rhs)), + (v4f32 (fadd VR128:$lhs, VR128:$rhs)), (i32 10))), + (VADDSUBPSrr VR128:$lhs, VR128:$rhs)>; + def : Pat<(v2f64 (X86Blendi (v2f64 (fsub VR128:$lhs, VR128:$rhs)), + (v2f64 (fadd VR128:$lhs, VR128:$rhs)), (i32 2))), + (VADDSUBPDrr VR128:$lhs, VR128:$rhs)>; + def : Pat<(v2f64 (X86Movsd (v2f64 (fadd VR128:$lhs, VR128:$rhs)), + (v2f64 (fsub VR128:$lhs, VR128:$rhs)))), + (VADDSUBPDrr VR128:$lhs, VR128:$rhs)>; +} + +let Predicates = [UseSSE3] in { + // Constant 10 corresponds to the binary mask '1010'. + // In the pattern below, it is used as a blend mask to select: + // - the 1st and 3rd element from the first input vector (the fsub node); + // - the 2nd and 4th element from the second input vector (the fadd node). + + def : Pat<(v4f32 (X86Blendi (v4f32 (fsub VR128:$lhs, VR128:$rhs)), + (v4f32 (fadd VR128:$lhs, VR128:$rhs)), (i32 10))), + (ADDSUBPSrr VR128:$lhs, VR128:$rhs)>; + + def : Pat<(v2f64 (X86Blendi (v2f64 (fsub VR128:$lhs, VR128:$rhs)), + (v2f64 (fadd VR128:$lhs, VR128:$rhs)), (i32 2))), + (ADDSUBPDrr VR128:$lhs, VR128:$rhs)>; + def : Pat<(v2f64 (X86Movsd (v2f64 (fadd VR128:$lhs, VR128:$rhs)), + (v2f64 (fsub VR128:$lhs, VR128:$rhs)))), + (ADDSUBPDrr VR128:$lhs, VR128:$rhs)>; +} + //===---------------------------------------------------------------------===// // SSE3 Instructions //===---------------------------------------------------------------------===// @@ -7053,8 +7223,6 @@ multiclass SS48I_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode, let Predicates = [HasAVX] in { let isCommutable = 0 in - defm VPACKUSDW : SS41I_binop_rm_int<0x2B, "vpackusdw", int_x86_sse41_packusdw, - 0, DEFAULT_ITINS_SHUFFLESCHED>, VEX_4V; defm VPMINSB : SS48I_binop_rm<0x38, "vpminsb", X86smin, v16i8, VR128, loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, VEX_4V; @@ -7086,9 +7254,6 @@ let Predicates = [HasAVX] in { let Predicates = [HasAVX2] in { let isCommutable = 0 in - defm VPACKUSDW : SS41I_binop_rm_int_y<0x2B, "vpackusdw", - int_x86_avx2_packusdw, WriteShuffle>, - VEX_4V, VEX_L; defm VPMINSBY : SS48I_binop_rm<0x38, "vpminsb", X86smin, v32i8, VR256, loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, VEX_4V, VEX_L; @@ -7120,8 +7285,6 @@ let Predicates = [HasAVX2] in { let Constraints = "$src1 = $dst" in { let isCommutable = 0 in - defm PACKUSDW : SS41I_binop_rm_int<0x2B, "packusdw", int_x86_sse41_packusdw, - 1, DEFAULT_ITINS_SHUFFLESCHED>; defm PMINSB : SS48I_binop_rm<0x38, "pminsb", X86smin, v16i8, VR128, memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; defm PMINSD : SS48I_binop_rm<0x39, "pminsd", X86smin, v4i32, VR128, @@ -7969,6 +8132,16 @@ class avx_broadcast<bits<8> opc, string OpcodeStr, RegisterClass RC, !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set RC:$dst, (Int addr:$src))]>, Sched<[Sched]>, VEX; +class avx_broadcast_no_int<bits<8> opc, string OpcodeStr, RegisterClass RC, + X86MemOperand x86memop, ValueType VT, + PatFrag ld_frag, SchedWrite Sched> : + AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set RC:$dst, (VT (X86VBroadcast (ld_frag addr:$src))))]>, + Sched<[Sched]>, VEX { + let mayLoad = 1; +} + // AVX2 adds register forms class avx2_broadcast_reg<bits<8> opc, string OpcodeStr, RegisterClass RC, Intrinsic Int, SchedWrite Sched> : @@ -7977,16 +8150,15 @@ class avx2_broadcast_reg<bits<8> opc, string OpcodeStr, RegisterClass RC, [(set RC:$dst, (Int VR128:$src))]>, Sched<[Sched]>, VEX; let ExeDomain = SSEPackedSingle in { - def VBROADCASTSSrm : avx_broadcast<0x18, "vbroadcastss", VR128, f32mem, - int_x86_avx_vbroadcast_ss, WriteLoad>; - def VBROADCASTSSYrm : avx_broadcast<0x18, "vbroadcastss", VR256, f32mem, - int_x86_avx_vbroadcast_ss_256, - WriteFShuffleLd>, VEX_L; + def VBROADCASTSSrm : avx_broadcast_no_int<0x18, "vbroadcastss", VR128, + f32mem, v4f32, loadf32, WriteLoad>; + def VBROADCASTSSYrm : avx_broadcast_no_int<0x18, "vbroadcastss", VR256, + f32mem, v8f32, loadf32, + WriteFShuffleLd>, VEX_L; } let ExeDomain = SSEPackedDouble in -def VBROADCASTSDYrm : avx_broadcast<0x19, "vbroadcastsd", VR256, f64mem, - int_x86_avx_vbroadcast_sd_256, - WriteFShuffleLd>, VEX_L; +def VBROADCASTSDYrm : avx_broadcast_no_int<0x19, "vbroadcastsd", VR256, f64mem, + v4f64, loadf64, WriteFShuffleLd>, VEX_L; def VBROADCASTF128 : avx_broadcast<0x1A, "vbroadcastf128", VR256, f128mem, int_x86_avx_vbroadcastf128_pd_256, WriteFShuffleLd>, VEX_L; @@ -8366,6 +8538,21 @@ let Predicates = [HasF16C] in { (VCVTPH2PSrm addr:$src)>; } +// Patterns for matching conversions from float to half-float and vice versa. +let Predicates = [HasF16C] in { + def : Pat<(f32_to_f16 FR32:$src), + (i16 (EXTRACT_SUBREG (VMOVPDI2DIrr (VCVTPS2PHrr + (COPY_TO_REGCLASS FR32:$src, VR128), 0)), sub_16bit))>; + + def : Pat<(f16_to_f32 GR16:$src), + (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr + (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128)), FR32)) >; + + def : Pat<(f16_to_f32 (i16 (f32_to_f16 FR32:$src))), + (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr + (VCVTPS2PHrr (COPY_TO_REGCLASS FR32:$src, VR128), 0)), FR32)) >; +} + //===----------------------------------------------------------------------===// // AVX2 Instructions //===----------------------------------------------------------------------===// @@ -8543,13 +8730,6 @@ def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))), } let Predicates = [HasAVX] in { -def : Pat<(v8f32 (X86VBroadcast (loadf32 addr:$src))), - (VBROADCASTSSYrm addr:$src)>; -def : Pat<(v4f64 (X86VBroadcast (loadf64 addr:$src))), - (VBROADCASTSDYrm addr:$src)>; -def : Pat<(v4f32 (X86VBroadcast (loadf32 addr:$src))), - (VBROADCASTSSrm addr:$src)>; - // Provide fallback in case the load node that is used in the patterns above // is used by additional users, which prevents the pattern selection. let AddedComplexity = 20 in { diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td index b5595cbd3bbe..540278021f02 100644 --- a/lib/Target/X86/X86InstrSystem.td +++ b/lib/Target/X86/X86InstrSystem.td @@ -439,7 +439,10 @@ def LLDT16m : I<0x00, MRM2m, (outs), (ins i16mem:$src), let SchedRW = [WriteSystem] in { def WRMSR : I<0x30, RawFrm, (outs), (ins), "wrmsr", [], IIC_WRMSR>, TB; def RDMSR : I<0x32, RawFrm, (outs), (ins), "rdmsr", [], IIC_RDMSR>, TB; -def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", [], IIC_RDPMC>, TB; + +let Defs = [RAX, RDX], Uses = [ECX] in + def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", [(X86rdpmc)], IIC_RDPMC>, + TB; def SMSW16r : I<0x01, MRM4r, (outs GR16:$dst), (ins), "smsw{w}\t$dst", [], IIC_SMSW>, OpSize16, TB; diff --git a/lib/Target/X86/X86JITInfo.cpp b/lib/Target/X86/X86JITInfo.cpp index e969ef2cf3d7..a082c4f8b0ef 100644 --- a/lib/Target/X86/X86JITInfo.cpp +++ b/lib/Target/X86/X86JITInfo.cpp @@ -432,7 +432,7 @@ X86JITInfo::getLazyResolverFunction(JITCompilerFn F) { // SSE Callback should be called for SSE-enabled LLVM. return X86CompilationCallback_SSE; #else - if (Subtarget->hasSSE1()) + if (useSSE) return X86CompilationCallback_SSE; #endif #endif @@ -440,8 +440,8 @@ X86JITInfo::getLazyResolverFunction(JITCompilerFn F) { return X86CompilationCallback; } -X86JITInfo::X86JITInfo(X86TargetMachine &tm) : TM(tm) { - Subtarget = &TM.getSubtarget<X86Subtarget>(); +X86JITInfo::X86JITInfo(bool UseSSE) { + useSSE = UseSSE; useGOT = 0; TLSOffset = nullptr; } diff --git a/lib/Target/X86/X86JITInfo.h b/lib/Target/X86/X86JITInfo.h index 4d279de3425f..564343ffa3fa 100644 --- a/lib/Target/X86/X86JITInfo.h +++ b/lib/Target/X86/X86JITInfo.h @@ -19,16 +19,14 @@ #include "llvm/Target/TargetJITInfo.h" namespace llvm { - class X86TargetMachine; class X86Subtarget; class X86JITInfo : public TargetJITInfo { - X86TargetMachine &TM; - const X86Subtarget *Subtarget; uintptr_t PICBase; - char* TLSOffset; + char *TLSOffset; + bool useSSE; public: - explicit X86JITInfo(X86TargetMachine &tm); + explicit X86JITInfo(bool UseSSE); /// replaceMachineCodeForFunction - Make it so that calling the function /// whose machine code is at OLD turns into a call to NEW, perhaps by diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp index 0190080b935b..2bd70a96c432 100644 --- a/lib/Target/X86/X86MCInstLower.cpp +++ b/lib/Target/X86/X86MCInstLower.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "X86AsmPrinter.h" +#include "X86RegisterInfo.h" #include "InstPrinter/X86ATTInstPrinter.h" #include "MCTargetDesc/X86BaseInfo.h" #include "llvm/ADT/SmallString.h" @@ -779,6 +780,9 @@ static void LowerPATCHPOINT(MCStreamer &OS, StackMaps &SM, void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { X86MCInstLower MCInstLowering(*MF, *this); + const X86RegisterInfo *RI = + static_cast<const X86RegisterInfo *>(TM.getRegisterInfo()); + switch (MI->getOpcode()) { case TargetOpcode::DBG_VALUE: llvm_unreachable("Should be handled target independently"); @@ -883,6 +887,37 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { .addReg(X86::R10) .addReg(X86::RAX)); return; + + case X86::SEH_PushReg: + OutStreamer.EmitWinCFIPushReg(RI->getSEHRegNum(MI->getOperand(0).getImm())); + return; + + case X86::SEH_SaveReg: + OutStreamer.EmitWinCFISaveReg(RI->getSEHRegNum(MI->getOperand(0).getImm()), + MI->getOperand(1).getImm()); + return; + + case X86::SEH_SaveXMM: + OutStreamer.EmitWinCFISaveXMM(RI->getSEHRegNum(MI->getOperand(0).getImm()), + MI->getOperand(1).getImm()); + return; + + case X86::SEH_StackAlloc: + OutStreamer.EmitWinCFIAllocStack(MI->getOperand(0).getImm()); + return; + + case X86::SEH_SetFrame: + OutStreamer.EmitWinCFISetFrame(RI->getSEHRegNum(MI->getOperand(0).getImm()), + MI->getOperand(1).getImm()); + return; + + case X86::SEH_PushFrame: + OutStreamer.EmitWinCFIPushFrame(MI->getOperand(0).getImm()); + return; + + case X86::SEH_EndPrologue: + OutStreamer.EmitWinCFIEndProlog(); + return; } MCInst TmpInst; diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp index a83e1e487b82..e8a7e84bb7e3 100644 --- a/lib/Target/X86/X86RegisterInfo.cpp +++ b/lib/Target/X86/X86RegisterInfo.cpp @@ -53,20 +53,18 @@ static cl::opt<bool> EnableBasePointer("x86-use-base-pointer", cl::Hidden, cl::init(true), cl::desc("Enable use of a base pointer for complex stack frames")); -X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm) - : X86GenRegisterInfo((tm.getSubtarget<X86Subtarget>().is64Bit() - ? X86::RIP : X86::EIP), - X86_MC::getDwarfRegFlavour(tm.getTargetTriple(), false), - X86_MC::getDwarfRegFlavour(tm.getTargetTriple(), true), - (tm.getSubtarget<X86Subtarget>().is64Bit() - ? X86::RIP : X86::EIP)), - TM(tm) { +X86RegisterInfo::X86RegisterInfo(const X86Subtarget &STI) + : X86GenRegisterInfo( + (STI.is64Bit() ? X86::RIP : X86::EIP), + X86_MC::getDwarfRegFlavour(STI.getTargetTriple(), false), + X86_MC::getDwarfRegFlavour(STI.getTargetTriple(), true), + (STI.is64Bit() ? X86::RIP : X86::EIP)), + Subtarget(STI) { X86_MC::InitLLVM2SEHRegisterMapping(this); // Cache some information. - const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>(); - Is64Bit = Subtarget->is64Bit(); - IsWin64 = Subtarget->isTargetWin64(); + Is64Bit = Subtarget.is64Bit(); + IsWin64 = Subtarget.isTargetWin64(); if (Is64Bit) { SlotSize = 8; @@ -83,21 +81,6 @@ X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm) BasePtr = Is64Bit ? X86::RBX : X86::ESI; } -/// getCompactUnwindRegNum - This function maps the register to the number for -/// compact unwind encoding. Return -1 if the register isn't valid. -int X86RegisterInfo::getCompactUnwindRegNum(unsigned RegNum, bool isEH) const { - switch (getLLVMRegNum(RegNum, isEH)) { - case X86::EBX: case X86::RBX: return 1; - case X86::ECX: case X86::R12: return 2; - case X86::EDX: case X86::R13: return 3; - case X86::EDI: case X86::R14: return 4; - case X86::ESI: case X86::R15: return 5; - case X86::EBP: case X86::RBP: return 6; - } - - return -1; -} - bool X86RegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const { // ExeDepsFixer and PostRAScheduler require liveness. @@ -173,9 +156,8 @@ X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC) const{ } const TargetRegisterClass * -X86RegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind) - const { - const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); +X86RegisterInfo::getPointerRegClass(const MachineFunction &MF, + unsigned Kind) const { switch (Kind) { default: llvm_unreachable("Unexpected Kind in getPointerRegClass!"); case 0: // Normal GPRs. @@ -225,7 +207,7 @@ X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, case X86::GR64RegClassID: return 12 - FPDiff; case X86::VR128RegClassID: - return TM.getSubtarget<X86Subtarget>().is64Bit() ? 10 : 4; + return Subtarget.is64Bit() ? 10 : 4; case X86::VR64RegClassID: return 4; } @@ -233,8 +215,8 @@ X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, const MCPhysReg * X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { - bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX(); - bool HasAVX512 = TM.getSubtarget<X86Subtarget>().hasAVX512(); + bool HasAVX = Subtarget.hasAVX(); + bool HasAVX512 = Subtarget.hasAVX512(); assert(MF && "MachineFunction required"); switch (MF->getFunction()->getCallingConv()) { @@ -287,8 +269,8 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { const uint32_t* X86RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const { - bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX(); - bool HasAVX512 = TM.getSubtarget<X86Subtarget>().hasAVX512(); + bool HasAVX = Subtarget.hasAVX(); + bool HasAVX512 = Subtarget.hasAVX512(); switch (CC) { case CallingConv::GHC: @@ -406,7 +388,7 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { Reserved.set(*AI); } } - if (!Is64Bit || !TM.getSubtarget<X86Subtarget>().hasAVX512()) { + if (!Is64Bit || !Subtarget.hasAVX512()) { for (unsigned n = 16; n != 32; ++n) { for (MCRegAliasIterator AI(X86::XMM0 + n, this, true); AI.isValid(); ++AI) Reserved.set(*AI); @@ -459,7 +441,7 @@ bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const { bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); const Function *F = MF.getFunction(); - unsigned StackAlign = TM.getFrameLowering()->getStackAlignment(); + unsigned StackAlign = MF.getTarget().getFrameLowering()->getStackAlignment(); bool requiresRealignment = ((MFI->getMaxAlignment() > StackAlign) || F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h index 2289d91960c8..74efd1fe32d7 100644 --- a/lib/Target/X86/X86RegisterInfo.h +++ b/lib/Target/X86/X86RegisterInfo.h @@ -22,11 +22,11 @@ namespace llvm { class Type; class TargetInstrInfo; - class X86TargetMachine; + class X86Subtarget; class X86RegisterInfo final : public X86GenRegisterInfo { public: - X86TargetMachine &TM; + const X86Subtarget &Subtarget; private: /// Is64Bit - Is the target 64-bits. @@ -55,15 +55,11 @@ private: unsigned BasePtr; public: - X86RegisterInfo(X86TargetMachine &tm); + X86RegisterInfo(const X86Subtarget &STI); // FIXME: This should be tablegen'd like getDwarfRegNum is int getSEHRegNum(unsigned i) const; - /// getCompactUnwindRegNum - This function maps the register to the number for - /// compact unwind encoding. Return -1 if the register isn't valid. - int getCompactUnwindRegNum(unsigned RegNum, bool isEH) const override; - /// Code Generation virtual methods... /// bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override; diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp index 744890d50c83..a83dd9b2eea7 100644 --- a/lib/Target/X86/X86SelectionDAGInfo.cpp +++ b/lib/Target/X86/X86SelectionDAGInfo.cpp @@ -11,21 +11,23 @@ // //===----------------------------------------------------------------------===// -#include "X86TargetMachine.h" +#include "X86InstrInfo.h" +#include "X86ISelLowering.h" +#include "X86RegisterInfo.h" +#include "X86Subtarget.h" +#include "X86SelectionDAGInfo.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/Target/TargetLowering.h" + using namespace llvm; #define DEBUG_TYPE "x86-selectiondag-info" -X86SelectionDAGInfo::X86SelectionDAGInfo(const X86TargetMachine &TM) : - TargetSelectionDAGInfo(TM), - Subtarget(&TM.getSubtarget<X86Subtarget>()), - TLI(*TM.getTargetLowering()) { -} +X86SelectionDAGInfo::X86SelectionDAGInfo(const DataLayout &DL) + : TargetSelectionDAGInfo(&DL) {} -X86SelectionDAGInfo::~X86SelectionDAGInfo() { -} +X86SelectionDAGInfo::~X86SelectionDAGInfo() {} SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, @@ -35,6 +37,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, bool isVolatile, MachinePointerInfo DstPtrInfo) const { ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); + const X86Subtarget &Subtarget = DAG.getTarget().getSubtarget<X86Subtarget>(); // If to a segment-relative address space, use the default lowering. if (DstPtrInfo.getAddrSpace() >= 256) @@ -43,16 +46,14 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, // If not DWORD aligned or size is more than the threshold, call the library. // The libc version is likely to be faster for these cases. It can use the // address value and run time information about the CPU. - if ((Align & 3) != 0 || - !ConstantSize || - ConstantSize->getZExtValue() > - Subtarget->getMaxInlineSizeThreshold()) { + if ((Align & 3) != 0 || !ConstantSize || + ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold()) { // Check to see if there is a specialized entry-point for memory zeroing. ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src); if (const char *bzeroEntry = V && - V->isNullValue() ? Subtarget->getBZeroEntry() : nullptr) { - EVT IntPtr = TLI.getPointerTy(); + V->isNullValue() ? Subtarget.getBZeroEntry() : nullptr) { + EVT IntPtr = DAG.getTargetLoweringInfo().getPointerTy(); Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext()); TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; @@ -65,10 +66,11 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(Chain) .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), - DAG.getExternalSymbol(bzeroEntry, IntPtr), &Args, 0) + DAG.getExternalSymbol(bzeroEntry, IntPtr), std::move(Args), + 0) .setDiscardResult(); - std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI); + std::pair<SDValue,SDValue> CallResult = DAG.getTargetLoweringInfo().LowerCallTo(CLI); return CallResult.second; } @@ -99,7 +101,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, ValReg = X86::EAX; Val = (Val << 8) | Val; Val = (Val << 16) | Val; - if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) { // QWORD aligned + if (Subtarget.is64Bit() && ((Align & 0x7) == 0)) { // QWORD aligned AVT = MVT::i64; ValReg = X86::RAX; Val = (Val << 32) | Val; @@ -128,13 +130,11 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, InFlag = Chain.getValue(1); } - Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX : - X86::ECX, - Count, InFlag); + Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RCX : X86::ECX, + Count, InFlag); InFlag = Chain.getValue(1); - Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI : - X86::EDI, - Dst, InFlag); + Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RDI : X86::EDI, + Dst, InFlag); InFlag = Chain.getValue(1); SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); @@ -182,10 +182,11 @@ X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, // This requires the copy size to be a constant, preferably // within a subtarget-specific limit. ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); + const X86Subtarget &Subtarget = DAG.getTarget().getSubtarget<X86Subtarget>(); if (!ConstantSize) return SDValue(); uint64_t SizeVal = ConstantSize->getZExtValue(); - if (!AlwaysInline && SizeVal > Subtarget->getMaxInlineSizeThreshold()) + if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold()) return SDValue(); /// If not DWORD aligned, it is more efficient to call the library. However @@ -218,7 +219,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, AVT = MVT::i32; else // QWORD aligned - AVT = Subtarget->is64Bit() ? MVT::i64 : MVT::i32; + AVT = Subtarget.is64Bit() ? MVT::i64 : MVT::i32; unsigned UBytes = AVT.getSizeInBits() / 8; unsigned CountVal = SizeVal / UBytes; @@ -226,15 +227,15 @@ X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, unsigned BytesLeft = SizeVal % UBytes; SDValue InFlag; - Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX : + Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RCX : X86::ECX, Count, InFlag); InFlag = Chain.getValue(1); - Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI : + Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RDI : X86::EDI, Dst, InFlag); InFlag = Chain.getValue(1); - Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RSI : + Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RSI : X86::ESI, Src, InFlag); InFlag = Chain.getValue(1); diff --git a/lib/Target/X86/X86SelectionDAGInfo.h b/lib/Target/X86/X86SelectionDAGInfo.h index 0d5dc38ae898..c12555a59617 100644 --- a/lib/Target/X86/X86SelectionDAGInfo.h +++ b/lib/Target/X86/X86SelectionDAGInfo.h @@ -23,14 +23,8 @@ class X86TargetMachine; class X86Subtarget; class X86SelectionDAGInfo : public TargetSelectionDAGInfo { - /// Subtarget - Keep a pointer to the X86Subtarget around so that we can - /// make the right decision when generating code for different targets. - const X86Subtarget *Subtarget; - - const X86TargetLowering &TLI; - public: - explicit X86SelectionDAGInfo(const X86TargetMachine &TM); + explicit X86SelectionDAGInfo(const DataLayout &DL); ~X86SelectionDAGInfo(); SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index 989e0d61b6fc..79b7e68c320b 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -291,13 +291,60 @@ void X86Subtarget::initializeEnvironment() { CallRegIndirect = false; LEAUsesAG = false; SlowLEA = false; + SlowIncDec = false; stackAlignment = 4; // FIXME: this is a known good value for Yonah. How about others? MaxInlineSizeThreshold = 128; } +static std::string computeDataLayout(const X86Subtarget &ST) { + // X86 is little endian + std::string Ret = "e"; + + Ret += DataLayout::getManglingComponent(ST.getTargetTriple()); + // X86 and x32 have 32 bit pointers. + if (ST.isTarget64BitILP32() || !ST.is64Bit()) + Ret += "-p:32:32"; + + // Some ABIs align 64 bit integers and doubles to 64 bits, others to 32. + if (ST.is64Bit() || ST.isOSWindows() || ST.isTargetNaCl()) + Ret += "-i64:64"; + else + Ret += "-f64:32:64"; + + // Some ABIs align long double to 128 bits, others to 32. + if (ST.isTargetNaCl()) + ; // No f80 + else if (ST.is64Bit() || ST.isTargetDarwin()) + Ret += "-f80:128"; + else + Ret += "-f80:32"; + + // The registers can hold 8, 16, 32 or, in x86-64, 64 bits. + if (ST.is64Bit()) + Ret += "-n8:16:32:64"; + else + Ret += "-n8:16:32"; + + // The stack is aligned to 32 bits on some ABIs and 128 bits on others. + if (!ST.is64Bit() && ST.isOSWindows()) + Ret += "-S32"; + else + Ret += "-S128"; + + return Ret; +} + +X86Subtarget &X86Subtarget::initializeSubtargetDependencies(StringRef CPU, + StringRef FS) { + initializeEnvironment(); + resetSubtargetFeatures(CPU, FS); + return *this; +} + X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU, - const std::string &FS, unsigned StackAlignOverride) + const std::string &FS, X86TargetMachine &TM, + unsigned StackAlignOverride) : X86GenSubtargetInfo(TT, CPU, FS), X86ProcFamily(Others), PICStyle(PICStyles::None), TargetTriple(TT), StackAlignOverride(StackAlignOverride), @@ -305,10 +352,12 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU, In32BitMode(TargetTriple.getArch() == Triple::x86 && TargetTriple.getEnvironment() != Triple::CODE16), In16BitMode(TargetTriple.getArch() == Triple::x86 && - TargetTriple.getEnvironment() == Triple::CODE16) { - initializeEnvironment(); - resetSubtargetFeatures(CPU, FS); -} + TargetTriple.getEnvironment() == Triple::CODE16), + DL(computeDataLayout(*this)), TSInfo(DL), + InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM), + FrameLowering(TargetFrameLowering::StackGrowsDown, getStackAlignment(), + is64Bit() ? -8 : -4), + JITInfo(hasSSE1()) {} bool X86Subtarget::enablePostRAScheduler(CodeGenOpt::Level OptLevel, diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index 703559a47648..09db0ebc5a9b 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -14,6 +14,11 @@ #ifndef X86SUBTARGET_H #define X86SUBTARGET_H +#include "X86FrameLowering.h" +#include "X86ISelLowering.h" +#include "X86InstrInfo.h" +#include "X86JITInfo.h" +#include "X86SelectionDAGInfo.h" #include "llvm/ADT/Triple.h" #include "llvm/IR/CallingConv.h" #include "llvm/Target/TargetSubtargetInfo.h" @@ -40,6 +45,7 @@ enum Style { } class X86Subtarget final : public X86GenSubtargetInfo { + protected: enum X86SSEEnum { NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F @@ -181,6 +187,9 @@ protected: /// SlowLEA - True if the LEA instruction with certain arguments is slow bool SlowLEA; + /// SlowIncDec - True if INC and DEC instructions are slow when writing to flags + bool SlowIncDec; + /// Processor has AVX-512 PreFetch Instructions bool HasPFI; @@ -217,14 +226,31 @@ private: /// In16BitMode - True if compiling for 16-bit, false for 32-bit or 64-bit. bool In16BitMode; + // Calculates type size & alignment + const DataLayout DL; + X86SelectionDAGInfo TSInfo; + // Ordering here is important. X86InstrInfo initializes X86RegisterInfo which + // X86TargetLowering needs. + X86InstrInfo InstrInfo; + X86TargetLowering TLInfo; + X86FrameLowering FrameLowering; + X86JITInfo JITInfo; + public: /// This constructor initializes the data members to match that /// of the specified triple. /// X86Subtarget(const std::string &TT, const std::string &CPU, - const std::string &FS, + const std::string &FS, X86TargetMachine &TM, unsigned StackAlignOverride); + const X86TargetLowering *getTargetLowering() const { return &TLInfo; } + const X86InstrInfo *getInstrInfo() const { return &InstrInfo; } + const DataLayout *getDataLayout() const { return &DL; } + const X86FrameLowering *getFrameLowering() const { return &FrameLowering; } + const X86SelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; } + X86JITInfo *getJITInfo() { return &JITInfo; } + /// getStackAlignment - Returns the minimum alignment known to hold of the /// stack frame on entry to the function and which must be maintained by every /// function for this subtarget. @@ -241,6 +267,9 @@ public: /// \brief Reset the features for the X86 target. void resetSubtargetFeatures(const MachineFunction *MF) override; private: + /// \brief Initialize the full set of dependencies so we can use an initializer + /// list for X86Subtarget. + X86Subtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS); void initializeEnvironment(); void resetSubtargetFeatures(StringRef CPU, StringRef FS); public: @@ -319,6 +348,7 @@ public: bool callRegIndirect() const { return CallRegIndirect; } bool LEAusesAG() const { return LEAUsesAG; } bool slowLEA() const { return SlowLEA; } + bool slowIncDec() const { return SlowIncDec; } bool hasCDI() const { return HasCDI; } bool hasPFI() const { return HasPFI; } bool hasERI() const { return HasERI; } diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index 93760efe666d..f12140f1f161 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -29,61 +29,14 @@ extern "C" void LLVMInitializeX86Target() { void X86TargetMachine::anchor() { } -static std::string computeDataLayout(const X86Subtarget &ST) { - // X86 is little endian - std::string Ret = "e"; - - Ret += DataLayout::getManglingComponent(ST.getTargetTriple()); - // X86 and x32 have 32 bit pointers. - if (ST.isTarget64BitILP32() || !ST.is64Bit()) - Ret += "-p:32:32"; - - // Some ABIs align 64 bit integers and doubles to 64 bits, others to 32. - if (ST.is64Bit() || ST.isTargetCygMing() || ST.isTargetKnownWindowsMSVC() || - ST.isTargetNaCl()) - Ret += "-i64:64"; - else - Ret += "-f64:32:64"; - - // Some ABIs align long double to 128 bits, others to 32. - if (ST.isTargetNaCl()) - ; // No f80 - else if (ST.is64Bit() || ST.isTargetDarwin()) - Ret += "-f80:128"; - else - Ret += "-f80:32"; - - // The registers can hold 8, 16, 32 or, in x86-64, 64 bits. - if (ST.is64Bit()) - Ret += "-n8:16:32:64"; - else - Ret += "-n8:16:32"; - - // The stack is aligned to 32 bits on some ABIs and 128 bits on others. - if (!ST.is64Bit() && (ST.isTargetCygMing() || ST.isTargetKnownWindowsMSVC())) - Ret += "-S32"; - else - Ret += "-S128"; - - return Ret; -} - /// X86TargetMachine ctor - Create an X86 target. /// -X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, - const TargetOptions &Options, +X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) - : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), - Subtarget(TT, CPU, FS, Options.StackAlignmentOverride), - FrameLowering(*this, Subtarget), - InstrItins(Subtarget.getInstrItineraryData()), - DL(computeDataLayout(*getSubtargetImpl())), - InstrInfo(*this), - TLInfo(*this), - TSInfo(*this), - JITInfo(*this) { + : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), + Subtarget(TT, CPU, FS, *this, Options.StackAlignmentOverride) { // Determine the PICStyle based on the target selected. if (getRelocationModel() == Reloc::Static) { // Unless we're in PIC or DynamicNoPIC mode, set the PIC style to None. @@ -158,6 +111,7 @@ public: return *getX86TargetMachine().getSubtargetImpl(); } + void addIRPasses() override; bool addInstSelector() override; bool addILPOpts() override; bool addPreRegAlloc() override; @@ -170,6 +124,12 @@ TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) { return new X86PassConfig(this, PM); } +void X86PassConfig::addIRPasses() { + addPass(createX86AtomicExpandPass(&getX86TargetMachine())); + + TargetPassConfig::addIRPasses(); +} + bool X86PassConfig::addInstSelector() { // Install an instruction selector. addPass(createX86ISelDag(getX86TargetMachine(), getOptLevel())); diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h index 57e6eda6bc15..41d51570b9ab 100644 --- a/lib/Target/X86/X86TargetMachine.h +++ b/lib/Target/X86/X86TargetMachine.h @@ -13,12 +13,7 @@ #ifndef X86TARGETMACHINE_H #define X86TARGETMACHINE_H - -#include "X86FrameLowering.h" -#include "X86ISelLowering.h" #include "X86InstrInfo.h" -#include "X86JITInfo.h" -#include "X86SelectionDAGInfo.h" #include "X86Subtarget.h" #include "llvm/IR/DataLayout.h" #include "llvm/Target/TargetMachine.h" @@ -30,13 +25,6 @@ class StringRef; class X86TargetMachine final : public LLVMTargetMachine { virtual void anchor(); X86Subtarget Subtarget; - X86FrameLowering FrameLowering; - InstrItineraryData InstrItins; - const DataLayout DL; // Calculates type size & alignment - X86InstrInfo InstrInfo; - X86TargetLowering TLInfo; - X86SelectionDAGInfo TSInfo; - X86JITInfo JITInfo; public: X86TargetMachine(const Target &T, StringRef TT, @@ -44,28 +32,28 @@ public: Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL); - const DataLayout *getDataLayout() const override { return &DL; } + const DataLayout *getDataLayout() const override { + return getSubtargetImpl()->getDataLayout(); + } const X86InstrInfo *getInstrInfo() const override { - return &InstrInfo; + return getSubtargetImpl()->getInstrInfo(); } const TargetFrameLowering *getFrameLowering() const override { - return &FrameLowering; - } - X86JITInfo *getJITInfo() override { - return &JITInfo; + return getSubtargetImpl()->getFrameLowering(); } + X86JITInfo *getJITInfo() override { return Subtarget.getJITInfo(); } const X86Subtarget *getSubtargetImpl() const override { return &Subtarget; } const X86TargetLowering *getTargetLowering() const override { - return &TLInfo; + return getSubtargetImpl()->getTargetLowering(); } const X86SelectionDAGInfo *getSelectionDAGInfo() const override { - return &TSInfo; + return getSubtargetImpl()->getSelectionDAGInfo(); } const X86RegisterInfo *getRegisterInfo() const override { return &getInstrInfo()->getRegisterInfo(); } const InstrItineraryData *getInstrItineraryData() const override { - return &InstrItins; + return &getSubtargetImpl()->getInstrItineraryData(); } /// \brief Register X86 analysis passes with a pass manager. diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index 91b9d40f8ef7..c961e2f5b2c8 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -102,6 +102,8 @@ public: unsigned getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm) const override; + unsigned getIntImmCost(int64_t) const; + unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override; unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, @@ -142,13 +144,17 @@ unsigned X86TTI::getNumberOfRegisters(bool Vector) const { if (Vector && !ST->hasSSE1()) return 0; - if (ST->is64Bit()) + if (ST->is64Bit()) { + if (Vector && ST->hasAVX512()) + return 32; return 16; + } return 8; } unsigned X86TTI::getRegisterBitWidth(bool Vector) const { if (Vector) { + if (ST->hasAVX512()) return 512; if (ST->hasAVX()) return 256; if (ST->hasSSE1()) return 128; return 0; @@ -400,17 +406,117 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, unsigned X86TTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) const { - // We only estimate the cost of reverse shuffles. - if (Kind != SK_Reverse) + // We only estimate the cost of reverse and alternate shuffles. + if (Kind != SK_Reverse && Kind != SK_Alternate) return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp); - unsigned Cost = 1; - if (LT.second.getSizeInBits() > 128) - Cost = 3; // Extract + insert + copy. + if (Kind == SK_Reverse) { + std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp); + unsigned Cost = 1; + if (LT.second.getSizeInBits() > 128) + Cost = 3; // Extract + insert + copy. + + // Multiple by the number of parts. + return Cost * LT.first; + } + + if (Kind == SK_Alternate) { + // 64-bit packed float vectors (v2f32) are widened to type v4f32. + // 64-bit packed integer vectors (v2i32) are promoted to type v2i64. + std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp); + + // The backend knows how to generate a single VEX.256 version of + // instruction VPBLENDW if the target supports AVX2. + if (ST->hasAVX2() && LT.second == MVT::v16i16) + return LT.first; + + static const CostTblEntry<MVT::SimpleValueType> AVXAltShuffleTbl[] = { + {ISD::VECTOR_SHUFFLE, MVT::v4i64, 1}, // vblendpd + {ISD::VECTOR_SHUFFLE, MVT::v4f64, 1}, // vblendpd + + {ISD::VECTOR_SHUFFLE, MVT::v8i32, 1}, // vblendps + {ISD::VECTOR_SHUFFLE, MVT::v8f32, 1}, // vblendps + + // This shuffle is custom lowered into a sequence of: + // 2x vextractf128 , 2x vpblendw , 1x vinsertf128 + {ISD::VECTOR_SHUFFLE, MVT::v16i16, 5}, + + // This shuffle is custom lowered into a long sequence of: + // 2x vextractf128 , 4x vpshufb , 2x vpor , 1x vinsertf128 + {ISD::VECTOR_SHUFFLE, MVT::v32i8, 9} + }; + + if (ST->hasAVX()) { + int Idx = CostTableLookup(AVXAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); + if (Idx != -1) + return LT.first * AVXAltShuffleTbl[Idx].Cost; + } + + static const CostTblEntry<MVT::SimpleValueType> SSE41AltShuffleTbl[] = { + // These are lowered into movsd. + {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, + + // packed float vectors with four elements are lowered into BLENDI dag + // nodes. A v4i32/v4f32 BLENDI generates a single 'blendps'/'blendpd'. + {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1}, + + // This shuffle generates a single pshufw. + {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1}, + + // There is no instruction that matches a v16i8 alternate shuffle. + // The backend will expand it into the sequence 'pshufb + pshufb + or'. + {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3} + }; + + if (ST->hasSSE41()) { + int Idx = CostTableLookup(SSE41AltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); + if (Idx != -1) + return LT.first * SSE41AltShuffleTbl[Idx].Cost; + } + + static const CostTblEntry<MVT::SimpleValueType> SSSE3AltShuffleTbl[] = { + {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, // movsd + {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, // movsd + + // SSE3 doesn't have 'blendps'. The following shuffles are expanded into + // the sequence 'shufps + pshufd' + {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, + {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, - // Multiple by the number of parts. - return Cost * LT.first; + {ISD::VECTOR_SHUFFLE, MVT::v8i16, 3}, // pshufb + pshufb + or + {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3} // pshufb + pshufb + or + }; + + if (ST->hasSSSE3()) { + int Idx = CostTableLookup(SSSE3AltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); + if (Idx != -1) + return LT.first * SSSE3AltShuffleTbl[Idx].Cost; + } + + static const CostTblEntry<MVT::SimpleValueType> SSEAltShuffleTbl[] = { + {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, // movsd + {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, // movsd + + {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, // shufps + pshufd + {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, // shufps + pshufd + + // This is expanded into a long sequence of four extract + four insert. + {ISD::VECTOR_SHUFFLE, MVT::v8i16, 8}, // 4 x pextrw + 4 pinsrw. + + // 8 x (pinsrw + pextrw + and + movb + movzb + or) + {ISD::VECTOR_SHUFFLE, MVT::v16i8, 48} + }; + + // Fall-back (SSE3 and SSE2). + int Idx = CostTableLookup(SSEAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); + if (Idx != -1) + return LT.first * SSEAltShuffleTbl[Idx].Cost; + return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); + } + + return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); } unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const { @@ -808,6 +914,19 @@ unsigned X86TTI::getReductionCost(unsigned Opcode, Type *ValTy, return TargetTransformInfo::getReductionCost(Opcode, ValTy, IsPairwise); } +/// \brief Calculate the cost of materializing a 64-bit value. This helper +/// method might only calculate a fraction of a larger immediate. Therefore it +/// is valid to return a cost of ZERO. +unsigned X86TTI::getIntImmCost(int64_t Val) const { + if (Val == 0) + return TCC_Free; + + if (isInt<32>(Val)) + return TCC_Basic; + + return 2 * TCC_Basic; +} + unsigned X86TTI::getIntImmCost(const APInt &Imm, Type *Ty) const { assert(Ty->isIntegerTy()); @@ -825,11 +944,21 @@ unsigned X86TTI::getIntImmCost(const APInt &Imm, Type *Ty) const { if (Imm == 0) return TCC_Free; - if (Imm.getBitWidth() <= 64 && - (isInt<32>(Imm.getSExtValue()) || isUInt<32>(Imm.getZExtValue()))) - return TCC_Basic; - else - return 2 * TCC_Basic; + // Sign-extend all constants to a multiple of 64-bit. + APInt ImmVal = Imm; + if (BitSize & 0x3f) + ImmVal = Imm.sext((BitSize + 63) & ~0x3fU); + + // Split the constant into 64-bit chunks and calculate the cost for each + // chunk. + unsigned Cost = 0; + for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { + APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); + int64_t Val = Tmp.getSExtValue(); + Cost += getIntImmCost(Val); + } + // We need at least one instruction to materialze the constant. + return std::max(1U, Cost); } unsigned X86TTI::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, @@ -889,9 +1018,13 @@ unsigned X86TTI::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, break; } - if ((Idx == ImmIdx) && - Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue())) - return TCC_Free; + if (Idx == ImmIdx) { + unsigned NumConstants = (BitSize + 63) / 64; + unsigned Cost = X86TTI::getIntImmCost(Imm, Ty); + return (Cost <= NumConstants * TCC_Basic) + ? static_cast<unsigned>(TCC_Free) + : Cost; + } return X86TTI::getIntImmCost(Imm, Ty); } diff --git a/lib/Target/XCore/XCoreFrameLowering.cpp b/lib/Target/XCore/XCoreFrameLowering.cpp index 5499aba351c5..e6947369c2d7 100644 --- a/lib/Target/XCore/XCoreFrameLowering.cpp +++ b/lib/Target/XCore/XCoreFrameLowering.cpp @@ -228,7 +228,9 @@ void XCoreFrameLowering::emitPrologue(MachineFunction &MF) const { const XCoreInstrInfo &TII = *static_cast<const XCoreInstrInfo*>(MF.getTarget().getInstrInfo()); XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>(); - DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); + // Debug location must be unknown since the first debug location is used + // to determine the end of the prologue. + DebugLoc dl; if (MFI->getMaxAlignment() > getStackAlignment()) report_fatal_error("emitPrologue unsupported alignment: " @@ -416,7 +418,7 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB, bool emitFrameMoves = XCoreRegisterInfo::needsFrameMoves(*MF); DebugLoc DL; - if (MI != MBB.end()) + if (MI != MBB.end() && !MI->isDebugValue()) DL = MI->getDebugLoc(); for (std::vector<CalleeSavedInfo>::const_iterator it = CSI.begin(); diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp index 9d785864952b..be7ef6420193 100644 --- a/lib/Target/XCore/XCoreISelLowering.cpp +++ b/lib/Target/XCore/XCoreISelLowering.cpp @@ -68,10 +68,9 @@ getTargetNodeName(unsigned Opcode) const } } -XCoreTargetLowering::XCoreTargetLowering(XCoreTargetMachine &XTM) - : TargetLowering(XTM, new XCoreTargetObjectFile()), - TM(XTM), - Subtarget(*XTM.getSubtargetImpl()) { +XCoreTargetLowering::XCoreTargetLowering(const TargetMachine &TM) + : TargetLowering(TM, new XCoreTargetObjectFile()), TM(TM), + Subtarget(TM.getSubtarget<XCoreSubtarget>()) { // Set up the register classes. addRegisterClass(MVT::i32, &XCore::GRRegsRegClass); @@ -92,15 +91,12 @@ XCoreTargetLowering::XCoreTargetLowering(XCoreTargetMachine &XTM) // XCore does not have the NodeTypes below. setOperationAction(ISD::BR_CC, MVT::i32, Expand); - setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); + setOperationAction(ISD::SELECT_CC, MVT::i32, Expand); setOperationAction(ISD::ADDC, MVT::i32, Expand); setOperationAction(ISD::ADDE, MVT::i32, Expand); setOperationAction(ISD::SUBC, MVT::i32, Expand); setOperationAction(ISD::SUBE, MVT::i32, Expand); - // Stop the combiner recombining select and set_cc - setOperationAction(ISD::SELECT_CC, MVT::Other, Expand); - // 64bit setOperationAction(ISD::ADD, MVT::i64, Custom); setOperationAction(ISD::SUB, MVT::i64, Custom); @@ -217,7 +213,6 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::BR_JT: return LowerBR_JT(Op, DAG); case ISD::LOAD: return LowerLOAD(Op, DAG); case ISD::STORE: return LowerSTORE(Op, DAG); - case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); case ISD::VAARG: return LowerVAARG(Op, DAG); case ISD::VASTART: return LowerVASTART(Op, DAG); case ISD::SMUL_LOHI: return LowerSMUL_LOHI(Op, DAG); @@ -258,33 +253,21 @@ void XCoreTargetLowering::ReplaceNodeResults(SDNode *N, // Misc Lower Operation implementation //===----------------------------------------------------------------------===// -SDValue XCoreTargetLowering:: -LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const -{ - SDLoc dl(Op); - SDValue Cond = DAG.getNode(ISD::SETCC, dl, MVT::i32, Op.getOperand(2), - Op.getOperand(3), Op.getOperand(4)); - return DAG.getNode(ISD::SELECT, dl, MVT::i32, Cond, Op.getOperand(0), - Op.getOperand(1)); -} - SDValue XCoreTargetLowering::getGlobalAddressWrapper(SDValue GA, const GlobalValue *GV, SelectionDAG &DAG) const { // FIXME there is no actual debug info here SDLoc dl(GA); - const GlobalValue *UnderlyingGV = GV; - // If GV is an alias then use the aliasee to determine the wrapper type - if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) - UnderlyingGV = GA->getAliasee(); - if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(UnderlyingGV)) { - if ((GVar->isConstant() && GV->hasLocalLinkage()) || - (GVar->hasSection() && - StringRef(GVar->getSection()).startswith(".cp."))) - return DAG.getNode(XCoreISD::CPRelativeWrapper, dl, MVT::i32, GA); - return DAG.getNode(XCoreISD::DPRelativeWrapper, dl, MVT::i32, GA); - } - return DAG.getNode(XCoreISD::PCRelativeWrapper, dl, MVT::i32, GA); + + if (GV->getType()->getElementType()->isFunctionTy()) + return DAG.getNode(XCoreISD::PCRelativeWrapper, dl, MVT::i32, GA); + + const auto *GVar = dyn_cast<GlobalVariable>(GV); + if ((GV->hasSection() && StringRef(GV->getSection()).startswith(".cp.")) || + (GVar && GVar->isConstant() && GV->hasLocalLinkage())) + return DAG.getNode(XCoreISD::CPRelativeWrapper, dl, MVT::i32, GA); + + return DAG.getNode(XCoreISD::DPRelativeWrapper, dl, MVT::i32, GA); } static bool IsSmallObject(const GlobalValue *GV, const XCoreTargetLowering &XTL) { @@ -508,7 +491,7 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG) const { CLI.setDebugLoc(DL).setChain(Chain) .setCallee(CallingConv::C, IntPtrTy, DAG.getExternalSymbol("__misaligned_load", getPointerTy()), - &Args, 0); + std::move(Args), 0); std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); SDValue Ops[] = { CallResult.first, CallResult.second }; @@ -568,7 +551,7 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG) const CLI.setDebugLoc(dl).setChain(Chain) .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), DAG.getExternalSymbol("__misaligned_store", getPointerTy()), - &Args, 0); + std::move(Args), 0); std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); return CallResult.second; diff --git a/lib/Target/XCore/XCoreISelLowering.h b/lib/Target/XCore/XCoreISelLowering.h index d28715b7178f..62b89c348dc7 100644 --- a/lib/Target/XCore/XCoreISelLowering.h +++ b/lib/Target/XCore/XCoreISelLowering.h @@ -94,7 +94,7 @@ namespace llvm { { public: - explicit XCoreTargetLowering(XCoreTargetMachine &TM); + explicit XCoreTargetLowering(const TargetMachine &TM); using TargetLowering::isZExtFree; bool isZExtFree(SDValue Val, EVT VT2) const override; @@ -123,7 +123,7 @@ namespace llvm { bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override; private: - const XCoreTargetMachine &TM; + const TargetMachine &TM; const XCoreSubtarget &Subtarget; // Lower Operand helpers @@ -157,7 +157,6 @@ namespace llvm { SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const; SDValue LowerUMUL_LOHI(SDValue Op, SelectionDAG &DAG) const; diff --git a/lib/Target/XCore/XCoreInstrInfo.cpp b/lib/Target/XCore/XCoreInstrInfo.cpp index 984f0cd9c4d3..36ea9a087da5 100644 --- a/lib/Target/XCore/XCoreInstrInfo.cpp +++ b/lib/Target/XCore/XCoreInstrInfo.cpp @@ -373,7 +373,8 @@ void XCoreInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, const TargetRegisterInfo *TRI) const { DebugLoc DL; - if (I != MBB.end()) DL = I->getDebugLoc(); + if (I != MBB.end() && !I->isDebugValue()) + DL = I->getDebugLoc(); MachineFunction *MF = MBB.getParent(); const MachineFrameInfo &MFI = *MF->getFrameInfo(); MachineMemOperand *MMO = @@ -395,7 +396,8 @@ void XCoreInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, const TargetRegisterInfo *TRI) const { DebugLoc DL; - if (I != MBB.end()) DL = I->getDebugLoc(); + if (I != MBB.end() && !I->isDebugValue()) + DL = I->getDebugLoc(); MachineFunction *MF = MBB.getParent(); const MachineFrameInfo &MFI = *MF->getFrameInfo(); MachineMemOperand *MMO = @@ -440,7 +442,8 @@ MachineBasicBlock::iterator XCoreInstrInfo::loadImmediate( MachineBasicBlock::iterator MI, unsigned Reg, uint64_t Value) const { DebugLoc dl; - if (MI != MBB.end()) dl = MI->getDebugLoc(); + if (MI != MBB.end() && !MI->isDebugValue()) + dl = MI->getDebugLoc(); if (isImmMskBitp(Value)) { int N = Log2_32(Value) + 1; return BuildMI(MBB, MI, dl, get(XCore::MKMSK_rus), Reg).addImm(N); diff --git a/lib/Target/XCore/XCoreSelectionDAGInfo.cpp b/lib/Target/XCore/XCoreSelectionDAGInfo.cpp index 5a6bbe7b1d3e..91b33fd6559c 100644 --- a/lib/Target/XCore/XCoreSelectionDAGInfo.cpp +++ b/lib/Target/XCore/XCoreSelectionDAGInfo.cpp @@ -16,9 +16,8 @@ using namespace llvm; #define DEBUG_TYPE "xcore-selectiondag-info" -XCoreSelectionDAGInfo::XCoreSelectionDAGInfo(const XCoreTargetMachine &TM) - : TargetSelectionDAGInfo(TM) { -} +XCoreSelectionDAGInfo::XCoreSelectionDAGInfo(const DataLayout &DL) + : TargetSelectionDAGInfo(&DL) {} XCoreSelectionDAGInfo::~XCoreSelectionDAGInfo() { } @@ -47,7 +46,7 @@ EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, SDValue Chain, .setCallee(TLI.getLibcallCallingConv(RTLIB::MEMCPY), Type::getVoidTy(*DAG.getContext()), DAG.getExternalSymbol("__memcpy_4", TLI.getPointerTy()), - &Args, 0) + std::move(Args), 0) .setDiscardResult(); std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI); diff --git a/lib/Target/XCore/XCoreSelectionDAGInfo.h b/lib/Target/XCore/XCoreSelectionDAGInfo.h index ea6af980c01f..0079de1798b2 100644 --- a/lib/Target/XCore/XCoreSelectionDAGInfo.h +++ b/lib/Target/XCore/XCoreSelectionDAGInfo.h @@ -22,7 +22,7 @@ class XCoreTargetMachine; class XCoreSelectionDAGInfo : public TargetSelectionDAGInfo { public: - explicit XCoreSelectionDAGInfo(const XCoreTargetMachine &TM); + explicit XCoreSelectionDAGInfo(const DataLayout &DL); ~XCoreSelectionDAGInfo(); SDValue diff --git a/lib/Target/XCore/XCoreSubtarget.cpp b/lib/Target/XCore/XCoreSubtarget.cpp index 89ea03a88f6d..7227411ba560 100644 --- a/lib/Target/XCore/XCoreSubtarget.cpp +++ b/lib/Target/XCore/XCoreSubtarget.cpp @@ -25,8 +25,8 @@ using namespace llvm; void XCoreSubtarget::anchor() { } -XCoreSubtarget::XCoreSubtarget(const std::string &TT, - const std::string &CPU, const std::string &FS) - : XCoreGenSubtargetInfo(TT, CPU, FS) -{ -} +XCoreSubtarget::XCoreSubtarget(const std::string &TT, const std::string &CPU, + const std::string &FS, const TargetMachine &TM) + : XCoreGenSubtargetInfo(TT, CPU, FS), + DL("e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i64:32-f64:32-a:0:32-n32"), + InstrInfo(), FrameLowering(*this), TLInfo(TM), TSInfo(DL) {} diff --git a/lib/Target/XCore/XCoreSubtarget.h b/lib/Target/XCore/XCoreSubtarget.h index 5ac4dbc4bc07..1e9810bb89ef 100644 --- a/lib/Target/XCore/XCoreSubtarget.h +++ b/lib/Target/XCore/XCoreSubtarget.h @@ -14,6 +14,11 @@ #ifndef XCORESUBTARGET_H #define XCORESUBTARGET_H +#include "XCoreFrameLowering.h" +#include "XCoreISelLowering.h" +#include "XCoreInstrInfo.h" +#include "XCoreSelectionDAGInfo.h" +#include "llvm/IR/DataLayout.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetSubtargetInfo.h" #include <string> @@ -26,17 +31,31 @@ class StringRef; class XCoreSubtarget : public XCoreGenSubtargetInfo { virtual void anchor(); + const DataLayout DL; // Calculates type size & alignment + XCoreInstrInfo InstrInfo; + XCoreFrameLowering FrameLowering; + XCoreTargetLowering TLInfo; + XCoreSelectionDAGInfo TSInfo; public: /// This constructor initializes the data members to match that /// of the specified triple. /// XCoreSubtarget(const std::string &TT, const std::string &CPU, - const std::string &FS); + const std::string &FS, const TargetMachine &TM); /// ParseSubtargetFeatures - Parses features string setting specified /// subtarget options. Definition of function is auto generated by tblgen. void ParseSubtargetFeatures(StringRef CPU, StringRef FS); + + const XCoreInstrInfo *getInstrInfo() const { return &InstrInfo; } + const XCoreFrameLowering *getFrameLowering() const { return &FrameLowering; } + const XCoreTargetLowering *getTargetLowering() const { return &TLInfo; } + const XCoreSelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; } + const TargetRegisterInfo *getRegisterInfo() const { + return &InstrInfo.getRegisterInfo(); + } + const DataLayout *getDataLayout() const { return &DL; } }; } // End llvm namespace diff --git a/lib/Target/XCore/XCoreTargetMachine.cpp b/lib/Target/XCore/XCoreTargetMachine.cpp index 0fb21c5d7dfb..8d8bb3800ea5 100644 --- a/lib/Target/XCore/XCoreTargetMachine.cpp +++ b/lib/Target/XCore/XCoreTargetMachine.cpp @@ -25,13 +25,8 @@ XCoreTargetMachine::XCoreTargetMachine(const Target &T, StringRef TT, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) - : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), - Subtarget(TT, CPU, FS), - DL("e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i64:32-f64:32-a:0:32-n32"), - InstrInfo(), - FrameLowering(Subtarget), - TLInfo(*this), - TSInfo(*this) { + : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), + Subtarget(TT, CPU, FS, *this) { initAsmInfo(); } diff --git a/lib/Target/XCore/XCoreTargetMachine.h b/lib/Target/XCore/XCoreTargetMachine.h index a57ca55f3c10..14c43bf151f4 100644 --- a/lib/Target/XCore/XCoreTargetMachine.h +++ b/lib/Target/XCore/XCoreTargetMachine.h @@ -14,46 +14,38 @@ #ifndef XCORETARGETMACHINE_H #define XCORETARGETMACHINE_H -#include "XCoreFrameLowering.h" -#include "XCoreISelLowering.h" -#include "XCoreInstrInfo.h" -#include "XCoreSelectionDAGInfo.h" #include "XCoreSubtarget.h" -#include "llvm/IR/DataLayout.h" #include "llvm/Target/TargetMachine.h" namespace llvm { class XCoreTargetMachine : public LLVMTargetMachine { XCoreSubtarget Subtarget; - const DataLayout DL; // Calculates type size & alignment - XCoreInstrInfo InstrInfo; - XCoreFrameLowering FrameLowering; - XCoreTargetLowering TLInfo; - XCoreSelectionDAGInfo TSInfo; public: XCoreTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL); - const XCoreInstrInfo *getInstrInfo() const override { return &InstrInfo; } + const XCoreInstrInfo *getInstrInfo() const override { + return getSubtargetImpl()->getInstrInfo(); + } const XCoreFrameLowering *getFrameLowering() const override { - return &FrameLowering; + return getSubtargetImpl()->getFrameLowering(); } const XCoreSubtarget *getSubtargetImpl() const override { return &Subtarget; } const XCoreTargetLowering *getTargetLowering() const override { - return &TLInfo; + return getSubtargetImpl()->getTargetLowering(); } - const XCoreSelectionDAGInfo* getSelectionDAGInfo() const override { - return &TSInfo; + return getSubtargetImpl()->getSelectionDAGInfo(); } - const TargetRegisterInfo *getRegisterInfo() const override { - return &InstrInfo.getRegisterInfo(); + return getSubtargetImpl()->getRegisterInfo(); + } + const DataLayout *getDataLayout() const override { + return getSubtargetImpl()->getDataLayout(); } - const DataLayout *getDataLayout() const override { return &DL; } // Pass Pipeline Configuration TargetPassConfig *createPassConfig(PassManagerBase &PM) override; |