7 files changed, 75 insertions, 16 deletions
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 3688db943d5..4151175a019 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -4135,6 +4135,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
     case ARM::VLD3d32Pseudo:
     case ARM::VLD1d64TPseudo:
     case ARM::VLD1d64TPseudoWB_fixed:
+    case ARM::VLD1d64TPseudoWB_register:
     case ARM::VLD3d8Pseudo_UPD:
     case ARM::VLD3d16Pseudo_UPD:
     case ARM::VLD3d32Pseudo_UPD:
@@ -4152,6 +4153,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
     case ARM::VLD4d32Pseudo:
     case ARM::VLD1d64QPseudo:
     case ARM::VLD1d64QPseudoWB_fixed:
+    case ARM::VLD1d64QPseudoWB_register:
     case ARM::VLD4d8Pseudo_UPD:
     case ARM::VLD4d16Pseudo_UPD:
     case ARM::VLD4d32Pseudo_UPD:
diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index be7afae5ea4..a488eee92f9 100644
--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -156,8 +156,10 @@ static const NEONLdStTableEntry NEONLdStTable[] = {
 
 { ARM::VLD1d64QPseudo,      ARM::VLD1d64Q,     true,  false, false, SingleSpc,  4, 1 ,false},
 { ARM::VLD1d64QPseudoWB_fixed,  ARM::VLD1d64Qwb_fixed,   true,  true, false, SingleSpc,  4, 1 ,false},
+{ ARM::VLD1d64QPseudoWB_register,  ARM::VLD1d64Qwb_register,   true,  true, true, SingleSpc,  4, 1 ,false},
 { ARM::VLD1d64TPseudo,      ARM::VLD1d64T,     true,  false, false, SingleSpc,  3, 1 ,false},
 { ARM::VLD1d64TPseudoWB_fixed,  ARM::VLD1d64Twb_fixed,   true,  true, false, SingleSpc,  3, 1 ,false},
+{ ARM::VLD1d64TPseudoWB_register,  ARM::VLD1d64Twb_register, true, true, true,  SingleSpc,  3, 1 ,false},
 
 { ARM::VLD2LNd16Pseudo,     ARM::VLD2LNd16,     true, false, false, SingleSpc,  2, 4 ,true},
 { ARM::VLD2LNd16Pseudo_UPD, ARM::VLD2LNd16_UPD, true, true, true,  SingleSpc,  2, 4 ,true},
@@ -1496,6 +1498,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     case ARM::VLD3d32Pseudo:
     case ARM::VLD1d64TPseudo:
     case ARM::VLD1d64TPseudoWB_fixed:
+    case ARM::VLD1d64TPseudoWB_register:
     case ARM::VLD3d8Pseudo_UPD:
     case ARM::VLD3d16Pseudo_UPD:
     case ARM::VLD3d32Pseudo_UPD:
@@ -1513,6 +1516,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     case ARM::VLD4d32Pseudo:
     case ARM::VLD1d64QPseudo:
     case ARM::VLD1d64QPseudoWB_fixed:
+    case ARM::VLD1d64QPseudoWB_register:
     case ARM::VLD4d8Pseudo_UPD:
     case ARM::VLD4d16Pseudo_UPD:
     case ARM::VLD4d32Pseudo_UPD:
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index efeef6c9943..387e8dc582c 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -1932,15 +1932,17 @@ void ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
     Ops.push_back(Align);
     if (isUpdating) {
       SDValue Inc = N->getOperand(AddrOpIdx + 1);
-      // FIXME: VLD1/VLD2 fixed increment doesn't need Reg0. Remove the reg0
-      // case entirely when the rest are updated to that form, too.
       bool IsImmUpdate = isPerfectIncrement(Inc, VT, NumVecs);
-      if ((NumVecs <= 2) && !IsImmUpdate)
-        Opc = getVLDSTRegisterUpdateOpcode(Opc);
-      // FIXME: We use a VLD1 for v1i64 even if the pseudo says vld2/3/4, so
-      // check for that explicitly too. Horribly hacky, but temporary.
-      if ((NumVecs > 2 && !isVLDfixed(Opc)) || !IsImmUpdate)
-        Ops.push_back(IsImmUpdate ? Reg0 : Inc);
+      if (!IsImmUpdate) {
+        // We use a VLD1 for v1i64 even if the pseudo says vld2/3/4, so
+        // check for the opcode rather than the number of vector elements.
+        if (isVLDfixed(Opc))
+          Opc = getVLDSTRegisterUpdateOpcode(Opc);
+        Ops.push_back(Inc);
+      // VLD1/VLD2 fixed increment does not need Reg0 so only include it in
+      // the operands if not such an opcode.
+      } else if (!isVLDfixed(Opc))
+        Ops.push_back(Reg0);
     }
     Ops.push_back(Pred);
     Ops.push_back(Reg0);
@@ -2086,16 +2088,17 @@ void ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
     Ops.push_back(Align);
     if (isUpdating) {
       SDValue Inc = N->getOperand(AddrOpIdx + 1);
-      // FIXME: VST1/VST2 fixed increment doesn't need Reg0. Remove the reg0
-      // case entirely when the rest are updated to that form, too.
       bool IsImmUpdate = isPerfectIncrement(Inc, VT, NumVecs);
-      if (NumVecs <= 2 && !IsImmUpdate)
-        Opc = getVLDSTRegisterUpdateOpcode(Opc);
-      // FIXME: We use a VST1 for v1i64 even if the pseudo says vld2/3/4, so
-      // check for that explicitly too. Horribly hacky, but temporary.
-      if  (!IsImmUpdate)
+      if (!IsImmUpdate) {
+        // We use a VST1 for v1i64 even if the pseudo says VST2/3/4, so
+        // check for the opcode rather than the number of vector elements.
+        if (isVSTfixed(Opc))
+          Opc = getVLDSTRegisterUpdateOpcode(Opc);
         Ops.push_back(Inc);
-      else if (NumVecs > 2 && !isVSTfixed(Opc))
+      }
+      // VST1/VST2 fixed increment does not need Reg0 so only include it in
+      // the operands if not such an opcode.
+      else if (!isVSTfixed(Opc))
         Ops.push_back(Reg0);
     }
     Ops.push_back(SrcReg);
diff --git a/test/CodeGen/ARM/vld3.ll b/test/CodeGen/ARM/vld3.ll
index 0eaad0f9003..46e17c97e6e 100644
--- a/test/CodeGen/ARM/vld3.ll
+++ b/test/CodeGen/ARM/vld3.ll
@@ -96,6 +96,19 @@ define <1 x i64> @vld3i64_update(i64** %ptr, i64* %A) nounwind {
         ret <1 x i64> %tmp4
 }
 
+define <1 x i64> @vld3i64_reg_update(i64** %ptr, i64* %A) nounwind {
+;CHECK-LABEL: vld3i64_reg_update:
+;CHECK: vld1.64	{d16, d17, d18}, [{{r[0-9]+|lr}}:64], {{r[0-9]+|lr}}
+        %tmp0 = bitcast i64* %A to i8*
+        %tmp1 = call %struct.__neon_int64x1x3_t @llvm.arm.neon.vld3.v1i64.p0i8(i8* %tmp0, i32 16)
+        %tmp5 = getelementptr i64, i64* %A, i32 1
+        store i64* %tmp5, i64** %ptr
+        %tmp2 = extractvalue %struct.__neon_int64x1x3_t %tmp1, 0
+        %tmp3 = extractvalue %struct.__neon_int64x1x3_t %tmp1, 2
+        %tmp4 = add <1 x i64> %tmp2, %tmp3
+        ret <1 x i64> %tmp4
+}
+
 define <16 x i8> @vld3Qi8(i8* %A) nounwind {
 ;CHECK-LABEL: vld3Qi8:
 ;Check the alignment value.  Max for this instruction is 64 bits:
diff --git a/test/CodeGen/ARM/vld4.ll b/test/CodeGen/ARM/vld4.ll
index 5663e6d41f0..679c9ae4450 100644
--- a/test/CodeGen/ARM/vld4.ll
+++ b/test/CodeGen/ARM/vld4.ll
@@ -96,6 +96,19 @@ define <1 x i64> @vld4i64_update(i64** %ptr, i64* %A) nounwind {
         ret <1 x i64> %tmp4
 }
 
+define <1 x i64> @vld4i64_reg_update(i64** %ptr, i64* %A) nounwind {
+;CHECK-LABEL: vld4i64_reg_update:
+;CHECK: vld1.64 {d16, d17, d18, d19}, [{{r[0-9]+|lr}}:256], {{r[0-9]+|lr}}
+        %tmp0 = bitcast i64* %A to i8*
+        %tmp1 = call %struct.__neon_int64x1x4_t @llvm.arm.neon.vld4.v1i64.p0i8(i8* %tmp0, i32 64)
+        %tmp5 = getelementptr i64, i64* %A, i32 1
+        store i64* %tmp5, i64** %ptr
+        %tmp2 = extractvalue %struct.__neon_int64x1x4_t %tmp1, 0
+        %tmp3 = extractvalue %struct.__neon_int64x1x4_t %tmp1, 2
+        %tmp4 = add <1 x i64> %tmp2, %tmp3
+        ret <1 x i64> %tmp4
+}
+
 define <16 x i8> @vld4Qi8(i8* %A) nounwind {
 ;CHECK-LABEL: vld4Qi8:
 ;Check the alignment value.  Max for this instruction is 256 bits:
diff --git a/test/CodeGen/ARM/vst3.ll b/test/CodeGen/ARM/vst3.ll
index d70d5957900..1723304e82b 100644
--- a/test/CodeGen/ARM/vst3.ll
+++ b/test/CodeGen/ARM/vst3.ll
@@ -73,6 +73,18 @@ define void @vst3i64_update(i64** %ptr, <1 x i64>* %B) nounwind {
         ret void
 }
 
+define void @vst3i64_reg_update(i64** %ptr, <1 x i64>* %B) nounwind {
+;CHECK-LABEL: vst3i64_reg_update
+;CHECK: vst1.64	{d{{.*}}, d{{.*}}, d{{.*}}}, [r{{.*}}], r{{.*}}
+        %A = load i64*, i64** %ptr
+        %tmp0 = bitcast i64* %A to i8*
+        %tmp1 = load <1 x i64>, <1 x i64>* %B
+        call void @llvm.arm.neon.vst3.p0i8.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 1)
+        %tmp2 = getelementptr i64, i64* %A, i32 1
+        store i64* %tmp2, i64** %ptr
+        ret void
+}
+
 define void @vst3Qi8(i8* %A, <16 x i8>* %B) nounwind {
 ;CHECK-LABEL: vst3Qi8:
 ;Check the alignment value.  Max for this instruction is 64 bits:
diff --git a/test/CodeGen/ARM/vst4.ll b/test/CodeGen/ARM/vst4.ll
index afa4321c91a..ca9e5e7a59d 100644
--- a/test/CodeGen/ARM/vst4.ll
+++ b/test/CodeGen/ARM/vst4.ll
@@ -72,6 +72,18 @@ define void @vst4i64_update(i64** %ptr, <1 x i64>* %B) nounwind {
         ret void
 }
 
+define void @vst4i64_reg_update(i64** %ptr, <1 x i64>* %B) nounwind {
+;CHECK-LABEL: vst4i64_reg_update:
+;CHECK: vst1.64	{d16, d17, d18, d19}, [r{{[0-9]+}}], r{{[0-9]+}}
+        %A = load i64*, i64** %ptr
+        %tmp0 = bitcast i64* %A to i8*
+        %tmp1 = load <1 x i64>, <1 x i64>* %B
+        call void @llvm.arm.neon.vst4.p0i8.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 1)
+        %tmp2 = getelementptr i64, i64* %A, i32 1
+        store i64* %tmp2, i64** %ptr
+        ret void
+}
+
 define void @vst4Qi8(i8* %A, <16 x i8>* %B) nounwind {
 ;CHECK-LABEL: vst4Qi8:
 ;Check the alignment value.  Max for this instruction is 256 bits: