17 files changed, 4169 insertions, 150 deletions
diff --git a/include/clang/Basic/BuiltinsAArch64.def b/include/clang/Basic/BuiltinsAArch64.def
index 768e4bb26c..aafd202aae 100644
--- a/include/clang/Basic/BuiltinsAArch64.def
+++ b/include/clang/Basic/BuiltinsAArch64.def
@@ -16,3 +16,10 @@
 
 // In libgcc
 BUILTIN(__clear_cache, "vv*v*", "i")
+// NEON
+#define GET_NEON_AARCH64_BUILTINS
+#include "clang/Basic/arm_neon.inc"
+#undef GET_NEON_AARCH64_BUILTINS
+#undef GET_NEON_BUILTINS
+
+#undef BUILTIN
diff --git a/include/clang/Basic/TargetBuiltins.h b/include/clang/Basic/TargetBuiltins.h
index 66e378fa9b..4202a4a502 100644
--- a/include/clang/Basic/TargetBuiltins.h
+++ b/include/clang/Basic/TargetBuiltins.h
@@ -91,7 +91,8 @@ namespace clang {
       Poly8,
       Poly16,
       Float16,
-      Float32
+      Float32,
+      Float64
     };
 
     NeonTypeFlags(unsigned F) : Flags(F) {}
diff --git a/include/clang/Basic/arm_neon.td b/include/clang/Basic/arm_neon.td
index 77bc797c50..ea03e8fe8a 100644
--- a/include/clang/Basic/arm_neon.td
+++ b/include/clang/Basic/arm_neon.td
@@ -69,6 +69,7 @@ def OP_REINT : Op;
 def OP_ABDL  : Op;
 def OP_ABA   : Op;
 def OP_ABAL  : Op;
+def OP_DIV  : Op;
 
 class Inst <string n, string p, string t, Op o> {
   string Name = n;
@@ -77,6 +78,7 @@ class Inst <string n, string p, string t, Op o> {
   Op Operand = o;
   bit isShift = 0;
   bit isVCVT_N = 0;
+  bit isA64 = 0;
 
   // Certain intrinsics have different names than their representative
   // instructions. This field allows us to handle this correctly when we
@@ -145,6 +147,7 @@ class NoTestOpInst<string n, string p, string t, Op o> : Inst<n, p, t, o> {}
 // l: long
 // f: float
 // h: half-float
+// d: double
 
 // size modifiers:
 // U: unsigned
@@ -452,3 +455,110 @@ def VREINTERPRET
 // Vector fused multiply-add operations
 
 def VFMA : SInst<"vfma", "dddd", "fQf">;
+
+////////////////////////////////////////////////////////////////////////////////
+// AArch64 Intrinsics
+
+let isA64 = 1 in {
+
+////////////////////////////////////////////////////////////////////////////////
+// Addition
+// With additional Qd type.
+def ADD : IOpInst<"vadd", "ddd", "csilfUcUsUiUlQcQsQiQlQfQUcQUsQUiQUlQd", OP_ADD>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Subtraction
+// With additional Qd type.
+def SUB : IOpInst<"vsub", "ddd", "csilfUcUsUiUlQcQsQiQlQfQUcQUsQUiQUlQd", OP_SUB>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Multiplication
+// With additional Qd type.
+def MUL     : IOpInst<"vmul", "ddd", "csifUcUsUiQcQsQiQfQUcQUsQUiQd", OP_MUL>;
+def MLA     : IOpInst<"vmla", "dddd", "csifUcUsUiQcQsQiQfQUcQUsQUiQd", OP_MLA>;
+def MLS     : IOpInst<"vmls", "dddd", "csifUcUsUiQcQsQiQfQUcQUsQUiQd", OP_MLS>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Multiplication Extended
+def MULX : SInst<"vmulx", "ddd", "fQfQd">;
+
+////////////////////////////////////////////////////////////////////////////////
+// Division
+def FDIV : IOpInst<"vdiv", "ddd",  "fQfQd", OP_DIV>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Vector fused multiply-add operations
+// With additional Qd type.
+def FMLA : SInst<"vfma", "dddd", "fQfQd">;
+def FMLS : SInst<"vfms", "dddd", "fQfQd">;
+
+////////////////////////////////////////////////////////////////////////////////
+// Logical operations
+// With additional Qd type.
+def BSL : SInst<"vbsl", "dudd", "csilUcUsUiUlfPcPsQcQsQiQlQUcQUsQUiQUlQfQPcQPsQd">;
+
+////////////////////////////////////////////////////////////////////////////////
+// Absolute Difference
+// With additional Qd type.
+def ABD  : SInst<"vabd", "ddd",  "csiUcUsUifQcQsQiQUcQUsQUiQfQd">;
+
+////////////////////////////////////////////////////////////////////////////////
+// Reciprocal/Sqrt
+// With additional Qd type.
+def FRECPS  : IInst<"vrecps", "ddd", "fQfQd">;
+def FRSQRTS : IInst<"vrsqrts", "ddd", "fQfQd">;
+
+////////////////////////////////////////////////////////////////////////////////
+// Comparison
+// With additional Qd type.
+def FCAGE : IInst<"vcage", "udd", "fQfQd">;
+def FCAGT : IInst<"vcagt", "udd", "fQfQd">;
+def FCALE : IInst<"vcale", "udd", "fQfQd">;
+def FCALT : IInst<"vcalt", "udd", "fQfQd">;
+// With additional Ql, QUl, Qd types.
+def CMTST  : WInst<"vtst", "udd", "csiUcUsUiPcQcQsQiQlQUcQUsQUiQUlQPc">;
+def CFMEQ  : SOpInst<"vceq", "udd",
+                  "csifUcUsUiPcQcQsQiQlQfQUcQUsQUiQUlQPcQd", OP_EQ>;
+def CFMGE  : SOpInst<"vcge", "udd", "csifUcUsUiQcQsQiQlQfQUcQUsQUiQUlQd", OP_GE>;
+def CFMLE  : SOpInst<"vcle", "udd", "csifUcUsUiQcQsQiQlQfQUcQUsQUiQUlQd", OP_LE>;
+def CFMGT  : SOpInst<"vcgt", "udd", "csifUcUsUiQcQsQiQlQfQUcQUsQUiQUlQd", OP_GT>;
+def CFMLT  : SOpInst<"vclt", "udd", "csifUcUsUiQcQsQiQlQfQUcQUsQUiQUlQd", OP_LT>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Max/Min Integer
+// With additional Qd type.
+def MAX : SInst<"vmax", "ddd", "csiUcUsUifQcQsQiQUcQUsQUiQfQd">;
+def MIN : SInst<"vmin", "ddd", "csiUcUsUifQcQsQiQUcQUsQUiQfQd">;
+
+////////////////////////////////////////////////////////////////////////////////
+// MaxNum/MinNum Floating Point
+def FMAXNM : SInst<"vmaxnm", "ddd", "fQfQd">;
+def FMINNM : SInst<"vminnm", "ddd", "fQfQd">;
+
+////////////////////////////////////////////////////////////////////////////////
+// Pairwise Max/Min
+// With additional Qc Qs Qi QUc QUs QUi Qf Qd types.
+def MAXP : SInst<"vpmax", "ddd", "csiUcUsUifQcQsQiQUcQUsQUiQfQd">;
+def MINP : SInst<"vpmin", "ddd", "csiUcUsUifQcQsQiQUcQUsQUiQfQd">;
+
+////////////////////////////////////////////////////////////////////////////////
+// Pairwise MaxNum/MinNum Floating Point
+def FMAXNMP : SInst<"vpmaxnm", "ddd", "fQfQd">;
+def FMINNMP : SInst<"vpminnm", "ddd", "fQfQd">;
+
+////////////////////////////////////////////////////////////////////////////////
+// Pairwise Addition
+// With additional Qc Qs Qi QUc QUs QUi Qf Qd types.
+def ADDP  : IInst<"vpadd", "ddd", "csiUcUsUifQcQsQiQUcQUsQUiQfQd">;
+
+////////////////////////////////////////////////////////////////////////////////
+// Scalar Arithmetic
+
+// Scalar Addition
+
+def SCALAR_ADD : Inst<"vaddd", "ddd", "lUl", OP_ADD>;
+
+// Scalar Subtraction
+def SCALAR_SUB : Inst<"vsubd", "ddd", "lUl", OP_SUB>;
+
+}
diff --git a/include/clang/Sema/Sema.h b/include/clang/Sema/Sema.h
index 752dd985a9..d14b38eb0c 100644
--- a/include/clang/Sema/Sema.h
+++ b/include/clang/Sema/Sema.h
@@ -7557,7 +7557,7 @@ private:
 
   bool CheckARMBuiltinExclusiveCall(unsigned BuiltinID, CallExpr *TheCall);
   bool CheckARMBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall);
-
+  bool CheckAArch64BuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall);
   bool CheckMipsBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall);
 
   bool SemaBuiltinVAStart(CallExpr *TheCall);
diff --git a/lib/AST/ItaniumMangle.cpp b/lib/AST/ItaniumMangle.cpp
index 76a8bf4ecc..7016ee1d65 100644
--- a/lib/AST/ItaniumMangle.cpp
+++ b/lib/AST/ItaniumMangle.cpp
@@ -360,6 +360,7 @@ private:
   void mangleBareFunctionType(const FunctionType *T,
                               bool MangleReturnType);
   void mangleNeonVectorType(const VectorType *T);
+  void mangleAArch64NeonVectorType(const VectorType *T);
 
   void mangleIntegerLiteral(QualType T, const llvm::APSInt &Value);
   void mangleMemberExpr(const Expr *base, bool isArrow,
@@ -2174,7 +2175,9 @@ void CXXNameMangler::mangleNeonVectorType(const VectorType *T) {
     case BuiltinType::LongLong:  EltName = "int64_t"; break;
     case BuiltinType::ULongLong: EltName = "uint64_t"; break;
     case BuiltinType::Float:     EltName = "float32_t"; break;
-    default: llvm_unreachable("unexpected Neon vector element type");
+    case BuiltinType::Half:      EltName = "float16_t";break;
+    default:
+      llvm_unreachable("unexpected Neon vector element type");
     }
   }
   const char *BaseName = 0;
@@ -2190,6 +2193,70 @@ void CXXNameMangler::mangleNeonVectorType(const VectorType *T) {
   Out << BaseName << EltName;
 }
 
+static StringRef mangleAArch64VectorBase(const BuiltinType *EltType) {
+  switch (EltType->getKind()) {
+  case BuiltinType::SChar:
+    return "Int8";
+  case BuiltinType::Short:
+    return "Int16";
+  case BuiltinType::Int:
+    return "Int32";
+  case BuiltinType::LongLong:
+    return "Int64";
+  case BuiltinType::UChar:
+    return "Uint8";
+  case BuiltinType::UShort:
+    return "Uint16";
+  case BuiltinType::UInt:
+    return "Uint32";
+  case BuiltinType::ULongLong:
+    return "Uint64";
+  case BuiltinType::Half:
+    return "Float16";
+  case BuiltinType::Float:
+    return "Float32";
+  case BuiltinType::Double:
+    return "Float64";
+  default:
+    llvm_unreachable("Unexpected vector element base type");
+  }
+}
+
+// AArch64's ABI for Neon vector types specifies that they should be mangled as
+// the equivalent internal name. The vector type must be one of the special
+// types predefined by ARM.
+void CXXNameMangler::mangleAArch64NeonVectorType(const VectorType *T) {
+  QualType EltType = T->getElementType();
+  assert(EltType->isBuiltinType() && "Neon vector element not a BuiltinType");
+  unsigned BitSize =
+      (T->getNumElements() * getASTContext().getTypeSize(EltType));
+
+  assert((BitSize == 64 || BitSize == 128) &&
+         "Neon vector type not 64 or 128 bits");
+
+  assert(getASTContext().getTypeSize(EltType) != BitSize &&
+         "Vector of 1 element not permitted");
+
+  StringRef EltName;
+  if (T->getVectorKind() == VectorType::NeonPolyVector) {
+    switch (cast<BuiltinType>(EltType)->getKind()) {
+    case BuiltinType::UChar:
+      EltName = "Poly8";
+      break;
+    case BuiltinType::UShort:
+      EltName = "Poly16";
+      break;
+    default:
+      llvm_unreachable("unexpected Neon polynomial vector element type");
+    }
+  } else
+    EltName = mangleAArch64VectorBase(cast<BuiltinType>(EltType));
+
+  std::string TypeName =
+      ("__" + EltName + "x" + llvm::utostr(T->getNumElements()) + "_t").str();
+  Out << TypeName.length() << TypeName;
+}
+
 // GNU extension: vector types
 // <type>                  ::= <vector-type>
 // <vector-type>           ::= Dv <positive dimension number> _
@@ -2201,7 +2268,11 @@ void CXXNameMangler::mangleNeonVectorType(const VectorType *T) {
 void CXXNameMangler::mangleType(const VectorType *T) {
   if ((T->getVectorKind() == VectorType::NeonVector ||
        T->getVectorKind() == VectorType::NeonPolyVector)) {
-    mangleNeonVectorType(T);
+    if (getASTContext().getTargetInfo().getTriple().getArch() ==
+        llvm::Triple::aarch64)
+      mangleAArch64NeonVectorType(T);
+    else
+      mangleNeonVectorType(T);
     return;
   }
   Out << "Dv" << T->getNumElements() << '_';
diff --git a/lib/Basic/Targets.cpp b/lib/Basic/Targets.cpp
index 596eb8cb16..718f3bb223 100644
--- a/lib/Basic/Targets.cpp
+++ b/lib/Basic/Targets.cpp
@@ -3177,7 +3177,14 @@ class AArch64TargetInfo : public TargetInfo {
   static const char * const GCCRegNames[];
   static const TargetInfo::GCCRegAlias GCCRegAliases[];
 
+  enum FPUModeEnum {
+    FPUMode,
+    NeonMode
+  };
+
+  unsigned FPU;
   static const Builtin::Info BuiltinInfo[];
+
 public:
   AArch64TargetInfo(const llvm::Triple &Triple) : TargetInfo(Triple) {
     BigEndian = false;
@@ -3242,7 +3249,14 @@ public:
                         Opts.ShortEnums ? "1" : "4");
 
     if (BigEndian)
-      Builder.defineMacro("__ARM_BIG_ENDIAN");
+      Builder.defineMacro("__AARCH_BIG_ENDIAN");
+
+    if (FPU == NeonMode) {
+      Builder.defineMacro("__AARCH_FEATURE_ADVSIMD");
+
+      // 64-bit NEON supports half, single and double precision operations.
+      Builder.defineMacro("__AARCH_ADVSIMD_FP", "0xe");
+    }
   }
   virtual void getTargetBuiltins(const Builtin::Info *&Records,
                                  unsigned &NumRecords) const {
@@ -3250,9 +3264,28 @@ public:
     NumRecords = clang::AArch64::LastTSBuiltin-Builtin::FirstTSBuiltin;
   }
   virtual bool hasFeature(StringRef Feature) const {
-    return Feature == "aarch64";
+    return Feature == "aarch64" || (Feature == "neon" && FPU == NeonMode);
   }
-  virtual void getGCCRegNames(const char * const *&Names,
+
+  virtual bool setFeatureEnabled(llvm::StringMap<bool> &Features,
+                                 StringRef Name, bool Enabled) const {
+    if (Name == "neon") {
+      Features[Name] = Enabled;
+      return true;
+    }
+
+    return false;
+  }
+
+  virtual void HandleTargetFeatures(std::vector<std::string> &Features) {
+    FPU = FPUMode;
+    for (unsigned i = 0, e = Features.size(); i != e; ++i) {
+      if (Features[i] == "+neon")
+        FPU = NeonMode;
+    }
+  }
+
+  virtual void getGCCRegNames(const char *const *&Names,
                               unsigned &NumNames) const;
   virtual void getGCCRegAliases(const GCCRegAlias *&Aliases,
                                 unsigned &NumAliases) const;
diff --git a/lib/CodeGen/CGBuiltin.cpp b/lib/CodeGen/CGBuiltin.cpp
index 5b41237585..d1dd7a0958 100644
--- a/lib/CodeGen/CGBuiltin.cpp
+++ b/lib/CodeGen/CGBuiltin.cpp
@@ -1614,6 +1614,8 @@ static llvm::VectorType *GetNeonType(CodeGenFunction *CGF,
     return llvm::VectorType::get(CGF->Int64Ty, 1 << IsQuad);
   case NeonTypeFlags::Float32:
     return llvm::VectorType::get(CGF->FloatTy, 2 << IsQuad);
+  case NeonTypeFlags::Float64:
+    return llvm::VectorType::get(CGF->DoubleTy, 1 << IsQuad);
   }
   llvm_unreachable("Invalid NeonTypeFlags element type!");
 }
@@ -1718,7 +1720,200 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     return EmitNounwindRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name), Ops);
   }
 
-  return 0;
+  SmallVector<Value *, 4> Ops;
+  for (unsigned i = 0, e = E->getNumArgs() - 1; i != e; i++) {
+    Ops.push_back(EmitScalarExpr(E->getArg(i)));
+  }
+
+  // Get the last argument, which specifies the vector type.
+  llvm::APSInt Result;
+  const Expr *Arg = E->getArg(E->getNumArgs() - 1);
+  if (!Arg->isIntegerConstantExpr(Result, getContext()))
+    return 0;
+
+  // Determine the type of this overloaded NEON intrinsic.
+  NeonTypeFlags Type(Result.getZExtValue());
+  bool usgn = Type.isUnsigned();
+
+  llvm::VectorType *VTy = GetNeonType(this, Type);
+  llvm::Type *Ty = VTy;
+  if (!Ty)
+    return 0;
+
+  unsigned Int;
+  switch (BuiltinID) {
+  default:
+    return 0;
+
+  // AArch64 builtins mapping to legacy ARM v7 builtins.
+  // FIXME: the mapped builtins listed correspond to what has been tested
+  // in aarch64-neon-intrinsics.c so far.
+  case AArch64::BI__builtin_neon_vmul_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vmul_v, E);
+  case AArch64::BI__builtin_neon_vmulq_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vmulq_v, E);
+  case AArch64::BI__builtin_neon_vabd_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vabd_v, E);
+  case AArch64::BI__builtin_neon_vabdq_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vabdq_v, E);
+  case AArch64::BI__builtin_neon_vfma_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vfma_v, E);
+  case AArch64::BI__builtin_neon_vfmaq_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vfmaq_v, E);
+  case AArch64::BI__builtin_neon_vbsl_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vbsl_v, E);
+  case AArch64::BI__builtin_neon_vbslq_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vbslq_v, E);
+  case AArch64::BI__builtin_neon_vrsqrts_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vrsqrts_v, E);
+  case AArch64::BI__builtin_neon_vrsqrtsq_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vrsqrtsq_v, E);
+  case AArch64::BI__builtin_neon_vrecps_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vrecps_v, E);
+  case AArch64::BI__builtin_neon_vrecpsq_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vrecpsq_v, E);
+  case AArch64::BI__builtin_neon_vcage_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vcage_v, E);
+  case AArch64::BI__builtin_neon_vcale_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vcale_v, E);
+  case AArch64::BI__builtin_neon_vcaleq_v:
+    std::swap(Ops[0], Ops[1]);
+  case AArch64::BI__builtin_neon_vcageq_v: {
+    Function *F;
+    if (VTy->getElementType()->isIntegerTy(64))
+      F = CGM.getIntrinsic(Intrinsic::aarch64_neon_vacgeq);
+    else
+      F = CGM.getIntrinsic(Intrinsic::arm_neon_vacgeq);
+    return EmitNeonCall(F, Ops, "vcage");
+  }
+  case AArch64::BI__builtin_neon_vcalt_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vcalt_v, E);
+  case AArch64::BI__builtin_neon_vcagt_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vcagt_v, E);
+  case AArch64::BI__builtin_neon_vcaltq_v:
+    std::swap(Ops[0], Ops[1]);
+  case AArch64::BI__builtin_neon_vcagtq_v: {
+    Function *F;
+    if (VTy->getElementType()->isIntegerTy(64))
+      F = CGM.getIntrinsic(Intrinsic::aarch64_neon_vacgtq);
+    else
+      F = CGM.getIntrinsic(Intrinsic::arm_neon_vacgtq);
+    return EmitNeonCall(F, Ops, "vcagt");
+  }
+  case AArch64::BI__builtin_neon_vtst_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vtst_v, E);
+  case AArch64::BI__builtin_neon_vtstq_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vtstq_v, E);
+  case AArch64::BI__builtin_neon_vhadd_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vhadd_v, E);
+  case AArch64::BI__builtin_neon_vhaddq_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vhaddq_v, E);
+  case AArch64::BI__builtin_neon_vhsub_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vhsub_v, E);
+  case AArch64::BI__builtin_neon_vhsubq_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vhsubq_v, E);
+  case AArch64::BI__builtin_neon_vrhadd_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vrhadd_v, E);
+  case AArch64::BI__builtin_neon_vrhaddq_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vrhaddq_v, E);
+  case AArch64::BI__builtin_neon_vqadd_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqadd_v, E);
+  case AArch64::BI__builtin_neon_vqaddq_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqaddq_v, E);
+  case AArch64::BI__builtin_neon_vqsub_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqsub_v, E);
+  case AArch64::BI__builtin_neon_vqsubq_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqsubq_v, E);
+  case AArch64::BI__builtin_neon_vshl_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vshl_v, E);
+  case AArch64::BI__builtin_neon_vshlq_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vshlq_v, E);
+  case AArch64::BI__builtin_neon_vqshl_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqshl_v, E);
+  case AArch64::BI__builtin_neon_vqshlq_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqshlq_v, E);
+  case AArch64::BI__builtin_neon_vrshl_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vrshl_v, E);
+  case AArch64::BI__builtin_neon_vrshlq_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vrshlq_v, E);
+  case AArch64::BI__builtin_neon_vqrshl_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqrshl_v, E);
+  case AArch64::BI__builtin_neon_vqrshlq_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqrshlq_v, E);
+  case AArch64::BI__builtin_neon_vmax_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vmax_v, E);
+  case AArch64::BI__builtin_neon_vmaxq_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vmaxq_v, E);
+  case AArch64::BI__builtin_neon_vmin_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vmin_v, E);
+  case AArch64::BI__builtin_neon_vminq_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vminq_v, E);
+  case AArch64::BI__builtin_neon_vpmax_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vpmax_v, E);
+  case AArch64::BI__builtin_neon_vpmin_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vpmin_v, E);
+  case AArch64::BI__builtin_neon_vpadd_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vpadd_v, E);
+  case AArch64::BI__builtin_neon_vqdmulh_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqdmulh_v, E);
+  case AArch64::BI__builtin_neon_vqdmulhq_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqdmulhq_v, E);
+  case AArch64::BI__builtin_neon_vqrdmulh_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqrdmulh_v, E);
+  case AArch64::BI__builtin_neon_vqrdmulhq_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqrdmulhq_v, E);
+
+  // AArch64-only builtins
+  case AArch64::BI__builtin_neon_vfms_v:
+  case AArch64::BI__builtin_neon_vfmsq_v: {
+    Value *F = CGM.getIntrinsic(Intrinsic::fma, Ty);
+    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
+    Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
+    Ops[1] = Builder.CreateFNeg(Ops[1]);
+    Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
+
+    // LLVM's fma intrinsic puts the accumulator in the last position, but the
+    // AArch64 intrinsic has it first.
+    return Builder.CreateCall3(F, Ops[1], Ops[2], Ops[0]);
+  }
+  case AArch64::BI__builtin_neon_vmaxnm_v:
+  case AArch64::BI__builtin_neon_vmaxnmq_v: {
+    Int = Intrinsic::aarch64_neon_vmaxnm;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmaxnm");
+  }
+  case AArch64::BI__builtin_neon_vminnm_v:
+  case AArch64::BI__builtin_neon_vminnmq_v: {
+    Int = Intrinsic::aarch64_neon_vminnm;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vminnm");
+  }
+  case AArch64::BI__builtin_neon_vpmaxnm_v:
+  case AArch64::BI__builtin_neon_vpmaxnmq_v: {
+    Int = Intrinsic::aarch64_neon_vpmaxnm;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmaxnm");
+  }
+  case AArch64::BI__builtin_neon_vpminnm_v:
+  case AArch64::BI__builtin_neon_vpminnmq_v: {
+    Int = Intrinsic::aarch64_neon_vpminnm;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpminnm");
+  }
+  case AArch64::BI__builtin_neon_vpmaxq_v: {
+    Int = usgn ? Intrinsic::arm_neon_vpmaxu : Intrinsic::arm_neon_vpmaxs;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmax");
+  }
+  case AArch64::BI__builtin_neon_vpminq_v: {
+    Int = usgn ? Intrinsic::arm_neon_vpminu : Intrinsic::arm_neon_vpmins;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmin");
+  }
+  case AArch64::BI__builtin_neon_vpaddq_v: {
+    Int = Intrinsic::arm_neon_vpadd;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpadd");
+  }
+  case AArch64::BI__builtin_neon_vmulx_v:
+  case AArch64::BI__builtin_neon_vmulxq_v: {
+    Int = Intrinsic::aarch64_neon_vmulx;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmulx");
+  }
+  }
 }
 
 Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID,
diff --git a/lib/Driver/Tools.cpp b/lib/Driver/Tools.cpp
index 50630b6989..bc003facba 100644
--- a/lib/Driver/Tools.cpp
+++ b/lib/Driver/Tools.cpp
@@ -1407,6 +1407,14 @@ void Clang::AddHexagonTargetArgs(const ArgList &Args,
   CmdArgs.push_back ("-machine-sink-split=0");
 }
 
+void Clang::AddAArch64TargetArgs(const ArgList &Args,
+                                 ArgStringList &CmdArgs) const {
+  const Driver &D = getToolChain().getDriver();
+  // Honor -mfpu=.
+  if (const Arg *A = Args.getLastArg(options::OPT_mfpu_EQ))
+    addFPUArgs(D, A, Args, CmdArgs);
+}
+
 static bool
 shouldUseExceptionTablesForObjCExceptions(const ObjCRuntime &runtime,
                                           const llvm::Triple &Triple) {
@@ -2498,9 +2506,11 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   case llvm::Triple::hexagon:
     AddHexagonTargetArgs(Args, CmdArgs);
     break;
-  }
-
 
+  case llvm::Triple::aarch64:
+    AddAArch64TargetArgs(Args, CmdArgs);
+    break;
+  }
 
   // Pass the linker version in use.
   if (Arg *A = Args.getLastArg(options::OPT_mlinker_version_EQ)) {
diff --git a/lib/Driver/Tools.h b/lib/Driver/Tools.h
index 1dd4d5edfe..9138ddf88b 100644
--- a/lib/Driver/Tools.h
+++ b/lib/Driver/Tools.h
@@ -65,6 +65,8 @@ using llvm::opt::ArgStringList;
                           llvm::opt::ArgStringList &CmdArgs) const;
     void AddHexagonTargetArgs(const llvm::opt::ArgList &Args,
                               llvm::opt::ArgStringList &CmdArgs) const;
+    void AddAArch64TargetArgs(const llvm::opt::ArgList &Args,
+                              llvm::opt::ArgStringList &CmdArgs) const;
 
     enum RewriteKind { RK_None, RK_Fragile, RK_NonFragile };
 
diff --git a/lib/Sema/SemaChecking.cpp b/lib/Sema/SemaChecking.cpp
index e417949013..9f2dc34575 100644
--- a/lib/Sema/SemaChecking.cpp
+++ b/lib/Sema/SemaChecking.cpp
@@ -306,6 +306,10 @@ Sema::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
         if (CheckARMBuiltinFunctionCall(BuiltinID, TheCall))
           return ExprError();
         break;
+      case llvm::Triple::aarch64:
+        if (CheckAArch64BuiltinFunctionCall(BuiltinID, TheCall))
+          return ExprError();
+        break;
       case llvm::Triple::mips:
       case llvm::Triple::mipsel:
       case llvm::Triple::mips64:
@@ -342,6 +346,9 @@ static unsigned RFT(unsigned t, bool shift = false) {
   case NeonTypeFlags::Float32:
     assert(!shift && "cannot shift float types!");
     return (2 << IsQuad) - 1;
+  case NeonTypeFlags::Float64:
+    assert(!shift && "cannot shift float types!");
+    return (1 << IsQuad) - 1;
   }
   llvm_unreachable("Invalid NeonTypeFlag!");
 }
@@ -367,10 +374,90 @@ static QualType getNeonEltType(NeonTypeFlags Flags, ASTContext &Context) {
     return Context.UnsignedShortTy;
   case NeonTypeFlags::Float32:
     return Context.FloatTy;
+  case NeonTypeFlags::Float64:
+    return Context.DoubleTy;
   }
   llvm_unreachable("Invalid NeonTypeFlag!");
 }
 
+bool Sema::CheckAArch64BuiltinFunctionCall(unsigned BuiltinID,
+                                           CallExpr *TheCall) {
+
+  llvm::APSInt Result;
+
+  uint64_t mask = 0;
+  unsigned TV = 0;
+  int PtrArgNum = -1;
+  bool HasConstPtr = false;
+  switch (BuiltinID) {
+#define GET_NEON_AARCH64_OVERLOAD_CHECK
+#include "clang/Basic/arm_neon.inc"
+#undef GET_NEON_AARCH64_OVERLOAD_CHECK
+  }
+
+  // For NEON intrinsics which are overloaded on vector element type, validate
+  // the immediate which specifies which variant to emit.
+  unsigned ImmArg = TheCall->getNumArgs() - 1;
+  if (mask) {
+    if (SemaBuiltinConstantArg(TheCall, ImmArg, Result))
+      return true;
+
+    TV = Result.getLimitedValue(64);
+    if ((TV > 63) || (mask & (1ULL << TV)) == 0)
+      return Diag(TheCall->getLocStart(), diag::err_invalid_neon_type_code)
+             << TheCall->getArg(ImmArg)->getSourceRange();
+  }
+
+  if (PtrArgNum >= 0) {
+    // Check that pointer arguments have the specified type.
+    Expr *Arg = TheCall->getArg(PtrArgNum);
+    if (ImplicitCastExpr *ICE = dyn_cast<ImplicitCastExpr>(Arg))
+      Arg = ICE->getSubExpr();
+    ExprResult RHS = DefaultFunctionArrayLvalueConversion(Arg);
+    QualType RHSTy = RHS.get()->getType();
+    QualType EltTy = getNeonEltType(NeonTypeFlags(TV), Context);
+    if (HasConstPtr)
+      EltTy = EltTy.withConst();
+    QualType LHSTy = Context.getPointerType(EltTy);
+    AssignConvertType ConvTy;
+    ConvTy = CheckSingleAssignmentConstraints(LHSTy, RHS);
+    if (RHS.isInvalid())
+      return true;
+    if (DiagnoseAssignmentResult(ConvTy, Arg->getLocStart(), LHSTy, RHSTy,
+                                 RHS.get(), AA_Assigning))
+      return true;
+  }
+
+  // For NEON intrinsics which take an immediate value as part of the
+  // instruction, range check them here.
+  unsigned i = 0, l = 0, u = 0;
+  switch (BuiltinID) {
+  default:
+    return false;
+#define GET_NEON_AARCH64_IMMEDIATE_CHECK
+#include "clang/Basic/arm_neon.inc"
+#undef GET_NEON_AARCH64_IMMEDIATE_CHECK
+  }
+  ;
+
+  // We can't check the value of a dependent argument.
+  if (TheCall->getArg(i)->isTypeDependent() ||
+      TheCall->getArg(i)->isValueDependent())
+    return false;
+
+  // Check that the immediate argument is actually a constant.
+  if (SemaBuiltinConstantArg(TheCall, i, Result))
+    return true;
+
+  // Range check against the upper/lower values for this isntruction.
+  unsigned Val = Result.getZExtValue();
+  if (Val < l || Val > (u + l))
+    return Diag(TheCall->getLocStart(), diag::err_argument_invalid_range)
+           << l << u + l << TheCall->getArg(i)->getSourceRange();
+
+  return false;
+}
+
 bool Sema::CheckARMBuiltinExclusiveCall(unsigned BuiltinID, CallExpr *TheCall) {
   assert((BuiltinID == ARM::BI__builtin_arm_ldrex ||
           BuiltinID == ARM::BI__builtin_arm_strex) &&
diff --git a/lib/Sema/SemaType.cpp b/lib/Sema/SemaType.cpp
index 0106a679b9..bc6b6a52fa 100644
--- a/lib/Sema/SemaType.cpp
+++ b/lib/Sema/SemaType.cpp
@@ -4603,6 +4603,42 @@ static void HandleExtVectorTypeAttr(QualType &CurType,
     CurType = T;
 }
 
+static bool isPermittedNeonBaseType(QualType &Ty,
+                                    VectorType::VectorKind VecKind,
+                                    bool IsAArch64) {
+  const BuiltinType *BTy = Ty->getAs<BuiltinType>();
+  if (!BTy)
+    return false;
+
+  if (VecKind == VectorType::NeonPolyVector) {
+    if (IsAArch64) {
+      // AArch64 polynomial vectors are unsigned
+      return BTy->getKind() == BuiltinType::UChar ||
+             BTy->getKind() == BuiltinType::UShort;
+    } else {
+      // AArch32 polynomial vector are signed.
+      return BTy->getKind() == BuiltinType::SChar ||
+             BTy->getKind() == BuiltinType::Short;
+    }
+  }
+
+  // Non-polynomial vector types: the usual suspects are allowed, as well as
+  // float64_t on AArch64.
+  if (IsAArch64 && BTy->getKind() == BuiltinType::Double)
+    return true;
+
+  return BTy->getKind() == BuiltinType::SChar ||
+         BTy->getKind() == BuiltinType::UChar ||
+         BTy->getKind() == BuiltinType::Short ||
+         BTy->getKind() == BuiltinType::UShort ||
+         BTy->getKind() == BuiltinType::Int ||
+         BTy->getKind() == BuiltinType::UInt ||
+         BTy->getKind() == BuiltinType::LongLong ||
+         BTy->getKind() == BuiltinType::ULongLong ||
+         BTy->getKind() == BuiltinType::Float ||
+         BTy->getKind() == BuiltinType::Half;
+}
+
 /// HandleNeonVectorTypeAttr - The "neon_vector_type" and
 /// "neon_polyvector_type" attributes are used to create vector types that
 /// are mangled according to ARM's ABI.  Otherwise, these types are identical
@@ -4646,9 +4682,14 @@ static void HandleNeonVectorTypeAttr(QualType& CurType,
        BTy->getKind() != BuiltinType::LongLong &&
        BTy->getKind() != BuiltinType::ULongLong &&
        BTy->getKind() != BuiltinType::Float)) {
-    S.Diag(Attr.getLoc(), diag::err_attribute_invalid_vector_type) <<CurType;
-    Attr.setInvalid();
-    return;
+    llvm::Triple::ArchType Arch =
+        S.Context.getTargetInfo().getTriple().getArch();
+    if (!isPermittedNeonBaseType(CurType, VecKind,
+                                 Arch == llvm::Triple::aarch64)) {
+      S.Diag(Attr.getLoc(), diag::err_attribute_invalid_vector_type) << CurType;
+      Attr.setInvalid();
+      return;
+    }
   }
   // The total size of the vector must be 64 or 128 bits.
   unsigned typeSize = static_cast<unsigned>(S.Context.getTypeSize(CurType));
diff --git a/test/CodeGen/aarch64-neon-intrinsics.c b/test/CodeGen/aarch64-neon-intrinsics.c
new file mode 100644
index 0000000000..7ed8a39a02
--- /dev/null
+++ b/test/CodeGen/aarch64-neon-intrinsics.c
@@ -0,0 +1,3023 @@
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon \
+// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+
+// Test new aarch64 intrinsics and types
+
+#include <arm_neon.h>
+
+int8x8_t test_vadd_s8(int8x8_t v1, int8x8_t v2) {
+   // CHECK: test_vadd_s8
+  return vadd_s8(v1, v2);
+  // CHECK: add {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+int16x4_t test_vadd_s16(int16x4_t v1, int16x4_t v2) {
+   // CHECK: test_vadd_s16
+  return vadd_s16(v1, v2);
+  // CHECK: add {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+
+int32x2_t test_vadd_s32(int32x2_t v1, int32x2_t v2) {
+   // CHECK: test_vadd_s32
+  return vadd_s32(v1, v2);
+  // CHECK: add {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+int64x1_t test_vadd_s64(int64x1_t v1, int64x1_t v2) {
+  // CHECK: test_vadd_s64
+  return vadd_s64(v1, v2);
+  // CHECK: add {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+}
+
+float32x2_t test_vadd_f32(float32x2_t v1, float32x2_t v2) {
+   // CHECK: test_vadd_f32
+  return vadd_f32(v1, v2);
+  // CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+uint8x8_t test_vadd_u8(uint8x8_t v1, uint8x8_t v2) {
+   // CHECK: test_vadd_u8
+  return vadd_u8(v1, v2);
+  // CHECK: add {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+uint16x4_t test_vadd_u16(uint16x4_t v1, uint16x4_t v2) {
+   // CHECK: test_vadd_u16
+  return vadd_u16(v1, v2);
+  // CHECK: add {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+
+uint32x2_t test_vadd_u32(uint32x2_t v1, uint32x2_t v2) {
+   // CHECK: test_vadd_u32
+  return vadd_u32(v1, v2);
+  // CHECK: add {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+uint64x1_t test_vadd_u64(uint64x1_t v1, uint64x1_t v2) {
+   // CHECK: test_vadd_u64
+  return vadd_u64(v1, v2);
+  // CHECK: add {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+}
+
+int8x16_t test_vaddq_s8(int8x16_t v1, int8x16_t v2) {
+   // CHECK: test_vaddq_s8
+  return vaddq_s8(v1, v2);
+  // CHECK: add {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+int16x8_t test_vaddq_s16(int16x8_t v1, int16x8_t v2) {
+   // CHECK: test_vaddq_s16
+  return vaddq_s16(v1, v2);
+  // CHECK: add {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+int32x4_t test_vaddq_s32(int32x4_t v1,int32x4_t  v2) {
+   // CHECK: test_vaddq_s32
+  return vaddq_s32(v1, v2);
+  // CHECK: add {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+int64x2_t test_vaddq_s64(int64x2_t v1, int64x2_t v2) {
+   // CHECK: test_vaddq_s64
+  return vaddq_s64(v1, v2);
+  // CHECK: add {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+}
+
+float32x4_t test_vaddq_f32(float32x4_t v1, float32x4_t v2) {
+   // CHECK: test_vaddq_f32
+  return vaddq_f32(v1, v2);
+  // CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+float64x2_t test_vaddq_f64(float64x2_t v1, float64x2_t v2) {
+  // CHECK: test_vaddq_f64
+  return vaddq_f64(v1, v2);
+  // CHECK: fadd {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+}
+
+uint8x16_t test_vaddq_u8(uint8x16_t v1, uint8x16_t v2) {
+   // CHECK: test_vaddq_u8
+  return vaddq_u8(v1, v2);
+  // CHECK: add {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+uint16x8_t test_vaddq_u16(uint16x8_t v1, uint16x8_t v2) {
+   // CHECK: test_vaddq_u16
+  return vaddq_u16(v1, v2);
+  // CHECK: add {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+uint32x4_t test_vaddq_u32(uint32x4_t v1, uint32x4_t v2) {
+   // CHECK: vaddq_u32
+  return vaddq_u32(v1, v2);
+  // CHECK: add {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+uint64x2_t test_vaddq_u64(uint64x2_t v1, uint64x2_t v2) {
+   // CHECK: test_vaddq_u64
+  return vaddq_u64(v1, v2);
+  // CHECK: add {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+}
+
+int8x8_t test_vsub_s8(int8x8_t v1, int8x8_t v2) {
+   // CHECK: test_vsub_s8
+  return vsub_s8(v1, v2);
+  // CHECK: sub {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+int16x4_t test_vsub_s16(int16x4_t v1, int16x4_t v2) {
+   // CHECK: test_vsub_s16
+  return vsub_s16(v1, v2);
+  // CHECK: sub {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+int32x2_t test_vsub_s32(int32x2_t v1, int32x2_t v2) {
+   // CHECK: test_vsub_s32
+  return vsub_s32(v1, v2);
+  // CHECK: sub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+int64x1_t test_vsub_s64(int64x1_t v1, int64x1_t v2) {
+   // CHECK: test_vsub_s64
+  return vsub_s64(v1, v2);
+  // CHECK: sub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+}
+
+float32x2_t test_vsub_f32(float32x2_t v1, float32x2_t v2) {
+   // CHECK: test_vsub_f32
+  return vsub_f32(v1, v2);
+  // CHECK: fsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+uint8x8_t test_vsub_u8(uint8x8_t v1, uint8x8_t v2) {
+   // CHECK: test_vsub_u8
+  return vsub_u8(v1, v2);
+  // CHECK: sub {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+uint16x4_t test_vsub_u16(uint16x4_t v1, uint16x4_t v2) {
+   // CHECK: test_vsub_u16
+  return vsub_u16(v1, v2);
+  // CHECK: sub {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+
+uint32x2_t test_vsub_u32(uint32x2_t v1, uint32x2_t v2) {
+   // CHECK: test_vsub_u32
+  return vsub_u32(v1, v2);
+  // CHECK: sub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+uint64x1_t test_vsub_u64(uint64x1_t v1, uint64x1_t v2) {
+   // CHECK: test_vsub_u64
+  return vsub_u64(v1, v2);
+  // CHECK: sub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+}
+
+int8x16_t test_vsubq_s8(int8x16_t v1, int8x16_t v2) {
+   // CHECK: test_vsubq_s8
+  return vsubq_s8(v1, v2);
+  // CHECK: sub {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+int16x8_t test_vsubq_s16(int16x8_t v1, int16x8_t v2) {
+   // CHECK: test_vsubq_s16
+  return vsubq_s16(v1, v2);
+  // CHECK: sub {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+int32x4_t test_vsubq_s32(int32x4_t v1,int32x4_t  v2) {
+   // CHECK: test_vsubq_s32
+  return vsubq_s32(v1, v2);
+  // CHECK: sub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+int64x2_t test_vsubq_s64(int64x2_t v1, int64x2_t v2) {
+   // CHECK: test_vsubq_s64
+  return vsubq_s64(v1, v2);
+  // CHECK: sub {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+}
+
+float32x4_t test_vsubq_f32(float32x4_t v1, float32x4_t v2) {
+   // CHECK: test_vsubq_f32
+  return vsubq_f32(v1, v2);
+  // CHECK: fsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+float64x2_t test_vsubq_f64(float64x2_t v1, float64x2_t v2) {
+  // CHECK: test_vsubq_f64
+  return vsubq_f64(v1, v2);
+  // CHECK: fsub {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+}
+
+uint8x16_t test_vsubq_u8(uint8x16_t v1, uint8x16_t v2) {
+   // CHECK: test_vsubq_u8
+  return vsubq_u8(v1, v2);
+  // CHECK: sub {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+uint16x8_t test_vsubq_u16(uint16x8_t v1, uint16x8_t v2) {
+   // CHECK: test_vsubq_u16
+  return vsubq_u16(v1, v2);
+  // CHECK: sub {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+uint32x4_t test_vsubq_u32(uint32x4_t v1, uint32x4_t v2) {
+   // CHECK: vsubq_u32
+  return vsubq_u32(v1, v2);
+  // CHECK: sub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+uint64x2_t test_vsubq_u64(uint64x2_t v1, uint64x2_t v2) {
+   // CHECK: test_vsubq_u64
+  return vsubq_u64(v1, v2);
+  // CHECK: sub {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+}
+
+int8x8_t test_vmul_s8(int8x8_t v1, int8x8_t v2) {
+  // CHECK: test_vmul_s8
+  return vmul_s8(v1, v2);
+  // CHECK: mul {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+int16x4_t test_vmul_s16(int16x4_t v1, int16x4_t v2) {
+  // CHECK: test_vmul_s16
+  return vmul_s16(v1, v2);
+  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+
+int32x2_t test_vmul_s32(int32x2_t v1, int32x2_t v2) {
+  // CHECK: test_vmul_s32
+  return vmul_s32(v1, v2);
+  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+float32x2_t test_vmul_f32(float32x2_t v1, float32x2_t v2) {
+  // CHECK: test_vmul_f32
+  return vmul_f32(v1, v2);
+  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+
+uint8x8_t test_vmul_u8(uint8x8_t v1, uint8x8_t v2) {
+  // CHECK: test_vmul_u8
+  return vmul_u8(v1, v2);
+  // CHECK: mul {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+uint16x4_t test_vmul_u16(uint16x4_t v1, uint16x4_t v2) {
+  // CHECK: test_vmul_u16
+  return vmul_u16(v1, v2);
+  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+
+uint32x2_t test_vmul_u32(uint32x2_t v1, uint32x2_t v2) {
+  // CHECK: test_vmul_u32
+  return vmul_u32(v1, v2);
+  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+int8x16_t test_vmulq_s8(int8x16_t v1, int8x16_t v2) {
+  // CHECK: test_vmulq_s8
+  return vmulq_s8(v1, v2);
+  // CHECK: mul {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+int16x8_t test_vmulq_s16(int16x8_t v1, int16x8_t v2) {
+  // CHECK: test_vmulq_s16
+  return vmulq_s16(v1, v2);
+  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+int32x4_t test_vmulq_s32(int32x4_t v1, int32x4_t v2) {
+  // CHECK: test_vmulq_s32
+  return vmulq_s32(v1, v2);
+  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+    
+uint8x16_t test_vmulq_u8(uint8x16_t v1, uint8x16_t v2) {
+  // CHECK: test_vmulq_u8
+  return vmulq_u8(v1, v2);
+  // CHECK: mul {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+uint16x8_t test_vmulq_u16(uint16x8_t v1, uint16x8_t v2) {
+  // CHECK: test_vmulq_u16
+  return vmulq_u16(v1, v2);
+  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+uint32x4_t test_vmulq_u32(uint32x4_t v1, uint32x4_t v2) {
+  // CHECK: test_vmulq_u32
+  return vmulq_u32(v1, v2);
+  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+float32x4_t test_vmulq_f32(float32x4_t v1, float32x4_t v2) {
+  // CHECK: test_vmulq_f32
+  return vmulq_f32(v1, v2);
+  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+float64x2_t test_vmulq_f64(float64x2_t v1, float64x2_t v2) {
+  // CHECK: test_vmulq_f64
+  return vmulq_f64(v1, v2);
+  // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+}
+
+poly8x8_t test_vmul_p8(poly8x8_t v1, poly8x8_t v2) {
+  //  test_vmul_p8
+  return vmul_p8(v1, v2);
+  //  pmul {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+poly8x16_t test_vmulq_p8(poly8x16_t v1, poly8x16_t v2) {
+  // test_vmulq_p8
+  return vmulq_p8(v1, v2);
+  // pmul {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+
+int8x8_t test_vmla_s8(int8x8_t v1, int8x8_t v2, int8x8_t v3) {
+  // CHECK: test_vmla_s8
+  return vmla_s8(v1, v2, v3);
+  // CHECK: mla {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+int8x8_t test_vmla_s16(int16x4_t v1, int16x4_t v2, int16x4_t v3) {
+  // CHECK: test_vmla_s16
+  return vmla_s16(v1, v2, v3);
+  // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+
+int32x2_t test_vmla_s32(int32x2_t v1, int32x2_t v2, int32x2_t v3) {
+  // CHECK: test_vmla_s32
+  return vmla_s32(v1, v2, v3);
+  // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+float32x2_t test_vmla_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) {
+  // CHECK: test_vmla_f32
+  return vmla_f32(v1, v2, v3);
+  // CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+uint8x8_t test_vmla_u8(uint8x8_t v1, uint8x8_t v2, uint8x8_t v3) {
+  // CHECK: test_vmla_u8
+  return vmla_u8(v1, v2, v3);
+  // CHECK: mla {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+uint16x4_t test_vmla_u16(uint16x4_t v1, uint16x4_t v2, uint16x4_t v3) {
+  // CHECK: test_vmla_u16
+  return vmla_u16(v1, v2, v3);
+  // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+
+uint32x2_t test_vmla_u32(uint32x2_t v1, uint32x2_t v2, uint32x2_t v3) {
+  // CHECK: test_vmla_u32
+  return vmla_u32(v1, v2, v3);
+  // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+int8x16_t test_vmlaq_s8(int8x16_t v1, int8x16_t v2, int8x16_t v3) {
+  // CHECK: test_vmlaq_s8
+  return vmlaq_s8(v1, v2, v3);
+  // CHECK: mla {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+int16x8_t test_vmlaq_s16(int16x8_t v1, int16x8_t v2, int16x8_t v3) {
+  // CHECK: test_vmlaq_s16
+  return vmlaq_s16(v1, v2, v3);
+  // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+int32x4_t test_vmlaq_s32(int32x4_t v1, int32x4_t v2, int32x4_t v3) {
+  // CHECK: test_vmlaq_s32
+  return vmlaq_s32(v1, v2, v3);
+  // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+} 
+
+float32x4_t test_vmlaq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) {
+  // CHECK: test_vmlaq_f32
+  return vmlaq_f32(v1, v2, v3);
+  // CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+uint8x16_t test_vmlaq_u8(uint8x16_t v1, uint8x16_t v2, uint8x16_t v3) {
+   // CHECK: test_vmlaq_u8
+  return vmlaq_u8(v1, v2, v3);
+  // CHECK: mla {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+uint16x8_t test_vmlaq_u16(uint16x8_t v1, uint16x8_t v2, uint16x8_t v3) {
+  // CHECK: test_vmlaq_u16
+  return vmlaq_u16(v1, v2, v3);
+  // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+uint32x4_t test_vmlaq_u32(uint32x4_t v1, uint32x4_t v2, uint32x4_t v3) {
+  // CHECK: test_vmlaq_u32
+  return vmlaq_u32(v1, v2, v3);
+  // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+float64x2_t test_vmlaq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) {
+  // CHECK: test_vmlaq_f64
+  return vmlaq_f64(v1, v2, v3);
+  // CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+}
+
+int8x8_t test_vmls_s8(int8x8_t v1, int8x8_t v2, int8x8_t v3) {
+  // CHECK: test_vmls_s8
+  return vmls_s8(v1, v2, v3);
+  // CHECK: mls {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+int8x8_t test_vmls_s16(int16x4_t v1, int16x4_t v2, int16x4_t v3) {
+  // CHECK: test_vmls_s16
+  return vmls_s16(v1, v2, v3);
+  // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+
+int32x2_t test_vmls_s32(int32x2_t v1, int32x2_t v2, int32x2_t v3) {
+  // CHECK: test_vmls_s32
+  return vmls_s32(v1, v2, v3);
+  // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+float32x2_t test_vmls_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) {
+  // CHECK: test_vmls_f32
+  return vmls_f32(v1, v2, v3);
+  // CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+uint8x8_t test_vmls_u8(uint8x8_t v1, uint8x8_t v2, uint8x8_t v3) {
+  // CHECK: test_vmls_u8
+  return vmls_u8(v1, v2, v3);
+  // CHECK: mls {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+uint16x4_t test_vmls_u16(uint16x4_t v1, uint16x4_t v2, uint16x4_t v3) {
+  // CHECK: test_vmls_u16
+  return vmls_u16(v1, v2, v3);
+  // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+
+uint32x2_t test_vmls_u32(uint32x2_t v1, uint32x2_t v2, uint32x2_t v3) {
+  // CHECK: test_vmls_u32
+  return vmls_u32(v1, v2, v3);
+  // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+int8x16_t test_vmlsq_s8(int8x16_t v1, int8x16_t v2, int8x16_t v3) {
+  // CHECK: test_vmlsq_s8
+  return vmlsq_s8(v1, v2, v3);
+  // CHECK: mls {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+int16x8_t test_vmlsq_s16(int16x8_t v1, int16x8_t v2, int16x8_t v3) {
+  // CHECK: test_vmlsq_s16
+  return vmlsq_s16(v1, v2, v3);
+  // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+int32x4_t test_vmlsq_s32(int32x4_t v1, int32x4_t v2, int32x4_t v3) {
+  // CHECK: test_vmlsq_s32
+  return vmlsq_s32(v1, v2, v3);
+  // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+float32x4_t test_vmlsq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) {
+  // CHECK: test_vmlsq_f32
+  return vmlsq_f32(v1, v2, v3);
+  // CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+uint8x16_t test_vmlsq_u8(uint8x16_t v1, uint8x16_t v2, uint8x16_t v3) {
+  // CHECK: test_vmlsq_u8
+  return vmlsq_u8(v1, v2, v3);
+  // CHECK: mls {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+uint16x8_t test_vmlsq_u16(uint16x8_t v1, uint16x8_t v2, uint16x8_t v3) {
+  // CHECK: test_vmlsq_u16
+  return vmlsq_u16(v1, v2, v3);
+  // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+uint32x4_t test_vmlsq_u32(uint32x4_t v1, uint32x4_t v2, uint32x4_t v3) {
+  // CHECK: test_vmlsq_u32
+  return vmlsq_u32(v1, v2, v3);
+  // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+float64x2_t test_vmlsq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) {
+  // CHECK: test_vmlsq_f64
+  return vmlsq_f64(v1, v2, v3);
+  // CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+}
+float32x2_t test_vfma_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) {
+  // CHECK: test_vfma_f32
+  return vfma_f32(v1, v2, v3);
+  // CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+float32x4_t test_vfmaq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) {
+  // CHECK: test_vfmaq_f32
+  return vfmaq_f32(v1, v2, v3);
+  // CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+float64x2_t test_vfmaq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) {
+  // CHECK: test_vfmaq_f64
+  return vfmaq_f64(v1, v2, v3);
+  // CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+}
+float32x2_t test_vfms_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) {
+  // CHECK: test_vfms_f32
+  return vfms_f32(v1, v2, v3);
+  // CHECK: fmls v0.2s, v1.2s, v2.2s
+}
+
+float32x4_t test_vfmsq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) {
+  // CHECK: test_vfmsq_f32
+  return vfmsq_f32(v1, v2, v3);
+  // CHECK: fmls v0.4s, v1.4s, v2.4s
+}
+
+float64x2_t test_vfmsq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) {
+  // CHECK: vfmsq_f64
+  return vfmsq_f64(v1, v2, v3);
+  // CHECK: fmls v0.2d, v1.2d, v2.2d
+}
+
+float64x2_t test_vdivq_f64(float64x2_t v1, float64x2_t v2) {
+  // CHECK: test_vdivq_f64
+  return vdivq_f64(v1, v2);
+  // CHECK: fdiv {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+}
+
+float32x4_t test_vdivq_f32(float32x4_t v1, float32x4_t v2) {
+  // CHECK: test_vdivq_f32
+  return vdivq_f32(v1, v2);
+  // CHECK: fdiv {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+float32x2_t test_vdiv_f32(float32x2_t v1, float32x2_t v2) {
+  // CHECK: test_vdiv_f32
+  return vdiv_f32(v1, v2);
+  // CHECK: fdiv {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+uint64x1_t test_vaddd_u64(uint64x1_t v1, uint64x1_t v2) {
+   // CHECK: test_vaddd_u64
+  return vaddd_u64(v1, v2);
+  // CHECK: add {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+}
+
+int64x1_t test_vaddd_s64(int64x1_t v1, int64x1_t v2) {
+   // CHECK: test_vaddd_s64
+  return vaddd_s64(v1, v2);
+  // CHECK: add {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+}
+
+uint64x1_t test_vsubd_u64(uint64x1_t v1, uint64x1_t v2) {
+   // CHECK: test_vsubd_u64
+  return vsubd_u64(v1, v2);
+  // CHECK: sub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+}
+
+int64x1_t test_vsubd_s64(int64x1_t v1, int64x1_t v2) {
+   // CHECK: test_vsubd_s64
+  return vsubd_s64(v1, v2);
+  // CHECK: sub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+}
+
+int8x8_t test_vaba_s8(int8x8_t v1, int8x8_t v2, int8x8_t v3) {
+  // CHECK: test_vaba_s8
+  return vaba_s8(v1, v2, v3);
+  // CHECK: saba {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+int16x4_t test_vaba_s16(int16x4_t v1, int16x4_t v2, int16x4_t v3) {
+  // CHECK: test_vaba_s16
+  return vaba_s16(v1, v2, v3);
+  // CHECK: saba {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+
+int32x2_t test_vaba_s32(int32x2_t v1, int32x2_t v2, int32x2_t v3) {
+  // CHECK: test_vaba_s32
+  return vaba_s32(v1, v2, v3);
+  // CHECK: saba {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+uint8x8_t test_vaba_u8(uint8x8_t v1, uint8x8_t v2, uint8x8_t v3) {
+  // CHECK: test_vaba_u8
+  return vaba_u8(v1, v2, v3);
+  // CHECK: uaba {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+uint16x4_t test_vaba_u16(uint16x4_t v1, uint16x4_t v2, uint16x4_t v3) {
+  // CHECK: test_vaba_u16
+  return vaba_u16(v1, v2, v3);
+  // CHECK: uaba {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+
+uint32x2_t test_vaba_u32(uint32x2_t v1, uint32x2_t v2, uint32x2_t v3) {
+  // CHECK: test_vaba_u32
+  return vaba_u32(v1, v2, v3);
+  // CHECK: uaba {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+int8x16_t test_vabaq_s8(int8x16_t v1, int8x16_t v2, int8x16_t v3) {
+  // CHECK: test_vabaq_s8
+  return vabaq_s8(v1, v2, v3);
+  // CHECK: saba {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+int16x8_t test_vabaq_s16(int16x8_t v1, int16x8_t v2, int16x8_t v3) {
+  // CHECK: test_vabaq_s16
+  return vabaq_s16(v1, v2, v3);
+  // CHECK: saba {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+int32x4_t test_vabaq_s32(int32x4_t v1, int32x4_t v2, int32x4_t v3) {
+  // CHECK: test_vabaq_s32
+  return vabaq_s32(v1, v2, v3);
+  // CHECK: saba {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+uint8x16_t test_vabaq_u8(uint8x16_t v1, uint8x16_t v2, uint8x16_t v3) {
+  // CHECK: test_vabaq_u8
+  return vabaq_u8(v1, v2, v3);
+  // CHECK: uaba {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+uint16x8_t test_vabaq_u16(uint16x8_t v1, uint16x8_t v2, uint16x8_t v3) {
+  // CHECK: test_vabaq_u16
+  return vabaq_u16(v1, v2, v3);
+  // CHECK: uaba {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+uint32x4_t test_vabaq_u32(uint32x4_t v1, uint32x4_t v2, uint32x4_t v3) {
+  // CHECK: test_vabaq_u32
+  return vabaq_u32(v1, v2, v3);
+  // CHECK: uaba {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+int8x8_t test_vabd_s8(int8x8_t v1, int8x8_t v2) {
+  // CHECK: test_vabd_s8
+  return vabd_s8(v1, v2);
+  // CHECK: sabd {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+int16x4_t test_vabd_s16(int16x4_t v1, int16x4_t v2) {
+  // CHECK: test_vabd_s16
+  return vabd_s16(v1, v2);
+  // CHECK: sabd {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+
+int32x2_t test_vabd_s32(int32x2_t v1, int32x2_t v2) {
+  // CHECK: test_vabd_s32
+  return vabd_s32(v1, v2);
+  // CHECK: sabd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+uint8x8_t test_vabd_u8(uint8x8_t v1, uint8x8_t v2) {
+  // CHECK: test_vabd_u8
+  return vabd_u8(v1, v2);
+  // CHECK: uabd {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+uint16x4_t test_vabd_u16(uint16x4_t v1, uint16x4_t v2) {
+  // CHECK: test_vabd_u16
+  return vabd_u16(v1, v2);
+  // CHECK: uabd {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+
+uint32x2_t test_vabd_u32(uint32x2_t v1, uint32x2_t v2) {
+  // CHECK: test_vabd_u32
+  return vabd_u32(v1, v2);
+  // CHECK: uabd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+float32x2_t test_vabd_f32(float32x2_t v1, float32x2_t v2) {
+  // CHECK: test_vabd_f32
+  return vabd_f32(v1, v2);
+  // CHECK: fabd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+int8x16_t test_vabdq_s8(int8x16_t v1, int8x16_t v2) {
+  // CHECK: test_vabdq_s8
+  return vabdq_s8(v1, v2);
+  // CHECK: sabd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+int16x8_t test_vabdq_s16(int16x8_t v1, int16x8_t v2) {
+  // CHECK: test_vabdq_s16
+  return vabdq_s16(v1, v2);
+  // CHECK: sabd {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+int32x4_t test_vabdq_s32(int32x4_t v1, int32x4_t v2) {
+  // CHECK: test_vabdq_s32
+  return vabdq_s32(v1, v2);
+  // CHECK: sabd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+uint8x16_t test_vabdq_u8(uint8x16_t v1, uint8x16_t v2) {
+  // CHECK: test_vabdq_u8
+  return vabdq_u8(v1, v2);
+  // CHECK: uabd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+uint16x8_t test_vabdq_u16(uint16x8_t v1, uint16x8_t v2) {
+  // CHECK: test_vabdq_u16
+  return vabdq_u16(v1, v2);
+  // CHECK: uabd {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+uint32x4_t test_vabdq_u32(uint32x4_t v1, uint32x4_t v2) {
+  // CHECK: test_vabdq_u32
+  return vabdq_u32(v1, v2);
+  // CHECK: uabd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+float32x4_t test_vabdq_f32(float32x4_t v1, float32x4_t v2) {
+  // CHECK: test_vabdq_f32
+  return vabdq_f32(v1, v2);
+  // CHECK: fabd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+float64x2_t test_vabdq_f64(float64x2_t v1, float64x2_t v2) {
+  // CHECK: test_vabdq_f64
+  return vabdq_f64(v1, v2);
+  // CHECK: fabd {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+}
+
+
+int8x8_t test_vbsl_s8(uint8x8_t v1, int8x8_t v2, int8x8_t v3) {
+  // CHECK: test_vbsl_s8
+  return vbsl_s8(v1, v2, v3);
+  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+int8x8_t test_vbsl_s16(uint16x4_t v1, int16x4_t v2, int16x4_t v3) {
+  // CHECK: test_vbsl_s16
+  return vbsl_s16(v1, v2, v3);
+  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+int32x2_t test_vbsl_s32(uint32x2_t v1, int32x2_t v2, int32x2_t v3) {
+  // CHECK: test_vbsl_s32
+  return vbsl_s32(v1, v2, v3);
+  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+uint64x1_t test_vbsl_s64(uint64x1_t v1, uint64x1_t v2, uint64x1_t v3) {
+  // CHECK: test_vbsl_s64
+  return vbsl_s64(v1, v2, v3);
+  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+uint8x8_t test_vbsl_u8(uint8x8_t v1, uint8x8_t v2, uint8x8_t v3) {
+  // CHECK: test_vbsl_u8
+  return vbsl_u8(v1, v2, v3);
+  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+uint16x4_t test_vbsl_u16(uint16x4_t v1, uint16x4_t v2, uint16x4_t v3) {
+  // CHECK: test_vbsl_u16
+  return vbsl_u16(v1, v2, v3);
+  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+uint32x2_t test_vbsl_u32(uint32x2_t v1, uint32x2_t v2, uint32x2_t v3) {
+  // CHECK: test_vbsl_u32
+  return vbsl_u32(v1, v2, v3);
+  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+uint64x1_t test_vbsl_u64(uint64x1_t v1, uint64x1_t v2, uint64x1_t v3) {
+  // CHECK: test_vbsl_u64
+  return vbsl_u64(v1, v2, v3);
+  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+float32x2_t test_vbsl_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) {
+  // CHECK: test_vbsl_f32
+  return vbsl_f32(v1, v2, v3);
+  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+poly8x8_t test_vbsl_p8(uint8x8_t v1, poly8x8_t v2, poly8x8_t v3) {
+  // CHECK: test_vbsl_p8
+  return vbsl_p8(v1, v2, v3);
+  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+poly16x4_t test_vbsl_p16(uint16x4_t v1, poly16x4_t v2, poly16x4_t v3) {
+  // CHECK: test_vbsl_p16
+  return vbsl_p16(v1, v2, v3);
+  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+int8x16_t test_vbslq_s8(uint8x16_t v1, int8x16_t v2, int8x16_t v3) {
+  // CHECK: test_vbslq_s8
+  return vbslq_s8(v1, v2, v3);
+  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+int16x8_t test_vbslq_s16(uint16x8_t v1, int16x8_t v2, int16x8_t v3) {
+  // CHECK: test_vbslq_s16
+  return vbslq_s16(v1, v2, v3);
+  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+int32x4_t test_vbslq_s32(uint32x4_t v1, int32x4_t v2, int32x4_t v3) {
+  // CHECK: test_vbslq_s32
+  return vbslq_s32(v1, v2, v3);
+  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+int64x2_t test_vbslq_s64(uint64x2_t v1, int64x2_t v2, int64x2_t v3) {
+  // CHECK: test_vbslq_s64
+  return vbslq_s64(v1, v2, v3);
+  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+uint8x16_t test_vbslq_u8(uint8x16_t v1, uint8x16_t v2, uint8x16_t v3) {
+  // CHECK: test_vbslq_u8
+  return vbslq_u8(v1, v2, v3);
+  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+uint16x8_t test_vbslq_u16(uint16x8_t v1, uint16x8_t v2, uint16x8_t v3) {
+  // CHECK: test_vbslq_u16
+  return vbslq_u16(v1, v2, v3);
+  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+int32x4_t test_vbslq_u32(uint32x4_t v1, int32x4_t v2, int32x4_t v3) {
+  // CHECK: test_vbslq_u32
+  return vbslq_s32(v1, v2, v3);
+  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+uint64x2_t test_vbslq_u64(uint64x2_t v1, uint64x2_t v2, uint64x2_t v3) {
+  // CHECK: test_vbslq_u64
+  return vbslq_u64(v1, v2, v3);
+  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+float32x4_t test_vbslq_f32(uint32x4_t v1, float32x4_t v2, float32x4_t v3) {
+  // CHECK: test_vbslq_f32
+  return vbslq_f32(v1, v2, v3);
+  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+poly8x16_t test_vbslq_p8(uint8x16_t v1, poly8x16_t v2, poly8x16_t v3) {
+  // CHECK: test_vbslq_p8
+  return vbslq_p8(v1, v2, v3);
+  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+poly16x8_t test_vbslq_p16(uint16x8_t v1, poly16x8_t v2, poly16x8_t v3) {
+  // CHECK: test_vbslq_p16
+  return vbslq_p16(v1, v2, v3);
+  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+float64x2_t test_vbslq_f64(uint64x2_t v1, float64x2_t v2, float64x2_t v3) {
+  // CHECK: test_vbslq_f64
+  return vbslq_f64(v1, v2, v3);
+  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+float32x2_t test_vrecps_f32(float32x2_t v1, float32x2_t v2) {
+   // CHECK: test_vrecps_f32
+   return vrecps_f32(v1, v2);
+   // CHECK: frecps {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+float32x4_t test_vrecpsq_f32(float32x4_t v1, float32x4_t v2) {
+   // CHECK: test_vrecpsq_f32
+   return vrecpsq_f32(v1, v2);
+   // CHECK: frecps {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+float64x2_t test_vrecpsq_f64(float64x2_t v1, float64x2_t v2) {
+   // CHECK: test_vrecpsq_f64
+  return vrecpsq_f64(v1, v2);
+  // CHECK: frecps {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+}
+
+float32x2_t test_vrsqrts_f32(float32x2_t v1, float32x2_t v2) {
+   // CHECK: test_vrsqrts_f32
+  return vrsqrts_f32(v1, v2);
+  // CHECK: frsqrts {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+float32x4_t test_vrsqrtsq_f32(float32x4_t v1, float32x4_t v2) {
+   // CHECK: test_vrsqrtsq_f32
+  return vrsqrtsq_f32(v1, v2);
+  // CHECK: frsqrts {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+float64x2_t test_vrsqrtsq_f64(float64x2_t v1, float64x2_t v2) {
+   // CHECK: test_vrsqrtsq_f64
+  return vrsqrtsq_f64(v1, v2);
+  // CHECK: frsqrts {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+}
+
+uint32x2_t test_vcage_f32(float32x2_t v1, float32x2_t v2) {
+  // CHECK: test_vcage_f32
+  return vcage_f32(v1, v2);
+  // CHECK: facge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+uint32x4_t test_vcageq_f32(float32x4_t v1, float32x4_t v2) {
+  // CHECK: test_vcageq_f32
+  return vcageq_f32(v1, v2);
+  // CHECK: facge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+uint64x2_t test_vcageq_f64(float64x2_t v1, float64x2_t v2) {
+  // CHECK: test_vcageq_f64
+  return vcageq_f64(v1, v2);
+  // CHECK: facge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+}
+
+uint32x2_t test_vcagt_f32(float32x2_t v1, float32x2_t v2) {
+  // CHECK: test_vcagt_f32
+  return vcagt_f32(v1, v2);
+  // CHECK: facgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+uint32x4_t test_vcagtq_f32(float32x4_t v1, float32x4_t v2) {
+  // CHECK: test_vcagtq_f32
+  return vcagtq_f32(v1, v2);
+  // CHECK: facgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+uint64x2_t test_vcagtq_f64(float64x2_t v1, float64x2_t v2) {
+  // CHECK: test_vcagtq_f64
+  return vcagtq_f64(v1, v2);
+  // CHECK: facgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+}
+
+uint32x2_t test_vcale_f32(float32x2_t v1, float32x2_t v2) {
+  // CHECK: test_vcale_f32
+  return vcale_f32(v1, v2);
+ // Using registers other than v0, v1 are possible, but would be odd.
+  // CHECK: facge {{v[0-9]+}}.2s, v1.2s, v0.2s
+}
+
+uint32x4_t test_vcaleq_f32(float32x4_t v1, float32x4_t v2) {
+  // CHECK: test_vcaleq_f32
+  return vcaleq_f32(v1, v2);
+  // Using registers other than v0, v1 are possible, but would be odd.
+  // CHECK: facge {{v[0-9]+}}.4s, v1.4s, v0.4s
+}
+
+uint64x2_t test_vcaleq_f64(float64x2_t v1, float64x2_t v2) {
+  // CHECK: test_vcaleq_f64
+  return vcaleq_f64(v1, v2);
+  // Using registers other than v0, v1 are possible, but would be odd.
+  // CHECK: facge {{v[0-9]+}}.2d, v1.2d, v0.2d
+}
+
+uint32x2_t test_vcalt_f32(float32x2_t v1, float32x2_t v2) {
+  // CHECK: test_vcalt_f32
+  return vcalt_f32(v1, v2);
+  // Using registers other than v0, v1 are possible, but would be odd.
+  // CHECK: facgt {{v[0-9]+}}.2s, v1.2s, v0.2s
+}
+
+uint32x4_t test_vcaltq_f32(float32x4_t v1, float32x4_t v2) {
+  // CHECK: test_vcaltq_f32
+  return vcaltq_f32(v1, v2);
+  // Using registers other than v0, v1 are possible, but would be odd.
+  // CHECK: facgt {{v[0-9]+}}.4s, v1.4s, v0.4s
+}
+
+uint64x2_t test_vcaltq_f64(float64x2_t v1, float64x2_t v2) {
+  // CHECK: test_vcaltq_f64
+  return vcaltq_f64(v1, v2);
+  // Using registers other than v0, v1 are possible, but would be odd.
+  // CHECK: facgt {{v[0-9]+}}.2d, v1.2d, v0.2d
+}
+
+uint8x8_t test_vtst_s8(int8x8_t v1, int8x8_t v2) {
+   // CHECK: test_vtst_s8
+  return vtst_s8(v1, v2);
+  // CHECK: cmtst {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+uint16x4_t test_vtst_s16(int16x4_t v1, int16x4_t v2) {
+   // CHECK: test_vtst_s16
+  return vtst_s16(v1, v2);
+  // CHECK: cmtst {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+
+uint32x2_t test_vtst_s32(int32x2_t v1, int32x2_t v2) {
+   // CHECK: test_vtst_s32
+  return vtst_s32(v1, v2);
+  // CHECK: cmtst {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+uint8x8_t test_vtst_u8(uint8x8_t v1, uint8x8_t v2) {
+   // CHECK: test_vtst_u8
+  return vtst_u8(v1, v2);
+  // CHECK: cmtst {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+uint16x4_t test_vtst_u16(uint16x4_t v1, uint16x4_t v2) {
+   // CHECK: test_vtst_u16
+  return vtst_u16(v1, v2);
+  // CHECK: cmtst {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+
+uint32x2_t test_vtst_u32(uint32x2_t v1, uint32x2_t v2) {
+   // CHECK: test_vtst_u32
+  return vtst_u32(v1, v2);
+  // CHECK: cmtst {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+uint8x16_t test_vtstq_s8(int8x16_t v1, int8x16_t v2) {
+   // CHECK: test_vtstq_s8
+  return vtstq_s8(v1, v2);
+  // CHECK: cmtst {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+uint16x8_t test_vtstq_s16(int16x8_t v1, int16x8_t v2) {
+   // CHECK: test_vtstq_s16
+  return vtstq_s16(v1, v2);
+  // CHECK: cmtst {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+uint32x4_t test_vtstq_s32(int32x4_t v1, int32x4_t v2) {
+   // CHECK: test_vtstq_s32
+  return vtstq_s32(v1, v2);
+  // CHECK: cmtst {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+uint8x16_t test_vtstq_u8(uint8x16_t v1, uint8x16_t v2) {
+   // CHECK: test_vtstq_u8
+  return vtstq_u8(v1, v2);
+  // CHECK: cmtst {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+uint16x8_t test_vtstq_u16(uint16x8_t v1, uint16x8_t v2) {
+   // CHECK: test_vtstq_u16
+  return vtstq_u16(v1, v2);
+  // CHECK: cmtst {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+uint32x4_t test_vtstq_u32(uint32x4_t v1, uint32x4_t v2) {
+   // CHECK: test_vtstq_u32
+  return vtstq_u32(v1, v2);
+  // CHECK: cmtst {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+uint64x2_t test_vtstq_s64(int64x2_t v1, int64x2_t v2) {
+   // CHECK: test_vtstq_s64
+  return vtstq_s64(v1, v2);
+  // CHECK: cmtst {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+}
+
+uint64x2_t test_vtstq_u64(uint64x2_t v1, uint64x2_t v2) {
+   // CHECK: test_vtstq_u64
+  return vtstq_u64(v1, v2);
+  // CHECK: cmtst {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+}
+
+uint8x8_t test_vtst_p8(poly8x8_t v1, poly8x8_t v2) {
+   // CHECK: test_vtst_p8
+  return vtst_p8(v1, v2);
+  // CHECK: cmtst {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+uint8x16_t test_vtstq_p8(poly8x16_t v1, poly8x16_t v2) {
+   // CHECK: test_vtstq_p8
+  return vtstq_p8(v1, v2);
+  // CHECK: cmtst {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+
+uint8x8_t test_vceq_s8(int8x8_t v1, int8x8_t v2) {
+  // CHECK: test_vceq_s8
+  return vceq_s8(v1, v2);
+  // CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+uint16x4_t test_vceq_s16(int16x4_t v1, int16x4_t v2) {
+  // CHECK: test_vceq_s16
+  return vceq_s16(v1, v2);
+  // CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+
+uint32x2_t test_vceq_s32(int32x2_t v1, int32x2_t v2) {
+  // CHECK: test_vceq_s32
+  return vceq_s32(v1, v2);
+  // CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+uint32x2_t test_vceq_f32(float32x2_t v1, float32x2_t v2) {
+  // CHECK: test_vceq_f32
+  return vceq_f32(v1, v2);
+  // CHECK: fcmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+uint8x8_t test_vceq_u8(uint8x8_t v1, uint8x8_t v2) {
+  // CHECK: test_vceq_u8
+  return vceq_u8(v1, v2);
+  // CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+uint16x4_t test_vceq_u16(uint16x4_t v1, uint16x4_t v2) {
+  // CHECK: test_vceq_u16
+  return vceq_u16(v1, v2);
+  // CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+
+uint32x2_t test_vceq_u32(uint32x2_t v1, uint32x2_t v2) {
+  // CHECK: test_vceq_u32
+  return vceq_u32(v1, v2);
+  // CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+uint8x8_t test_vceq_p8(poly8x8_t v1, poly8x8_t v2) {
+  // CHECK: test_vceq_p8
+  return vceq_p8(v1, v2);
+  // CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+uint8x16_t test_vceqq_s8(int8x16_t v1, int8x16_t v2) {
+  // CHECK: test_vceqq_s8
+  return vceqq_s8(v1, v2);
+  // CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+uint16x8_t test_vceqq_s16(int16x8_t v1, int16x8_t v2) {
+  // CHECK: test_vceqq_s16
+  return vceqq_s16(v1, v2);
+  // CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+uint32x4_t test_vceqq_s32(int32x4_t v1, int32x4_t v2) {
+  // CHECK: test_vceqq_s32
+  return vceqq_s32(v1, v2);
+  // CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+uint32x4_t test_vceqq_f32(float32x4_t v1, float32x4_t v2) {
+  // CHECK: test_vceqq_f32
+  return vceqq_f32(v1, v2);
+  // CHECK: fcmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+uint8x16_t test_vceqq_u8(uint8x16_t v1, uint8x16_t v2) {
+  // CHECK: test_vceqq_u8
+  return vceqq_u8(v1, v2);
+  // CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+uint16x8_t test_vceqq_u16(uint16x8_t v1, uint16x8_t v2) {
+  // CHECK: test_vceqq_u16
+  return vceqq_u16(v1, v2);
+  // CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+uint32x4_t test_vceqq_u32(uint32x4_t v1, uint32x4_t v2) {
+  // CHECK: test_vceqq_u32
+  return vceqq_u32(v1, v2);
+  // CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+uint8x16_t test_vceqq_p8(poly8x16_t v1, poly8x16_t v2) {
+  // CHECK: test_vceqq_p8
+  return vceqq_p8(v1, v2);
+  // CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+
+uint64x2_t test_vceqq_s64(int64x2_t v1, int64x2_t v2) {
+  // CHECK: test_vceqq_s64
+  return vceqq_s64(v1, v2);
+  // CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+}
+
+uint64x2_t test_vceqq_u64(uint64x2_t v1, uint64x2_t v2) {
+  // CHECK: test_vceqq_u64
+  return vceqq_u64(v1, v2);
+  // CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+}
+
+uint64x2_t test_vceqq_f64(float64x2_t v1, float64x2_t v2) {
+  // CHECK: test_vceqq_f64
+  return vceqq_f64(v1, v2);
+  // CHECK: fcmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+}
+uint8x8_t test_vcge_s8(int8x8_t v1, int8x8_t v2) {
+// CHECK: test_vcge_s8
+  return vcge_s8(v1, v2);
+// CHECK: cmge {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+uint16x4_t test_vcge_s16(int16x4_t v1, int16x4_t v2) {
+// CHECK: test_vcge_s16
+  return vcge_s16(v1, v2);
+// CHECK: cmge {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+
+uint32x2_t test_vcge_s32(int32x2_t v1, int32x2_t v2) {
+// CHECK: test_vcge_s32
+  return vcge_s32(v1, v2);
+// CHECK: cmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+uint32x2_t test_vcge_f32(float32x2_t v1, float32x2_t v2) {
+// CHECK: test_vcge_f32
+  return vcge_f32(v1, v2);
+// CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+uint8x8_t test_vcge_u8(uint8x8_t v1, uint8x8_t v2) {
+// CHECK: test_vcge_u8
+  return vcge_u8(v1, v2);
+// CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+uint16x4_t test_vcge_u16(uint16x4_t v1, uint16x4_t v2) {
+// CHECK: test_vcge_u16
+  return vcge_u16(v1, v2);
+// CHECK: cmhs {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+
+uint32x2_t test_vcge_u32(uint32x2_t v1, uint32x2_t v2) {
+// CHECK: test_vcge_u32
+  return vcge_u32(v1, v2);
+// CHECK: cmhs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+uint8x16_t test_vcgeq_s8(int8x16_t v1, int8x16_t v2) {
+// CHECK: test_vcgeq_s8
+  return vcgeq_s8(v1, v2);
+// CHECK: cmge {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+uint16x8_t test_vcgeq_s16(int16x8_t v1, int16x8_t v2) {
+// CHECK: test_vcgeq_s16
+  return vcgeq_s16(v1, v2);
+// CHECK: cmge {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+uint32x4_t test_vcgeq_s32(int32x4_t v1, int32x4_t v2) {
+// CHECK: test_vcgeq_s32
+  return vcgeq_s32(v1, v2);
+// CHECK: cmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+uint32x4_t test_vcgeq_f32(float32x4_t v1, float32x4_t v2) {
+// CHECK: test_vcgeq_f32
+  return vcgeq_f32(v1, v2);
+// CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+uint8x16_t test_vcgeq_u8(uint8x16_t v1, uint8x16_t v2) {
+// CHECK: test_vcgeq_u8
+  return vcgeq_u8(v1, v2);
+// CHECK: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+uint16x8_t test_vcgeq_u16(uint16x8_t v1, uint16x8_t v2) {
+// CHECK: test_vcgeq_u16
+  return vcgeq_u16(v1, v2);
+// CHECK: cmhs {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+uint32x4_t test_vcgeq_u32(uint32x4_t v1, uint32x4_t v2) {
+// CHECK: test_vcgeq_u32
+  return vcgeq_u32(v1, v2);
+// CHECK: cmhs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+uint64x2_t test_vcgeq_s64(int64x2_t v1, int64x2_t v2) {
+// CHECK: test_vcgeq_s64
+  return vcgeq_s64(v1, v2);
+// CHECK: cmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+}
+
+uint64x2_t test_vcgeq_u64(uint64x2_t v1, uint64x2_t v2) {
+// CHECK: test_vcgeq_u64
+  return vcgeq_u64(v1, v2);
+// CHECK: cmhs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+}
+
+uint64x2_t test_vcgeq_f64(float64x2_t v1, float64x2_t v2) {
+// CHECK: test_vcgeq_f64
+  return vcgeq_f64(v1, v2);
+// CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+}
+
+// Notes about vcle:
+// LE condition predicate implemented as GE, so check reversed operands.
+// Using registers other than v0, v1 are possible, but would be odd.
+uint8x8_t test_vcle_s8(int8x8_t v1, int8x8_t v2) {
+  // CHECK: test_vcle_s8
+  return vcle_s8(v1, v2);
+  // CHECK: cmge {{v[0-9]+}}.8b, v1.8b, v0.8b
+}
+
+uint16x4_t test_vcle_s16(int16x4_t v1, int16x4_t v2) {
+  // CHECK: test_vcle_s16
+  return vcle_s16(v1, v2);
+  // CHECK: cmge {{v[0-9]+}}.4h, v1.4h, v0.4h
+}
+
+uint32x2_t test_vcle_s32(int32x2_t v1, int32x2_t v2) {
+  // CHECK: test_vcle_s32
+  return vcle_s32(v1, v2);
+  // CHECK: cmge {{v[0-9]+}}.2s, v1.2s, v0.2s
+}
+
+uint32x2_t test_vcle_f32(float32x2_t v1, float32x2_t v2) {
+  // CHECK: test_vcle_f32
+  return vcle_f32(v1, v2);
+  // CHECK: fcmge {{v[0-9]+}}.2s, v1.2s, v0.2s
+}
+
+uint8x8_t test_vcle_u8(uint8x8_t v1, uint8x8_t v2) {
+  // CHECK: test_vcle_u8
+  return vcle_u8(v1, v2);
+  // CHECK: cmhs {{v[0-9]+}}.8b, v1.8b, v0.8b
+}
+
+uint16x4_t test_vcle_u16(uint16x4_t v1, uint16x4_t v2) {
+  // CHECK: test_vcle_u16
+  return vcle_u16(v1, v2);
+  // CHECK: cmhs {{v[0-9]+}}.4h, v1.4h, v0.4h
+}
+
+uint32x2_t test_vcle_u32(uint32x2_t v1, uint32x2_t v2) {
+  // CHECK: test_vcle_u32
+  return vcle_u32(v1, v2);
+  // CHECK: cmhs {{v[0-9]+}}.2s, v1.2s, v0.2s
+}
+
+uint8x16_t test_vcleq_s8(int8x16_t v1, int8x16_t v2) {
+  // CHECK: test_vcleq_s8
+  return vcleq_s8(v1, v2);
+  // CHECK: cmge {{v[0-9]+}}.16b, v1.16b, v0.16b
+}
+
+uint16x8_t test_vcleq_s16(int16x8_t v1, int16x8_t v2) {
+  // CHECK: test_vcleq_s16
+  return vcleq_s16(v1, v2);
+  // CHECK: cmge {{v[0-9]+}}.8h, v1.8h, v0.8h
+}
+
+uint32x4_t test_vcleq_s32(int32x4_t v1, int32x4_t v2) {
+  // CHECK: test_vcleq_s32
+  return vcleq_s32(v1, v2);
+  // CHECK: cmge {{v[0-9]+}}.4s, v1.4s, v0.4s
+}
+
+uint32x4_t test_vcleq_f32(float32x4_t v1, float32x4_t v2) {
+  // CHECK: test_vcleq_f32
+  return vcleq_f32(v1, v2);
+  // CHECK: fcmge {{v[0-9]+}}.4s, v1.4s, v0.4s
+}
+
+uint8x16_t test_vcleq_u8(uint8x16_t v1, uint8x16_t v2) {
+  // CHECK: test_vcleq_u8
+  return vcleq_u8(v1, v2);
+  // CHECK: cmhs {{v[0-9]+}}.16b, v1.16b, v0.16b
+}
+
+uint16x8_t test_vcleq_u16(uint16x8_t v1, uint16x8_t v2) {
+  // CHECK: test_vcleq_u16
+  return vcleq_u16(v1, v2);
+  // CHECK: cmhs {{v[0-9]+}}.8h, v1.8h, v0.8h
+}
+
+uint32x4_t test_vcleq_u32(uint32x4_t v1, uint32x4_t v2) {
+  // CHECK: test_vcleq_u32
+  return vcleq_u32(v1, v2);
+  // CHECK: cmhs {{v[0-9]+}}.4s, v1.4s, v0.4s
+}
+
+uint64x2_t test_vcleq_s64(int64x2_t v1, int64x2_t v2) {
+  // CHECK: test_vcleq_s64
+  return vcleq_s64(v1, v2);
+  // CHECK: cmge {{v[0-9]+}}.2d, v1.2d, v0.2d
+}
+
+uint64x2_t test_vcleq_u64(uint64x2_t v1, uint64x2_t v2) {
+  // CHECK: test_vcleq_u64
+  return vcleq_u64(v1, v2);
+  // CHECK: cmhs {{v[0-9]+}}.2d, v1.2d, v0.2d
+}
+
+uint64x2_t test_vcleq_f64(float64x2_t v1, float64x2_t v2) {
+  // CHECK: test_vcleq_f64
+  return vcleq_f64(v1, v2);
+  // CHECK: fcmge {{v[0-9]+}}.2d, v1.2d, v0.2d
+}
+
+
+uint8x8_t test_vcgt_s8(int8x8_t v1, int8x8_t v2) {
+  // CHECK: test_vcgt_s8
+  return vcgt_s8(v1, v2);
+  // CHECK: cmgt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+uint16x4_t test_vcgt_s16(int16x4_t v1, int16x4_t v2) {
+  // CHECK: test_vcgt_s16
+  return vcgt_s16(v1, v2);
+  // CHECK: cmgt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+
+uint32x2_t test_vcgt_s32(int32x2_t v1, int32x2_t v2) {
+  // CHECK: test_vcgt_s32
+  return vcgt_s32(v1, v2);
+  // CHECK: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+uint32x2_t test_vcgt_f32(float32x2_t v1, float32x2_t v2) {
+  // CHECK: test_vcgt_f32
+  return vcgt_f32(v1, v2);
+  // CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+uint8x8_t test_vcgt_u8(uint8x8_t v1, uint8x8_t v2) {
+  // CHECK: test_vcgt_u8
+  return vcgt_u8(v1, v2);
+  // CHECK: cmhi {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+uint16x4_t test_vcgt_u16(uint16x4_t v1, uint16x4_t v2) {
+  // CHECK: test_vcgt_u16
+  return vcgt_u16(v1, v2);
+  // CHECK: cmhi {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+
+uint32x2_t test_vcgt_u32(uint32x2_t v1, uint32x2_t v2) {
+  // CHECK: test_vcgt_u32
+  return vcgt_u32(v1, v2);
+  // CHECK: cmhi {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+uint8x16_t test_vcgtq_s8(int8x16_t v1, int8x16_t v2) {
+  // CHECK: test_vcgtq_s8
+  return vcgtq_s8(v1, v2);
+  // CHECK: cmgt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+uint16x8_t test_vcgtq_s16(int16x8_t v1, int16x8_t v2) {
+  // CHECK: test_vcgtq_s16
+  return vcgtq_s16(v1, v2);
+  // CHECK: cmgt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+uint32x4_t test_vcgtq_s32(int32x4_t v1, int32x4_t v2) {
+  // CHECK: test_vcgtq_s32
+  return vcgtq_s32(v1, v2);
+  // CHECK: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+uint32x4_t test_vcgtq_f32(float32x4_t v1, float32x4_t v2) {
+  // CHECK: test_vcgtq_f32
+  return vcgtq_f32(v1, v2);
+  // CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+uint8x16_t test_vcgtq_u8(uint8x16_t v1, uint8x16_t v2) {
+  // CHECK: test_vcgtq_u8
+  return vcgtq_u8(v1, v2);
+  // CHECK: cmhi {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+uint16x8_t test_vcgtq_u16(uint16x8_t v1, uint16x8_t v2) {
+  // CHECK: test_vcgtq_u16
+  return vcgtq_u16(v1, v2);
+  // CHECK: cmhi {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+uint32x4_t test_vcgtq_u32(uint32x4_t v1, uint32x4_t v2) {
+  // CHECK: test_vcgtq_u32
+  return vcgtq_u32(v1, v2);
+  // CHECK: cmhi {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+uint64x2_t test_vcgtq_s64(int64x2_t v1, int64x2_t v2) {
+  // CHECK: test_vcgtq_s64
+  return vcgtq_s64(v1, v2);
+  // CHECK: cmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+}
+
+uint64x2_t test_vcgtq_u64(uint64x2_t v1, uint64x2_t v2) {
+  // CHECK: test_vcgtq_u64
+  return vcgtq_u64(v1, v2);
+  // CHECK: cmhi {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+}
+
+uint64x2_t test_vcgtq_f64(float64x2_t v1, float64x2_t v2) {
+  // CHECK: test_vcgtq_f64
+  return vcgtq_f64(v1, v2);
+  // CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+}
+
+
+// Notes about vclt:
+// LT condition predicate implemented as GT, so check reversed operands.
+// Using registers other than v0, v1 are possible, but would be odd.
+
+uint8x8_t test_vclt_s8(int8x8_t v1, int8x8_t v2) {
+  // CHECK: test_vclt_s8
+  return vclt_s8(v1, v2);
+  // CHECK: cmgt {{v[0-9]+}}.8b, v1.8b, v0.8b
+}
+
+uint16x4_t test_vclt_s16(int16x4_t v1, int16x4_t v2) {
+  // CHECK: test_vclt_s16
+  return vclt_s16(v1, v2);
+  // CHECK: cmgt {{v[0-9]+}}.4h, v1.4h, v0.4h
+}
+
+uint32x2_t test_vclt_s32(int32x2_t v1, int32x2_t v2) {
+  // CHECK: test_vclt_s32
+  return vclt_s32(v1, v2);
+  // CHECK: cmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
+}
+
+uint32x2_t test_vclt_f32(float32x2_t v1, float32x2_t v2) {
+  // CHECK: test_vclt_f32
+  return vclt_f32(v1, v2);
+  // CHECK: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
+}
+
+uint8x8_t test_vclt_u8(uint8x8_t v1, uint8x8_t v2) {
+  // CHECK: test_vclt_u8
+  return vclt_u8(v1, v2);
+  // CHECK: cmhi {{v[0-9]+}}.8b, v1.8b, v0.8b
+}
+
+uint16x4_t test_vclt_u16(uint16x4_t v1, uint16x4_t v2) {
+  // CHECK: test_vclt_u16
+  return vclt_u16(v1, v2);
+  // CHECK: cmhi {{v[0-9]+}}.4h, v1.4h, v0.4h
+}
+
+uint32x2_t test_vclt_u32(uint32x2_t v1, uint32x2_t v2) {
+  // CHECK: test_vclt_u32
+  return vclt_u32(v1, v2);
+  // CHECK: cmhi {{v[0-9]+}}.2s, v1.2s, v0.2s
+}
+
+uint8x16_t test_vcltq_s8(int8x16_t v1, int8x16_t v2) {
+  // CHECK: test_vcltq_s8
+  return vcltq_s8(v1, v2);
+  // CHECK: cmgt {{v[0-9]+}}.16b, v1.16b, v0.16b
+}
+
+uint16x8_t test_vcltq_s16(int16x8_t v1, int16x8_t v2) {
+  // CHECK: test_vcltq_s16
+  return vcltq_s16(v1, v2);
+  // CHECK: cmgt {{v[0-9]+}}.8h, v1.8h, v0.8h
+}
+
+uint32x4_t test_vcltq_s32(int32x4_t v1, int32x4_t v2) {
+  // CHECK: test_vcltq_s32
+  return vcltq_s32(v1, v2);
+  // CHECK: cmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
+}
+
+uint32x4_t test_vcltq_f32(float32x4_t v1, float32x4_t v2) {
+  // CHECK: test_vcltq_f32
+  return vcltq_f32(v1, v2);
+  // CHECK: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
+}
+
+uint8x16_t test_vcltq_u8(uint8x16_t v1, uint8x16_t v2) {
+  // CHECK: test_vcltq_u8
+  return vcltq_u8(v1, v2);
+  // CHECK: cmhi {{v[0-9]+}}.16b, v1.16b, v0.16b
+}
+
+uint16x8_t test_vcltq_u16(uint16x8_t v1, uint16x8_t v2) {
+  // CHECK: test_vcltq_u16
+  return vcltq_u16(v1, v2);
+  // CHECK: cmhi {{v[0-9]+}}.8h, v1.8h, v0.8h
+}
+
+uint32x4_t test_vcltq_u32(uint32x4_t v1, uint32x4_t v2) {
+  // CHECK: test_vcltq_u32
+  return vcltq_u32(v1, v2);
+  // CHECK: cmhi {{v[0-9]+}}.4s, v1.4s, v0.4s
+}
+
+uint64x2_t test_vcltq_s64(int64x2_t v1, int64x2_t v2) {
+  // CHECK: test_vcltq_s64
+  return vcltq_s64(v1, v2);
+  // CHECK: cmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
+}
+
+uint64x2_t test_vcltq_u64(uint64x2_t v1, uint64x2_t v2) {
+  // CHECK: test_vcltq_u64
+  return vcltq_u64(v1, v2);
+  // CHECK: cmhi {{v[0-9]+}}.2d, v1.2d, v0.2d
+}
+
+uint64x2_t test_vcltq_f64(float64x2_t v1, float64x2_t v2) {
+  // CHECK: test_vcltq_f64
+  return vcltq_f64(v1, v2);
+  // CHECK: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
+}
+
+
+int8x8_t test_vhadd_s8(int8x8_t v1, int8x8_t v2) {
+// CHECK: test_vhadd_s8
+  return vhadd_s8(v1, v2);
+  // CHECK: shadd {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+int16x4_t test_vhadd_s16(int16x4_t v1, int16x4_t v2) {
+// CHECK: test_vhadd_s16
+  return vhadd_s16(v1, v2);
+  // CHECK: shadd {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+
+int32x2_t test_vhadd_s32(int32x2_t v1, int32x2_t v2) {
+// CHECK: test_vhadd_s32
+  return vhadd_s32(v1, v2);
+  // CHECK: shadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+uint8x8_t test_vhadd_u8(uint8x8_t v1, uint8x8_t v2) {
+// CHECK: test_vhadd_u8
+  return vhadd_u8(v1, v2);
+  // CHECK: uhadd {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+uint16x4_t test_vhadd_u16(uint16x4_t v1, uint16x4_t v2) {
+// CHECK: test_vhadd_u16
+  return vhadd_u16(v1, v2);
+  // CHECK: uhadd {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+
+uint32x2_t test_vhadd_u32(uint32x2_t v1, uint32x2_t v2) {
+// CHECK: test_vhadd_u32
+  return vhadd_u32(v1, v2);
+  // CHECK: uhadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+int8x16_t test_vhaddq_s8(int8x16_t v1, int8x16_t v2) {
+// CHECK: test_vhaddq_s8
+  return vhaddq_s8(v1, v2);
+  // CHECK: shadd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+int16x8_t test_vhaddq_s16(int16x8_t v1, int16x8_t v2) {
+// CHECK: test_vhaddq_s16
+  return vhaddq_s16(v1, v2);
+  // CHECK: shadd {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+int32x4_t test_vhaddq_s32(int32x4_t v1, int32x4_t v2) {
+// CHECK: test_vhaddq_s32
+  return vhaddq_s32(v1, v2);
+  // CHECK: shadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+uint8x16_t test_vhaddq_u8(uint8x16_t v1, uint8x16_t v2) {
+// CHECK: test_vhaddq_u8
+  return vhaddq_u8(v1, v2);
+  // CHECK: uhadd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+uint16x8_t test_vhaddq_u16(uint16x8_t v1, uint16x8_t v2) {
+// CHECK: test_vhaddq_u16
+  return vhaddq_u16(v1, v2);
+  // CHECK: uhadd {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+uint32x4_t test_vhaddq_u32(uint32x4_t v1, uint32x4_t v2) {
+// CHECK: test_vhaddq_u32
+  return vhaddq_u32(v1, v2);
+  // CHECK: uhadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+
+int8x8_t test_vhsub_s8(int8x8_t v1, int8x8_t v2) {
+// CHECK: test_vhsub_s8
+  return vhsub_s8(v1, v2);
+  // CHECK: shsub {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+int16x4_t test_vhsub_s16(int16x4_t v1, int16x4_t v2) {
+// CHECK: test_vhsub_s16
+  return vhsub_s16(v1, v2);
+  // CHECK: shsub {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+
+int32x2_t test_vhsub_s32(int32x2_t v1, int32x2_t v2) {
+// CHECK: test_vhsub_s32
+  return vhsub_s32(v1, v2);
+  // CHECK: shsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+uint8x8_t test_vhsub_u8(uint8x8_t v1, uint8x8_t v2) {
+// CHECK: test_vhsub_u8
+  return vhsub_u8(v1, v2);
+  // CHECK: uhsub {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+uint16x4_t test_vhsub_u16(uint16x4_t v1, uint16x4_t v2) {
+// CHECK: test_vhsub_u16
+  return vhsub_u16(v1, v2);
+  // CHECK: uhsub {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+
+uint32x2_t test_vhsub_u32(uint32x2_t v1, uint32x2_t v2) {
+// CHECK: test_vhsub_u32
+  return vhsub_u32(v1, v2);
+  // CHECK: uhsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+int8x16_t test_vhsubq_s8(int8x16_t v1, int8x16_t v2) {
+// CHECK: test_vhsubq_s8
+  return vhsubq_s8(v1, v2);
+  // CHECK: shsub {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+int16x8_t test_vhsubq_s16(int16x8_t v1, int16x8_t v2) {
+// CHECK: test_vhsubq_s16
+  return vhsubq_s16(v1, v2);
+  // CHECK: shsub {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+int32x4_t test_vhsubq_s32(int32x4_t v1, int32x4_t v2) {
+// CHECK: test_vhsubq_s32
+  return vhsubq_s32(v1, v2);
+  // CHECK: shsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+uint8x16_t test_vhsubq_u8(uint8x16_t v1, uint8x16_t v2) {
+// CHECK: test_vhsubq_u8
+  return vhsubq_u8(v1, v2);
+  // CHECK: uhsub {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+uint16x8_t test_vhsubq_u16(uint16x8_t v1, uint16x8_t v2) {
+// CHECK: test_vhsubq_u16
+  return vhsubq_u16(v1, v2);
+  // CHECK: uhsub {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+uint32x4_t test_vhsubq_u32(uint32x4_t v1, uint32x4_t v2) {
+// CHECK: test_vhsubq_u32
+  return vhsubq_u32(v1, v2);
+  // CHECK: uhsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+
+int8x8_t test_vrhadd_s8(int8x8_t v1, int8x8_t v2) {
+// CHECK: test_vrhadd_s8
+  return vrhadd_s8(v1, v2);
+// CHECK: srhadd {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+int16x4_t test_vrhadd_s16(int16x4_t v1, int16x4_t v2) {
+// CHECK: test_vrhadd_s16
+  return vrhadd_s16(v1, v2);
+// CHECK: srhadd {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+
+int32x2_t test_vrhadd_s32(int32x2_t v1, int32x2_t v2) {
+// CHECK: test_vrhadd_s32
+  return vrhadd_s32(v1, v2);
+// CHECK: srhadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+uint8x8_t test_vrhadd_u8(uint8x8_t v1, uint8x8_t v2) {
+// CHECK: test_vrhadd_u8
+  return vrhadd_u8(v1, v2);
+// CHECK: urhadd {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+uint16x4_t test_vrhadd_u16(uint16x4_t v1, uint16x4_t v2) {
+// CHECK: test_vrhadd_u16
+  return vrhadd_u16(v1, v2);
+// CHECK: urhadd {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+
+uint32x2_t test_vrhadd_u32(uint32x2_t v1, uint32x2_t v2) {
+// CHECK: test_vrhadd_u32
+  return vrhadd_u32(v1, v2);
+// CHECK: urhadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+int8x16_t test_vrhaddq_s8(int8x16_t v1, int8x16_t v2) {
+// CHECK: test_vrhaddq_s8
+  return vrhaddq_s8(v1, v2);
+// CHECK: srhadd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+int16x8_t test_vrhaddq_s16(int16x8_t v1, int16x8_t v2) {
+// CHECK: test_vrhaddq_s16
+  return vrhaddq_s16(v1, v2);
+// CHECK: srhadd {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+int32x4_t test_vrhaddq_s32(int32x4_t v1, int32x4_t v2) {
+// CHECK: test_vrhaddq_s32
+  return vrhaddq_s32(v1, v2);
+// CHECK: srhadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+uint8x16_t test_vrhaddq_u8(uint8x16_t v1, uint8x16_t v2) {
+// CHECK: test_vrhaddq_u8
+  return vrhaddq_u8(v1, v2);
+// CHECK: urhadd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+uint16x8_t test_vrhaddq_u16(uint16x8_t v1, uint16x8_t v2) {
+// CHECK: test_vrhaddq_u16
+  return vrhaddq_u16(v1, v2);
+// CHECK: urhadd {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+uint32x4_t test_vrhaddq_u32(uint32x4_t v1, uint32x4_t v2) {
+// CHECK: test_vrhaddq_u32
+  return vrhaddq_u32(v1, v2);
+// CHECK: urhadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+int8x8_t test_vqadd_s8(int8x8_t a, int8x8_t b) {
+// CHECK: test_vqadd_s8
+  return vqadd_s8(a, b);
+  // CHECK: sqadd {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+int16x4_t test_vqadd_s16(int16x4_t a, int16x4_t b) {
+// CHECK: test_vqadd_s16
+  return vqadd_s16(a, b);
+  // CHECK: sqadd {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+
+int32x2_t test_vqadd_s32(int32x2_t a, int32x2_t b) {
+// CHECK: test_vqadd_s32
+  return vqadd_s32(a, b);
+  // CHECK: sqadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+int64x1_t test_vqadd_s64(int64x1_t a, int64x1_t b) {
+// CHECK: test_vqadd_s64
+  return vqadd_s64(a, b);
+// CHECK:  sqadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+}
+
+uint8x8_t test_vqadd_u8(uint8x8_t a, uint8x8_t b) {
+// CHECK: test_vqadd_u8
+  return vqadd_u8(a, b);
+  // CHECK: uqadd {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+uint16x4_t test_vqadd_u16(uint16x4_t a, uint16x4_t b) {
+// CHECK: test_vqadd_u16
+  return vqadd_u16(a, b);
+  // CHECK: uqadd {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+
+uint32x2_t test_vqadd_u32(uint32x2_t a, uint32x2_t b) {
+// CHECK: test_vqadd_u32
+  return vqadd_u32(a, b);
+  // CHECK: uqadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+uint64x1_t test_vqadd_u64(uint64x1_t a, uint64x1_t b) {
+// CHECK:  test_vqadd_u64
+  return vqadd_u64(a, b);
+// CHECK:  uqadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+}
+
+int8x16_t test_vqaddq_s8(int8x16_t a, int8x16_t b) {
+// CHECK: test_vqaddq_s8
+  return vqaddq_s8(a, b);
+  // CHECK: sqadd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+int16x8_t test_vqaddq_s16(int16x8_t a, int16x8_t b) {
+// CHECK: test_vqaddq_s16
+  return vqaddq_s16(a, b);
+  // CHECK: sqadd {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+int32x4_t test_vqaddq_s32(int32x4_t a, int32x4_t b) {
+// CHECK: test_vqaddq_s32
+  return vqaddq_s32(a, b);
+  // CHECK: sqadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+int64x2_t test_vqaddq_s64(int64x2_t a, int64x2_t b) {
+// CHECK: test_vqaddq_s64
+  return vqaddq_s64(a, b);
+// CHECK: sqadd {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+}
+
+uint8x16_t test_vqaddq_u8(uint8x16_t a, uint8x16_t b) {
+// CHECK: test_vqaddq_u8
+  return vqaddq_u8(a, b);
+  // CHECK: uqadd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+uint16x8_t test_vqaddq_u16(uint16x8_t a, uint16x8_t b) {
+// CHECK: test_vqaddq_u16
+  return vqaddq_u16(a, b);
+  // CHECK: uqadd {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+uint32x4_t test_vqaddq_u32(uint32x4_t a, uint32x4_t b) {
+// CHECK: test_vqaddq_u32
+  return vqaddq_u32(a, b);
+  // CHECK: uqadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+uint64x2_t test_vqaddq_u64(uint64x2_t a, uint64x2_t b) {
+// CHECK: test_vqaddq_u64
+  return vqaddq_u64(a, b);
+// CHECK: uqadd {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+}
+
+
+int8x8_t test_vqsub_s8(int8x8_t a, int8x8_t b) {
+// CHECK: test_vqsub_s8
+  return vqsub_s8(a, b);
+  // CHECK: sqsub {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+int16x4_t test_vqsub_s16(int16x4_t a, int16x4_t b) {
+// CHECK: test_vqsub_s16
+  return vqsub_s16(a, b);
+  // CHECK: sqsub {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+
+int32x2_t test_vqsub_s32(int32x2_t a, int32x2_t b) {
+// CHECK: test_vqsub_s32
+  return vqsub_s32(a, b);
+  // CHECK: sqsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+int64x1_t test_vqsub_s64(int64x1_t a, int64x1_t b) {
+// CHECK: test_vqsub_s64
+  return vqsub_s64(a, b);
+// CHECK: sqsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+}
+
+uint8x8_t test_vqsub_u8(uint8x8_t a, uint8x8_t b) {
+// CHECK: test_vqsub_u8
+  return vqsub_u8(a, b);
+  // CHECK: uqsub {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+uint16x4_t test_vqsub_u16(uint16x4_t a, uint16x4_t b) {
+// CHECK: test_vqsub_u16
+  return vqsub_u16(a, b);
+  // CHECK: uqsub {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+
+uint32x2_t test_vqsub_u32(uint32x2_t a, uint32x2_t b) {
+// CHECK: test_vqsub_u32
+  return vqsub_u32(a, b);
+  // CHECK: uqsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+uint64x1_t test_vqsub_u64(uint64x1_t a, uint64x1_t b) {
+// CHECK: test_vqsub_u64
+  return vqsub_u64(a, b);
+// CHECK:  uqsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+}
+
+int8x16_t test_vqsubq_s8(int8x16_t a, int8x16_t b) {
+// CHECK: test_vqsubq_s8
+  return vqsubq_s8(a, b);
+  // CHECK: sqsub {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+int16x8_t test_vqsubq_s16(int16x8_t a, int16x8_t b) {
+// CHECK: test_vqsubq_s16
+  return vqsubq_s16(a, b);
+  // CHECK: sqsub {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+int32x4_t test_vqsubq_s32(int32x4_t a, int32x4_t b) {
+// CHECK: test_vqsubq_s32
+  return vqsubq_s32(a, b);
+  // CHECK: sqsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+int64x2_t test_vqsubq_s64(int64x2_t a, int64x2_t b) {
+// CHECK: test_vqsubq_s64
+  return vqsubq_s64(a, b);
+// CHECK: sqsub {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+}
+
+uint8x16_t test_vqsubq_u8(uint8x16_t a, uint8x16_t b) {
+// CHECK: test_vqsubq_u8
+  return vqsubq_u8(a, b);
+  // CHECK: uqsub {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+uint16x8_t test_vqsubq_u16(uint16x8_t a, uint16x8_t b) {
+// CHECK: test_vqsubq_u16
+  return vqsubq_u16(a, b);
+  // CHECK: uqsub {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+uint32x4_t test_vqsubq_u32(uint32x4_t a, uint32x4_t b) {
+// CHECK: test_vqsubq_u32
+  return vqsubq_u32(a, b);
+  // CHECK: uqsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+uint64x2_t test_vqsubq_u64(uint64x2_t a, uint64x2_t b) {
+// CHECK: test_vqsubq_u64
+  return vqsubq_u64(a, b);
+  // CHECK: uqsub {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+}
+
+
+int8x8_t test_vshl_s8(int8x8_t a, int8x8_t b) {
+// CHECK: test_vshl_s8
+  return vshl_s8(a, b);
+// CHECK: sshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+int16x4_t test_vshl_s16(int16x4_t a, int16x4_t b) {
+// CHECK: test_vshl_s16
+  return vshl_s16(a, b);
+// CHECK: sshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+
+int32x2_t test_vshl_s32(int32x2_t a, int32x2_t b) {
+// CHECK: test_vshl_s32
+  return vshl_s32(a, b);
+// CHECK: sshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+int64x1_t test_vshl_s64(int64x1_t a, int64x1_t b) {
+// CHECK: test_vshl_s64
+  return vshl_s64(a, b);
+// CHECK: sshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+}
+
+uint8x8_t test_vshl_u8(uint8x8_t a, int8x8_t b) {
+// CHECK: test_vshl_u8
+  return vshl_u8(a, b);
+// CHECK: ushl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+uint16x4_t test_vshl_u16(uint16x4_t a, int16x4_t b) {
+// CHECK: test_vshl_u16
+  return vshl_u16(a, b);
+// CHECK: ushl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+
+uint32x2_t test_vshl_u32(uint32x2_t a, int32x2_t b) {
+// CHECK: test_vshl_u32
+  return vshl_u32(a, b);
+// CHECK: ushl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+uint64x1_t test_vshl_u64(uint64x1_t a, int64x1_t b) {
+// CHECK: test_vshl_u64
+  return vshl_u64(a, b);
+// CHECK: ushl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+}
+
+int8x16_t test_vshlq_s8(int8x16_t a, int8x16_t b) {
+// CHECK: test_vshlq_s8
+  return vshlq_s8(a, b);
+// CHECK: sshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+int16x8_t test_vshlq_s16(int16x8_t a, int16x8_t b) {
+// CHECK: test_vshlq_s16
+  return vshlq_s16(a, b);
+// CHECK: sshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+int32x4_t test_vshlq_s32(int32x4_t a, int32x4_t b) {
+// CHECK: test_vshlq_s32
+  return vshlq_s32(a, b);
+// CHECK: sshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+int64x2_t test_vshlq_s64(int64x2_t a, int64x2_t b) {
+// CHECK: test_vshlq_s64
+  return vshlq_s64(a, b);
+// CHECK: sshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+}
+
+uint8x16_t test_vshlq_u8(uint8x16_t a, int8x16_t b) {
+// CHECK: test_vshlq_u8
+  return vshlq_u8(a, b);
+// CHECK: ushl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+uint16x8_t test_vshlq_u16(uint16x8_t a, int16x8_t b) {
+// CHECK: test_vshlq_u16
+  return vshlq_u16(a, b);
+// CHECK: ushl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+uint32x4_t test_vshlq_u32(uint32x4_t a, int32x4_t b) {
+// CHECK: test_vshlq_u32
+  return vshlq_u32(a, b);
+// CHECK: ushl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+uint64x2_t test_vshlq_u64(uint64x2_t a, int64x2_t b) {
+// CHECK: test_vshlq_u64
+  return vshlq_u64(a, b);
+// CHECK: ushl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+}
+
+
+int8x8_t test_vqshl_s8(int8x8_t a, int8x8_t b) {
+// CHECK: test_vqshl_s8
+  return vqshl_s8(a, b);
+// CHECK: sqshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+int16x4_t test_vqshl_s16(int16x4_t a, int16x4_t b) {
+// CHECK: test_vqshl_s16
+  return vqshl_s16(a, b);
+// CHECK: sqshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+
+int32x2_t test_vqshl_s32(int32x2_t a, int32x2_t b) {
+// CHECK: test_vqshl_s32
+  return vqshl_s32(a, b);
+// CHECK: sqshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+int64x1_t test_vqshl_s64(int64x1_t a, int64x1_t b) {
+// CHECK: test_vqshl_s64
+  return vqshl_s64(a, b);
+// CHECK: sqshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+}
+
+uint8x8_t test_vqshl_u8(uint8x8_t a, int8x8_t b) {
+// CHECK: test_vqshl_u8
+  return vqshl_u8(a, b);
+// CHECK: uqshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+uint16x4_t test_vqshl_u16(uint16x4_t a, int16x4_t b) {
+// CHECK: test_vqshl_u16
+  return vqshl_u16(a, b);
+// CHECK: uqshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+
+uint32x2_t test_vqshl_u32(uint32x2_t a, int32x2_t b) {
+// CHECK: test_vqshl_u32
+  return vqshl_u32(a, b);
+// CHECK: uqshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+uint64x1_t test_vqshl_u64(uint64x1_t a, int64x1_t b) {
+// CHECK: test_vqshl_u64
+  return vqshl_u64(a, b);
+// CHECK: uqshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+}
+
+int8x16_t test_vqshlq_s8(int8x16_t a, int8x16_t b) {
+// CHECK: test_vqshlq_s8
+  return vqshlq_s8(a, b);
+// CHECK: sqshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+int16x8_t test_vqshlq_s16(int16x8_t a, int16x8_t b) {
+// CHECK: test_vqshlq_s16
+  return vqshlq_s16(a, b);
+// CHECK: sqshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+int32x4_t test_vqshlq_s32(int32x4_t a, int32x4_t b) {
+// CHECK: test_vqshlq_s32
+  return vqshlq_s32(a, b);
+// CHECK: sqshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+int64x2_t test_vqshlq_s64(int64x2_t a, int64x2_t b) {
+// CHECK: test_vqshlq_s64
+  return vqshlq_s64(a, b);
+// CHECK: sqshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+}
+
+uint8x16_t test_vqshlq_u8(uint8x16_t a, int8x16_t b) {
+// CHECK: test_vqshlq_u8
+  return vqshlq_u8(a, b);
+// CHECK: uqshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+uint16x8_t test_vqshlq_u16(uint16x8_t a, int16x8_t b) {
+// CHECK: test_vqshlq_u16
+  return vqshlq_u16(a, b);
+// CHECK: uqshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+uint32x4_t test_vqshlq_u32(uint32x4_t a, int32x4_t b) {
+// CHECK: test_vqshlq_u32
+  return vqshlq_u32(a, b);
+// CHECK: uqshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+uint64x2_t test_vqshlq_u64(uint64x2_t a, int64x2_t b) {
+// CHECK: test_vqshlq_u32
+  return vqshlq_u64(a, b);
+// CHECK: uqshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+}
+
+int8x8_t test_vrshl_s8(int8x8_t a, int8x8_t b) {
+// CHECK: test_vrshl_s8
+  return vrshl_s8(a, b);
+// CHECK: srshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+int16x4_t test_vrshl_s16(int16x4_t a, int16x4_t b) {
+// CHECK: test_vrshl_s16
+  return vrshl_s16(a, b);
+// CHECK: srshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+
+int32x2_t test_vrshl_s32(int32x2_t a, int32x2_t b) {
+// CHECK: test_vrshl_s32
+  return vrshl_s32(a, b);
+// CHECK: srshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+int64x1_t test_vrshl_s64(int64x1_t a, int64x1_t b) {
+// CHECK: test_vrshl_s64
+  return vrshl_s64(a, b);
+// CHECK: srshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+}
+
+uint8x8_t test_vrshl_u8(uint8x8_t a, int8x8_t b) {
+// CHECK: test_vrshl_u8
+  return vrshl_u8(a, b);
+// CHECK: urshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+uint16x4_t test_vrshl_u16(uint16x4_t a, int16x4_t b) {
+// CHECK: test_vrshl_u16
+  return vrshl_u16(a, b);
+// CHECK: urshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+
+uint32x2_t test_vrshl_u32(uint32x2_t a, int32x2_t b) {
+// CHECK: test_vrshl_u32
+  return vrshl_u32(a, b);
+// CHECK: urshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+uint64x1_t test_vrshl_u64(uint64x1_t a, int64x1_t b) {
+// CHECK: test_vrshl_u64
+  return vrshl_u64(a, b);
+// CHECK: urshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+}
+
+int8x16_t test_vrshlq_s8(int8x16_t a, int8x16_t b) {
+// CHECK: test_vrshlq_s8
+  return vrshlq_s8(a, b);
+// CHECK: srshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+int16x8_t test_vrshlq_s16(int16x8_t a, int16x8_t b) {
+// CHECK: test_vrshlq_s16
+  return vrshlq_s16(a, b);
+// CHECK: srshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+int32x4_t test_vrshlq_s32(int32x4_t a, int32x4_t b) {
+// CHECK: test_vrshlq_s32
+  return vrshlq_s32(a, b);
+// CHECK: srshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+int64x2_t test_vrshlq_s64(int64x2_t a, int64x2_t b) {
+// CHECK: test_vrshlq_s64
+  return vrshlq_s64(a, b);
+// CHECK: srshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+}
+
+uint8x16_t test_vrshlq_u8(uint8x16_t a, int8x16_t b) {
+// CHECK: test_vrshlq_u8
+  return vrshlq_u8(a, b);
+// CHECK: urshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+uint16x8_t test_vrshlq_u16(uint16x8_t a, int16x8_t b) {
+// CHECK: test_vrshlq_u16
+  return vrshlq_u16(a, b);
+// CHECK: urshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+uint32x4_t test_vrshlq_u32(uint32x4_t a, int32x4_t b) {
+// CHECK: test_vrshlq_u32
+  return vrshlq_u32(a, b);
+// CHECK: urshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+uint64x2_t test_vrshlq_u64(uint64x2_t a, int64x2_t b) {
+// CHECK: test_vrshlq_u64
+  return vrshlq_u64(a, b);
+// CHECK: urshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+}
+
+
+int8x8_t test_vqrshl_s8(int8x8_t a, int8x8_t b) {
+// CHECK: test_vqrshl_s8
+  return vqrshl_s8(a, b);
+// CHECK: sqrshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+int16x4_t test_vqrshl_s16(int16x4_t a, int16x4_t b) {
+// CHECK: test_vqrshl_s16
+  return vqrshl_s16(a, b);
+// CHECK: sqrshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+
+int32x2_t test_vqrshl_s32(int32x2_t a, int32x2_t b) {
+// CHECK: test_vqrshl_s32
+  return vqrshl_s32(a, b);
+// CHECK: sqrshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+int64x1_t test_vqrshl_s64(int64x1_t a, int64x1_t b) {
+// CHECK: test_vqrshl_s64
+  return vqrshl_s64(a, b);
+// CHECK: sqrshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+}
+
+uint8x8_t test_vqrshl_u8(uint8x8_t a, int8x8_t b) {
+// CHECK: test_vqrshl_u8
+  return vqrshl_u8(a, b);
+// CHECK: uqrshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+uint16x4_t test_vqrshl_u16(uint16x4_t a, int16x4_t b) {
+// CHECK: test_vqrshl_u16
+  return vqrshl_u16(a, b);
+// CHECK: uqrshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+
+uint32x2_t test_vqrshl_u32(uint32x2_t a, int32x2_t b) {
+// CHECK: test_vqrshl_u32
+  return vqrshl_u32(a, b);
+// CHECK: uqrshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+uint64x1_t test_vqrshl_u64(uint64x1_t a, int64x1_t b) {
+// CHECK: test_vqrshl_u64
+  return vqrshl_u64(a, b);
+// CHECK: uqrshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+}
+
+int8x16_t test_vqrshlq_s8(int8x16_t a, int8x16_t b) {
+// CHECK: test_vqrshlq_s8
+  return vqrshlq_s8(a, b);
+// CHECK: sqrshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+int16x8_t test_vqrshlq_s16(int16x8_t a, int16x8_t b) {
+// CHECK: test_vqrshlq_s16
+  return vqrshlq_s16(a, b);
+// CHECK: sqrshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+int32x4_t test_vqrshlq_s32(int32x4_t a, int32x4_t b) {
+// CHECK: test_vqrshlq_s32
+  return vqrshlq_s32(a, b);
+// CHECK: sqrshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+int64x2_t test_vqrshlq_s64(int64x2_t a, int64x2_t b) {
+// CHECK: test_vqrshlq_s64
+  return vqrshlq_s64(a, b);
+// CHECK: sqrshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+}
+
+// CHECK: test_vqrshlq_u8
+uint8x16_t test_vqrshlq_u8(uint8x16_t a, int8x16_t b) {
+  return vqrshlq_u8(a, b);
+// CHECK: uqrshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+uint16x8_t test_vqrshlq_u16(uint16x8_t a, int16x8_t b) {
+// CHECK: test_vqrshlq_u16
+  return vqrshlq_u16(a, b);
+// CHECK: uqrshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+uint32x4_t test_vqrshlq_u32(uint32x4_t a, int32x4_t b) {
+// CHECK: test_vqrshlq_u32
+  return vqrshlq_u32(a, b);
+// CHECK: uqrshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+uint64x2_t test_vqrshlq_u64(uint64x2_t a, int64x2_t b) {
+// CHECK: test_vqrshlq_u64
+  return vqrshlq_u64(a, b);
+// CHECK: uqrshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+}
+
+int8x8_t test_vmax_s8(int8x8_t a, int8x8_t b) {
+// CHECK: test_vmax_s8
+  return vmax_s8(a, b);
+// CHECK: smax {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+int16x4_t test_vmax_s16(int16x4_t a, int16x4_t b) {
+// CHECK: test_vmax_s16
+  return vmax_s16(a, b);
+// CHECK: smax {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+
+int32x2_t test_vmax_s32(int32x2_t a, int32x2_t b) {
+// CHECK: test_vmax_s32
+  return vmax_s32(a, b);
+// CHECK: smax {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+uint8x8_t test_vmax_u8(uint8x8_t a, uint8x8_t b) {
+// CHECK: test_vmax_u8
+  return vmax_u8(a, b);
+// CHECK: umax {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+uint16x4_t test_vmax_u16(uint16x4_t a, uint16x4_t b) {
+// CHECK: test_vmax_u16
+  return vmax_u16(a, b);
+// CHECK: umax {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+
+uint32x2_t test_vmax_u32(uint32x2_t a, uint32x2_t b) {
+// CHECK: test_vmax_u32
+  return vmax_u32(a, b);
+// CHECK: umax {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+float32x2_t test_vmax_f32(float32x2_t a, float32x2_t b) {
+// CHECK: test_vmax_f32
+  return vmax_f32(a, b);
+// CHECK: fmax {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+int8x16_t test_vmaxq_s8(int8x16_t a, int8x16_t b) {
+// CHECK: test_vmaxq_s8
+  return vmaxq_s8(a, b);
+// CHECK: smax {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+int16x8_t test_vmaxq_s16(int16x8_t a, int16x8_t b) {
+// CHECK: test_vmaxq_s16
+  return vmaxq_s16(a, b);
+// CHECK: smax {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+int32x4_t test_vmaxq_s32(int32x4_t a, int32x4_t b) {
+// CHECK: test_vmaxq_s32
+  return vmaxq_s32(a, b);
+// CHECK: smax {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+uint8x16_t test_vmaxq_u8(uint8x16_t a, uint8x16_t b) {
+// CHECK: test_vmaxq_u8
+  return vmaxq_u8(a, b);
+// CHECK: umax {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+uint16x8_t test_vmaxq_u16(uint16x8_t a, uint16x8_t b) {
+// CHECK: test_vmaxq_u16
+  return vmaxq_u16(a, b);
+// CHECK: umax {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+uint32x4_t test_vmaxq_u32(uint32x4_t a, uint32x4_t b) {
+// CHECK: test_vmaxq_u32
+  return vmaxq_u32(a, b);
+// CHECK: umax {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+float32x4_t test_vmaxq_f32(float32x4_t a, float32x4_t b) {
+// CHECK: test_vmaxq_f32
+  return vmaxq_f32(a, b);
+// CHECK: fmax {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+float64x2_t test_vmaxq_f64(float64x2_t a, float64x2_t b) {
+// CHECK: test_vmaxq_f64
+  return vmaxq_f64(a, b);
+// CHECK: fmax {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+}
+
+
+int8x8_t test_vmin_s8(int8x8_t a, int8x8_t b) {
+// CHECK: test_vmin_s8
+  return vmin_s8(a, b);
+// CHECK: smin {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+int16x4_t test_vmin_s16(int16x4_t a, int16x4_t b) {
+// CHECK: test_vmin_s16
+  return vmin_s16(a, b);
+// CHECK: smin {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+
+int32x2_t test_vmin_s32(int32x2_t a, int32x2_t b) {
+// CHECK: test_vmin_s32
+  return vmin_s32(a, b);
+// CHECK: smin {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+uint8x8_t test_vmin_u8(uint8x8_t a, uint8x8_t b) {
+// CHECK: test_vmin_u8
+  return vmin_u8(a, b);
+// CHECK: umin {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+uint16x4_t test_vmin_u16(uint16x4_t a, uint16x4_t b) {
+// CHECK: test_vmin_u16
+  return vmin_u16(a, b);
+// CHECK: umin {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+
+uint32x2_t test_vmin_u32(uint32x2_t a, uint32x2_t b) {
+// CHECK: test_vmin_u32
+  return vmin_u32(a, b);
+// CHECK: umin {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+float32x2_t test_vmin_f32(float32x2_t a, float32x2_t b) {
+// CHECK: test_vmin_f32
+  return vmin_f32(a, b);
+// CHECK: fmin {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+int8x16_t test_vminq_s8(int8x16_t a, int8x16_t b) {
+// CHECK: test_vminq_s8
+  return vminq_s8(a, b);
+// CHECK: smin {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+int16x8_t test_vminq_s16(int16x8_t a, int16x8_t b) {
+// CHECK: test_vminq_s16
+  return vminq_s16(a, b);
+// CHECK: smin {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+int32x4_t test_vminq_s32(int32x4_t a, int32x4_t b) {
+// CHECK: test_vminq_s32
+  return vminq_s32(a, b);
+// CHECK: smin {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+uint8x16_t test_vminq_u8(uint8x16_t a, uint8x16_t b) {
+// CHECK: test_vminq_u8
+  return vminq_u8(a, b);
+// CHECK: umin {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+uint16x8_t test_vminq_u16(uint16x8_t a, uint16x8_t b) {
+// CHECK: test_vminq_u16
+  return vminq_u16(a, b);
+// CHECK: umin {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+uint32x4_t test_vminq_u32(uint32x4_t a, uint32x4_t b) {
+// CHECK: test_vminq_u32
+  return vminq_u32(a, b);
+// CHECK: umin {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+float32x4_t test_vminq_f32(float32x4_t a, float32x4_t b) {
+// CHECK: test_vminq_f32
+  return vminq_f32(a, b);
+// CHECK: fmin {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+float64x2_t test_vminq_f64(float64x2_t a, float64x2_t b) {
+// CHECK: test_vminq_f64
+  return vminq_f64(a, b);
+// CHECK: fmin {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+}
+
+float32x2_t test_vmaxnm_f32(float32x2_t a, float32x2_t b) {
+// CHECK: test_vmaxnm_f32
+  return vmaxnm_f32(a, b);
+// CHECK: fmaxnm {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+float32x4_t test_vmaxnmq_f32(float32x4_t a, float32x4_t b) {
+// CHECK: test_vmaxnmq_f32
+  return vmaxnmq_f32(a, b);
+// CHECK: fmaxnm {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+float64x2_t test_vmaxnmq_f64(float64x2_t a, float64x2_t b) {
+// CHECK: test_vmaxnmq_f64
+  return vmaxnmq_f64(a, b);
+// CHECK: fmaxnm {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+}
+
+float32x2_t test_vminnm_f32(float32x2_t a, float32x2_t b) {
+// CHECK: test_vminnm_f32
+  return vminnm_f32(a, b);
+// CHECK: fminnm {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+float32x4_t test_vminnmq_f32(float32x4_t a, float32x4_t b) {
+// CHECK: test_vminnmq_f32
+  return vminnmq_f32(a, b);
+// CHECK: fminnm {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+float64x2_t test_vminnmq_f64(float64x2_t a, float64x2_t b) {
+// CHECK: test_vminnmq_f64
+  return vminnmq_f64(a, b);
+// CHECK: fminnm {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+}
+
+int8x8_t test_vpmax_s8(int8x8_t a, int8x8_t b) {
+// CHECK: test_vpmax_s8
+  return vpmax_s8(a, b);
+// CHECK: smaxp {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+int16x4_t test_vpmax_s16(int16x4_t a, int16x4_t b) {
+// CHECK: test_vpmax_s16
+  return vpmax_s16(a, b);
+// CHECK: smaxp {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+
+int32x2_t test_vpmax_s32(int32x2_t a, int32x2_t b) {
+// CHECK: test_vpmax_s32
+  return vpmax_s32(a, b);
+// CHECK: smaxp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+uint8x8_t test_vpmax_u8(uint8x8_t a, uint8x8_t b) {
+// CHECK: test_vpmax_u8
+  return vpmax_u8(a, b);
+// CHECK: umaxp {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+uint16x4_t test_vpmax_u16(uint16x4_t a, uint16x4_t b) {
+// CHECK: test_vpmax_u16
+  return vpmax_u16(a, b);
+// CHECK: umaxp {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+
+uint32x2_t test_vpmax_u32(uint32x2_t a, uint32x2_t b) {
+// CHECK: test_vpmax_u32
+  return vpmax_u32(a, b);
+// CHECK: umaxp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+float32x2_t test_vpmax_f32(float32x2_t a, float32x2_t b) {
+// CHECK: test_vpmax_f32
+  return vpmax_f32(a, b);
+// CHECK: fmaxp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+int8x16_t test_vpmaxq_s8(int8x16_t a, int8x16_t b) {
+// CHECK: test_vpmaxq_s8
+  return vpmaxq_s8(a, b);
+// CHECK: smaxp {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+int16x8_t test_vpmaxq_s16(int16x8_t a, int16x8_t b) {
+// CHECK: test_vpmaxq_s16
+  return vpmaxq_s16(a, b);
+// CHECK: smaxp {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+int32x4_t test_vpmaxq_s32(int32x4_t a, int32x4_t b) {
+// CHECK: test_vpmaxq_s32
+  return vpmaxq_s32(a, b);
+// CHECK: smaxp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+uint8x16_t test_vpmaxq_u8(uint8x16_t a, uint8x16_t b) {
+// CHECK: test_vpmaxq_u8
+  return vpmaxq_u8(a, b);
+// CHECK: umaxp {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+uint16x8_t test_vpmaxq_u16(uint16x8_t a, uint16x8_t b) {
+// CHECK: test_vpmaxq_u16
+  return vpmaxq_u16(a, b);
+// CHECK: umaxp {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+uint32x4_t test_vpmaxq_u32(uint32x4_t a, uint32x4_t b) {
+// CHECK: test_vpmaxq_u32
+  return vpmaxq_u32(a, b);
+// CHECK: umaxp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+float32x4_t test_vpmaxq_f32(float32x4_t a, float32x4_t b) {
+// CHECK: test_vpmaxq_f32
+  return vpmaxq_f32(a, b);
+// CHECK: fmaxp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+float64x2_t test_vpmaxq_f64(float64x2_t a, float64x2_t b) {
+// CHECK: test_vpmaxq_f64
+  return vpmaxq_f64(a, b);
+// CHECK: fmaxp {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+}
+
+int8x8_t test_vpmin_s8(int8x8_t a, int8x8_t b) {
+// CHECK: test_vpmin_s8
+  return vpmin_s8(a, b);
+// CHECK: sminp {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+int16x4_t test_vpmin_s16(int16x4_t a, int16x4_t b) {
+// CHECK: test_vpmin_s16
+  return vpmin_s16(a, b);
+// CHECK: sminp {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+
+int32x2_t test_vpmin_s32(int32x2_t a, int32x2_t b) {
+// CHECK: test_vpmin_s32
+  return vpmin_s32(a, b);
+// CHECK: sminp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+uint8x8_t test_vpmin_u8(uint8x8_t a, uint8x8_t b) {
+// CHECK: test_vpmin_u8
+  return vpmin_u8(a, b);
+// CHECK: uminp {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+uint16x4_t test_vpmin_u16(uint16x4_t a, uint16x4_t b) {
+// CHECK: test_vpmin_u16
+  return vpmin_u16(a, b);
+// CHECK: uminp {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+
+uint32x2_t test_vpmin_u32(uint32x2_t a, uint32x2_t b) {
+// CHECK: test_vpmin_u32
+  return vpmin_u32(a, b);
+// CHECK: uminp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+float32x2_t test_vpmin_f32(float32x2_t a, float32x2_t b) {
+// CHECK: test_vpmin_f32
+  return vpmin_f32(a, b);
+// CHECK: fminp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+int8x16_t test_vpminq_s8(int8x16_t a, int8x16_t b) {
+// CHECK: test_vpminq_s8
+  return vpminq_s8(a, b);
+// CHECK: sminp {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+int16x8_t test_vpminq_s16(int16x8_t a, int16x8_t b) {
+// CHECK: test_vpminq_s16
+  return vpminq_s16(a, b);
+// CHECK: sminp {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+int32x4_t test_vpminq_s32(int32x4_t a, int32x4_t b) {
+// CHECK: test_vpminq_s32
+  return vpminq_s32(a, b);
+// CHECK: sminp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+uint8x16_t test_vpminq_u8(uint8x16_t a, uint8x16_t b) {
+// CHECK: test_vpminq_u8
+  return vpminq_u8(a, b);
+// CHECK: uminp {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+uint16x8_t test_vpminq_u16(uint16x8_t a, uint16x8_t b) {
+// CHECK: test_vpminq_u16
+  return vpminq_u16(a, b);
+// CHECK: uminp {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+uint32x4_t test_vpminq_u32(uint32x4_t a, uint32x4_t b) {
+// CHECK: test_vpminq_u32
+  return vpminq_u32(a, b);
+// CHECK: uminp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+float32x4_t test_vpminq_f32(float32x4_t a, float32x4_t b) {
+// CHECK: test_vpminq_f32
+  return vpminq_f32(a, b);
+// CHECK: fminp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+float64x2_t test_vpminq_f64(float64x2_t a, float64x2_t b) {
+// CHECK: test_vpminq_f64
+  return vpminq_f64(a, b);
+// CHECK: fminp {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+}
+
+float32x2_t test_vpmaxnm_f32(float32x2_t a, float32x2_t b) {
+// CHECK: test_vpmaxnm_f32
+  return vpmaxnm_f32(a, b);
+// CHECK: fmaxnmp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+float32x4_t test_vpmaxnmq_f32(float32x4_t a, float32x4_t b) {
+// CHECK: test_vpmaxnmq_f32
+  return vpmaxnmq_f32(a, b);
+// CHECK: fmaxnmp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+float64x2_t test_vpmaxnmq_f64(float64x2_t a, float64x2_t b) {
+// CHECK: test_vpmaxnmq_f64
+  return vpmaxnmq_f64(a, b);
+// CHECK: fmaxnmp {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+}
+
+float32x2_t test_vpminnm_f32(float32x2_t a, float32x2_t b) {
+// CHECK: test_vpminnm_f32
+  return vpminnm_f32(a, b);
+// CHECK: fminnmp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+float32x4_t test_vpminnmq_f32(float32x4_t a, float32x4_t b) {
+// CHECK: test_vpminnmq_f32
+  return vpminnmq_f32(a, b);
+// CHECK: fminnmp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+float64x2_t test_vpminnmq_f64(float64x2_t a, float64x2_t b) {
+// CHECK: test_vpminnmq_f64
+  return vpminnmq_f64(a, b);
+// CHECK: fminnmp {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+}
+
+int8x8_t test_vpadd_s8(int8x8_t a, int8x8_t b) {
+// CHECK: test_vpadd_s8
+  return vpadd_s8(a, b);
+// CHECK: addp {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+int16x4_t test_vpadd_s16(int16x4_t a, int16x4_t b) {
+// CHECK: test_vpadd_s16
+  return vpadd_s16(a, b);
+// CHECK: addp {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+
+int32x2_t test_vpadd_s32(int32x2_t a, int32x2_t b) {
+// CHECK: test_vpadd_s32
+  return vpadd_s32(a, b);
+// CHECK: addp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+uint8x8_t test_vpadd_u8(uint8x8_t a, uint8x8_t b) {
+// CHECK: test_vpadd_u8
+  return vpadd_u8(a, b);
+// CHECK: addp {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
+
+uint16x4_t test_vpadd_u16(uint16x4_t a, uint16x4_t b) {
+// CHECK: test_vpadd_u16
+  return vpadd_u16(a, b);
+// CHECK: addp {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+
+uint32x2_t test_vpadd_u32(uint32x2_t a, uint32x2_t b) {
+// CHECK: test_vpadd_u32
+  return vpadd_u32(a, b);
+// CHECK: addp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+float32x2_t test_vpadd_f32(float32x2_t a, float32x2_t b) {
+// CHECK: test_vpadd_f32
+  return vpadd_f32(a, b);
+// CHECK: faddp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+int8x16_t test_vpaddq_s8(int8x16_t a, int8x16_t b) {
+// CHECK: test_vpaddq_s8
+  return vpaddq_s8(a, b);
+// CHECK: addp {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+int16x8_t test_vpaddq_s16(int16x8_t a, int16x8_t b) {
+// CHECK: test_vpaddq_s16
+  return vpaddq_s16(a, b);
+// CHECK: addp {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+int32x4_t test_vpaddq_s32(int32x4_t a, int32x4_t b) {
+// CHECK: test_vpaddq_s32
+  return vpaddq_s32(a, b);
+// CHECK: addp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+uint8x16_t test_vpaddq_u8(uint8x16_t a, uint8x16_t b) {
+// CHECK: test_vpaddq_u8
+  return vpaddq_u8(a, b);
+// CHECK: addp {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+}
+
+uint16x8_t test_vpaddq_u16(uint16x8_t a, uint16x8_t b) {
+// CHECK: test_vpaddq_u16
+  return vpaddq_u16(a, b);
+// CHECK: addp {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+uint32x4_t test_vpaddq_u32(uint32x4_t a, uint32x4_t b) {
+// CHECK: test_vpaddq_u32
+  return vpaddq_u32(a, b);
+// CHECK: addp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+float32x4_t test_vpaddq_f32(float32x4_t a, float32x4_t b) {
+// CHECK: test_vpaddq_f32
+  return vpaddq_f32(a, b);
+// CHECK: faddp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+float64x2_t test_vpaddq_f64(float64x2_t a, float64x2_t b) {
+// CHECK: test_vpaddq_f64
+  return vpaddq_f64(a, b);
+// CHECK: faddp {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+}
+
+int16x4_t test_vqdmulh_s16(int16x4_t a, int16x4_t b) {
+// CHECK: test_vqdmulh_s16
+  return vqdmulh_s16(a, b);
+// CHECK: sqdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+
+int32x2_t test_vqdmulh_s32(int32x2_t a, int32x2_t b) {
+// CHECK: test_vqdmulh_s32
+  return vqdmulh_s32(a, b);
+// CHECK: sqdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+int16x8_t test_vqdmulhq_s16(int16x8_t a, int16x8_t b) {
+// CHECK: test_vqdmulhq_s16
+  return vqdmulhq_s16(a, b);
+// CHECK: sqdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+int32x4_t test_vqdmulhq_s32(int32x4_t a, int32x4_t b) {
+// CHECK: test_vqdmulhq_s32
+  return vqdmulhq_s32(a, b);
+// CHECK: sqdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+int16x4_t test_vqrdmulh_s16(int16x4_t a, int16x4_t b) {
+// CHECK: test_vqrdmulh_s16
+  return vqrdmulh_s16(a, b);
+// CHECK: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+}
+
+int32x2_t test_vqrdmulh_s32(int32x2_t a, int32x2_t b) {
+// CHECK: test_vqrdmulh_s32
+  return vqrdmulh_s32(a, b);
+// CHECK: sqrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+int16x8_t test_vqrdmulhq_s16(int16x8_t a, int16x8_t b) {
+// CHECK: test_vqrdmulhq_s16
+  return vqrdmulhq_s16(a, b);
+// CHECK: sqrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+}
+
+int32x4_t test_vqrdmulhq_s32(int32x4_t a, int32x4_t b) {
+// CHECK: test_vqrdmulhq_s32
+  return vqrdmulhq_s32(a, b);
+// CHECK: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+
+float32x2_t test_vmulx_f32(float32x2_t a, float32x2_t b) {
+// CHECK: test_vmulx_f32
+  return vmulx_f32(a, b);
+// CHECK: fmulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+}
+
+float32x4_t test_vmulxq_f32(float32x4_t a, float32x4_t b) {
+// CHECK: test_vmulxq_f32
+  return vmulxq_f32(a, b);
+// CHECK: fmulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+}
+
+float64x2_t test_vmulxq_f64(float64x2_t a, float64x2_t b) {
+// CHECK: test_vmulxq_f64
+  return vmulxq_f64(a, b);
+// CHECK: fmulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+}
+
diff --git a/test/CodeGenCXX/aarch64-mangle-neon-vectors.cpp b/test/CodeGenCXX/aarch64-mangle-neon-vectors.cpp
new file mode 100644
index 0000000000..86509a0739
--- /dev/null
+++ b/test/CodeGenCXX/aarch64-mangle-neon-vectors.cpp
@@ -0,0 +1,85 @@
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu  %s -emit-llvm -o - | FileCheck %s
+
+typedef unsigned char uint8_t;
+typedef unsigned short uint16_t;
+typedef signed char int8_t;
+typedef signed short int16_t;
+typedef signed long long int64_t;
+typedef unsigned long long uint64_t;
+typedef unsigned char poly8_t;
+typedef unsigned short poly16_t;
+typedef __fp16 float16_t;
+typedef float float32_t;
+typedef double float64_t;
+
+typedef __attribute__((neon_vector_type(8))) int8_t int8x8_t;
+typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t;
+typedef __attribute__((neon_vector_type(4))) int16_t int16x4_t;
+typedef __attribute__((neon_vector_type(8))) int16_t int16x8_t;
+typedef __attribute__((neon_vector_type(2))) int int32x2_t;
+typedef __attribute__((neon_vector_type(4))) int int32x4_t;
+typedef __attribute__((neon_vector_type(2))) int64_t int64x2_t;
+typedef __attribute__((neon_vector_type(8))) uint8_t uint8x8_t;
+typedef __attribute__((neon_vector_type(16))) uint8_t uint8x16_t;
+typedef __attribute__((neon_vector_type(4))) uint16_t uint16x4_t;
+typedef __attribute__((neon_vector_type(8))) uint16_t uint16x8_t;
+typedef __attribute__((neon_vector_type(2))) unsigned int uint32x2_t;
+typedef __attribute__((neon_vector_type(4))) unsigned int uint32x4_t;
+typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t;
+typedef __attribute__((neon_vector_type(4))) float16_t float16x4_t;
+typedef __attribute__((neon_vector_type(8))) float16_t float16x8_t;
+typedef __attribute__((neon_vector_type(2))) float32_t float32x2_t;
+typedef __attribute__((neon_vector_type(4))) float32_t float32x4_t;
+typedef __attribute__((neon_vector_type(2))) float64_t float64x2_t;
+typedef __attribute__((neon_polyvector_type(8))) poly8_t poly8x8_t;
+typedef __attribute__((neon_polyvector_type(16))) poly8_t poly8x16_t;
+typedef __attribute__((neon_polyvector_type(4))) poly16_t poly16x4_t;
+typedef __attribute__((neon_polyvector_type(8))) poly16_t poly16x8_t;
+
+// CHECK: 10__Int8x8_t
+void f1(int8x8_t) {}
+// CHECK: 11__Int16x4_t
+void f2(int16x4_t) {}
+// CHECK: 11__Int32x2_t
+void f3(int32x2_t) {}
+// CHECK: 11__Uint8x8_t
+void f4(uint8x8_t) {}
+// CHECK: 12__Uint16x4_t
+void f5(uint16x4_t) {}
+// CHECK: 13__Float16x4_t
+void f6(float16x4_t) {}
+// CHECK: 13__Float16x8_t
+void f7(float16x8_t) {}
+// CHECK: 12__Uint32x2_t
+void f8(uint32x2_t) {}
+// CHECK: 13__Float32x2_t
+void f9(float32x2_t) {}
+// CHECK: 13__Float32x4_t
+void f10(float32x4_t) {}
+// CHECK: 11__Poly8x8_t
+void f11(poly8x8_t v) {}
+// CHECK: 12__Poly16x4_t
+void f12(poly16x4_t v) {}
+// CHECK:12__Poly8x16_t
+void f13(poly8x16_t v) {}
+// CHECK:12__Poly16x8_t
+void f14(poly16x8_t v) {}
+// CHECK: 11__Int8x16_t
+void f15(int8x16_t) {}
+// CHECK: 11__Int16x8_t
+void f16(int16x8_t) {}
+// CHECK:11__Int32x4_t
+void f17(int32x4_t) {}
+// CHECK: 12__Uint8x16_t
+void f18(uint8x16_t) {}
+// CHECK: 12__Uint16x8_t
+void f19(uint16x8_t) {}
+// CHECK: 12__Uint32x4_t
+void f20(uint32x4_t) {}
+// CHECK: 11__Int64x2_t
+void f21(int64x2_t) {}
+// CHECK: 12__Uint64x2_t
+void f22(uint64x2_t) {}
+// CHECK: 13__Float64x2_t
+void f23(float64x2_t) {}
diff --git a/test/CodeGenCXX/mangle-neon-vectors.cpp b/test/CodeGenCXX/mangle-neon-vectors.cpp
index 3723deb192..793c89803f 100644
--- a/test/CodeGenCXX/mangle-neon-vectors.cpp
+++ b/test/CodeGenCXX/mangle-neon-vectors.cpp
@@ -1,6 +1,7 @@
-// RUN: %clang_cc1 %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -triple arm-none-linux-gnueabi %s -emit-llvm -o - | FileCheck %s
 
 typedef float float32_t;
+typedef __fp16 float16_t;
 typedef signed char poly8_t;
 typedef short poly16_t;
 typedef unsigned long long uint64_t;
@@ -11,8 +12,10 @@ typedef __attribute__((neon_vector_type(1))) uint64_t uint64x1_t;
 typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t;
 typedef __attribute__((neon_vector_type(2))) float32_t float32x2_t;
 typedef __attribute__((neon_vector_type(4))) float32_t float32x4_t;
-typedef __attribute__((neon_polyvector_type(16))) poly8_t  poly8x16_t;
-typedef __attribute__((neon_polyvector_type(8)))  poly16_t poly16x8_t;
+typedef __attribute__((neon_vector_type(4))) float16_t float16x4_t;
+typedef __attribute__((neon_vector_type(8))) float16_t float16x8_t;
+typedef __attribute__((neon_polyvector_type(16))) poly8_t poly8x16_t;
+typedef __attribute__((neon_polyvector_type(8))) poly16_t poly16x8_t;
 
 // CHECK: 16__simd64_int32_t
 void f1(int32x2_t v) { }
@@ -26,7 +29,11 @@ void f4(uint64x2_t v) { }
 void f5(float32x2_t v) { }
 // CHECK: 19__simd128_float32_t
 void f6(float32x4_t v) { }
+// CHECK: 18__simd64_float16_t
+void f7(float16x4_t v) {}
+// CHECK: 19__simd128_float16_t
+void f8(float16x8_t v) {}
 // CHECK: 17__simd128_poly8_t
-void f7(poly8x16_t v) { }
+void f9(poly8x16_t v) {}
 // CHECK: 18__simd128_poly16_t
-void f8(poly16x8_t v) { }
+void f10(poly16x8_t v) {}
diff --git a/test/Preprocessor/aarch64-target-features.c b/test/Preprocessor/aarch64-target-features.c
index 8bb8427c0d..25bdb71bc3 100644
--- a/test/Preprocessor/aarch64-target-features.c
+++ b/test/Preprocessor/aarch64-target-features.c
@@ -30,3 +30,6 @@
 // RUN: %clang -target aarch64-none-linux-gnu -fshort-enums -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SHORTENUMS %s
 // CHECK-SHORTENUMS: __ARM_SIZEOF_MINIMAL_ENUM 1
 
+// RUN: %clang -target aarch64-none-linux-gnu -mfpu=neon -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-NEON %s
+// CHECK-NEON: __AARCH_ADVSIMD_FP
+// CHECK-NEON: __AARCH_FEATURE_ADVSIMD
diff --git a/test/Sema/aarch64-neon-vector-types.c b/test/Sema/aarch64-neon-vector-types.c
new file mode 100644
index 0000000000..f4d58ffd09
--- /dev/null
+++ b/test/Sema/aarch64-neon-vector-types.c
@@ -0,0 +1,34 @@
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 %s -triple aarch64-none-linux-gnu -fsyntax-only -verify
+
+typedef float float32_t;
+typedef unsigned char poly8_t;
+typedef unsigned short poly16_t;
+typedef unsigned long long uint64_t;
+
+// Define some valid Neon types.
+typedef __attribute__((neon_vector_type(2))) int int32x2_t;
+typedef __attribute__((neon_vector_type(4))) int int32x4_t;
+typedef __attribute__((neon_vector_type(1))) uint64_t uint64x1_t;
+typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t;
+typedef __attribute__((neon_vector_type(2))) float32_t float32x2_t;
+typedef __attribute__((neon_vector_type(4))) float32_t float32x4_t;
+typedef __attribute__((neon_polyvector_type(16))) poly8_t  poly8x16_t;
+typedef __attribute__((neon_polyvector_type(8)))  poly16_t poly16x8_t;
+
+// The attributes must have a single argument.
+typedef __attribute__((neon_vector_type(2, 4))) int only_one_arg; // expected-error{{attribute takes one argument}}
+
+// The number of elements must be an ICE.
+typedef __attribute__((neon_vector_type(2.0))) int non_int_width; // expected-error{{attribute requires an integer constant}}
+
+// Only certain element types are allowed.
+typedef __attribute__((neon_vector_type(2))) double double_elt;
+typedef __attribute__((neon_vector_type(4))) void* ptr_elt; // expected-error{{invalid vector element type}}
+typedef __attribute__((neon_polyvector_type(4))) float32_t bad_poly_elt; // expected-error{{invalid vector element type}}
+struct aggr { signed char c; };
+typedef __attribute__((neon_vector_type(8))) struct aggr aggregate_elt; // expected-error{{invalid vector element type}}
+
+// The total vector size must be 64 or 128 bits.
+typedef __attribute__((neon_vector_type(1))) int int32x1_t; // expected-error{{Neon vector size must be 64 or 128 bits}}
+typedef __attribute__((neon_vector_type(3))) int int32x3_t; // expected-error{{Neon vector size must be 64 or 128 bits}}
diff --git a/utils/TableGen/NeonEmitter.cpp b/utils/TableGen/NeonEmitter.cpp
index bb505de95d..411aa7e4ab 100644
--- a/utils/TableGen/NeonEmitter.cpp
+++ b/utils/TableGen/NeonEmitter.cpp
@@ -90,7 +90,8 @@ enum OpKind {
   OpReinterpret,
   OpAbdl,
   OpAba,
-  OpAbal
+  OpAbal,
+  OpDiv
 };
 
 enum ClassKind {
@@ -127,7 +128,8 @@ public:
     Poly8,
     Poly16,
     Float16,
-    Float32
+    Float32,
+    Float64
   };
 
   NeonTypeFlags(unsigned F) : Flags(F) {}
@@ -205,6 +207,7 @@ public:
     OpMap["OP_ABDL"]  = OpAbdl;
     OpMap["OP_ABA"]   = OpAba;
     OpMap["OP_ABAL"]  = OpAbal;
+    OpMap["OP_DIV"] = OpDiv;
 
     Record *SI = R.getClass("SInst");
     Record *II = R.getClass("IInst");
@@ -235,7 +238,18 @@ public:
   void runTests(raw_ostream &o);
 
 private:
-  void emitIntrinsic(raw_ostream &OS, Record *R);
+  void emitIntrinsic(raw_ostream &OS, Record *R,
+                     StringMap<ClassKind> &EmittedMap);
+  void genBuiltinsDef(raw_ostream &OS, StringMap<ClassKind> &A64IntrinsicMap,
+                      bool isA64GenBuiltinDef);
+  void genOverloadTypeCheckCode(raw_ostream &OS,
+                                StringMap<ClassKind> &A64IntrinsicMap,
+                                bool isA64TypeCheck);
+  void genIntrinsicRangeCheckCode(raw_ostream &OS,
+                                  StringMap<ClassKind> &A64IntrinsicMap,
+                                  bool isA64RangeCheck);
+  void genTargetTest(raw_ostream &OS, StringMap<OpKind> &EmittedMap,
+                     bool isA64TestGen);
 };
 } // end anonymous namespace
 
@@ -259,6 +273,7 @@ static void ParseTypes(Record *r, std::string &s,
       case 'l':
       case 'h':
       case 'f':
+      case 'd':
         break;
       default:
         PrintFatalError(r->getLoc(),
@@ -347,6 +362,8 @@ static char ModType(const char mod, char type, bool &quad, bool &poly,
       poly = false;
       if (type == 'f')
         type = 'i';
+      if (type == 'd')
+        type = 'l';
       break;
     case 'x':
       usgn = false;
@@ -470,6 +487,13 @@ static std::string TypeString(const char mod, StringRef typestr) {
         break;
       s += quad ? "x4" : "x2";
       break;
+    case 'd':
+      s += "float64";
+      if (scal)
+        break;
+      s += quad ? "x2" : "x1";
+      break;
+
     default:
       PrintFatalError("unhandled type!");
   }
@@ -647,6 +671,18 @@ static void InstructionTypeCode(const StringRef &typeStr,
     default: break;
     }
     break;
+  case 'd':
+    switch (ck) {
+    case ClassS:
+    case ClassI:
+      typeCode += "f64";
+      break;
+    case ClassW:
+      PrintFatalError("unhandled type!");
+    default:
+      break;
+    }
+    break;
   default:
     PrintFatalError("unhandled type!");
   }
@@ -1252,6 +1288,9 @@ static unsigned GetNumElements(StringRef typestr, bool &quad) {
   case 'l': nElts = 1; break;
   case 'h': nElts = 4; break;
   case 'f': nElts = 2; break;
+  case 'd':
+    nElts = 1;
+    break;
   default:
     PrintFatalError("unhandled type!");
   }
@@ -1488,6 +1527,9 @@ static std::string GenOpString(OpKind op, const std::string &proto,
     }
     break;
   }
+  case OpDiv:
+    s += "__a / __b;";
+    break;
   default:
     PrintFatalError("unknown OpKind!");
   }
@@ -1533,6 +1575,9 @@ static unsigned GetNeonEnum(const std::string &proto, StringRef typestr) {
     case 'f':
       ET = NeonTypeFlags::Float32;
       break;
+    case 'd':
+      ET = NeonTypeFlags::Float64;
+      break;
     default:
       PrintFatalError("unhandled type!");
   }
@@ -1776,7 +1821,7 @@ void NeonEmitter::run(raw_ostream &OS) {
   OS << "#ifndef __ARM_NEON_H\n";
   OS << "#define __ARM_NEON_H\n\n";
 
-  OS << "#ifndef __ARM_NEON__\n";
+  OS << "#if !defined(__ARM_NEON__) && !defined(__AARCH_FEATURE_ADVSIMD)\n";
   OS << "#error \"NEON support not enabled\"\n";
   OS << "#endif\n\n";
 
@@ -1784,19 +1829,39 @@ void NeonEmitter::run(raw_ostream &OS) {
 
   // Emit NEON-specific scalar typedefs.
   OS << "typedef float float32_t;\n";
+  OS << "typedef __fp16 float16_t;\n";
+
+  OS << "#ifdef __aarch64__\n";
+  OS << "typedef double float64_t;\n";
+  OS << "#endif\n\n";
+
+  // For now, signedness of polynomial types depends on target
+  OS << "#ifdef __aarch64__\n";
+  OS << "typedef uint8_t poly8_t;\n";
+  OS << "typedef uint16_t poly16_t;\n";
+  OS << "#else\n";
   OS << "typedef int8_t poly8_t;\n";
   OS << "typedef int16_t poly16_t;\n";
-  OS << "typedef uint16_t float16_t;\n";
+  OS << "#endif\n";
 
   // Emit Neon vector typedefs.
-  std::string TypedefTypes("cQcsQsiQilQlUcQUcUsQUsUiQUiUlQUlhQhfQfPcQPcPsQPs");
+  std::string TypedefTypes(
+      "cQcsQsiQilQlUcQUcUsQUsUiQUiUlQUlhQhfQfQdPcQPcPsQPs");
   SmallVector<StringRef, 24> TDTypeVec;
   ParseTypes(0, TypedefTypes, TDTypeVec);
 
   // Emit vector typedefs.
   for (unsigned i = 0, e = TDTypeVec.size(); i != e; ++i) {
     bool dummy, quad = false, poly = false;
-    (void) ClassifyType(TDTypeVec[i], quad, poly, dummy);
+    char type = ClassifyType(TDTypeVec[i], quad, poly, dummy);
+    bool isA64 = false;
+
+    if (type == 'd' && quad)
+      isA64 = true;
+
+    if (isA64)
+      OS << "#ifdef __aarch64__\n";
+
     if (poly)
       OS << "typedef __attribute__((neon_polyvector_type(";
     else
@@ -1809,19 +1874,37 @@ void NeonEmitter::run(raw_ostream &OS) {
 
     OS << TypeString('s', TDTypeVec[i]);
     OS << " " << TypeString('d', TDTypeVec[i]) << ";\n";
+
+    if (isA64)
+      OS << "#endif\n";
   }
   OS << "\n";
 
   // Emit struct typedefs.
   for (unsigned vi = 2; vi != 5; ++vi) {
     for (unsigned i = 0, e = TDTypeVec.size(); i != e; ++i) {
+      bool dummy, quad = false, poly = false;
+      char type = ClassifyType(TDTypeVec[i], quad, poly, dummy);
+      bool isA64 = false;
+
+      if (type == 'd' && quad)
+        isA64 = true;
+
+      if (isA64)
+        OS << "#ifdef __aarch64__\n";
+
       std::string ts = TypeString('d', TDTypeVec[i]);
       std::string vs = TypeString('0' + vi, TDTypeVec[i]);
       OS << "typedef struct " << vs << " {\n";
       OS << "  " << ts << " val";
       OS << "[" << utostr(vi) << "]";
       OS << ";\n} ";
-      OS << vs << ";\n\n";
+      OS << vs << ";\n";
+
+      if (isA64)
+        OS << "#endif\n";
+
+      OS << "\n";
     }
   }
 
@@ -1829,30 +1912,58 @@ void NeonEmitter::run(raw_ostream &OS) {
 
   std::vector<Record*> RV = Records.getAllDerivedDefinitions("Inst");
 
+  StringMap<ClassKind> EmittedMap;
+
   // Emit vmovl, vmull and vabd intrinsics first so they can be used by other
   // intrinsics.  (Some of the saturating multiply instructions are also
   // used to implement the corresponding "_lane" variants, but tablegen
   // sorts the records into alphabetical order so that the "_lane" variants
   // come after the intrinsics they use.)
-  emitIntrinsic(OS, Records.getDef("VMOVL"));
-  emitIntrinsic(OS, Records.getDef("VMULL"));
-  emitIntrinsic(OS, Records.getDef("VABD"));
-
+  emitIntrinsic(OS, Records.getDef("VMOVL"), EmittedMap);
+  emitIntrinsic(OS, Records.getDef("VMULL"), EmittedMap);
+  emitIntrinsic(OS, Records.getDef("VABD"), EmittedMap);
+
+  // ARM intrinsics must be emitted before AArch64 intrinsics to ensure
+  // common intrinsics appear only once in the output stream.
+  // The check for uniquiness is done in emitIntrinsic.
+  // Emit ARM intrinsics.
   for (unsigned i = 0, e = RV.size(); i != e; ++i) {
     Record *R = RV[i];
-    if (R->getName() != "VMOVL" &&
-        R->getName() != "VMULL" &&
+
+    // Skip AArch64 intrinsics; they will be emitted at the end.
+    bool isA64 = R->getValueAsBit("isA64");
+    if (isA64)
+      continue;
+
+    if (R->getName() != "VMOVL" && R->getName() != "VMULL" &&
         R->getName() != "VABD")
-      emitIntrinsic(OS, R);
+      emitIntrinsic(OS, R, EmittedMap);
   }
 
+  // Emit AArch64-specific intrinsics.
+  OS << "#ifdef __aarch64__\n";
+
+  for (unsigned i = 0, e = RV.size(); i != e; ++i) {
+    Record *R = RV[i];
+
+    // Skip ARM intrinsics already included above.
+    bool isA64 = R->getValueAsBit("isA64");
+    if (!isA64)
+      continue;
+
+    emitIntrinsic(OS, R, EmittedMap);
+  }
+
+  OS << "#endif\n\n";
+
   OS << "#undef __ai\n\n";
   OS << "#endif /* __ARM_NEON_H */\n";
 }
 
 /// emitIntrinsic - Write out the arm_neon.h header file definitions for the
-/// intrinsics specified by record R.
-void NeonEmitter::emitIntrinsic(raw_ostream &OS, Record *R) {
+/// intrinsics specified by record R checking for intrinsic uniqueness.
+void NeonEmitter::emitIntrinsic(raw_ostream &OS, Record *R,
+                                StringMap<ClassKind> &EmittedMap) {
   std::string name = R->getValueAsString("Name");
   std::string Proto = R->getValueAsString("Prototype");
   std::string Types = R->getValueAsString("Types");
@@ -1879,12 +1990,20 @@ void NeonEmitter::emitIntrinsic(raw_ostream &OS, Record *R) {
         (void)ClassifyType(TypeVec[srcti], inQuad, dummy, dummy);
         if (srcti == ti || inQuad != outQuad)
           continue;
-        OS << GenIntrinsic(name, Proto, TypeVec[ti], TypeVec[srcti],
-                           OpCast, ClassS);
+        std::string s = GenIntrinsic(name, Proto, TypeVec[ti], TypeVec[srcti],
+                                     OpCast, ClassS);
+        if (EmittedMap.count(s))
+          continue;
+        EmittedMap[s] = ClassS;
+        OS << s;
       }
     } else {
-      OS << GenIntrinsic(name, Proto, TypeVec[ti], TypeVec[ti],
-                         kind, classKind);
+      std::string s =
+          GenIntrinsic(name, Proto, TypeVec[ti], TypeVec[ti], kind, classKind);
+      if (EmittedMap.count(s))
+        continue;
+      EmittedMap[s] = classKind;
+      OS << s;
     }
   }
   OS << "\n";
@@ -1912,56 +2031,151 @@ static unsigned RangeFromType(const char mod, StringRef typestr) {
   }
 }
 
-/// runHeader - Emit a file with sections defining:
-/// 1. the NEON section of BuiltinsARM.def.
-/// 2. the SemaChecking code for the type overload checking.
-/// 3. the SemaChecking code for validation of intrinsic immediate arguments.
-void NeonEmitter::runHeader(raw_ostream &OS) {
-  std::vector<Record*> RV = Records.getAllDerivedDefinitions("Inst");
-
+/// Generate the ARM and AArch64 intrinsic range checking code for
+/// shift/lane immediates, checking for unique declarations.
+void
+NeonEmitter::genIntrinsicRangeCheckCode(raw_ostream &OS,
+                                        StringMap<ClassKind> &A64IntrinsicMap,
+                                        bool isA64RangeCheck) {
+  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
   StringMap<OpKind> EmittedMap;
 
-  // Generate BuiltinsARM.def for NEON
-  OS << "#ifdef GET_NEON_BUILTINS\n";
+  // Generate the intrinsic range checking code for shift/lane immediates.
+  if (isA64RangeCheck)
+    OS << "#ifdef GET_NEON_AARCH64_IMMEDIATE_CHECK\n";
+  else
+    OS << "#ifdef GET_NEON_IMMEDIATE_CHECK\n";
+
   for (unsigned i = 0, e = RV.size(); i != e; ++i) {
     Record *R = RV[i];
+
     OpKind k = OpMap[R->getValueAsDef("Operand")->getName()];
     if (k != OpNone)
       continue;
 
+    std::string name = R->getValueAsString("Name");
     std::string Proto = R->getValueAsString("Prototype");
+    std::string Types = R->getValueAsString("Types");
 
     // Functions with 'a' (the splat code) in the type prototype should not get
     // their own builtin as they use the non-splat variant.
     if (Proto.find('a') != std::string::npos)
       continue;
 
-    std::string Types = R->getValueAsString("Types");
+    // Functions which do not have an immediate do not need to have range
+    // checking code emitted.
+    size_t immPos = Proto.find('i');
+    if (immPos == std::string::npos)
+      continue;
+
     SmallVector<StringRef, 16> TypeVec;
     ParseTypes(R, Types, TypeVec);
 
     if (R->getSuperClasses().size() < 2)
       PrintFatalError(R->getLoc(), "Builtin has no class kind");
 
-    std::string name = R->getValueAsString("Name");
     ClassKind ck = ClassMap[R->getSuperClasses()[1]];
 
+    // Do not include AArch64 range checks if not generating code for AArch64.
+    bool isA64 = R->getValueAsBit("isA64");
+    if (!isA64RangeCheck && isA64)
+      continue;
+
+    // Include ARM range checks in AArch64 but only if ARM intrinsics are not
+    // redefined by AArch64 to handle new types.
+    if (isA64RangeCheck && !isA64 && A64IntrinsicMap.count(name)) {
+      ClassKind &A64CK = A64IntrinsicMap[name];
+      if (A64CK == ck && ck != ClassNone)
+        continue;
+    }
+
     for (unsigned ti = 0, te = TypeVec.size(); ti != te; ++ti) {
-      // Generate the BuiltinsARM.def declaration for this builtin, ensuring
-      // that each unique BUILTIN() macro appears only once in the output
-      // stream.
-      std::string bd = GenBuiltinDef(name, Proto, TypeVec[ti], ck);
-      if (EmittedMap.count(bd))
+      std::string namestr, shiftstr, rangestr;
+
+      if (R->getValueAsBit("isVCVT_N")) {
+        // VCVT between floating- and fixed-point values takes an immediate
+        // in the range 1 to 32.
+        ck = ClassB;
+        rangestr = "l = 1; u = 31"; // upper bound = l + u
+      } else if (Proto.find('s') == std::string::npos) {
+        // Builtins which are overloaded by type will need to have their upper
+        // bound computed at Sema time based on the type constant.
+        ck = ClassB;
+        if (R->getValueAsBit("isShift")) {
+          shiftstr = ", true";
+
+          // Right shifts have an 'r' in the name, left shifts do not.
+          if (name.find('r') != std::string::npos)
+            rangestr = "l = 1; ";
+        }
+        rangestr += "u = RFT(TV" + shiftstr + ")";
+      } else {
+        // The immediate generally refers to a lane in the preceding argument.
+        assert(immPos > 0 && "unexpected immediate operand");
+        rangestr =
+            "u = " + utostr(RangeFromType(Proto[immPos - 1], TypeVec[ti]));
+      }
+      // Make sure cases appear only once by uniquing them in a string map.
+      namestr = MangleName(name, TypeVec[ti], ck);
+      if (EmittedMap.count(namestr))
         continue;
+      EmittedMap[namestr] = OpNone;
 
-      EmittedMap[bd] = OpNone;
-      OS << bd << "\n";
+      // Calculate the index of the immediate that should be range checked.
+      unsigned immidx = 0;
+
+      // Builtins that return a struct of multiple vectors have an extra
+      // leading arg for the struct return.
+      if (Proto[0] >= '2' && Proto[0] <= '4')
+        ++immidx;
+
+      // Add one to the index for each argument until we reach the immediate
+      // to be checked.  Structs of vectors are passed as multiple arguments.
+      for (unsigned ii = 1, ie = Proto.size(); ii != ie; ++ii) {
+        switch (Proto[ii]) {
+        default:
+          immidx += 1;
+          break;
+        case '2':
+          immidx += 2;
+          break;
+        case '3':
+          immidx += 3;
+          break;
+        case '4':
+          immidx += 4;
+          break;
+        case 'i':
+          ie = ii + 1;
+          break;
+        }
+      }
+      if (isA64RangeCheck)
+        OS << "case AArch64::BI__builtin_neon_";
+      else
+        OS << "case ARM::BI__builtin_neon_";
+      OS << MangleName(name, TypeVec[ti], ck) << ": i = " << immidx << "; "
+         << rangestr << "; break;\n";
     }
   }
   OS << "#endif\n\n";
+}
+
+/// Generate the ARM and AArch64 overloaded type checking code for
+/// SemaChecking.cpp, checking for unique builtin declarations.
+void
+NeonEmitter::genOverloadTypeCheckCode(raw_ostream &OS,
+                                      StringMap<ClassKind> &A64IntrinsicMap,
+                                      bool isA64TypeCheck) {
+  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
+  StringMap<OpKind> EmittedMap;
 
   // Generate the overloaded type checking code for SemaChecking.cpp
-  OS << "#ifdef GET_NEON_OVERLOAD_CHECK\n";
+  if (isA64TypeCheck)
+    OS << "#ifdef GET_NEON_AARCH64_OVERLOAD_CHECK\n";
+  else
+    OS << "#ifdef GET_NEON_OVERLOAD_CHECK\n";
+
   for (unsigned i = 0, e = RV.size(); i != e; ++i) {
     Record *R = RV[i];
     OpKind k = OpMap[R->getValueAsDef("Operand")->getName()];
@@ -1988,6 +2202,21 @@ void NeonEmitter::runHeader(raw_ostream &OS) {
     if (R->getSuperClasses().size() < 2)
       PrintFatalError(R->getLoc(), "Builtin has no class kind");
 
+    // Do not include AArch64 type checks if not generating code for AArch64.
+    bool isA64 = R->getValueAsBit("isA64");
+    if (!isA64TypeCheck && isA64)
+      continue;
+
+    // Include ARM  type check in AArch64 but only if ARM intrinsics
+    // are not redefined in AArch64 to handle new types, e.g. "vabd" is a SIntr
+    // redefined in AArch64 to handle an additional 2 x f64 type.
+    ClassKind ck = ClassMap[R->getSuperClasses()[1]];
+    if (isA64TypeCheck && !isA64 && A64IntrinsicMap.count(name)) {
+      ClassKind &A64CK = A64IntrinsicMap[name];
+      if (A64CK == ck && ck != ClassNone)
+        continue;
+    }
+
     int si = -1, qi = -1;
     uint64_t mask = 0, qmask = 0;
     for (unsigned ti = 0, te = TypeVec.size(); ti != te; ++ti) {
@@ -2035,9 +2264,12 @@ void NeonEmitter::runHeader(raw_ostream &OS) {
     }
 
     if (mask) {
-      OS << "case ARM::BI__builtin_neon_"
-         << MangleName(name, TypeVec[si], ClassB)
-         << ": mask = " << "0x" << utohexstr(mask) << "ULL";
+      if (isA64TypeCheck)
+        OS << "case AArch64::BI__builtin_neon_";
+      else
+        OS << "case ARM::BI__builtin_neon_";
+      OS << MangleName(name, TypeVec[si], ClassB) << ": mask = "
+         << "0x" << utohexstr(mask) << "ULL";
       if (PtrArgNum >= 0)
         OS << "; PtrArgNum = " << PtrArgNum;
       if (HasConstPtr)
@@ -2045,9 +2277,12 @@ void NeonEmitter::runHeader(raw_ostream &OS) {
       OS << "; break;\n";
     }
     if (qmask) {
-      OS << "case ARM::BI__builtin_neon_"
-         << MangleName(name, TypeVec[qi], ClassB)
-         << ": mask = " << "0x" << utohexstr(qmask) << "ULL";
+      if (isA64TypeCheck)
+        OS << "case AArch64::BI__builtin_neon_";
+      else
+        OS << "case ARM::BI__builtin_neon_";
+      OS << MangleName(name, TypeVec[qi], ClassB) << ": mask = "
+         << "0x" << utohexstr(qmask) << "ULL";
       if (PtrArgNum >= 0)
         OS << "; PtrArgNum = " << PtrArgNum;
       if (HasConstPtr)
@@ -2056,31 +2291,37 @@ void NeonEmitter::runHeader(raw_ostream &OS) {
     }
   }
   OS << "#endif\n\n";
+}
+
+/// genBuiltinsDef: Generate the BuiltinsARM.def and  BuiltinsAArch64.def
+/// declaration of builtins, checking for unique builtin declarations.
+void NeonEmitter::genBuiltinsDef(raw_ostream &OS,
+                                 StringMap<ClassKind> &A64IntrinsicMap,
+                                 bool isA64GenBuiltinDef) {
+  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
+  StringMap<OpKind> EmittedMap;
+
+  // Generate BuiltinsARM.def and BuiltinsAArch64.def
+  if (isA64GenBuiltinDef)
+    OS << "#ifdef GET_NEON_AARCH64_BUILTINS\n";
+  else
+    OS << "#ifdef GET_NEON_BUILTINS\n";
 
-  // Generate the intrinsic range checking code for shift/lane immediates.
-  OS << "#ifdef GET_NEON_IMMEDIATE_CHECK\n";
   for (unsigned i = 0, e = RV.size(); i != e; ++i) {
     Record *R = RV[i];
-
     OpKind k = OpMap[R->getValueAsDef("Operand")->getName()];
     if (k != OpNone)
       continue;
 
-    std::string name = R->getValueAsString("Name");
     std::string Proto = R->getValueAsString("Prototype");
-    std::string Types = R->getValueAsString("Types");
+    std::string name = R->getValueAsString("Name");
 
     // Functions with 'a' (the splat code) in the type prototype should not get
     // their own builtin as they use the non-splat variant.
     if (Proto.find('a') != std::string::npos)
       continue;
 
-    // Functions which do not have an immediate do not need to have range
-    // checking code emitted.
-    size_t immPos = Proto.find('i');
-    if (immPos == std::string::npos)
-      continue;
-
+    std::string Types = R->getValueAsString("Types");
     SmallVector<StringRef, 16> TypeVec;
     ParseTypes(R, Types, TypeVec);
 
@@ -2089,70 +2330,90 @@ void NeonEmitter::runHeader(raw_ostream &OS) {
 
     ClassKind ck = ClassMap[R->getSuperClasses()[1]];
 
-    for (unsigned ti = 0, te = TypeVec.size(); ti != te; ++ti) {
-      std::string namestr, shiftstr, rangestr;
-
-      if (R->getValueAsBit("isVCVT_N")) {
-        // VCVT between floating- and fixed-point values takes an immediate
-        // in the range 1 to 32.
-        ck = ClassB;
-        rangestr = "l = 1; u = 31"; // upper bound = l + u
-      } else if (Proto.find('s') == std::string::npos) {
-        // Builtins which are overloaded by type will need to have their upper
-        // bound computed at Sema time based on the type constant.
-        ck = ClassB;
-        if (R->getValueAsBit("isShift")) {
-          shiftstr = ", true";
+    // Do not include AArch64 BUILTIN() macros if not generating
+    // code for AArch64
+    bool isA64 = R->getValueAsBit("isA64");
+    if (!isA64GenBuiltinDef && isA64)
+      continue;
 
-          // Right shifts have an 'r' in the name, left shifts do not.
-          if (name.find('r') != std::string::npos)
-            rangestr = "l = 1; ";
-        }
-        rangestr += "u = RFT(TV" + shiftstr + ")";
-      } else {
-        // The immediate generally refers to a lane in the preceding argument.
-        assert(immPos > 0 && "unexpected immediate operand");
-        rangestr = "u = " + utostr(RangeFromType(Proto[immPos-1], TypeVec[ti]));
-      }
-      // Make sure cases appear only once by uniquing them in a string map.
-      namestr = MangleName(name, TypeVec[ti], ck);
-      if (EmittedMap.count(namestr))
+    // Include ARM  BUILTIN() macros  in AArch64 but only if ARM intrinsics
+    // are not redefined in AArch64 to handle new types, e.g. "vabd" is a SIntr
+    // redefined in AArch64 to handle an additional 2 x f64 type.
+    if (isA64GenBuiltinDef && !isA64 && A64IntrinsicMap.count(name)) {
+      ClassKind &A64CK = A64IntrinsicMap[name];
+      if (A64CK == ck && ck != ClassNone)
         continue;
-      EmittedMap[namestr] = OpNone;
-
-      // Calculate the index of the immediate that should be range checked.
-      unsigned immidx = 0;
+    }
 
-      // Builtins that return a struct of multiple vectors have an extra
-      // leading arg for the struct return.
-      if (Proto[0] >= '2' && Proto[0] <= '4')
-        ++immidx;
+    for (unsigned ti = 0, te = TypeVec.size(); ti != te; ++ti) {
+      // Generate the declaration for this builtin, ensuring
+      // that each unique BUILTIN() macro appears only once in the output
+      // stream.
+      std::string bd = GenBuiltinDef(name, Proto, TypeVec[ti], ck);
+      if (EmittedMap.count(bd))
+        continue;
 
-      // Add one to the index for each argument until we reach the immediate
-      // to be checked.  Structs of vectors are passed as multiple arguments.
-      for (unsigned ii = 1, ie = Proto.size(); ii != ie; ++ii) {
-        switch (Proto[ii]) {
-          default:  immidx += 1; break;
-          case '2': immidx += 2; break;
-          case '3': immidx += 3; break;
-          case '4': immidx += 4; break;
-          case 'i': ie = ii + 1; break;
-        }
-      }
-      OS << "case ARM::BI__builtin_neon_" << MangleName(name, TypeVec[ti], ck)
-         << ": i = " << immidx << "; " << rangestr << "; break;\n";
+      EmittedMap[bd] = OpNone;
+      OS << bd << "\n";
     }
   }
   OS << "#endif\n\n";
 }
 
+/// runHeader - Emit a file with sections defining:
+/// 1. the NEON section of BuiltinsARM.def and BuiltinsAArch64.def.
+/// 2. the SemaChecking code for the type overload checking.
+/// 3. the SemaChecking code for validation of intrinsic immediate arguments.
+void NeonEmitter::runHeader(raw_ostream &OS) {
+  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
+
+  // build a map of AArch64 intriniscs to be used in uniqueness checks.
+  StringMap<ClassKind> A64IntrinsicMap;
+  for (unsigned i = 0, e = RV.size(); i != e; ++i) {
+    Record *R = RV[i];
+
+    bool isA64 = R->getValueAsBit("isA64");
+    if (!isA64)
+      continue;
+
+    ClassKind CK = ClassNone;
+    if (R->getSuperClasses().size() >= 2)
+      CK = ClassMap[R->getSuperClasses()[1]];
+
+    std::string Name = R->getValueAsString("Name");
+    if (A64IntrinsicMap.count(Name))
+      continue;
+    A64IntrinsicMap[Name] = CK;
+  }
+
+  // Generate BuiltinsARM.def for ARM
+  genBuiltinsDef(OS, A64IntrinsicMap, false);
+
+  // Generate BuiltinsAArch64.def for AArch64
+  genBuiltinsDef(OS, A64IntrinsicMap, true);
+
+  // Generate ARM overloaded type checking code for SemaChecking.cpp
+  genOverloadTypeCheckCode(OS, A64IntrinsicMap, false);
+
+  // Generate AArch64 overloaded type checking code for SemaChecking.cpp
+  genOverloadTypeCheckCode(OS, A64IntrinsicMap, true);
+
+  // Generate ARM range checking code for shift/lane immediates.
+  genIntrinsicRangeCheckCode(OS, A64IntrinsicMap, false);
+
+  // Generate the AArch64 range checking code for shift/lane immediates.
+  genIntrinsicRangeCheckCode(OS, A64IntrinsicMap, true);
+}
+
 /// GenTest - Write out a test for the intrinsic specified by the name and
 /// type strings, including the embedded patterns for FileCheck to match.
 static std::string GenTest(const std::string &name,
                            const std::string &proto,
                            StringRef outTypeStr, StringRef inTypeStr,
                            bool isShift, bool isHiddenLOp,
-                           ClassKind ck, const std::string &InstName) {
+                           ClassKind ck, const std::string &InstName,
+						   bool isA64,
+						   std::string & testFuncProto) {
   assert(!proto.empty() && "");
   std::string s;
 
@@ -2167,12 +2428,17 @@ static std::string GenTest(const std::string &name,
     mangledName = MangleName(mangledName, inTypeNoQuad, ClassS);
   }
 
+  // todo: GenerateChecksForIntrinsic does not generate CHECK
+  // for aarch64 instructions yet
   std::vector<std::string> FileCheckPatterns;
-  GenerateChecksForIntrinsic(name, proto, outTypeStr, inTypeStr, ck, InstName,
-                             isHiddenLOp, FileCheckPatterns);
+  if (!isA64) {
+	GenerateChecksForIntrinsic(name, proto, outTypeStr, inTypeStr, ck, InstName,
+							   isHiddenLOp, FileCheckPatterns);
+	s+= "// CHECK_ARM: test_" + mangledName + "\n";
+  }
+  s += "// CHECK_AARCH64: test_" + mangledName + "\n";
 
   // Emit the FileCheck patterns.
-  s += "// CHECK: test_" + mangledName + "\n";
   // If for any reason we do not want to emit a check, mangledInst
   // will be the empty string.
   if (FileCheckPatterns.size()) {
@@ -2180,23 +2446,27 @@ static std::string GenTest(const std::string &name,
                                                   e = FileCheckPatterns.end();
          i != e;
          ++i) {
-      s += "// CHECK: " + *i + "\n";
+      s += "// CHECK_ARM: " + *i + "\n";
     }
   }
 
   // Emit the start of the test function.
-  s += TypeString(proto[0], outTypeStr) + " test_" + mangledName + "(";
+
+  testFuncProto = TypeString(proto[0], outTypeStr) + " test_" + mangledName + "(";
   char arg = 'a';
   std::string comma;
   for (unsigned i = 1, e = proto.size(); i != e; ++i, ++arg) {
     // Do not create arguments for values that must be immediate constants.
     if (proto[i] == 'i')
       continue;
-    s += comma + TypeString(proto[i], inTypeStr) + " ";
-    s.push_back(arg);
+    testFuncProto += comma + TypeString(proto[i], inTypeStr) + " ";
+    testFuncProto.push_back(arg);
     comma = ", ";
   }
-  s += ") {\n  ";
+  testFuncProto += ")";
+
+  s+= testFuncProto;
+  s+= " {\n  ";
 
   if (proto[0] != 'v')
     s += "return ";
@@ -2220,20 +2490,14 @@ static std::string GenTest(const std::string &name,
   return s;
 }
 
-/// runTests - Write out a complete set of tests for all of the Neon
-/// intrinsics.
-void NeonEmitter::runTests(raw_ostream &OS) {
-  OS <<
-    "// RUN: %clang_cc1 -triple thumbv7s-apple-darwin -target-abi apcs-gnu\\\n"
-    "// RUN:  -target-cpu swift -ffreestanding -Os -S -o - %s\\\n"
-    "// RUN:  | FileCheck %s\n"
-    "\n"
-    "// REQUIRES: long_tests\n"
-    "\n"
-    "#include <arm_neon.h>\n"
-    "\n";
+/// Write out all intrinsic tests for the specified target, checking
+/// for intrinsic test uniqueness.
+void NeonEmitter::genTargetTest(raw_ostream &OS, StringMap<OpKind> &EmittedMap,
+                                bool isA64GenTest) {
+  if (isA64GenTest)
+	OS << "#ifdef __aarch64__\n";
 
-  std::vector<Record*> RV = Records.getAllDerivedDefinitions("Inst");
+  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
   for (unsigned i = 0, e = RV.size(); i != e; ++i) {
     Record *R = RV[i];
     std::string name = R->getValueAsString("Name");
@@ -2242,6 +2506,12 @@ void NeonEmitter::runTests(raw_ostream &OS) {
     bool isShift = R->getValueAsBit("isShift");
     std::string InstName = R->getValueAsString("InstName");
     bool isHiddenLOp = R->getValueAsBit("isHiddenLInst");
+    bool isA64 = R->getValueAsBit("isA64");
+
+    // do not include AArch64 intrinsic test if not generating
+    // code for AArch64
+    if (!isA64GenTest && isA64)
+      continue;
 
     SmallVector<StringRef, 16> TypeVec;
     ParseTypes(R, Types, TypeVec);
@@ -2261,16 +2531,56 @@ void NeonEmitter::runTests(raw_ostream &OS) {
           (void)ClassifyType(TypeVec[srcti], inQuad, dummy, dummy);
           if (srcti == ti || inQuad != outQuad)
             continue;
-          OS << GenTest(name, Proto, TypeVec[ti], TypeVec[srcti],
-                        isShift, isHiddenLOp, ck, InstName);
+		  std::string testFuncProto;
+          std::string s = GenTest(name, Proto, TypeVec[ti], TypeVec[srcti],
+                                  isShift, isHiddenLOp, ck, InstName, isA64,
+								  testFuncProto);
+          if (EmittedMap.count(testFuncProto))
+            continue;
+          EmittedMap[testFuncProto] = kind;
+          OS << s << "\n";
         }
       } else {
-        OS << GenTest(name, Proto, TypeVec[ti], TypeVec[ti],
-                      isShift, isHiddenLOp, ck, InstName);
+		std::string testFuncProto;
+        std::string s = GenTest(name, Proto, TypeVec[ti], TypeVec[ti], isShift,
+                                isHiddenLOp, ck, InstName, isA64, testFuncProto);
+        if (EmittedMap.count(testFuncProto))
+          continue;
+        EmittedMap[testFuncProto] = kind;
+        OS << s << "\n";
       }
     }
-    OS << "\n";
   }
+
+  if (isA64GenTest)
+	OS << "#endif\n";
+}
+/// runTests - Write out a complete set of tests for all of the Neon
+/// intrinsics.
+void NeonEmitter::runTests(raw_ostream &OS) {
+  OS << "// RUN: %clang_cc1 -triple thumbv7s-apple-darwin -target-abi "
+        "apcs-gnu\\\n"
+        "// RUN:  -target-cpu swift -ffreestanding -Os -S -o - %s\\\n"
+        "// RUN:  | FileCheck %s -check-prefix=CHECK_ARM\n"
+		"\n"
+	    "// RUN: %clang_cc1 -triple aarch64-none-linux-gnu \\\n"
+	    "// RUN -target-feature +neon  -ffreestanding -S -o - %s \\\n"
+	    "// RUN:  | FileCheck %s -check-prefix=CHECK_AARCH64\n"
+        "\n"
+        "// REQUIRES: long_tests\n"
+        "\n"
+        "#include <arm_neon.h>\n"
+        "\n";
+
+  // ARM tests must be emitted before AArch64 tests to ensure
+  // tests for intrinsics that are common to ARM and AArch64
+  // appear only once in the output stream.
+  // The check for uniqueness is done in genTargetTest.
+  StringMap<OpKind> EmittedMap;
+
+  genTargetTest(OS, EmittedMap, false);
+
+  genTargetTest(OS, EmittedMap, true);
 }
 
 namespace clang {