[patches] Cherry pick CLS for: truncating buildvectors into truncates,llvm-r450784

support for FMA intrinsics to shouldSinkOperands, vector FP cmp, maximum VF with shouldMaximizeVectorBandwidth Change-Id: If19bb09ce7675c18bb07586d215a61c82e4dbac6
author: Pavel Iliin <Pavel.Iliin@arm.com> 2022-10-19 14:07:26 +0100
committer: Pirama Arumuga Nainar <pirama@google.com> 2022-11-30 05:04:27 +0000
commit: 91fdeab43d29b1f228113859da8ee238bc8c2f16 (patch)
tree: 28d683a8957973b0291ba54157248813208d755a
parent: ecc27f8f2bfef88a9a579b39a64898326e36bfc0 (diff)
download: llvm_android-llvm-r450784.tar.gz
12 files changed, 6860 insertions, 0 deletions
diff --git a/patches/PATCHES.json b/patches/PATCHES.json
index 497732f..382e45e 100644
--- a/patches/PATCHES.json
+++ b/patches/PATCHES.json
@@ -1094,5 +1094,145 @@
             "from": 450784,
             "until": null
         }
+    },
+    {
+        "metadata": {
+            "info": [],
+            "title": "[UPSTREAM] [AArch64] Add extra fptoint_sat tests for larger than legal types. NFC"
+        },
+        "platforms": [
+            "android"
+        ],
+        "rel_patch_path": "cherry/bb362d890f0d51c250818711d4a9b0b51cea7bc6.patch",
+        "version_range": {
+            "from": 450784,
+            "until": null
+        }
+    },
+    {
+        "metadata": {
+            "info": [],
+            "title": "[UPSTREAM] [AArch64] Use simd mov to materialize big fp constants"
+        },
+        "platforms": [
+            "android"
+        ],
+        "rel_patch_path": "cherry/7a605ab7bfbc681c34335684f45b7da32d495db1.patch",
+        "version_range": {
+            "from": 450784,
+            "until": null
+        }
+    },
+    {
+        "metadata": {
+            "info": [],
+            "title": "[UPSTREAM] [AArch64] Some tests to show reconstructing truncates. NFC"
+        },
+        "platforms": [
+            "android"
+        ],
+        "rel_patch_path": "cherry/84ccd015e7dd3ca57c4a9366ecd2b9a7430f505d.patch",
+        "version_range": {
+            "from": 450784,
+            "until": null
+        }
+    },
+    {
+        "metadata": {
+            "info": [],
+            "title": "[UPSTREAM] [AArch64] Turn truncating buildvectors into truncates."
+        },
+        "platforms": [
+            "android"
+        ],
+        "rel_patch_path": "cherry/d9633d149022054bdac90bd3d03a240dbdb46f7e.patch",
+        "version_range": {
+            "from": 450784,
+            "until": null
+        }
+    },
+    {
+        "metadata": {
+            "info": [],
+            "title": "[UPSTREAM] [AArch64] Add tests with free shuffles for indexed fma variants."
+        },
+        "platforms": [
+            "android"
+        ],
+        "rel_patch_path": "cherry/86617256864ebcbda03b6ce843deeb6a41a85800.patch",
+        "version_range": {
+            "from": 450784,
+            "until": null
+        }
+    },
+    {
+        "metadata": {
+            "info": [],
+            "title": "[UPSTREAM] [AArch64] Add additional tests for sinking free shuffles for FMAs."
+        },
+        "platforms": [
+            "android"
+        ],
+        "rel_patch_path": "cherry/a9a012086a917dff367bb63de2d63782b23111fc.patch",
+        "version_range": {
+            "from": 450784,
+            "until": null
+        }
+    },
+    {
+        "metadata": {
+            "info": [],
+            "title": "[UPSTRREAM] [AArch64] Add support for FMA intrinsics to shouldSinkOperands."
+        },
+        "platforms": [
+            "android"
+        ],
+        "rel_patch_path": "cherry/786c687810a5e3db4c64312018de25c65527c40c.patch",
+        "version_range": {
+            "from": 450784,
+            "until": null
+        }
+    },
+    {
+        "metadata": {
+            "info": [],
+            "title": "[UPSTRREAM] [AArch64] Add fcmp fast math tests"
+        },
+        "platforms": [
+            "android"
+        ],
+        "rel_patch_path": "cherry/a8de8cab7006bc885804e8a2c0a6902702521cfe.patch",
+        "version_range": {
+            "from": 450784,
+            "until": null
+        }
+    },
+    {
+        "metadata": {
+            "info": [],
+            "title": "[UPSTRREAM] [PATCH] [AArch64] Emit vector FP cmp when LE is used with fast-math"
+        },
+        "platforms": [
+            "android"
+        ],
+        "rel_patch_path": "cherry/bf268a05cd9294854ffccc3158c0e673069bed4a.patch",
+        "version_range": {
+            "from": 450784,
+            "until": null
+        }
+    },
+    {
+        "metadata": {
+            "info": [],
+            "title": "[MERGED] [UPSTREAM] [AArch64] Set maximum VF with shouldMaximizeVectorBandwidth"
+        },
+        "platforms": [
+            "android"
+        ],
+        "rel_patch_path": "cherry/Loop-Vectorizer-shouldMaximizeVectorBandwidth.patch",
+        "version_range": {
+            "from": 450784,
+            "until": null
+        }
     }
 ]
diff --git a/patches/cherry/786c687810a5e3db4c64312018de25c65527c40c.patch b/patches/cherry/786c687810a5e3db4c64312018de25c65527c40c.patch
new file mode 100644
index 0000000..3e06f7b
--- /dev/null
+++ b/patches/cherry/786c687810a5e3db4c64312018de25c65527c40c.patch
@@ -0,0 +1,245 @@
+From 786c687810a5e3db4c64312018de25c65527c40c Mon Sep 17 00:00:00 2001
+From: Florian Hahn <flo@fhahn.com>
+Date: Fri, 27 May 2022 10:37:02 +0100
+Subject: [PATCH] [AArch64] Add support for FMA intrinsics to
+ shouldSinkOperands.
+
+If the fma operates on a legal vector type, the indexed variants can be
+used, if the second operand is a splat of a valid index.
+
+Reviewed By: dmgreen
+
+Differential Revision: https://reviews.llvm.org/D126234
+---
+ .../Target/AArch64/AArch64ISelLowering.cpp    |   6 +-
+ .../AArch64/sink-free-instructions.ll         | 127 +++++++++++-------
+ 2 files changed, 81 insertions(+), 52 deletions(-)
+
+diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+index e31a58da0831..d31008496ea4 100644
+--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
++++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+@@ -12545,6 +12545,11 @@ bool AArch64TargetLowering::shouldSinkOperands(
+       }
+       LLVM_FALLTHROUGH;
+ 
++    case Intrinsic::fma:
++      if (cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
++          !Subtarget->hasFullFP16())
++        return false;
++      LLVM_FALLTHROUGH;
+     case Intrinsic::aarch64_neon_sqdmull:
+     case Intrinsic::aarch64_neon_sqdmulh:
+     case Intrinsic::aarch64_neon_sqrdmulh:
+@@ -12568,7 +12573,6 @@ bool AArch64TargetLowering::shouldSinkOperands(
+       Ops.push_back(&II->getArgOperandUse(0));
+       Ops.push_back(&II->getArgOperandUse(1));
+       return true;
+-
+     default:
+       return false;
+     }
+diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll
+index 5d7a26f65784..fc60b119225c 100644
+--- a/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll
++++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll
+@@ -1,5 +1,6 @@
+ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+-; RUN: opt < %s -codegenprepare -S | FileCheck %s
++; RUN: opt < %s -codegenprepare -S | FileCheck --check-prefixes=CHECK,NOFP16 %s
++; RUN: opt < %s -codegenprepare -S -mattr=+fullfp16 | FileCheck --check-prefixes=CHECK,FULLFP16 %s
+ 
+ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+ target triple = "aarch64-unknown"
+@@ -498,29 +499,53 @@ if.else:
+ declare <8 x half> @llvm.fma.v8f16(<8 x half>, <8 x half>, <8 x half>)
+ 
+ define <8 x half> @sink_shufflevector_fma_v8f16(i1 %c, <8 x half> %a, <8 x half> %b) {
+-; CHECK-LABEL: @sink_shufflevector_fma_v8f16(
+-; CHECK-NEXT:  entry:
+-; CHECK-NEXT:    [[S0:%.*]] = shufflevector <8 x half> [[A:%.*]], <8 x half> poison, <8 x i32> zeroinitializer
+-; CHECK-NEXT:    [[S1:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+-; CHECK-NEXT:    [[S2:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+-; CHECK-NEXT:    [[S3:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+-; CHECK-NEXT:    [[S4:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+-; CHECK-NEXT:    [[S5:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+-; CHECK-NEXT:    [[S6:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6>
+-; CHECK-NEXT:    [[S7:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+-; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+-; CHECK:       if.then:
+-; CHECK-NEXT:    [[R_0:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[B:%.*]], <8 x half> [[S0]], <8 x half> [[B]])
+-; CHECK-NEXT:    [[R_1:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_0]], <8 x half> [[S1]], <8 x half> [[B]])
+-; CHECK-NEXT:    [[R_2:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_1]], <8 x half> [[S2]], <8 x half> [[B]])
+-; CHECK-NEXT:    [[R_3:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_2]], <8 x half> [[S3]], <8 x half> [[B]])
+-; CHECK-NEXT:    ret <8 x half> [[R_3]]
+-; CHECK:       if.else:
+-; CHECK-NEXT:    [[R_4:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[B]], <8 x half> [[S4]], <8 x half> [[B]])
+-; CHECK-NEXT:    [[R_5:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_4]], <8 x half> [[S5]], <8 x half> [[B]])
+-; CHECK-NEXT:    [[R_6:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_5]], <8 x half> [[S6]], <8 x half> [[B]])
+-; CHECK-NEXT:    [[R_7:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_6]], <8 x half> [[S7]], <8 x half> [[B]])
+-; CHECK-NEXT:    ret <8 x half> [[R_7]]
++; NOFP16-LABEL: @sink_shufflevector_fma_v8f16(
++; NOFP16-NEXT:  entry:
++; NOFP16-NEXT:    [[S0:%.*]] = shufflevector <8 x half> [[A:%.*]], <8 x half> poison, <8 x i32> zeroinitializer
++; NOFP16-NEXT:    [[S1:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
++; NOFP16-NEXT:    [[S2:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
++; NOFP16-NEXT:    [[S3:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
++; NOFP16-NEXT:    [[S4:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
++; NOFP16-NEXT:    [[S5:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
++; NOFP16-NEXT:    [[S6:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6>
++; NOFP16-NEXT:    [[S7:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
++; NOFP16-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
++; NOFP16:       if.then:
++; NOFP16-NEXT:    [[R_0:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[B:%.*]], <8 x half> [[S0]], <8 x half> [[B]])
++; NOFP16-NEXT:    [[R_1:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_0]], <8 x half> [[S1]], <8 x half> [[B]])
++; NOFP16-NEXT:    [[R_2:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_1]], <8 x half> [[S2]], <8 x half> [[B]])
++; NOFP16-NEXT:    [[R_3:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_2]], <8 x half> [[S3]], <8 x half> [[B]])
++; NOFP16-NEXT:    ret <8 x half> [[R_3]]
++; NOFP16:       if.else:
++; NOFP16-NEXT:    [[R_4:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[B]], <8 x half> [[S4]], <8 x half> [[B]])
++; NOFP16-NEXT:    [[R_5:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_4]], <8 x half> [[S5]], <8 x half> [[B]])
++; NOFP16-NEXT:    [[R_6:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_5]], <8 x half> [[S6]], <8 x half> [[B]])
++; NOFP16-NEXT:    [[R_7:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_6]], <8 x half> [[S7]], <8 x half> [[B]])
++; NOFP16-NEXT:    ret <8 x half> [[R_7]]
++;
++; FULLFP16-LABEL: @sink_shufflevector_fma_v8f16(
++; FULLFP16-NEXT:  entry:
++; FULLFP16-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
++; FULLFP16:       if.then:
++; FULLFP16-NEXT:    [[TMP0:%.*]] = shufflevector <8 x half> [[A:%.*]], <8 x half> poison, <8 x i32> zeroinitializer
++; FULLFP16-NEXT:    [[R_0:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[B:%.*]], <8 x half> [[TMP0]], <8 x half> [[B]])
++; FULLFP16-NEXT:    [[TMP1:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
++; FULLFP16-NEXT:    [[R_1:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_0]], <8 x half> [[TMP1]], <8 x half> [[B]])
++; FULLFP16-NEXT:    [[TMP2:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
++; FULLFP16-NEXT:    [[R_2:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_1]], <8 x half> [[TMP2]], <8 x half> [[B]])
++; FULLFP16-NEXT:    [[TMP3:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
++; FULLFP16-NEXT:    [[R_3:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_2]], <8 x half> [[TMP3]], <8 x half> [[B]])
++; FULLFP16-NEXT:    ret <8 x half> [[R_3]]
++; FULLFP16:       if.else:
++; FULLFP16-NEXT:    [[TMP4:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
++; FULLFP16-NEXT:    [[R_4:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[B]], <8 x half> [[TMP4]], <8 x half> [[B]])
++; FULLFP16-NEXT:    [[TMP5:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
++; FULLFP16-NEXT:    [[R_5:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_4]], <8 x half> [[TMP5]], <8 x half> [[B]])
++; FULLFP16-NEXT:    [[TMP6:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6>
++; FULLFP16-NEXT:    [[R_6:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_5]], <8 x half> [[TMP6]], <8 x half> [[B]])
++; FULLFP16-NEXT:    [[TMP7:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
++; FULLFP16-NEXT:    [[R_7:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_6]], <8 x half> [[TMP7]], <8 x half> [[B]])
++; FULLFP16-NEXT:    ret <8 x half> [[R_7]]
+ ;
+ entry:
+   %s0 = shufflevector <8 x half> %a, <8 x half> poison, <8 x i32> zeroinitializer
+@@ -553,18 +578,18 @@ declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
+ define <4 x float> @sink_shufflevector_fma_v4f32(i1 %c, <8 x float> %a, <4 x float> %b) {
+ ; CHECK-LABEL: @sink_shufflevector_fma_v4f32(
+ ; CHECK-NEXT:  entry:
+-; CHECK-NEXT:    [[S0:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> zeroinitializer
+-; CHECK-NEXT:    [[S1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+-; CHECK-NEXT:    [[S2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+-; CHECK-NEXT:    [[S3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+ ; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+ ; CHECK:       if.then:
+-; CHECK-NEXT:    [[R_0:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[B:%.*]], <4 x float> [[S0]], <4 x float> [[B]])
+-; CHECK-NEXT:    [[R_1:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[R_0]], <4 x float> [[S1]], <4 x float> [[B]])
++; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> zeroinitializer
++; CHECK-NEXT:    [[R_0:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[B:%.*]], <4 x float> [[TMP0]], <4 x float> [[B]])
++; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
++; CHECK-NEXT:    [[R_1:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[R_0]], <4 x float> [[TMP1]], <4 x float> [[B]])
+ ; CHECK-NEXT:    ret <4 x float> [[R_1]]
+ ; CHECK:       if.else:
+-; CHECK-NEXT:    [[R_2:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[B]], <4 x float> [[S2]], <4 x float> [[B]])
+-; CHECK-NEXT:    [[R_3:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[R_2]], <4 x float> [[S3]], <4 x float> [[B]])
++; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
++; CHECK-NEXT:    [[R_2:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[B]], <4 x float> [[TMP2]], <4 x float> [[B]])
++; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
++; CHECK-NEXT:    [[R_3:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[R_2]], <4 x float> [[TMP3]], <4 x float> [[B]])
+ ; CHECK-NEXT:    ret <4 x float> [[R_3]]
+ ;
+ entry:
+@@ -588,18 +613,18 @@ if.else:
+ define <4 x float> @sink_shufflevector_first_arg_fma_v4f3(i1 %c, <8 x float> %a, <4 x float> %b) {
+ ; CHECK-LABEL: @sink_shufflevector_first_arg_fma_v4f3(
+ ; CHECK-NEXT:  entry:
+-; CHECK-NEXT:    [[S0:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> zeroinitializer
+-; CHECK-NEXT:    [[S1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+-; CHECK-NEXT:    [[S2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+-; CHECK-NEXT:    [[S3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+ ; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+ ; CHECK:       if.then:
+-; CHECK-NEXT:    [[R_0:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[S0]], <4 x float> [[B:%.*]], <4 x float> [[B]])
+-; CHECK-NEXT:    [[R_1:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[S1]], <4 x float> [[R_0]], <4 x float> [[B]])
++; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> zeroinitializer
++; CHECK-NEXT:    [[R_0:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP0]], <4 x float> [[B:%.*]], <4 x float> [[B]])
++; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
++; CHECK-NEXT:    [[R_1:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP1]], <4 x float> [[R_0]], <4 x float> [[B]])
+ ; CHECK-NEXT:    ret <4 x float> [[R_1]]
+ ; CHECK:       if.else:
+-; CHECK-NEXT:    [[R_2:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[S2]], <4 x float> [[B]], <4 x float> [[B]])
+-; CHECK-NEXT:    [[R_3:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[S3]], <4 x float> [[R_2]], <4 x float> [[B]])
++; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
++; CHECK-NEXT:    [[R_2:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP2]], <4 x float> [[B]], <4 x float> [[B]])
++; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
++; CHECK-NEXT:    [[R_3:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP3]], <4 x float> [[R_2]], <4 x float> [[B]])
+ ; CHECK-NEXT:    ret <4 x float> [[R_3]]
+ ;
+ entry:
+@@ -627,14 +652,14 @@ declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>)
+ define <2 x double> @sink_shufflevector_fma_v2f64(i1 %c, <2 x double> %a, <2 x double> %b) {
+ ; CHECK-LABEL: @sink_shufflevector_fma_v2f64(
+ ; CHECK-NEXT:  entry:
+-; CHECK-NEXT:    [[S0:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> poison, <2 x i32> zeroinitializer
+-; CHECK-NEXT:    [[S1:%.*]] = shufflevector <2 x double> [[A]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+ ; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+ ; CHECK:       if.then:
+-; CHECK-NEXT:    [[R_0:%.*]] = tail call fast <2 x double> @llvm.fma.v2f64(<2 x double> [[B:%.*]], <2 x double> [[S0]], <2 x double> [[B]])
++; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> poison, <2 x i32> zeroinitializer
++; CHECK-NEXT:    [[R_0:%.*]] = tail call fast <2 x double> @llvm.fma.v2f64(<2 x double> [[B:%.*]], <2 x double> [[TMP0]], <2 x double> [[B]])
+ ; CHECK-NEXT:    ret <2 x double> [[R_0]]
+ ; CHECK:       if.else:
+-; CHECK-NEXT:    [[R_1:%.*]] = tail call fast <2 x double> @llvm.fma.v2f64(<2 x double> [[B]], <2 x double> [[S1]], <2 x double> [[B]])
++; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
++; CHECK-NEXT:    [[R_1:%.*]] = tail call fast <2 x double> @llvm.fma.v2f64(<2 x double> [[B]], <2 x double> [[TMP1]], <2 x double> [[B]])
+ ; CHECK-NEXT:    ret <2 x double> [[R_1]]
+ ;
+ entry:
+@@ -654,10 +679,10 @@ if.else:
+ define <4 x float> @do_not_sink_out_of_range_shufflevector_fma_v4f32(i1 %c, <8 x float> %a, <4 x float> %b) {
+ ; CHECK-LABEL: @do_not_sink_out_of_range_shufflevector_fma_v4f32(
+ ; CHECK-NEXT:  entry:
+-; CHECK-NEXT:    [[S4:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> <i32 4, i32 4, i32 4, i32 4>
+ ; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+ ; CHECK:       if.then:
+-; CHECK-NEXT:    [[R:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[B:%.*]], <4 x float> [[S4]], <4 x float> [[B]])
++; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> <i32 4, i32 4, i32 4, i32 4>
++; CHECK-NEXT:    [[R:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[B:%.*]], <4 x float> [[TMP0]], <4 x float> [[B]])
+ ; CHECK-NEXT:    ret <4 x float> [[R]]
+ ; CHECK:       if.else:
+ ; CHECK-NEXT:    ret <4 x float> zeroinitializer
+@@ -679,20 +704,20 @@ declare <5 x float> @llvm.fma.v5f32(<5 x float>, <5 x float>, <5 x float>)
+ define <5 x float> @sink_shufflevector_fma_v5f32(i1 %c, <8 x float> %a, <5 x float> %b) {
+ ; CHECK-LABEL: @sink_shufflevector_fma_v5f32(
+ ; CHECK-NEXT:  entry:
+-; CHECK-NEXT:    [[S0:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <5 x i32> zeroinitializer
+-; CHECK-NEXT:    [[S1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <5 x i32> <i32 1, i32 1, i32 1, i32 1, i32 4>
++; CHECK-NEXT:    [[S1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <5 x i32> <i32 1, i32 1, i32 1, i32 1, i32 4>
+ ; CHECK-NEXT:    [[S2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <5 x i32> <i32 2, i32 2, i32 2, i32 2, i32 4>
+ ; CHECK-NEXT:    [[S3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <5 x i32> <i32 3, i32 3, i32 3, i32 3, i32 4>
+-; CHECK-NEXT:    [[S4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <5 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4>
+ ; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+ ; CHECK:       if.then:
+-; CHECK-NEXT:    [[R_0:%.*]] = tail call fast <5 x float> @llvm.fma.v5f32(<5 x float> [[B:%.*]], <5 x float> [[S0]], <5 x float> [[B]])
++; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <5 x i32> zeroinitializer
++; CHECK-NEXT:    [[R_0:%.*]] = tail call fast <5 x float> @llvm.fma.v5f32(<5 x float> [[B:%.*]], <5 x float> [[TMP0]], <5 x float> [[B]])
+ ; CHECK-NEXT:    [[R_1:%.*]] = tail call fast <5 x float> @llvm.fma.v5f32(<5 x float> [[R_0]], <5 x float> [[S1]], <5 x float> [[B]])
+ ; CHECK-NEXT:    ret <5 x float> [[R_1]]
+ ; CHECK:       if.else:
+ ; CHECK-NEXT:    [[R_2:%.*]] = tail call fast <5 x float> @llvm.fma.v5f32(<5 x float> [[B]], <5 x float> [[S2]], <5 x float> [[B]])
+ ; CHECK-NEXT:    [[R_3:%.*]] = tail call fast <5 x float> @llvm.fma.v5f32(<5 x float> [[R_2]], <5 x float> [[S3]], <5 x float> [[B]])
+-; CHECK-NEXT:    [[R_4:%.*]] = tail call fast <5 x float> @llvm.fma.v5f32(<5 x float> [[R_3]], <5 x float> [[S4]], <5 x float> [[B]])
++; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <5 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4>
++; CHECK-NEXT:    [[R_4:%.*]] = tail call fast <5 x float> @llvm.fma.v5f32(<5 x float> [[R_3]], <5 x float> [[TMP1]], <5 x float> [[B]])
+ ; CHECK-NEXT:    ret <5 x float> [[R_4]]
+ ;
+ entry:
+-- 
+2.34.1
+
diff --git a/patches/cherry/7a605ab7bfbc681c34335684f45b7da32d495db1.patch b/patches/cherry/7a605ab7bfbc681c34335684f45b7da32d495db1.patch
new file mode 100644
index 0000000..19568db
--- /dev/null
+++ b/patches/cherry/7a605ab7bfbc681c34335684f45b7da32d495db1.patch
@@ -0,0 +1,727 @@
+From 7a605ab7bfbc681c34335684f45b7da32d495db1 Mon Sep 17 00:00:00 2001
+From: zhongyunde <zhongyunde@huawei.com>
+Date: Fri, 4 Mar 2022 22:44:14 +0800
+Subject: [PATCH] [AArch64] Use simd mov to materialize big fp constants
+
+mov w8, #1325400064 + fmov s0, w8 ==> movi v0.2s, 0x4f, lsl 24
+Fix https://github.com/llvm/llvm-project/issues/53651
+
+Reviewed By: dmgreen, fhahn
+
+Differential Revision: https://reviews.llvm.org/D120452
+---
+ .../lib/Target/AArch64/AArch64InstrFormats.td |  14 ++
+ llvm/lib/Target/AArch64/AArch64InstrInfo.td   |   8 ++
+ llvm/test/CodeGen/AArch64/fabs.ll             |   3 +-
+ llvm/test/CodeGen/AArch64/fcvt-fixed.ll       | 136 ++++++++----------
+ llvm/test/CodeGen/AArch64/fpimm.ll            |   5 +-
+ .../test/CodeGen/AArch64/fptosi-sat-scalar.ll |  20 ++-
+ .../test/CodeGen/AArch64/fptosi-sat-vector.ll |  76 +++++-----
+ .../CodeGen/AArch64/remat-const-float-simd.ll |  33 +++++
+ .../AArch64/vecreduce-fadd-legalization.ll    |   3 +-
+ 9 files changed, 157 insertions(+), 141 deletions(-)
+ create mode 100644 llvm/test/CodeGen/AArch64/remat-const-float-simd.ll
+
+diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+index 659d2a62b8c4..74dccb85a66e 100644
+--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
++++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+@@ -1178,6 +1178,13 @@ def fpimm32XForm : SDNodeXForm<fpimm, [{
+       return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
+     }]>;
+ 
++def fpimm32SIMDModImmType4XForm : SDNodeXForm<fpimm, [{
++      uint32_t enc = AArch64_AM::encodeAdvSIMDModImmType4(N->getValueAPF()
++                                                          .bitcastToAPInt()
++                                                          .getZExtValue());
++      return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
++    }]>;
++
+ def fpimm64XForm : SDNodeXForm<fpimm, [{
+       APFloat InVal = N->getValueAPF();
+       uint32_t enc = AArch64_AM::getFP64Imm(InVal);
+@@ -1199,6 +1206,13 @@ def fpimm32 : Operand<f32>,
+   let ParserMatchClass = FPImmOperand;
+   let PrintMethod = "printFPImmOperand";
+ }
++
++def fpimm32SIMDModImmType4 : FPImmLeaf<f32, [{
++      uint64_t Enc = Imm.bitcastToAPInt().getZExtValue();
++      return Enc != 0 && AArch64_AM::isAdvSIMDModImmType4(Enc << 32 | Enc);
++    }], fpimm32SIMDModImmType4XForm> {
++}
++
+ def fpimm64 : Operand<f64>,
+               FPImmLeaf<f64, [{
+       return AArch64_AM::getFP64Imm(Imm) != -1;
+diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+index 1152f8b20a7b..3b50a2e5ece4 100644
+--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
++++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+@@ -6145,6 +6145,14 @@ def : Pat<(v8i8  immAllOnesV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 255)), dsub)>;
+ let isReMaterializable = 1, isAsCheapAsAMove = 1 in
+ defm MOVI      : SIMDModifiedImmVectorShift<0, 0b10, 0b00, "movi">;
+ 
++let Predicates = [HasNEON] in {
++  // Using the MOVI to materialize fp constants.
++  def : Pat<(f32 fpimm32SIMDModImmType4:$in),
++            (EXTRACT_SUBREG (MOVIv2i32 (fpimm32SIMDModImmType4XForm f32:$in),
++                                       (i32 24)),
++                            ssub)>;
++}
++
+ def : InstAlias<"movi $Vd.4h, $imm", (MOVIv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
+ def : InstAlias<"movi $Vd.8h, $imm", (MOVIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
+ def : InstAlias<"movi $Vd.2s, $imm", (MOVIv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
+diff --git a/llvm/test/CodeGen/AArch64/fabs.ll b/llvm/test/CodeGen/AArch64/fabs.ll
+index bc6b32770d4c..23bf7a699195 100644
+--- a/llvm/test/CodeGen/AArch64/fabs.ll
++++ b/llvm/test/CodeGen/AArch64/fabs.ll
+@@ -22,9 +22,8 @@ define double @not_fabs(double %x) #0 {
+ define float @still_not_fabs(float %x) #0 {
+ ; CHECK-LABEL: still_not_fabs:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    mov w8, #-2147483648
++; CHECK-NEXT:    movi v1.2s, #128, lsl #24
+ ; CHECK-NEXT:    fneg s2, s0
+-; CHECK-NEXT:    fmov s1, w8
+ ; CHECK-NEXT:    fcmp s0, s1
+ ; CHECK-NEXT:    fcsel s0, s0, s2, ge
+ ; CHECK-NEXT:    ret
+diff --git a/llvm/test/CodeGen/AArch64/fcvt-fixed.ll b/llvm/test/CodeGen/AArch64/fcvt-fixed.ll
+index 79978af6f80e..296be831da76 100644
+--- a/llvm/test/CodeGen/AArch64/fcvt-fixed.ll
++++ b/llvm/test/CodeGen/AArch64/fcvt-fixed.ll
+@@ -87,9 +87,8 @@ define i64 @fcvtzs_f64_i64_64(double %dbl) {
+ define i32 @fcvtzs_f16_i32_7(half %flt) {
+ ; CHECK-NO16-LABEL: fcvtzs_f16_i32_7:
+ ; CHECK-NO16:       // %bb.0:
+-; CHECK-NO16-NEXT:    mov w8, #1124073472
++; CHECK-NO16-NEXT:    movi v1.2s, #67, lsl #24
+ ; CHECK-NO16-NEXT:    fcvt s0, h0
+-; CHECK-NO16-NEXT:    fmov s1, w8
+ ; CHECK-NO16-NEXT:    fmul s0, s0, s1
+ ; CHECK-NO16-NEXT:    fcvt h0, s0
+ ; CHECK-NO16-NEXT:    fcvt s0, h0
+@@ -108,9 +107,8 @@ define i32 @fcvtzs_f16_i32_7(half %flt) {
+ define i32 @fcvtzs_f16_i32_15(half %flt) {
+ ; CHECK-NO16-LABEL: fcvtzs_f16_i32_15:
+ ; CHECK-NO16:       // %bb.0:
+-; CHECK-NO16-NEXT:    mov w8, #1191182336
++; CHECK-NO16-NEXT:    movi v1.2s, #71, lsl #24
+ ; CHECK-NO16-NEXT:    fcvt s0, h0
+-; CHECK-NO16-NEXT:    fmov s1, w8
+ ; CHECK-NO16-NEXT:    fmul s0, s0, s1
+ ; CHECK-NO16-NEXT:    fcvt h0, s0
+ ; CHECK-NO16-NEXT:    fcvt s0, h0
+@@ -129,9 +127,8 @@ define i32 @fcvtzs_f16_i32_15(half %flt) {
+ define i64 @fcvtzs_f16_i64_7(half %flt) {
+ ; CHECK-NO16-LABEL: fcvtzs_f16_i64_7:
+ ; CHECK-NO16:       // %bb.0:
+-; CHECK-NO16-NEXT:    mov w8, #1124073472
++; CHECK-NO16-NEXT:    movi v1.2s, #67, lsl #24
+ ; CHECK-NO16-NEXT:    fcvt s0, h0
+-; CHECK-NO16-NEXT:    fmov s1, w8
+ ; CHECK-NO16-NEXT:    fmul s0, s0, s1
+ ; CHECK-NO16-NEXT:    fcvt h0, s0
+ ; CHECK-NO16-NEXT:    fcvt s0, h0
+@@ -150,9 +147,8 @@ define i64 @fcvtzs_f16_i64_7(half %flt) {
+ define i64 @fcvtzs_f16_i64_15(half %flt) {
+ ; CHECK-NO16-LABEL: fcvtzs_f16_i64_15:
+ ; CHECK-NO16:       // %bb.0:
+-; CHECK-NO16-NEXT:    mov w8, #1191182336
++; CHECK-NO16-NEXT:    movi v1.2s, #71, lsl #24
+ ; CHECK-NO16-NEXT:    fcvt s0, h0
+-; CHECK-NO16-NEXT:    fmov s1, w8
+ ; CHECK-NO16-NEXT:    fmul s0, s0, s1
+ ; CHECK-NO16-NEXT:    fcvt h0, s0
+ ; CHECK-NO16-NEXT:    fcvt s0, h0
+@@ -253,9 +249,8 @@ define i64 @fcvtzu_f64_i64_64(double %dbl) {
+ define i32 @fcvtzu_f16_i32_7(half %flt) {
+ ; CHECK-NO16-LABEL: fcvtzu_f16_i32_7:
+ ; CHECK-NO16:       // %bb.0:
+-; CHECK-NO16-NEXT:    mov w8, #1124073472
++; CHECK-NO16-NEXT:    movi v1.2s, #67, lsl #24
+ ; CHECK-NO16-NEXT:    fcvt s0, h0
+-; CHECK-NO16-NEXT:    fmov s1, w8
+ ; CHECK-NO16-NEXT:    fmul s0, s0, s1
+ ; CHECK-NO16-NEXT:    fcvt h0, s0
+ ; CHECK-NO16-NEXT:    fcvt s0, h0
+@@ -274,9 +269,8 @@ define i32 @fcvtzu_f16_i32_7(half %flt) {
+ define i32 @fcvtzu_f16_i32_15(half %flt) {
+ ; CHECK-NO16-LABEL: fcvtzu_f16_i32_15:
+ ; CHECK-NO16:       // %bb.0:
+-; CHECK-NO16-NEXT:    mov w8, #1191182336
++; CHECK-NO16-NEXT:    movi v1.2s, #71, lsl #24
+ ; CHECK-NO16-NEXT:    fcvt s0, h0
+-; CHECK-NO16-NEXT:    fmov s1, w8
+ ; CHECK-NO16-NEXT:    fmul s0, s0, s1
+ ; CHECK-NO16-NEXT:    fcvt h0, s0
+ ; CHECK-NO16-NEXT:    fcvt s0, h0
+@@ -295,9 +289,8 @@ define i32 @fcvtzu_f16_i32_15(half %flt) {
+ define i64 @fcvtzu_f16_i64_7(half %flt) {
+ ; CHECK-NO16-LABEL: fcvtzu_f16_i64_7:
+ ; CHECK-NO16:       // %bb.0:
+-; CHECK-NO16-NEXT:    mov w8, #1124073472
++; CHECK-NO16-NEXT:    movi v1.2s, #67, lsl #24
+ ; CHECK-NO16-NEXT:    fcvt s0, h0
+-; CHECK-NO16-NEXT:    fmov s1, w8
+ ; CHECK-NO16-NEXT:    fmul s0, s0, s1
+ ; CHECK-NO16-NEXT:    fcvt h0, s0
+ ; CHECK-NO16-NEXT:    fcvt s0, h0
+@@ -316,9 +309,8 @@ define i64 @fcvtzu_f16_i64_7(half %flt) {
+ define i64 @fcvtzu_f16_i64_15(half %flt) {
+ ; CHECK-NO16-LABEL: fcvtzu_f16_i64_15:
+ ; CHECK-NO16:       // %bb.0:
+-; CHECK-NO16-NEXT:    mov w8, #1191182336
++; CHECK-NO16-NEXT:    movi v1.2s, #71, lsl #24
+ ; CHECK-NO16-NEXT:    fcvt s0, h0
+-; CHECK-NO16-NEXT:    fmov s1, w8
+ ; CHECK-NO16-NEXT:    fmul s0, s0, s1
+ ; CHECK-NO16-NEXT:    fcvt h0, s0
+ ; CHECK-NO16-NEXT:    fcvt s0, h0
+@@ -419,12 +411,11 @@ define double @scvtf_f64_i64_64(i64 %long) {
+ define half @scvtf_f16_i32_7(i32 %int) {
+ ; CHECK-NO16-LABEL: scvtf_f16_i32_7:
+ ; CHECK-NO16:       // %bb.0:
+-; CHECK-NO16-NEXT:    scvtf s0, w0
+-; CHECK-NO16-NEXT:    mov w8, #1124073472
+-; CHECK-NO16-NEXT:    fmov s1, w8
+-; CHECK-NO16-NEXT:    fcvt h0, s0
+-; CHECK-NO16-NEXT:    fcvt s0, h0
+-; CHECK-NO16-NEXT:    fdiv s0, s0, s1
++; CHECK-NO16-NEXT:    scvtf s1, w0
++; CHECK-NO16-NEXT:    movi v0.2s, #67, lsl #24
++; CHECK-NO16-NEXT:    fcvt h1, s1
++; CHECK-NO16-NEXT:    fcvt s1, h1
++; CHECK-NO16-NEXT:    fdiv s0, s1, s0
+ ; CHECK-NO16-NEXT:    fcvt h0, s0
+ ; CHECK-NO16-NEXT:    ret
+ ;
+@@ -440,12 +431,11 @@ define half @scvtf_f16_i32_7(i32 %int) {
+ define half @scvtf_f16_i32_15(i32 %int) {
+ ; CHECK-NO16-LABEL: scvtf_f16_i32_15:
+ ; CHECK-NO16:       // %bb.0:
+-; CHECK-NO16-NEXT:    scvtf s0, w0
+-; CHECK-NO16-NEXT:    mov w8, #1191182336
+-; CHECK-NO16-NEXT:    fmov s1, w8
+-; CHECK-NO16-NEXT:    fcvt h0, s0
+-; CHECK-NO16-NEXT:    fcvt s0, h0
+-; CHECK-NO16-NEXT:    fdiv s0, s0, s1
++; CHECK-NO16-NEXT:    scvtf s1, w0
++; CHECK-NO16-NEXT:    movi v0.2s, #71, lsl #24
++; CHECK-NO16-NEXT:    fcvt h1, s1
++; CHECK-NO16-NEXT:    fcvt s1, h1
++; CHECK-NO16-NEXT:    fdiv s0, s1, s0
+ ; CHECK-NO16-NEXT:    fcvt h0, s0
+ ; CHECK-NO16-NEXT:    ret
+ ;
+@@ -461,12 +451,11 @@ define half @scvtf_f16_i32_15(i32 %int) {
+ define half @scvtf_f16_i64_7(i64 %long) {
+ ; CHECK-NO16-LABEL: scvtf_f16_i64_7:
+ ; CHECK-NO16:       // %bb.0:
+-; CHECK-NO16-NEXT:    scvtf s0, x0
+-; CHECK-NO16-NEXT:    mov w8, #1124073472
+-; CHECK-NO16-NEXT:    fmov s1, w8
+-; CHECK-NO16-NEXT:    fcvt h0, s0
+-; CHECK-NO16-NEXT:    fcvt s0, h0
+-; CHECK-NO16-NEXT:    fdiv s0, s0, s1
++; CHECK-NO16-NEXT:    scvtf s1, x0
++; CHECK-NO16-NEXT:    movi v0.2s, #67, lsl #24
++; CHECK-NO16-NEXT:    fcvt h1, s1
++; CHECK-NO16-NEXT:    fcvt s1, h1
++; CHECK-NO16-NEXT:    fdiv s0, s1, s0
+ ; CHECK-NO16-NEXT:    fcvt h0, s0
+ ; CHECK-NO16-NEXT:    ret
+ ;
+@@ -482,12 +471,11 @@ define half @scvtf_f16_i64_7(i64 %long) {
+ define half @scvtf_f16_i64_15(i64 %long) {
+ ; CHECK-NO16-LABEL: scvtf_f16_i64_15:
+ ; CHECK-NO16:       // %bb.0:
+-; CHECK-NO16-NEXT:    scvtf s0, x0
+-; CHECK-NO16-NEXT:    mov w8, #1191182336
+-; CHECK-NO16-NEXT:    fmov s1, w8
+-; CHECK-NO16-NEXT:    fcvt h0, s0
+-; CHECK-NO16-NEXT:    fcvt s0, h0
+-; CHECK-NO16-NEXT:    fdiv s0, s0, s1
++; CHECK-NO16-NEXT:    scvtf s1, x0
++; CHECK-NO16-NEXT:    movi v0.2s, #71, lsl #24
++; CHECK-NO16-NEXT:    fcvt h1, s1
++; CHECK-NO16-NEXT:    fcvt s1, h1
++; CHECK-NO16-NEXT:    fdiv s0, s1, s0
+ ; CHECK-NO16-NEXT:    fcvt h0, s0
+ ; CHECK-NO16-NEXT:    ret
+ ;
+@@ -585,12 +573,11 @@ define double @ucvtf_f64_i64_64(i64 %long) {
+ define half @ucvtf_f16_i32_7(i32 %int) {
+ ; CHECK-NO16-LABEL: ucvtf_f16_i32_7:
+ ; CHECK-NO16:       // %bb.0:
+-; CHECK-NO16-NEXT:    ucvtf s0, w0
+-; CHECK-NO16-NEXT:    mov w8, #1124073472
+-; CHECK-NO16-NEXT:    fmov s1, w8
+-; CHECK-NO16-NEXT:    fcvt h0, s0
+-; CHECK-NO16-NEXT:    fcvt s0, h0
+-; CHECK-NO16-NEXT:    fdiv s0, s0, s1
++; CHECK-NO16-NEXT:    ucvtf s1, w0
++; CHECK-NO16-NEXT:    movi v0.2s, #67, lsl #24
++; CHECK-NO16-NEXT:    fcvt h1, s1
++; CHECK-NO16-NEXT:    fcvt s1, h1
++; CHECK-NO16-NEXT:    fdiv s0, s1, s0
+ ; CHECK-NO16-NEXT:    fcvt h0, s0
+ ; CHECK-NO16-NEXT:    ret
+ ;
+@@ -606,12 +593,11 @@ define half @ucvtf_f16_i32_7(i32 %int) {
+ define half @ucvtf_f16_i32_15(i32 %int) {
+ ; CHECK-NO16-LABEL: ucvtf_f16_i32_15:
+ ; CHECK-NO16:       // %bb.0:
+-; CHECK-NO16-NEXT:    ucvtf s0, w0
+-; CHECK-NO16-NEXT:    mov w8, #1191182336
+-; CHECK-NO16-NEXT:    fmov s1, w8
+-; CHECK-NO16-NEXT:    fcvt h0, s0
+-; CHECK-NO16-NEXT:    fcvt s0, h0
+-; CHECK-NO16-NEXT:    fdiv s0, s0, s1
++; CHECK-NO16-NEXT:    ucvtf s1, w0
++; CHECK-NO16-NEXT:    movi v0.2s, #71, lsl #24
++; CHECK-NO16-NEXT:    fcvt h1, s1
++; CHECK-NO16-NEXT:    fcvt s1, h1
++; CHECK-NO16-NEXT:    fdiv s0, s1, s0
+ ; CHECK-NO16-NEXT:    fcvt h0, s0
+ ; CHECK-NO16-NEXT:    ret
+ ;
+@@ -627,12 +613,11 @@ define half @ucvtf_f16_i32_15(i32 %int) {
+ define half @ucvtf_f16_i64_7(i64 %long) {
+ ; CHECK-NO16-LABEL: ucvtf_f16_i64_7:
+ ; CHECK-NO16:       // %bb.0:
+-; CHECK-NO16-NEXT:    ucvtf s0, x0
+-; CHECK-NO16-NEXT:    mov w8, #1124073472
+-; CHECK-NO16-NEXT:    fmov s1, w8
+-; CHECK-NO16-NEXT:    fcvt h0, s0
+-; CHECK-NO16-NEXT:    fcvt s0, h0
+-; CHECK-NO16-NEXT:    fdiv s0, s0, s1
++; CHECK-NO16-NEXT:    ucvtf s1, x0
++; CHECK-NO16-NEXT:    movi v0.2s, #67, lsl #24
++; CHECK-NO16-NEXT:    fcvt h1, s1
++; CHECK-NO16-NEXT:    fcvt s1, h1
++; CHECK-NO16-NEXT:    fdiv s0, s1, s0
+ ; CHECK-NO16-NEXT:    fcvt h0, s0
+ ; CHECK-NO16-NEXT:    ret
+ ;
+@@ -648,12 +633,11 @@ define half @ucvtf_f16_i64_7(i64 %long) {
+ define half @ucvtf_f16_i64_15(i64 %long) {
+ ; CHECK-NO16-LABEL: ucvtf_f16_i64_15:
+ ; CHECK-NO16:       // %bb.0:
+-; CHECK-NO16-NEXT:    ucvtf s0, x0
+-; CHECK-NO16-NEXT:    mov w8, #1191182336
+-; CHECK-NO16-NEXT:    fmov s1, w8
+-; CHECK-NO16-NEXT:    fcvt h0, s0
+-; CHECK-NO16-NEXT:    fcvt s0, h0
+-; CHECK-NO16-NEXT:    fdiv s0, s0, s1
++; CHECK-NO16-NEXT:    ucvtf s1, x0
++; CHECK-NO16-NEXT:    movi v0.2s, #71, lsl #24
++; CHECK-NO16-NEXT:    fcvt h1, s1
++; CHECK-NO16-NEXT:    fcvt s1, h1
++; CHECK-NO16-NEXT:    fdiv s0, s1, s0
+ ; CHECK-NO16-NEXT:    fcvt h0, s0
+ ; CHECK-NO16-NEXT:    ret
+ ;
+@@ -749,9 +733,8 @@ define i64 @fcvtzs_sat_f64_i64_64(double %dbl) {
+ define i32 @fcvtzs_sat_f16_i32_7(half %dbl) {
+ ; CHECK-NO16-LABEL: fcvtzs_sat_f16_i32_7:
+ ; CHECK-NO16:       // %bb.0:
+-; CHECK-NO16-NEXT:    mov w8, #1124073472
++; CHECK-NO16-NEXT:    movi v1.2s, #67, lsl #24
+ ; CHECK-NO16-NEXT:    fcvt s0, h0
+-; CHECK-NO16-NEXT:    fmov s1, w8
+ ; CHECK-NO16-NEXT:    fmul s0, s0, s1
+ ; CHECK-NO16-NEXT:    fcvt h0, s0
+ ; CHECK-NO16-NEXT:    fcvt s0, h0
+@@ -770,9 +753,8 @@ define i32 @fcvtzs_sat_f16_i32_7(half %dbl) {
+ define i32 @fcvtzs_sat_f16_i32_15(half %dbl) {
+ ; CHECK-NO16-LABEL: fcvtzs_sat_f16_i32_15:
+ ; CHECK-NO16:       // %bb.0:
+-; CHECK-NO16-NEXT:    mov w8, #1191182336
++; CHECK-NO16-NEXT:    movi v1.2s, #71, lsl #24
+ ; CHECK-NO16-NEXT:    fcvt s0, h0
+-; CHECK-NO16-NEXT:    fmov s1, w8
+ ; CHECK-NO16-NEXT:    fmul s0, s0, s1
+ ; CHECK-NO16-NEXT:    fcvt h0, s0
+ ; CHECK-NO16-NEXT:    fcvt s0, h0
+@@ -791,9 +773,8 @@ define i32 @fcvtzs_sat_f16_i32_15(half %dbl) {
+ define i64 @fcvtzs_sat_f16_i64_7(half %dbl) {
+ ; CHECK-NO16-LABEL: fcvtzs_sat_f16_i64_7:
+ ; CHECK-NO16:       // %bb.0:
+-; CHECK-NO16-NEXT:    mov w8, #1124073472
++; CHECK-NO16-NEXT:    movi v1.2s, #67, lsl #24
+ ; CHECK-NO16-NEXT:    fcvt s0, h0
+-; CHECK-NO16-NEXT:    fmov s1, w8
+ ; CHECK-NO16-NEXT:    fmul s0, s0, s1
+ ; CHECK-NO16-NEXT:    fcvt h0, s0
+ ; CHECK-NO16-NEXT:    fcvt s0, h0
+@@ -812,9 +793,8 @@ define i64 @fcvtzs_sat_f16_i64_7(half %dbl) {
+ define i64 @fcvtzs_sat_f16_i64_15(half %dbl) {
+ ; CHECK-NO16-LABEL: fcvtzs_sat_f16_i64_15:
+ ; CHECK-NO16:       // %bb.0:
+-; CHECK-NO16-NEXT:    mov w8, #1191182336
++; CHECK-NO16-NEXT:    movi v1.2s, #71, lsl #24
+ ; CHECK-NO16-NEXT:    fcvt s0, h0
+-; CHECK-NO16-NEXT:    fmov s1, w8
+ ; CHECK-NO16-NEXT:    fmul s0, s0, s1
+ ; CHECK-NO16-NEXT:    fcvt h0, s0
+ ; CHECK-NO16-NEXT:    fcvt s0, h0
+@@ -912,9 +892,8 @@ define i64 @fcvtzu_sat_f64_i64_64(double %dbl) {
+ define i32 @fcvtzu_sat_f16_i32_7(half %dbl) {
+ ; CHECK-NO16-LABEL: fcvtzu_sat_f16_i32_7:
+ ; CHECK-NO16:       // %bb.0:
+-; CHECK-NO16-NEXT:    mov w8, #1124073472
++; CHECK-NO16-NEXT:    movi v1.2s, #67, lsl #24
+ ; CHECK-NO16-NEXT:    fcvt s0, h0
+-; CHECK-NO16-NEXT:    fmov s1, w8
+ ; CHECK-NO16-NEXT:    fmul s0, s0, s1
+ ; CHECK-NO16-NEXT:    fcvt h0, s0
+ ; CHECK-NO16-NEXT:    fcvt s0, h0
+@@ -933,9 +912,8 @@ define i32 @fcvtzu_sat_f16_i32_7(half %dbl) {
+ define i32 @fcvtzu_sat_f16_i32_15(half %dbl) {
+ ; CHECK-NO16-LABEL: fcvtzu_sat_f16_i32_15:
+ ; CHECK-NO16:       // %bb.0:
+-; CHECK-NO16-NEXT:    mov w8, #1191182336
++; CHECK-NO16-NEXT:    movi v1.2s, #71, lsl #24
+ ; CHECK-NO16-NEXT:    fcvt s0, h0
+-; CHECK-NO16-NEXT:    fmov s1, w8
+ ; CHECK-NO16-NEXT:    fmul s0, s0, s1
+ ; CHECK-NO16-NEXT:    fcvt h0, s0
+ ; CHECK-NO16-NEXT:    fcvt s0, h0
+@@ -954,9 +932,8 @@ define i32 @fcvtzu_sat_f16_i32_15(half %dbl) {
+ define i64 @fcvtzu_sat_f16_i64_7(half %dbl) {
+ ; CHECK-NO16-LABEL: fcvtzu_sat_f16_i64_7:
+ ; CHECK-NO16:       // %bb.0:
+-; CHECK-NO16-NEXT:    mov w8, #1124073472
++; CHECK-NO16-NEXT:    movi v1.2s, #67, lsl #24
+ ; CHECK-NO16-NEXT:    fcvt s0, h0
+-; CHECK-NO16-NEXT:    fmov s1, w8
+ ; CHECK-NO16-NEXT:    fmul s0, s0, s1
+ ; CHECK-NO16-NEXT:    fcvt h0, s0
+ ; CHECK-NO16-NEXT:    fcvt s0, h0
+@@ -975,9 +952,8 @@ define i64 @fcvtzu_sat_f16_i64_7(half %dbl) {
+ define i64 @fcvtzu_sat_f16_i64_15(half %dbl) {
+ ; CHECK-NO16-LABEL: fcvtzu_sat_f16_i64_15:
+ ; CHECK-NO16:       // %bb.0:
+-; CHECK-NO16-NEXT:    mov w8, #1191182336
++; CHECK-NO16-NEXT:    movi v1.2s, #71, lsl #24
+ ; CHECK-NO16-NEXT:    fcvt s0, h0
+-; CHECK-NO16-NEXT:    fmov s1, w8
+ ; CHECK-NO16-NEXT:    fmul s0, s0, s1
+ ; CHECK-NO16-NEXT:    fcvt h0, s0
+ ; CHECK-NO16-NEXT:    fcvt s0, h0
+diff --git a/llvm/test/CodeGen/AArch64/fpimm.ll b/llvm/test/CodeGen/AArch64/fpimm.ll
+index 4c732f589147..10233ded3236 100644
+--- a/llvm/test/CodeGen/AArch64/fpimm.ll
++++ b/llvm/test/CodeGen/AArch64/fpimm.ll
+@@ -1,5 +1,5 @@
+ ; RUN: llc -mtriple=aarch64-linux-gnu                                                  -verify-machineinstrs < %s | FileCheck %s
+-; RUN: llc -mtriple=aarch64-apple-darwin -code-model=large                             -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LARGE
++; RUN: llc -mtriple=aarch64-apple-darwin -code-model=large                             -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LARGE
+ ; RUN: llc -mtriple=aarch64-none-eabi    -code-model=tiny                              -verify-machineinstrs < %s | FileCheck %s
+ 
+ @varf32 = global float 0.0
+@@ -15,8 +15,7 @@ define void @check_float() {
+ 
+   %newval2 = fadd float %val, 128.0
+   store volatile float %newval2, float* @varf32
+-; CHECK-DAG: mov [[W128:w[0-9]+]], #1124073472
+-; CHECK-DAG: fmov {{s[0-9]+}}, [[W128]]
++; CHECK-DAG: movi [[REG:v[0-9s]+]].2s, #67, lsl #24
+ 
+ ; CHECK: ret
+   ret void
+diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-scalar.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-scalar.ll
+index 70f9031123d7..729f531d3a50 100644
+--- a/llvm/test/CodeGen/AArch64/fptosi-sat-scalar.ll
++++ b/llvm/test/CodeGen/AArch64/fptosi-sat-scalar.ll
+@@ -131,11 +131,10 @@ define i100 @test_signed_i100_f32(float %f) nounwind {
+ ; CHECK-NEXT:    str x30, [sp, #8] // 8-byte Folded Spill
+ ; CHECK-NEXT:    fmov s8, s0
+ ; CHECK-NEXT:    bl __fixsfti
+-; CHECK-NEXT:    mov w8, #-251658240
++; CHECK-NEXT:    movi v0.2s, #241, lsl #24
++; CHECK-NEXT:    mov w8, #1895825407
+ ; CHECK-NEXT:    mov x10, #34359738367
+ ; CHECK-NEXT:    ldr x30, [sp, #8] // 8-byte Folded Reload
+-; CHECK-NEXT:    fmov s0, w8
+-; CHECK-NEXT:    mov w8, #1895825407
+ ; CHECK-NEXT:    fcmp s8, s0
+ ; CHECK-NEXT:    fmov s0, w8
+ ; CHECK-NEXT:    mov x8, #-34359738368
+@@ -160,11 +159,10 @@ define i128 @test_signed_i128_f32(float %f) nounwind {
+ ; CHECK-NEXT:    str x30, [sp, #8] // 8-byte Folded Spill
+ ; CHECK-NEXT:    fmov s8, s0
+ ; CHECK-NEXT:    bl __fixsfti
+-; CHECK-NEXT:    mov w8, #-16777216
++; CHECK-NEXT:    movi v0.2s, #255, lsl #24
++; CHECK-NEXT:    mov w8, #2130706431
+ ; CHECK-NEXT:    mov x10, #9223372036854775807
+ ; CHECK-NEXT:    ldr x30, [sp, #8] // 8-byte Folded Reload
+-; CHECK-NEXT:    fmov s0, w8
+-; CHECK-NEXT:    mov w8, #2130706431
+ ; CHECK-NEXT:    fcmp s8, s0
+ ; CHECK-NEXT:    fmov s0, w8
+ ; CHECK-NEXT:    mov x8, #-9223372036854775808
+@@ -575,11 +573,10 @@ define i100 @test_signed_i100_f16(half %f) nounwind {
+ ; CHECK-NEXT:    str x30, [sp, #8] // 8-byte Folded Spill
+ ; CHECK-NEXT:    fmov s0, s8
+ ; CHECK-NEXT:    bl __fixsfti
+-; CHECK-NEXT:    mov w8, #-251658240
++; CHECK-NEXT:    movi v0.2s, #241, lsl #24
++; CHECK-NEXT:    mov w8, #1895825407
+ ; CHECK-NEXT:    mov x10, #34359738367
+ ; CHECK-NEXT:    ldr x30, [sp, #8] // 8-byte Folded Reload
+-; CHECK-NEXT:    fmov s0, w8
+-; CHECK-NEXT:    mov w8, #1895825407
+ ; CHECK-NEXT:    fcmp s8, s0
+ ; CHECK-NEXT:    fmov s0, w8
+ ; CHECK-NEXT:    mov x8, #-34359738368
+@@ -605,11 +602,10 @@ define i128 @test_signed_i128_f16(half %f) nounwind {
+ ; CHECK-NEXT:    str x30, [sp, #8] // 8-byte Folded Spill
+ ; CHECK-NEXT:    fmov s0, s8
+ ; CHECK-NEXT:    bl __fixsfti
+-; CHECK-NEXT:    mov w8, #-16777216
++; CHECK-NEXT:    movi v0.2s, #255, lsl #24
++; CHECK-NEXT:    mov w8, #2130706431
+ ; CHECK-NEXT:    mov x10, #9223372036854775807
+ ; CHECK-NEXT:    ldr x30, [sp, #8] // 8-byte Folded Reload
+-; CHECK-NEXT:    fmov s0, w8
+-; CHECK-NEXT:    mov w8, #2130706431
+ ; CHECK-NEXT:    fcmp s8, s0
+ ; CHECK-NEXT:    fmov s0, w8
+ ; CHECK-NEXT:    mov x8, #-9223372036854775808
+diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
+index 9fc4455972dc..55d4abc962fc 100644
+--- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
++++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
+@@ -827,15 +827,14 @@ define <2 x i100> @test_signed_v2f32_v2i100(<2 x float> %f) {
+ ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+ ; CHECK-NEXT:    fmov s0, s8
+ ; CHECK-NEXT:    bl __fixsfti
+-; CHECK-NEXT:    mov w8, #-251658240
++; CHECK-NEXT:    movi v9.2s, #241, lsl #24
++; CHECK-NEXT:    mov w8, #1895825407
+ ; CHECK-NEXT:    mov x21, #-34359738368
+ ; CHECK-NEXT:    mov x22, #34359738367
+ ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
++; CHECK-NEXT:    fmov s10, w8
+ ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
+-; CHECK-NEXT:    fmov s9, w8
+-; CHECK-NEXT:    mov w8, #1895825407
+ ; CHECK-NEXT:    fcmp s8, s9
+-; CHECK-NEXT:    fmov s10, w8
+ ; CHECK-NEXT:    csel x8, xzr, x0, lt
+ ; CHECK-NEXT:    csel x9, x21, x1, lt
+ ; CHECK-NEXT:    fcmp s8, s10
+@@ -894,15 +893,14 @@ define <2 x i128> @test_signed_v2f32_v2i128(<2 x float> %f) {
+ ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+ ; CHECK-NEXT:    fmov s0, s8
+ ; CHECK-NEXT:    bl __fixsfti
+-; CHECK-NEXT:    mov w8, #-16777216
++; CHECK-NEXT:    movi v9.2s, #255, lsl #24
++; CHECK-NEXT:    mov w8, #2130706431
+ ; CHECK-NEXT:    mov x21, #-9223372036854775808
+ ; CHECK-NEXT:    mov x22, #9223372036854775807
+ ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
++; CHECK-NEXT:    fmov s10, w8
+ ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
+-; CHECK-NEXT:    fmov s9, w8
+-; CHECK-NEXT:    mov w8, #2130706431
+ ; CHECK-NEXT:    fcmp s8, s9
+-; CHECK-NEXT:    fmov s10, w8
+ ; CHECK-NEXT:    csel x8, xzr, x0, lt
+ ; CHECK-NEXT:    csel x9, x21, x1, lt
+ ; CHECK-NEXT:    fcmp s8, s10
+@@ -1106,20 +1104,19 @@ define <4 x i100> @test_signed_v4f32_v4i100(<4 x float> %f) {
+ ; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+ ; CHECK-NEXT:    fmov s0, s8
+ ; CHECK-NEXT:    bl __fixsfti
+-; CHECK-NEXT:    mov w8, #-251658240
++; CHECK-NEXT:    movi v9.2s, #241, lsl #24
++; CHECK-NEXT:    mov w8, #1895825407
+ ; CHECK-NEXT:    mov x25, #-34359738368
+ ; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+ ; CHECK-NEXT:    mov x26, #34359738367
+-; CHECK-NEXT:    fmov s9, w8
+-; CHECK-NEXT:    mov w8, #1895825407
+-; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+-; CHECK-NEXT:    fcmp s8, s9
+ ; CHECK-NEXT:    fmov s10, w8
+-; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+-; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
++; CHECK-NEXT:    fcmp s8, s9
++; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+ ; CHECK-NEXT:    csel x8, xzr, x0, lt
+ ; CHECK-NEXT:    csel x9, x25, x1, lt
+ ; CHECK-NEXT:    fcmp s8, s10
++; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
++; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
+ ; CHECK-NEXT:    csel x9, x26, x9, gt
+ ; CHECK-NEXT:    csinv x8, x8, xzr, le
+ ; CHECK-NEXT:    fcmp s8, s8
+@@ -1211,20 +1208,19 @@ define <4 x i128> @test_signed_v4f32_v4i128(<4 x float> %f) {
+ ; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+ ; CHECK-NEXT:    fmov s0, s8
+ ; CHECK-NEXT:    bl __fixsfti
+-; CHECK-NEXT:    mov w8, #-16777216
++; CHECK-NEXT:    movi v9.2s, #255, lsl #24
++; CHECK-NEXT:    mov w8, #2130706431
+ ; CHECK-NEXT:    mov x25, #-9223372036854775808
+ ; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+ ; CHECK-NEXT:    mov x26, #9223372036854775807
+-; CHECK-NEXT:    fmov s9, w8
+-; CHECK-NEXT:    mov w8, #2130706431
+-; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+-; CHECK-NEXT:    fcmp s8, s9
+ ; CHECK-NEXT:    fmov s10, w8
+-; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+-; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
++; CHECK-NEXT:    fcmp s8, s9
++; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+ ; CHECK-NEXT:    csel x8, xzr, x0, lt
+ ; CHECK-NEXT:    csel x9, x25, x1, lt
+ ; CHECK-NEXT:    fcmp s8, s10
++; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
++; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
+ ; CHECK-NEXT:    csel x9, x26, x9, gt
+ ; CHECK-NEXT:    csinv x8, x8, xzr, le
+ ; CHECK-NEXT:    fcmp s8, s8
+@@ -1862,15 +1858,14 @@ define <4 x i100> @test_signed_v4f16_v4i100(<4 x half> %f) {
+ ; CHECK-NEXT:    fcvt s8, h1
+ ; CHECK-NEXT:    fmov s0, s8
+ ; CHECK-NEXT:    bl __fixsfti
+-; CHECK-NEXT:    mov w8, #-251658240
++; CHECK-NEXT:    movi v9.2s, #241, lsl #24
++; CHECK-NEXT:    mov w8, #1895825407
+ ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+ ; CHECK-NEXT:    mov x25, #-34359738368
+ ; CHECK-NEXT:    mov x26, #34359738367
+-; CHECK-NEXT:    fmov s9, w8
+-; CHECK-NEXT:    mov w8, #1895825407
+-; CHECK-NEXT:    mov h0, v0.h[2]
+-; CHECK-NEXT:    fcmp s8, s9
+ ; CHECK-NEXT:    fmov s10, w8
++; CHECK-NEXT:    fcmp s8, s9
++; CHECK-NEXT:    mov h0, v0.h[2]
+ ; CHECK-NEXT:    csel x8, xzr, x0, lt
+ ; CHECK-NEXT:    csel x9, x25, x1, lt
+ ; CHECK-NEXT:    fcmp s8, s10
+@@ -1970,15 +1965,14 @@ define <4 x i128> @test_signed_v4f16_v4i128(<4 x half> %f) {
+ ; CHECK-NEXT:    fcvt s8, h1
+ ; CHECK-NEXT:    fmov s0, s8
+ ; CHECK-NEXT:    bl __fixsfti
+-; CHECK-NEXT:    mov w8, #-16777216
++; CHECK-NEXT:    movi v9.2s, #255, lsl #24
++; CHECK-NEXT:    mov w8, #2130706431
+ ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+ ; CHECK-NEXT:    mov x25, #-9223372036854775808
+ ; CHECK-NEXT:    mov x26, #9223372036854775807
+-; CHECK-NEXT:    fmov s9, w8
+-; CHECK-NEXT:    mov w8, #2130706431
+-; CHECK-NEXT:    mov h0, v0.h[2]
+-; CHECK-NEXT:    fcmp s8, s9
+ ; CHECK-NEXT:    fmov s10, w8
++; CHECK-NEXT:    fcmp s8, s9
++; CHECK-NEXT:    mov h0, v0.h[2]
+ ; CHECK-NEXT:    csel x8, xzr, x0, lt
+ ; CHECK-NEXT:    csel x9, x25, x1, lt
+ ; CHECK-NEXT:    fcmp s8, s10
+@@ -2618,15 +2612,14 @@ define <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) {
+ ; CHECK-NEXT:    fcvt s8, h0
+ ; CHECK-NEXT:    fmov s0, s8
+ ; CHECK-NEXT:    bl __fixsfti
+-; CHECK-NEXT:    mov w8, #-251658240
++; CHECK-NEXT:    movi v10.2s, #241, lsl #24
++; CHECK-NEXT:    mov w8, #1895825407
+ ; CHECK-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
+ ; CHECK-NEXT:    mov x25, #-34359738368
+ ; CHECK-NEXT:    mov x23, #34359738367
+-; CHECK-NEXT:    fmov s10, w8
+-; CHECK-NEXT:    mov w8, #1895825407
+-; CHECK-NEXT:    mov h0, v0.h[3]
+-; CHECK-NEXT:    fcmp s8, s10
+ ; CHECK-NEXT:    fmov s9, w8
++; CHECK-NEXT:    fcmp s8, s10
++; CHECK-NEXT:    mov h0, v0.h[3]
+ ; CHECK-NEXT:    csel x8, xzr, x0, lt
+ ; CHECK-NEXT:    csel x9, x25, x1, lt
+ ; CHECK-NEXT:    fcmp s8, s9
+@@ -2827,15 +2820,14 @@ define <8 x i128> @test_signed_v8f16_v8i128(<8 x half> %f) {
+ ; CHECK-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
+ ; CHECK-NEXT:    fmov s0, s8
+ ; CHECK-NEXT:    bl __fixsfti
+-; CHECK-NEXT:    mov w8, #-16777216
++; CHECK-NEXT:    movi v10.2s, #255, lsl #24
++; CHECK-NEXT:    mov w8, #2130706431
+ ; CHECK-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
+ ; CHECK-NEXT:    mov x21, #-9223372036854775808
+ ; CHECK-NEXT:    mov x22, #9223372036854775807
+-; CHECK-NEXT:    fmov s10, w8
+-; CHECK-NEXT:    mov w8, #2130706431
+-; CHECK-NEXT:    mov h0, v0.h[1]
+-; CHECK-NEXT:    fcmp s8, s10
+ ; CHECK-NEXT:    fmov s9, w8
++; CHECK-NEXT:    fcmp s8, s10
++; CHECK-NEXT:    mov h0, v0.h[1]
+ ; CHECK-NEXT:    csel x8, xzr, x0, lt
+ ; CHECK-NEXT:    csel x9, x21, x1, lt
+ ; CHECK-NEXT:    fcmp s8, s9
+diff --git a/llvm/test/CodeGen/AArch64/remat-const-float-simd.ll b/llvm/test/CodeGen/AArch64/remat-const-float-simd.ll
+new file mode 100644
+index 000000000000..cdb8b86fc398
+--- /dev/null
++++ b/llvm/test/CodeGen/AArch64/remat-const-float-simd.ll
+@@ -0,0 +1,33 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -mattr=+neon | FileCheck %s --check-prefixes=CHECK,CHECK-NEON
++; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -mattr=-neon | FileCheck %s --check-prefixes=CHECK,CHECK-SCALAR
++
++; Check that big fp constants can be rematerialized with movi
++target triple = "aarch64-unknown-linux-gnu"
++
++; float foo(void) { return float(2147483648); }
++define float @foo() {
++; CHECK-LABEL: foo:
++; CHECK:       // %bb.0: // %entry
++; CHECK-NEON-NEXT:    movi v0.2s, #79, lsl #24
++; CHECK-SCALAR-NEXT:    mov w8, #1325400064
++; CHECK-SCALAR-NEXT:    fmov s0, w8
++; CHECK-NEXT:    ret
++entry:
++ ret float 0x41E0000000000000
++}
++
++; float foo2(float p) { return p + float(2147483648); }
++define float @foo2(float %f) {
++; CHECK-LABEL: foo2:
++; CHECK:       // %bb.0: // %entry
++; CHECK-NEON-NEXT:    movi v1.2s, #79, lsl #24
++; CHECK-NEON-NEXT:    fadd s0, s0, s1
++; CHECK-SCALAR-NEXT:    mov w8, #1325400064
++; CHECK-SCALAR-NEXT:    fmov s1, w8
++; CHECK-SCALAR-NEXT:    fadd s0, s0, s1
++; CHECK-NEXT:    ret
++entry:
++  %p = fadd float %f, 0x41E0000000000000
++  ret float %p
++}
+diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization.ll
+index 285139c30896..a2e5a8a1b4c4 100644
+--- a/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization.ll
++++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization.ll
+@@ -48,8 +48,7 @@ define fp128 @test_v1f128(<1 x fp128> %a) nounwind {
+ define float @test_v3f32(<3 x float> %a) nounwind {
+ ; CHECK-LABEL: test_v3f32:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    mov w8, #-2147483648
+-; CHECK-NEXT:    fmov s1, w8
++; CHECK-NEXT:    movi v1.2s, #128, lsl #24
+ ; CHECK-NEXT:    mov v0.s[3], v1.s[0]
+ ; CHECK-NEXT:    faddp v0.4s, v0.4s, v0.4s
+ ; CHECK-NEXT:    faddp s0, v0.2s
+-- 
+2.34.1
+
diff --git a/patches/cherry/84ccd015e7dd3ca57c4a9366ecd2b9a7430f505d.patch b/patches/cherry/84ccd015e7dd3ca57c4a9366ecd2b9a7430f505d.patch
new file mode 100644
index 0000000..8d640a5
--- /dev/null
+++ b/patches/cherry/84ccd015e7dd3ca57c4a9366ecd2b9a7430f505d.patch
@@ -0,0 +1,509 @@
+From 84ccd015e7dd3ca57c4a9366ecd2b9a7430f505d Mon Sep 17 00:00:00 2001
+From: David Green <david.green@arm.com>
+Date: Sat, 5 Mar 2022 18:35:43 +0000
+Subject: [PATCH] [AArch64] Some tests to show reconstructing truncates. NFC
+
+---
+ .../CodeGen/AArch64/neon-extracttruncate.ll   | 490 ++++++++++++++++++
+ 1 file changed, 490 insertions(+)
+ create mode 100644 llvm/test/CodeGen/AArch64/neon-extracttruncate.ll
+
+diff --git a/llvm/test/CodeGen/AArch64/neon-extracttruncate.ll b/llvm/test/CodeGen/AArch64/neon-extracttruncate.ll
+new file mode 100644
+index 000000000000..14cc333120c7
+--- /dev/null
++++ b/llvm/test/CodeGen/AArch64/neon-extracttruncate.ll
+@@ -0,0 +1,490 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
++
++define <8 x i8> @extract_2_v4i16(<4 x i16> %a, <4 x i16> %b) {
++; CHECK-LABEL: extract_2_v4i16:
++; CHECK:       // %bb.0: // %entry
++; CHECK-NEXT:    uzp1 v0.8b, v0.8b, v1.8b
++; CHECK-NEXT:    ret
++entry:
++  %a0 = extractelement <4 x i16> %a, i32 0
++  %a1 = extractelement <4 x i16> %a, i32 1
++  %a2 = extractelement <4 x i16> %a, i32 2
++  %a3 = extractelement <4 x i16> %a, i32 3
++  %b0 = extractelement <4 x i16> %b, i32 0
++  %b1 = extractelement <4 x i16> %b, i32 1
++  %b2 = extractelement <4 x i16> %b, i32 2
++  %b3 = extractelement <4 x i16> %b, i32 3
++  %t0 = trunc i16 %a0 to i8
++  %t1 = trunc i16 %a1 to i8
++  %t2 = trunc i16 %a2 to i8
++  %t3 = trunc i16 %a3 to i8
++  %t4 = trunc i16 %b0 to i8
++  %t5 = trunc i16 %b1 to i8
++  %t6 = trunc i16 %b2 to i8
++  %t7 = trunc i16 %b3 to i8
++  %i0 = insertelement <8 x i8> undef, i8 %t0, i32 0
++  %i1 = insertelement <8 x i8> %i0, i8 %t1, i32 1
++  %i2 = insertelement <8 x i8> %i1, i8 %t2, i32 2
++  %i3 = insertelement <8 x i8> %i2, i8 %t3, i32 3
++  %i4 = insertelement <8 x i8> %i3, i8 %t4, i32 4
++  %i5 = insertelement <8 x i8> %i4, i8 %t5, i32 5
++  %i6 = insertelement <8 x i8> %i5, i8 %t6, i32 6
++  %i7 = insertelement <8 x i8> %i6, i8 %t7, i32 7
++  ret <8 x i8> %i7
++}
++
++define <8 x i8> @extract_2_v4i32(<4 x i32> %a, <4 x i32> %b) {
++; CHECK-LABEL: extract_2_v4i32:
++; CHECK:       // %bb.0: // %entry
++; CHECK-NEXT:    mov w8, v0.s[1]
++; CHECK-NEXT:    mov w9, v0.s[2]
++; CHECK-NEXT:    mov w10, v0.s[3]
++; CHECK-NEXT:    mov v0.b[1], w8
++; CHECK-NEXT:    fmov w8, s1
++; CHECK-NEXT:    mov v0.b[2], w9
++; CHECK-NEXT:    mov w9, v1.s[1]
++; CHECK-NEXT:    mov v0.b[3], w10
++; CHECK-NEXT:    mov v0.b[4], w8
++; CHECK-NEXT:    mov w8, v1.s[2]
++; CHECK-NEXT:    mov v0.b[5], w9
++; CHECK-NEXT:    mov w9, v1.s[3]
++; CHECK-NEXT:    mov v0.b[6], w8
++; CHECK-NEXT:    mov v0.b[7], w9
++; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
++; CHECK-NEXT:    ret
++entry:
++  %a0 = extractelement <4 x i32> %a, i32 0
++  %a1 = extractelement <4 x i32> %a, i32 1
++  %a2 = extractelement <4 x i32> %a, i32 2
++  %a3 = extractelement <4 x i32> %a, i32 3
++  %b0 = extractelement <4 x i32> %b, i32 0
++  %b1 = extractelement <4 x i32> %b, i32 1
++  %b2 = extractelement <4 x i32> %b, i32 2
++  %b3 = extractelement <4 x i32> %b, i32 3
++  %t0 = trunc i32 %a0 to i8
++  %t1 = trunc i32 %a1 to i8
++  %t2 = trunc i32 %a2 to i8
++  %t3 = trunc i32 %a3 to i8
++  %t4 = trunc i32 %b0 to i8
++  %t5 = trunc i32 %b1 to i8
++  %t6 = trunc i32 %b2 to i8
++  %t7 = trunc i32 %b3 to i8
++  %i0 = insertelement <8 x i8> undef, i8 %t0, i32 0
++  %i1 = insertelement <8 x i8> %i0, i8 %t1, i32 1
++  %i2 = insertelement <8 x i8> %i1, i8 %t2, i32 2
++  %i3 = insertelement <8 x i8> %i2, i8 %t3, i32 3
++  %i4 = insertelement <8 x i8> %i3, i8 %t4, i32 4
++  %i5 = insertelement <8 x i8> %i4, i8 %t5, i32 5
++  %i6 = insertelement <8 x i8> %i5, i8 %t6, i32 6
++  %i7 = insertelement <8 x i8> %i6, i8 %t7, i32 7
++  ret <8 x i8> %i7
++}
++
++define <16 x i8> @extract_4_v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) {
++; CHECK-LABEL: extract_4_v4i16:
++; CHECK:       // %bb.0: // %entry
++; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
++; CHECK-NEXT:    umov w9, v0.h[0]
++; CHECK-NEXT:    umov w10, v0.h[1]
++; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
++; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
++; CHECK-NEXT:    umov w8, v2.h[0]
++; CHECK-NEXT:    // kill: def $d3 killed $d3 def $q3
++; CHECK-NEXT:    fmov s4, w9
++; CHECK-NEXT:    umov w9, v0.h[2]
++; CHECK-NEXT:    mov v4.b[1], w10
++; CHECK-NEXT:    umov w10, v0.h[3]
++; CHECK-NEXT:    mov v4.b[2], w9
++; CHECK-NEXT:    umov w9, v1.h[0]
++; CHECK-NEXT:    mov v4.b[3], w10
++; CHECK-NEXT:    umov w10, v1.h[1]
++; CHECK-NEXT:    mov v4.b[4], w9
++; CHECK-NEXT:    umov w9, v1.h[2]
++; CHECK-NEXT:    mov v4.b[5], w10
++; CHECK-NEXT:    umov w10, v1.h[3]
++; CHECK-NEXT:    mov v4.b[6], w9
++; CHECK-NEXT:    umov w9, v2.h[1]
++; CHECK-NEXT:    mov v4.b[7], w10
++; CHECK-NEXT:    mov v4.b[8], w8
++; CHECK-NEXT:    umov w8, v2.h[2]
++; CHECK-NEXT:    mov v4.b[9], w9
++; CHECK-NEXT:    umov w9, v2.h[3]
++; CHECK-NEXT:    mov v4.b[10], w8
++; CHECK-NEXT:    umov w8, v3.h[0]
++; CHECK-NEXT:    mov v4.b[11], w9
++; CHECK-NEXT:    umov w9, v3.h[1]
++; CHECK-NEXT:    mov v4.b[12], w8
++; CHECK-NEXT:    umov w8, v3.h[2]
++; CHECK-NEXT:    mov v4.b[13], w9
++; CHECK-NEXT:    umov w9, v3.h[3]
++; CHECK-NEXT:    mov v4.b[14], w8
++; CHECK-NEXT:    mov v4.b[15], w9
++; CHECK-NEXT:    mov v0.16b, v4.16b
++; CHECK-NEXT:    ret
++entry:
++  %a0 = extractelement <4 x i16> %a, i32 0
++  %a1 = extractelement <4 x i16> %a, i32 1
++  %a2 = extractelement <4 x i16> %a, i32 2
++  %a3 = extractelement <4 x i16> %a, i32 3
++  %b0 = extractelement <4 x i16> %b, i32 0
++  %b1 = extractelement <4 x i16> %b, i32 1
++  %b2 = extractelement <4 x i16> %b, i32 2
++  %b3 = extractelement <4 x i16> %b, i32 3
++  %c0 = extractelement <4 x i16> %c, i32 0
++  %c1 = extractelement <4 x i16> %c, i32 1
++  %c2 = extractelement <4 x i16> %c, i32 2
++  %c3 = extractelement <4 x i16> %c, i32 3
++  %d0 = extractelement <4 x i16> %d, i32 0
++  %d1 = extractelement <4 x i16> %d, i32 1
++  %d2 = extractelement <4 x i16> %d, i32 2
++  %d3 = extractelement <4 x i16> %d, i32 3
++  %t0 = trunc i16 %a0 to i8
++  %t1 = trunc i16 %a1 to i8
++  %t2 = trunc i16 %a2 to i8
++  %t3 = trunc i16 %a3 to i8
++  %t4 = trunc i16 %b0 to i8
++  %t5 = trunc i16 %b1 to i8
++  %t6 = trunc i16 %b2 to i8
++  %t7 = trunc i16 %b3 to i8
++  %t8 = trunc i16 %c0 to i8
++  %t9 = trunc i16 %c1 to i8
++  %t10 = trunc i16 %c2 to i8
++  %t11 = trunc i16 %c3 to i8
++  %t12 = trunc i16 %d0 to i8
++  %t13 = trunc i16 %d1 to i8
++  %t14 = trunc i16 %d2 to i8
++  %t15 = trunc i16 %d3 to i8
++  %i0 = insertelement <16 x i8> undef, i8 %t0, i32 0
++  %i1 = insertelement <16 x i8> %i0, i8 %t1, i32 1
++  %i2 = insertelement <16 x i8> %i1, i8 %t2, i32 2
++  %i3 = insertelement <16 x i8> %i2, i8 %t3, i32 3
++  %i4 = insertelement <16 x i8> %i3, i8 %t4, i32 4
++  %i5 = insertelement <16 x i8> %i4, i8 %t5, i32 5
++  %i6 = insertelement <16 x i8> %i5, i8 %t6, i32 6
++  %i7 = insertelement <16 x i8> %i6, i8 %t7, i32 7
++  %i8 = insertelement <16 x i8> %i7, i8 %t8, i32 8
++  %i9 = insertelement <16 x i8> %i8, i8 %t9, i32 9
++  %i10 = insertelement <16 x i8> %i9, i8 %t10, i32 10
++  %i11 = insertelement <16 x i8> %i10, i8 %t11, i32 11
++  %i12 = insertelement <16 x i8> %i11, i8 %t12, i32 12
++  %i13 = insertelement <16 x i8> %i12, i8 %t13, i32 13
++  %i14 = insertelement <16 x i8> %i13, i8 %t14, i32 14
++  %i15 = insertelement <16 x i8> %i14, i8 %t15, i32 15
++  ret <16 x i8> %i15
++}
++
++define <16 x i8> @extract_4_v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
++; CHECK-LABEL: extract_4_v4i32:
++; CHECK:       // %bb.0: // %entry
++; CHECK-NEXT:    mov w8, v0.s[1]
++; CHECK-NEXT:    mov w9, v0.s[2]
++; CHECK-NEXT:    mov w10, v0.s[3]
++; CHECK-NEXT:    mov v0.b[1], w8
++; CHECK-NEXT:    fmov w8, s1
++; CHECK-NEXT:    mov v0.b[2], w9
++; CHECK-NEXT:    mov w9, v1.s[1]
++; CHECK-NEXT:    mov v0.b[3], w10
++; CHECK-NEXT:    mov v0.b[4], w8
++; CHECK-NEXT:    mov w8, v1.s[2]
++; CHECK-NEXT:    mov v0.b[5], w9
++; CHECK-NEXT:    mov w9, v1.s[3]
++; CHECK-NEXT:    mov v0.b[6], w8
++; CHECK-NEXT:    fmov w8, s2
++; CHECK-NEXT:    mov v0.b[7], w9
++; CHECK-NEXT:    mov w9, v2.s[1]
++; CHECK-NEXT:    mov v0.b[8], w8
++; CHECK-NEXT:    mov w8, v2.s[2]
++; CHECK-NEXT:    mov v0.b[9], w9
++; CHECK-NEXT:    mov w9, v2.s[3]
++; CHECK-NEXT:    mov v0.b[10], w8
++; CHECK-NEXT:    fmov w8, s3
++; CHECK-NEXT:    mov v0.b[11], w9
++; CHECK-NEXT:    mov w9, v3.s[1]
++; CHECK-NEXT:    mov v0.b[12], w8
++; CHECK-NEXT:    mov w8, v3.s[2]
++; CHECK-NEXT:    mov v0.b[13], w9
++; CHECK-NEXT:    mov w9, v3.s[3]
++; CHECK-NEXT:    mov v0.b[14], w8
++; CHECK-NEXT:    mov v0.b[15], w9
++; CHECK-NEXT:    ret
++entry:
++  %a0 = extractelement <4 x i32> %a, i32 0
++  %a1 = extractelement <4 x i32> %a, i32 1
++  %a2 = extractelement <4 x i32> %a, i32 2
++  %a3 = extractelement <4 x i32> %a, i32 3
++  %b0 = extractelement <4 x i32> %b, i32 0
++  %b1 = extractelement <4 x i32> %b, i32 1
++  %b2 = extractelement <4 x i32> %b, i32 2
++  %b3 = extractelement <4 x i32> %b, i32 3
++  %c0 = extractelement <4 x i32> %c, i32 0
++  %c1 = extractelement <4 x i32> %c, i32 1
++  %c2 = extractelement <4 x i32> %c, i32 2
++  %c3 = extractelement <4 x i32> %c, i32 3
++  %d0 = extractelement <4 x i32> %d, i32 0
++  %d1 = extractelement <4 x i32> %d, i32 1
++  %d2 = extractelement <4 x i32> %d, i32 2
++  %d3 = extractelement <4 x i32> %d, i32 3
++  %t0 = trunc i32 %a0 to i8
++  %t1 = trunc i32 %a1 to i8
++  %t2 = trunc i32 %a2 to i8
++  %t3 = trunc i32 %a3 to i8
++  %t4 = trunc i32 %b0 to i8
++  %t5 = trunc i32 %b1 to i8
++  %t6 = trunc i32 %b2 to i8
++  %t7 = trunc i32 %b3 to i8
++  %t8 = trunc i32 %c0 to i8
++  %t9 = trunc i32 %c1 to i8
++  %t10 = trunc i32 %c2 to i8
++  %t11 = trunc i32 %c3 to i8
++  %t12 = trunc i32 %d0 to i8
++  %t13 = trunc i32 %d1 to i8
++  %t14 = trunc i32 %d2 to i8
++  %t15 = trunc i32 %d3 to i8
++  %i0 = insertelement <16 x i8> undef, i8 %t0, i32 0
++  %i1 = insertelement <16 x i8> %i0, i8 %t1, i32 1
++  %i2 = insertelement <16 x i8> %i1, i8 %t2, i32 2
++  %i3 = insertelement <16 x i8> %i2, i8 %t3, i32 3
++  %i4 = insertelement <16 x i8> %i3, i8 %t4, i32 4
++  %i5 = insertelement <16 x i8> %i4, i8 %t5, i32 5
++  %i6 = insertelement <16 x i8> %i5, i8 %t6, i32 6
++  %i7 = insertelement <16 x i8> %i6, i8 %t7, i32 7
++  %i8 = insertelement <16 x i8> %i7, i8 %t8, i32 8
++  %i9 = insertelement <16 x i8> %i8, i8 %t9, i32 9
++  %i10 = insertelement <16 x i8> %i9, i8 %t10, i32 10
++  %i11 = insertelement <16 x i8> %i10, i8 %t11, i32 11
++  %i12 = insertelement <16 x i8> %i11, i8 %t12, i32 12
++  %i13 = insertelement <16 x i8> %i12, i8 %t13, i32 13
++  %i14 = insertelement <16 x i8> %i13, i8 %t14, i32 14
++  %i15 = insertelement <16 x i8> %i14, i8 %t15, i32 15
++  ret <16 x i8> %i15
++}
++
++define <16 x i8> @extract_4_mixed(<4 x i16> %a, <4 x i32> %b, <4 x i32> %c, <4 x i16> %d) {
++; CHECK-LABEL: extract_4_mixed:
++; CHECK:       // %bb.0: // %entry
++; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
++; CHECK-NEXT:    umov w8, v0.h[0]
++; CHECK-NEXT:    umov w9, v0.h[1]
++; CHECK-NEXT:    // kill: def $d3 killed $d3 def $q3
++; CHECK-NEXT:    fmov s4, w8
++; CHECK-NEXT:    umov w8, v0.h[2]
++; CHECK-NEXT:    mov v4.b[1], w9
++; CHECK-NEXT:    umov w9, v0.h[3]
++; CHECK-NEXT:    mov v4.b[2], w8
++; CHECK-NEXT:    fmov w8, s1
++; CHECK-NEXT:    mov v4.b[3], w9
++; CHECK-NEXT:    mov w9, v1.s[1]
++; CHECK-NEXT:    mov v4.b[4], w8
++; CHECK-NEXT:    mov w8, v1.s[2]
++; CHECK-NEXT:    mov v4.b[5], w9
++; CHECK-NEXT:    mov w9, v1.s[3]
++; CHECK-NEXT:    mov v4.b[6], w8
++; CHECK-NEXT:    fmov w8, s2
++; CHECK-NEXT:    mov v4.b[7], w9
++; CHECK-NEXT:    mov w9, v2.s[1]
++; CHECK-NEXT:    mov v4.b[8], w8
++; CHECK-NEXT:    mov w8, v2.s[2]
++; CHECK-NEXT:    mov v4.b[9], w9
++; CHECK-NEXT:    mov w9, v2.s[3]
++; CHECK-NEXT:    mov v4.b[10], w8
++; CHECK-NEXT:    umov w8, v3.h[0]
++; CHECK-NEXT:    mov v4.b[11], w9
++; CHECK-NEXT:    umov w9, v3.h[1]
++; CHECK-NEXT:    mov v4.b[12], w8
++; CHECK-NEXT:    umov w8, v3.h[2]
++; CHECK-NEXT:    mov v4.b[13], w9
++; CHECK-NEXT:    umov w9, v3.h[3]
++; CHECK-NEXT:    mov v4.b[14], w8
++; CHECK-NEXT:    mov v4.b[15], w9
++; CHECK-NEXT:    mov v0.16b, v4.16b
++; CHECK-NEXT:    ret
++entry:
++  %a0 = extractelement <4 x i16> %a, i32 0
++  %a1 = extractelement <4 x i16> %a, i32 1
++  %a2 = extractelement <4 x i16> %a, i32 2
++  %a3 = extractelement <4 x i16> %a, i32 3
++  %b0 = extractelement <4 x i32> %b, i32 0
++  %b1 = extractelement <4 x i32> %b, i32 1
++  %b2 = extractelement <4 x i32> %b, i32 2
++  %b3 = extractelement <4 x i32> %b, i32 3
++  %c0 = extractelement <4 x i32> %c, i32 0
++  %c1 = extractelement <4 x i32> %c, i32 1
++  %c2 = extractelement <4 x i32> %c, i32 2
++  %c3 = extractelement <4 x i32> %c, i32 3
++  %d0 = extractelement <4 x i16> %d, i32 0
++  %d1 = extractelement <4 x i16> %d, i32 1
++  %d2 = extractelement <4 x i16> %d, i32 2
++  %d3 = extractelement <4 x i16> %d, i32 3
++  %t0 = trunc i16 %a0 to i8
++  %t1 = trunc i16 %a1 to i8
++  %t2 = trunc i16 %a2 to i8
++  %t3 = trunc i16 %a3 to i8
++  %t4 = trunc i32 %b0 to i8
++  %t5 = trunc i32 %b1 to i8
++  %t6 = trunc i32 %b2 to i8
++  %t7 = trunc i32 %b3 to i8
++  %t8 = trunc i32 %c0 to i8
++  %t9 = trunc i32 %c1 to i8
++  %t10 = trunc i32 %c2 to i8
++  %t11 = trunc i32 %c3 to i8
++  %t12 = trunc i16 %d0 to i8
++  %t13 = trunc i16 %d1 to i8
++  %t14 = trunc i16 %d2 to i8
++  %t15 = trunc i16 %d3 to i8
++  %i0 = insertelement <16 x i8> undef, i8 %t0, i32 0
++  %i1 = insertelement <16 x i8> %i0, i8 %t1, i32 1
++  %i2 = insertelement <16 x i8> %i1, i8 %t2, i32 2
++  %i3 = insertelement <16 x i8> %i2, i8 %t3, i32 3
++  %i4 = insertelement <16 x i8> %i3, i8 %t4, i32 4
++  %i5 = insertelement <16 x i8> %i4, i8 %t5, i32 5
++  %i6 = insertelement <16 x i8> %i5, i8 %t6, i32 6
++  %i7 = insertelement <16 x i8> %i6, i8 %t7, i32 7
++  %i8 = insertelement <16 x i8> %i7, i8 %t8, i32 8
++  %i9 = insertelement <16 x i8> %i8, i8 %t9, i32 9
++  %i10 = insertelement <16 x i8> %i9, i8 %t10, i32 10
++  %i11 = insertelement <16 x i8> %i10, i8 %t11, i32 11
++  %i12 = insertelement <16 x i8> %i11, i8 %t12, i32 12
++  %i13 = insertelement <16 x i8> %i12, i8 %t13, i32 13
++  %i14 = insertelement <16 x i8> %i13, i8 %t14, i32 14
++  %i15 = insertelement <16 x i8> %i14, i8 %t15, i32 15
++  ret <16 x i8> %i15
++}
++
++define <16 x i8> @extract_4_v4i32_badindex(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
++; CHECK-LABEL: extract_4_v4i32_badindex:
++; CHECK:       // %bb.0: // %entry
++; CHECK-NEXT:    mov w8, v0.s[1]
++; CHECK-NEXT:    mov w9, v0.s[2]
++; CHECK-NEXT:    mov w10, v0.s[3]
++; CHECK-NEXT:    mov v0.b[1], w8
++; CHECK-NEXT:    fmov w8, s1
++; CHECK-NEXT:    mov v0.b[2], w9
++; CHECK-NEXT:    mov w9, v1.s[2]
++; CHECK-NEXT:    mov v0.b[3], w10
++; CHECK-NEXT:    mov v0.b[4], w8
++; CHECK-NEXT:    mov w8, v1.s[1]
++; CHECK-NEXT:    mov v0.b[5], w9
++; CHECK-NEXT:    mov w9, v1.s[3]
++; CHECK-NEXT:    mov v0.b[6], w8
++; CHECK-NEXT:    fmov w8, s2
++; CHECK-NEXT:    mov v0.b[7], w9
++; CHECK-NEXT:    mov w9, v2.s[1]
++; CHECK-NEXT:    mov v0.b[8], w8
++; CHECK-NEXT:    mov w8, v2.s[2]
++; CHECK-NEXT:    mov v0.b[9], w9
++; CHECK-NEXT:    mov w9, v2.s[3]
++; CHECK-NEXT:    mov v0.b[10], w8
++; CHECK-NEXT:    fmov w8, s3
++; CHECK-NEXT:    mov v0.b[11], w9
++; CHECK-NEXT:    mov w9, v3.s[1]
++; CHECK-NEXT:    mov v0.b[12], w8
++; CHECK-NEXT:    mov w8, v3.s[2]
++; CHECK-NEXT:    mov v0.b[13], w9
++; CHECK-NEXT:    mov w9, v3.s[3]
++; CHECK-NEXT:    mov v0.b[14], w8
++; CHECK-NEXT:    mov v0.b[15], w9
++; CHECK-NEXT:    ret
++entry:
++  %a0 = extractelement <4 x i32> %a, i32 0
++  %a1 = extractelement <4 x i32> %a, i32 1
++  %a2 = extractelement <4 x i32> %a, i32 2
++  %a3 = extractelement <4 x i32> %a, i32 3
++  %b0 = extractelement <4 x i32> %b, i32 0
++  %b1 = extractelement <4 x i32> %b, i32 2
++  %b2 = extractelement <4 x i32> %b, i32 1
++  %b3 = extractelement <4 x i32> %b, i32 3
++  %c0 = extractelement <4 x i32> %c, i32 0
++  %c1 = extractelement <4 x i32> %c, i32 1
++  %c2 = extractelement <4 x i32> %c, i32 2
++  %c3 = extractelement <4 x i32> %c, i32 3
++  %d0 = extractelement <4 x i32> %d, i32 0
++  %d1 = extractelement <4 x i32> %d, i32 1
++  %d2 = extractelement <4 x i32> %d, i32 2
++  %d3 = extractelement <4 x i32> %d, i32 3
++  %t0 = trunc i32 %a0 to i8
++  %t1 = trunc i32 %a1 to i8
++  %t2 = trunc i32 %a2 to i8
++  %t3 = trunc i32 %a3 to i8
++  %t4 = trunc i32 %b0 to i8
++  %t5 = trunc i32 %b1 to i8
++  %t6 = trunc i32 %b2 to i8
++  %t7 = trunc i32 %b3 to i8
++  %t8 = trunc i32 %c0 to i8
++  %t9 = trunc i32 %c1 to i8
++  %t10 = trunc i32 %c2 to i8
++  %t11 = trunc i32 %c3 to i8
++  %t12 = trunc i32 %d0 to i8
++  %t13 = trunc i32 %d1 to i8
++  %t14 = trunc i32 %d2 to i8
++  %t15 = trunc i32 %d3 to i8
++  %i0 = insertelement <16 x i8> undef, i8 %t0, i32 0
++  %i1 = insertelement <16 x i8> %i0, i8 %t1, i32 1
++  %i2 = insertelement <16 x i8> %i1, i8 %t2, i32 2
++  %i3 = insertelement <16 x i8> %i2, i8 %t3, i32 3
++  %i4 = insertelement <16 x i8> %i3, i8 %t4, i32 4
++  %i5 = insertelement <16 x i8> %i4, i8 %t5, i32 5
++  %i6 = insertelement <16 x i8> %i5, i8 %t6, i32 6
++  %i7 = insertelement <16 x i8> %i6, i8 %t7, i32 7
++  %i8 = insertelement <16 x i8> %i7, i8 %t8, i32 8
++  %i9 = insertelement <16 x i8> %i8, i8 %t9, i32 9
++  %i10 = insertelement <16 x i8> %i9, i8 %t10, i32 10
++  %i11 = insertelement <16 x i8> %i10, i8 %t11, i32 11
++  %i12 = insertelement <16 x i8> %i11, i8 %t12, i32 12
++  %i13 = insertelement <16 x i8> %i12, i8 %t13, i32 13
++  %i14 = insertelement <16 x i8> %i13, i8 %t14, i32 14
++  %i15 = insertelement <16 x i8> %i14, i8 %t15, i32 15
++  ret <16 x i8> %i15
++}
++
++define <16 x i8> @extract_4_v4i32_one(<4 x i32> %a) {
++; CHECK-LABEL: extract_4_v4i32_one:
++; CHECK:       // %bb.0: // %entry
++; CHECK-NEXT:    mov w8, v0.s[1]
++; CHECK-NEXT:    fmov w9, s0
++; CHECK-NEXT:    mov w10, v0.s[2]
++; CHECK-NEXT:    mov w11, v0.s[3]
++; CHECK-NEXT:    mov v0.b[1], w8
++; CHECK-NEXT:    mov v0.b[2], w10
++; CHECK-NEXT:    mov v0.b[3], w11
++; CHECK-NEXT:    mov v0.b[4], w9
++; CHECK-NEXT:    mov v0.b[5], w8
++; CHECK-NEXT:    mov v0.b[6], w10
++; CHECK-NEXT:    mov v0.b[7], w11
++; CHECK-NEXT:    mov v0.b[8], w9
++; CHECK-NEXT:    mov v0.b[9], w8
++; CHECK-NEXT:    mov v0.b[10], w10
++; CHECK-NEXT:    mov v0.b[11], w11
++; CHECK-NEXT:    mov v0.b[12], w9
++; CHECK-NEXT:    mov v0.b[13], w8
++; CHECK-NEXT:    mov v0.b[14], w10
++; CHECK-NEXT:    mov v0.b[15], w11
++; CHECK-NEXT:    ret
++entry:
++  %a0 = extractelement <4 x i32> %a, i32 0
++  %a1 = extractelement <4 x i32> %a, i32 1
++  %a2 = extractelement <4 x i32> %a, i32 2
++  %a3 = extractelement <4 x i32> %a, i32 3
++  %t0 = trunc i32 %a0 to i8
++  %t1 = trunc i32 %a1 to i8
++  %t2 = trunc i32 %a2 to i8
++  %t3 = trunc i32 %a3 to i8
++  %i0 = insertelement <16 x i8> undef, i8 %t0, i32 0
++  %i1 = insertelement <16 x i8> %i0, i8 %t1, i32 1
++  %i2 = insertelement <16 x i8> %i1, i8 %t2, i32 2
++  %i3 = insertelement <16 x i8> %i2, i8 %t3, i32 3
++  %i4 = insertelement <16 x i8> %i3, i8 %t0, i32 4
++  %i5 = insertelement <16 x i8> %i4, i8 %t1, i32 5
++  %i6 = insertelement <16 x i8> %i5, i8 %t2, i32 6
++  %i7 = insertelement <16 x i8> %i6, i8 %t3, i32 7
++  %i8 = insertelement <16 x i8> %i7, i8 %t0, i32 8
++  %i9 = insertelement <16 x i8> %i8, i8 %t1, i32 9
++  %i10 = insertelement <16 x i8> %i9, i8 %t2, i32 10
++  %i11 = insertelement <16 x i8> %i10, i8 %t3, i32 11
++  %i12 = insertelement <16 x i8> %i11, i8 %t0, i32 12
++  %i13 = insertelement <16 x i8> %i12, i8 %t1, i32 13
++  %i14 = insertelement <16 x i8> %i13, i8 %t2, i32 14
++  %i15 = insertelement <16 x i8> %i14, i8 %t3, i32 15
++  ret <16 x i8> %i15
++}
++
+-- 
+2.34.1
+
diff --git a/patches/cherry/86617256864ebcbda03b6ce843deeb6a41a85800.patch b/patches/cherry/86617256864ebcbda03b6ce843deeb6a41a85800.patch
new file mode 100644
index 0000000..8e8dfb8
--- /dev/null
+++ b/patches/cherry/86617256864ebcbda03b6ce843deeb6a41a85800.patch
@@ -0,0 +1,206 @@
+From 86617256864ebcbda03b6ce843deeb6a41a85800 Mon Sep 17 00:00:00 2001
+From: Florian Hahn <flo@fhahn.com>
+Date: Mon, 23 May 2022 20:27:42 +0100
+Subject: [PATCH] [AArch64] Add tests with free shuffles for indexed fma
+ variants.
+
+The new tests contain examples where shuffles are free, because indexed
+fma instructions can be used.
+---
+ .../AArch64/sink-free-instructions.ll         | 183 ++++++++++++++++++
+ 1 file changed, 183 insertions(+)
+
+diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll
+index 94164c08a4b3..244d2c35bbac 100644
+--- a/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll
++++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll
+@@ -494,3 +494,186 @@ if.else:
+   %vmull1 = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %s3, <8 x i8> %s4)
+   ret <8 x i16> %vmull1
+ }
++
++declare <8 x half> @llvm.fma.v8f16(<8 x half>, <8 x half>, <8 x half>)
++
++define <8 x half> @sink_shufflevector_fma_v8f16(i1 %c, <8 x half> %a, <8 x half> %b) {
++; CHECK-LABEL: @sink_shufflevector_fma_v8f16(
++; CHECK-NEXT:  entry:
++; CHECK-NEXT:    [[S0:%.*]] = shufflevector <8 x half> [[A:%.*]], <8 x half> poison, <8 x i32> zeroinitializer
++; CHECK-NEXT:    [[S1:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
++; CHECK-NEXT:    [[S2:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
++; CHECK-NEXT:    [[S3:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
++; CHECK-NEXT:    [[S4:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
++; CHECK-NEXT:    [[S5:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
++; CHECK-NEXT:    [[S6:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6>
++; CHECK-NEXT:    [[S7:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
++; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
++; CHECK:       if.then:
++; CHECK-NEXT:    [[R_0:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[B:%.*]], <8 x half> [[S0]], <8 x half> [[B]])
++; CHECK-NEXT:    [[R_1:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_0]], <8 x half> [[S1]], <8 x half> [[B]])
++; CHECK-NEXT:    [[R_2:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_1]], <8 x half> [[S2]], <8 x half> [[B]])
++; CHECK-NEXT:    [[R_3:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_2]], <8 x half> [[S3]], <8 x half> [[B]])
++; CHECK-NEXT:    ret <8 x half> [[R_3]]
++; CHECK:       if.else:
++; CHECK-NEXT:    [[R_4:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[B]], <8 x half> [[S4]], <8 x half> [[B]])
++; CHECK-NEXT:    [[R_5:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_4]], <8 x half> [[S5]], <8 x half> [[B]])
++; CHECK-NEXT:    [[R_6:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_5]], <8 x half> [[S6]], <8 x half> [[B]])
++; CHECK-NEXT:    [[R_7:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_6]], <8 x half> [[S7]], <8 x half> [[B]])
++; CHECK-NEXT:    ret <8 x half> [[R_7]]
++;
++entry:
++  %s0 = shufflevector <8 x half> %a, <8 x half> poison, <8 x i32> zeroinitializer
++  %s1 = shufflevector <8 x half> %a, <8 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
++  %s2 = shufflevector <8 x half> %a, <8 x half> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
++  %s3 = shufflevector <8 x half> %a, <8 x half> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
++  %s4 = shufflevector <8 x half> %a, <8 x half> poison, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
++  %s5 = shufflevector <8 x half> %a, <8 x half> poison, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
++  %s6 = shufflevector <8 x half> %a, <8 x half> poison, <8 x i32> <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6>
++  %s7 = shufflevector <8 x half> %a, <8 x half> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
++  br i1 %c, label %if.then, label %if.else
++
++if.then:
++  %r.0 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %b, <8 x half> %s0, <8 x half> %b)
++  %r.1 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %r.0, <8 x half> %s1, <8 x half> %b)
++  %r.2 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %r.1, <8 x half> %s2, <8 x half> %b)
++  %r.3 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %r.2, <8 x half> %s3, <8 x half> %b)
++  ret <8 x half> %r.3
++
++if.else:
++  %r.4 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %b, <8 x half> %s4, <8 x half> %b)
++  %r.5 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %r.4, <8 x half> %s5, <8 x half> %b)
++  %r.6 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %r.5, <8 x half> %s6, <8 x half> %b)
++  %r.7 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %r.6, <8 x half> %s7, <8 x half> %b)
++  ret <8 x half> %r.7
++}
++
++declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
++
++define <4 x float> @sink_shufflevector_fma_v4f32(i1 %c, <8 x float> %a, <4 x float> %b) {
++; CHECK-LABEL: @sink_shufflevector_fma_v4f32(
++; CHECK-NEXT:  entry:
++; CHECK-NEXT:    [[S0:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> zeroinitializer
++; CHECK-NEXT:    [[S1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
++; CHECK-NEXT:    [[S2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
++; CHECK-NEXT:    [[S3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
++; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
++; CHECK:       if.then:
++; CHECK-NEXT:    [[R_0:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[B:%.*]], <4 x float> [[S0]], <4 x float> [[B]])
++; CHECK-NEXT:    [[R_1:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[R_0]], <4 x float> [[S1]], <4 x float> [[B]])
++; CHECK-NEXT:    ret <4 x float> [[R_1]]
++; CHECK:       if.else:
++; CHECK-NEXT:    [[R_2:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[B]], <4 x float> [[S2]], <4 x float> [[B]])
++; CHECK-NEXT:    [[R_3:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[R_2]], <4 x float> [[S3]], <4 x float> [[B]])
++; CHECK-NEXT:    ret <4 x float> [[R_3]]
++;
++entry:
++  %s0 = shufflevector <8 x float> %a, <8 x float> poison, <4 x i32> zeroinitializer
++  %s1 = shufflevector <8 x float> %a, <8 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
++  %s2 = shufflevector <8 x float> %a, <8 x float> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
++  %s3 = shufflevector <8 x float> %a, <8 x float> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
++  br i1 %c, label %if.then, label %if.else
++
++if.then:
++  %r.0 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %b, <4 x float> %s0, <4 x float> %b)
++  %r.1 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %r.0, <4 x float> %s1, <4 x float> %b)
++  ret <4 x float> %r.1
++
++if.else:
++  %r.2 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %b, <4 x float> %s2, <4 x float> %b)
++  %r.3 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %r.2, <4 x float> %s3, <4 x float> %b)
++  ret <4 x float> %r.3
++}
++
++declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>)
++
++define <2 x double> @sink_shufflevector_fma_v2f64(i1 %c, <2 x double> %a, <2 x double> %b) {
++; CHECK-LABEL: @sink_shufflevector_fma_v2f64(
++; CHECK-NEXT:  entry:
++; CHECK-NEXT:    [[S0:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> poison, <2 x i32> zeroinitializer
++; CHECK-NEXT:    [[S1:%.*]] = shufflevector <2 x double> [[A]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
++; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
++; CHECK:       if.then:
++; CHECK-NEXT:    [[R_0:%.*]] = tail call fast <2 x double> @llvm.fma.v2f64(<2 x double> [[B:%.*]], <2 x double> [[S0]], <2 x double> [[B]])
++; CHECK-NEXT:    ret <2 x double> [[R_0]]
++; CHECK:       if.else:
++; CHECK-NEXT:    [[R_1:%.*]] = tail call fast <2 x double> @llvm.fma.v2f64(<2 x double> [[B]], <2 x double> [[S1]], <2 x double> [[B]])
++; CHECK-NEXT:    ret <2 x double> [[R_1]]
++;
++entry:
++  %s0 = shufflevector <2 x double> %a, <2 x double> poison, <2 x i32> zeroinitializer
++  %s1 = shufflevector <2 x double> %a, <2 x double> poison, <2 x i32> <i32 1, i32 1>
++  br i1 %c, label %if.then, label %if.else
++
++if.then:
++  %r.0 = tail call fast <2 x double> @llvm.fma.v2f64(<2 x double> %b, <2 x double> %s0, <2 x double> %b)
++  ret <2 x double> %r.0
++
++if.else:
++  %r.1 = tail call fast <2 x double> @llvm.fma.v2f64(<2 x double> %b, <2 x double> %s1, <2 x double> %b)
++  ret <2 x double> %r.1
++}
++
++define <4 x float> @do_not_sink_out_of_range_shufflevector_fma_v4f32(i1 %c, <8 x float> %a, <4 x float> %b) {
++; CHECK-LABEL: @do_not_sink_out_of_range_shufflevector_fma_v4f32(
++; CHECK-NEXT:  entry:
++; CHECK-NEXT:    [[S4:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> <i32 4, i32 4, i32 4, i32 4>
++; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
++; CHECK:       if.then:
++; CHECK-NEXT:    [[R:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[B:%.*]], <4 x float> [[S4]], <4 x float> [[B]])
++; CHECK-NEXT:    ret <4 x float> [[R]]
++; CHECK:       if.else:
++; CHECK-NEXT:    ret <4 x float> zeroinitializer
++;
++entry:
++  %s4 = shufflevector <8 x float> %a, <8 x float> poison, <4 x i32> <i32 4, i32 4, i32 4, i32 4>
++  br i1 %c, label %if.then, label %if.else
++
++if.then:
++  %r = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %b, <4 x float> %s4, <4 x float> %b)
++  ret <4 x float> %r
++
++if.else:
++  ret <4 x float> zeroinitializer
++}
++
++declare <5 x float> @llvm.fma.v5f32(<5 x float>, <5 x float>, <5 x float>)
++
++define <5 x float> @do_not_sink_shufflevector_fma_v5f32(i1 %c, <8 x float> %a, <5 x float> %b) {
++; CHECK-LABEL: @do_not_sink_shufflevector_fma_v5f32(
++; CHECK-NEXT:  entry:
++; CHECK-NEXT:    [[S0:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <5 x i32> zeroinitializer
++; CHECK-NEXT:    [[S1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <5 x i32> <i32 1, i32 1, i32 1, i32 1, i32 4>
++; CHECK-NEXT:    [[S2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <5 x i32> <i32 2, i32 2, i32 2, i32 2, i32 4>
++; CHECK-NEXT:    [[S3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <5 x i32> <i32 3, i32 3, i32 3, i32 3, i32 4>
++; CHECK-NEXT:    [[S4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <5 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4>
++; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
++; CHECK:       if.then:
++; CHECK-NEXT:    [[R_0:%.*]] = tail call fast <5 x float> @llvm.fma.v5f32(<5 x float> [[B:%.*]], <5 x float> [[S0]], <5 x float> [[B]])
++; CHECK-NEXT:    [[R_1:%.*]] = tail call fast <5 x float> @llvm.fma.v5f32(<5 x float> [[R_0]], <5 x float> [[S1]], <5 x float> [[B]])
++; CHECK-NEXT:    ret <5 x float> [[R_1]]
++; CHECK:       if.else:
++; CHECK-NEXT:    [[R_2:%.*]] = tail call fast <5 x float> @llvm.fma.v5f32(<5 x float> [[B]], <5 x float> [[S2]], <5 x float> [[B]])
++; CHECK-NEXT:    [[R_3:%.*]] = tail call fast <5 x float> @llvm.fma.v5f32(<5 x float> [[R_2]], <5 x float> [[S3]], <5 x float> [[B]])
++; CHECK-NEXT:    [[R_4:%.*]] = tail call fast <5 x float> @llvm.fma.v5f32(<5 x float> [[R_3]], <5 x float> [[S4]], <5 x float> [[B]])
++; CHECK-NEXT:    ret <5 x float> [[R_4]]
++;
++entry:
++  %s0 = shufflevector <8 x float> %a, <8 x float> poison, <5 x i32> zeroinitializer
++  %s1 = shufflevector <8 x float> %a, <8 x float> poison, <5 x i32> <i32 1, i32 1, i32 1, i32 1, i32 4>
++  %s2 = shufflevector <8 x float> %a, <8 x float> poison, <5 x i32> <i32 2, i32 2, i32 2, i32 2, i32 4>
++  %s3 = shufflevector <8 x float> %a, <8 x float> poison, <5 x i32> <i32 3, i32 3, i32 3, i32 3, i32 4>
++  %s4 = shufflevector <8 x float> %a, <8 x float> poison, <5 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4>
++  br i1 %c, label %if.then, label %if.else
++
++if.then:
++  %r.0 = tail call fast <5 x float> @llvm.fma.v5f32(<5 x float> %b, <5 x float> %s0, <5 x float> %b)
++  %r.1 = tail call fast <5 x float> @llvm.fma.v5f32(<5 x float> %r.0, <5 x float> %s1, <5 x float> %b)
++  ret <5 x float> %r.1
++
++if.else:
++  %r.2 = tail call fast <5 x float> @llvm.fma.v5f32(<5 x float> %b, <5 x float> %s2, <5 x float> %b)
++  %r.3 = tail call fast <5 x float> @llvm.fma.v5f32(<5 x float> %r.2, <5 x float> %s3, <5 x float> %b)
++  %r.4 = tail call fast <5 x float> @llvm.fma.v5f32(<5 x float> %r.3, <5 x float> %s4, <5 x float> %b)
++  ret <5 x float> %r.4
++}
+-- 
+2.34.1
+
diff --git a/patches/cherry/AArch64-Use-Tbl.patch b/patches/cherry/AArch64-Use-Tbl.patch
index f9521be..f9521be 100755..100644
--- a/patches/cherry/AArch64-Use-Tbl.patch
+++ b/patches/cherry/AArch64-Use-Tbl.patch
diff --git a/patches/cherry/Loop-Vectorizer-shouldMaximizeVectorBandwidth.patch b/patches/cherry/Loop-Vectorizer-shouldMaximizeVectorBandwidth.patch
new file mode 100644
index 0000000..b4afeb5
--- /dev/null
+++ b/patches/cherry/Loop-Vectorizer-shouldMaximizeVectorBandwidth.patch
@@ -0,0 +1,527 @@
+commit 7e4b5c2e864ebd1c1a3a0203171143e311dd2a96 (HEAD)
+Author: Peter Waller <peter.waller@arm.com>
+Date:   Mon May 16 20:59:17 2022 +0000
+
+    [LV] Improve register pressure estimate at high VFs
+
+commit 4f81e1af2d1de9d902709cbaff727ba198cd5410
+Author: Jingu Kang <jingu.kang@arm.com>
+Date:   Tue Apr 5 13:16:10 2022 +0100
+
+    [AArch64] Set maximum VF with shouldMaximizeVectorBandwidth
+---
+diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
+index 7412e050322e..1179971ad13b 100644
+--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
++++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
+@@ -727,7 +727,7 @@ public:
+   bool isTypeLegal(Type *Ty) const;
+ 
+   /// Returns the estimated number of registers required to represent \p Ty.
+-  InstructionCost getRegUsageForType(Type *Ty) const;
++  unsigned getRegUsageForType(Type *Ty) const;
+ 
+   /// Return true if switches should be turned into lookup tables for the
+   /// target.
+@@ -934,7 +934,8 @@ public:
+   /// creating vectors that span multiple vector registers.
+   /// If false, the vectorization factor will be chosen based on the
+   /// size of the widest element type.
+-  bool shouldMaximizeVectorBandwidth() const;
++  /// \p K Register Kind for vectorization.
++  bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const;
+ 
+   /// \return The minimum vectorization factor for types of given element
+   /// bit width, or 0 if there is no minimum VF. The returned value only
+@@ -1571,7 +1572,7 @@ public:
+   virtual bool isProfitableToHoist(Instruction *I) = 0;
+   virtual bool useAA() = 0;
+   virtual bool isTypeLegal(Type *Ty) = 0;
+-  virtual InstructionCost getRegUsageForType(Type *Ty) = 0;
++  virtual unsigned getRegUsageForType(Type *Ty) = 0;
+   virtual bool shouldBuildLookupTables() = 0;
+   virtual bool shouldBuildLookupTablesForConstant(Constant *C) = 0;
+   virtual bool shouldBuildRelLookupTables() = 0;
+@@ -1618,7 +1619,8 @@ public:
+   virtual unsigned getMinVectorRegisterBitWidth() const = 0;
+   virtual Optional<unsigned> getMaxVScale() const = 0;
+   virtual Optional<unsigned> getVScaleForTuning() const = 0;
+-  virtual bool shouldMaximizeVectorBandwidth() const = 0;
++  virtual bool
++  shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const = 0;
+   virtual ElementCount getMinimumVF(unsigned ElemWidth,
+                                     bool IsScalable) const = 0;
+   virtual unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const = 0;
+@@ -2001,7 +2003,7 @@ public:
+   }
+   bool useAA() override { return Impl.useAA(); }
+   bool isTypeLegal(Type *Ty) override { return Impl.isTypeLegal(Ty); }
+-  InstructionCost getRegUsageForType(Type *Ty) override {
++  unsigned getRegUsageForType(Type *Ty) override {
+     return Impl.getRegUsageForType(Ty);
+   }
+   bool shouldBuildLookupTables() override {
+@@ -2108,8 +2110,9 @@ public:
+   Optional<unsigned> getVScaleForTuning() const override {
+     return Impl.getVScaleForTuning();
+   }
+-  bool shouldMaximizeVectorBandwidth() const override {
+-    return Impl.shouldMaximizeVectorBandwidth();
++  bool shouldMaximizeVectorBandwidth(
++      TargetTransformInfo::RegisterKind K) const override {
++    return Impl.shouldMaximizeVectorBandwidth(K);
+   }
+   ElementCount getMinimumVF(unsigned ElemWidth,
+                             bool IsScalable) const override {
+diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+index a32744f8d58b..28ce1690202d 100644
+--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
++++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+@@ -310,7 +310,7 @@ public:
+ 
+   bool isTypeLegal(Type *Ty) const { return false; }
+ 
+-  InstructionCost getRegUsageForType(Type *Ty) const { return 1; }
++  unsigned getRegUsageForType(Type *Ty) const { return 1; }
+ 
+   bool shouldBuildLookupTables() const { return true; }
+ 
+@@ -415,7 +415,10 @@ public:
+   Optional<unsigned> getMaxVScale() const { return None; }
+   Optional<unsigned> getVScaleForTuning() const { return None; }
+ 
+-  bool shouldMaximizeVectorBandwidth() const { return false; }
++  bool
++  shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const {
++    return false;
++  }
+ 
+   ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const {
+     return ElementCount::get(0, IsScalable);
+diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+index 0b2737628923..39c8eaf6206b 100644
+--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
++++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+@@ -362,10 +362,9 @@ public:
+     return getTLI()->isTypeLegal(VT);
+   }
+ 
+-  InstructionCost getRegUsageForType(Type *Ty) {
+-    InstructionCost Val = getTLI()->getTypeLegalizationCost(DL, Ty).first;
+-    assert(Val >= 0 && "Negative cost!");
+-    return Val;
++  unsigned getRegUsageForType(Type *Ty) {
++    EVT ETy = getTLI()->getValueType(DL, Ty);
++    return getTLI()->getNumRegisters(Ty->getContext(), ETy);
+   }
+ 
+   InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr,
+diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
+index 25e9dee98e13..7ec752990620 100644
+--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
++++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
+@@ -470,7 +470,7 @@ bool TargetTransformInfo::isTypeLegal(Type *Ty) const {
+   return TTIImpl->isTypeLegal(Ty);
+ }
+ 
+-InstructionCost TargetTransformInfo::getRegUsageForType(Type *Ty) const {
++unsigned TargetTransformInfo::getRegUsageForType(Type *Ty) const {
+   return TTIImpl->getRegUsageForType(Ty);
+ }
+ 
+@@ -623,8 +623,9 @@ Optional<unsigned> TargetTransformInfo::getVScaleForTuning() const {
+   return TTIImpl->getVScaleForTuning();
+ }
+ 
+-bool TargetTransformInfo::shouldMaximizeVectorBandwidth() const {
+-  return TTIImpl->shouldMaximizeVectorBandwidth();
++bool TargetTransformInfo::shouldMaximizeVectorBandwidth(
++    TargetTransformInfo::RegisterKind K) const {
++  return TTIImpl->shouldMaximizeVectorBandwidth(K);
+ }
+ 
+ ElementCount TargetTransformInfo::getMinimumVF(unsigned ElemWidth,
+diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+index b2ffdf949d8b..c245b29b6d8a 100644
+--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
++++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+@@ -50,6 +50,12 @@ bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
+   return (CallerBits & CalleeBits) == CalleeBits;
+ }
+ 
++bool AArch64TTIImpl::shouldMaximizeVectorBandwidth(
++    TargetTransformInfo::RegisterKind K) const {
++  assert(K != TargetTransformInfo::RGK_Scalar);
++  return K == TargetTransformInfo::RGK_FixedWidthVector;
++}
++
+ /// Calculate the cost of materializing a 64-bit value. This helper
+ /// method might only calculate a fraction of a larger immediate. Therefore it
+ /// is valid to return a cost of ZERO.
+diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+index a6029b9f2445..b7b11d196f1c 100644
+--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
++++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+@@ -135,6 +135,8 @@ public:
+     return ST->getVScaleForTuning();
+   }
+ 
++  bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const;
++
+   /// Try to return an estimate cost factor that can be used as a multiplier
+   /// when scalarizing an operation for a vector with ElementCount \p VF.
+   /// For scalable vectors this currently takes the most pessimistic view based
+diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+index 9e637dfc3e16..7bc7bbf10614 100644
+--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
++++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+@@ -86,12 +86,11 @@ public:
+   unsigned getMinVectorRegisterBitWidth() const;
+   ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const;
+ 
+-  bool shouldMaximizeVectorBandwidth() const {
++  bool
++  shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const {
+     return true;
+   }
+-  bool supportsEfficientVectorElementLoadStore() {
+-    return false;
+-  }
++  bool supportsEfficientVectorElementLoadStore() { return false; }
+   bool hasBranchDivergence() {
+     return false;
+   }
+diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+index 99e6774a02e4..26ac8d872800 100644
+--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
++++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+@@ -276,7 +276,7 @@ void RISCVTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
+   BaseT::getPeelingPreferences(L, SE, PP);
+ }
+ 
+-InstructionCost RISCVTTIImpl::getRegUsageForType(Type *Ty) {
++unsigned RISCVTTIImpl::getRegUsageForType(Type *Ty) {
+   TypeSize Size = Ty->getPrimitiveSizeInBits();
+   if (Ty->isVectorTy()) {
+     if (Size.isScalable() && ST->hasVInstructions())
+diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+index e79c4f75712b..959a1433e689 100644
+--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
++++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+@@ -60,7 +60,7 @@ public:
+ 
+   TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const;
+ 
+-  InstructionCost getRegUsageForType(Type *Ty);
++  unsigned getRegUsageForType(Type *Ty);
+ 
+   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
+                                TTI::UnrollingPreferences &UP,
+diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+index 46ff0994e04e..c41726b11aca 100644
+--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
++++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+@@ -5560,9 +5560,12 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
+     return ElementCount::getFixed(ClampedConstTripCount);
+   }
+ 
++  TargetTransformInfo::RegisterKind RegKind =
++      ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
++                           : TargetTransformInfo::RGK_FixedWidthVector;
+   ElementCount MaxVF = MaxVectorElementCount;
+-  if (TTI.shouldMaximizeVectorBandwidth() ||
+-      (MaximizeBandwidth && isScalarEpilogueAllowed())) {
++  if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
++                            TTI.shouldMaximizeVectorBandwidth(RegKind))) {
+     auto MaxVectorElementCountMaxBW = ElementCount::get(
+         PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType),
+         ComputeScalableMaxVF);
+@@ -6319,16 +6322,10 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
+ 
+   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
+ 
+-  // A lambda that gets the register usage for the given type and VF.
+-  const auto &TTICapture = TTI;
+-  auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
++  auto GetRegUsage = [&TTI = TTI](Type *Ty, ElementCount VF) -> unsigned {
+     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
+       return 0;
+-    InstructionCost::CostType RegUsage =
+-        *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue();
+-    assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() &&
+-           "Nonsensical values for register usage.");
+-    return RegUsage;
++    return TTI.getRegUsageForType(VectorType::get(Ty, VF));
+   };
+ 
+   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
+diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll
+index 371d209bafff..a1ca0fea7972 100644
+--- a/llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll
++++ b/llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll
+@@ -4,11 +4,12 @@
+ ; are not profitable.
+ 
+ ; Test with a loop that contains memory accesses of i8 and i32 types. The
+-; default maximum VF for NEON is 4. And while we don't have an instruction to
+-; load 4 x i8, vectorization might still be profitable.
++; maximum VF for NEON is calculated by 128/size of smallest type in loop.
++; And while we don't have an instruction to  load 4 x i8, vectorization
++; might still be profitable.
+ define void @test_load_i8_store_i32(i8* noalias %src, i32* noalias %dst, i32 %off, i64 %N) {
+ ; CHECK-LABEL: @test_load_i8_store_i32(
+-; CHECK:       <4 x i8>
++; CHECK:       <16 x i8>
+ ;
+ entry:
+   br label %loop
+@@ -32,7 +33,7 @@ exit:
+ ; Same as test_load_i8_store_i32, but with types flipped for load and store.
+ define void @test_load_i32_store_i8(i32* noalias %src, i8* noalias %dst, i32 %off, i64 %N) {
+ ; CHECK-LABEL: @test_load_i32_store_i8(
+-; CHECK:     <4 x i8>
++; CHECK:     <16 x i8>
+ ;
+ entry:
+   br label %loop
+@@ -84,7 +85,7 @@ exit:
+ ; vectorization factor.
+ define void @test_load_i8_store_i64_large(i8* noalias %src, i64* noalias %dst, i64* noalias %dst.2, i64* noalias %dst.3, i64* noalias %dst.4, i64* noalias %dst.5, i64%off, i64 %off.2, i64 %N) {
+ ; CHECK-LABEL: @test_load_i8_store_i64_large
+-; CHECK: <2 x i64>
++; CHECK: <8 x i64>
+ ;
+ entry:
+   br label %loop
+diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/i1-reg-usage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/i1-reg-usage.ll
+new file mode 100644
+index 000000000000..f0dc8e502769
+--- /dev/null
++++ b/llvm/test/Transforms/LoopVectorize/AArch64/i1-reg-usage.ll
+@@ -0,0 +1,57 @@
++; RUN: opt -loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 < %s | FileCheck %s
++; REQUIRES: asserts
++
++target triple = "aarch64"
++
++; Test that shows how many registers the loop vectorizer thinks an illegal <VF x i1> will consume.
++
++; CHECK-LABEL: LV: Checking a loop in 'or_reduction_neon' from <stdin>
++; CHECK: LV(REG): VF = 32
++; CHECK-NEXT: LV(REG): Found max usage: 2 item
++; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 72 registers
++; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers
++
++define i1 @or_reduction_neon(i32 %arg, ptr %ptr) {
++entry:
++  br label %loop
++exit:
++  ret i1 %reduction_next
++loop:
++  %induction = phi i32 [ 0, %entry ], [ %induction_next, %loop ]
++  %reduction = phi i1 [ 0, %entry ], [ %reduction_next, %loop ]
++  %gep = getelementptr inbounds i32, ptr %ptr, i32 %induction
++  %loaded = load i32, ptr %gep
++  %i1 = icmp eq i32 %loaded, %induction
++  %reduction_next = or i1 %i1, %reduction
++  %induction_next = add nuw i32 %induction, 1
++  %cond = icmp eq i32 %induction_next, %arg
++  br i1 %cond, label %exit, label %loop, !llvm.loop !32
++}
++
++; CHECK-LABEL: LV: Checking a loop in 'or_reduction_sve'
++; CHECK: LV(REG): VF = 64
++; CHECK-NEXT: LV(REG): Found max usage: 2 item
++; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 136 registers
++; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers
++
++define i1 @or_reduction_sve(i32 %arg, ptr %ptr) vscale_range(2,2) "target-features"="+sve" {
++entry:
++  br label %loop
++exit:
++  ret i1 %reduction_next
++loop:
++  %induction = phi i32 [ 0, %entry ], [ %induction_next, %loop ]
++  %reduction = phi i1 [ true, %entry ], [ %reduction_next, %loop ]
++  %gep = getelementptr inbounds i32, ptr %ptr, i32 %induction
++  %loaded = load i32, ptr %gep
++  %i1 = icmp eq i32 %loaded, %induction
++  %reduction_next = or i1 %i1, %reduction
++  %induction_next = add nuw i32 %induction, 1
++  %cond = icmp eq i32 %induction_next, %arg
++  br i1 %cond, label %exit, label %loop, !llvm.loop !64
++}
++
++!32 = distinct !{!32, !33}
++!33 = !{!"llvm.loop.vectorize.width", i32 32}
++!64 = distinct !{!64, !65}
++!65 = !{!"llvm.loop.vectorize.width", i32 64}
+diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
+index e6e43375204d..28eabe382dfb 100644
+--- a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
++++ b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
+@@ -116,9 +116,9 @@ for.body:                                         ; preds = %entry, %for.body
+ }
+ 
+ ; CHECK-LABEL: @add_d(
+-; CHECK: load <4 x i16>
+-; CHECK: add nsw <4 x i32>
+-; CHECK: store <4 x i32>
++; CHECK: load <8 x i16>
++; CHECK: add nsw <8 x i32>
++; CHECK: store <8 x i32>
+ define void @add_d(i16* noalias nocapture readonly %p, i32* noalias nocapture %q, i32 %len) #0 {
+ entry:
+   %cmp7 = icmp sgt i32 %len, 0
+diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll
+index a95c0aa6f375..071255c4f4f0 100644
+--- a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll
++++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll
+@@ -123,16 +123,16 @@ for.body:
+ ; }
+ ;
+ ; CHECK: vector.body:
+-; CHECK:   phi <8 x i16>
+-; CHECK:   [[Ld1:%[a-zA-Z0-9.]+]] = load <8 x i8>
+-; CHECK:   zext <8 x i8> [[Ld1]] to <8 x i16>
+-; CHECK:   [[Ld2:%[a-zA-Z0-9.]+]] = load <8 x i8>
+-; CHECK:   zext <8 x i8> [[Ld2]] to <8 x i16>
+-; CHECK:   add <8 x i16>
+-; CHECK:   add <8 x i16>
++; CHECK:   phi <16 x i16>
++; CHECK:   [[Ld1:%[a-zA-Z0-9.]+]] = load <16 x i8>
++; CHECK:   zext <16 x i8> [[Ld1]] to <16 x i16>
++; CHECK:   [[Ld2:%[a-zA-Z0-9.]+]] = load <16 x i8>
++; CHECK:   zext <16 x i8> [[Ld2]] to <16 x i16>
++; CHECK:   add <16 x i16>
++; CHECK:   add <16 x i16>
+ ;
+ ; CHECK: middle.block:
+-; CHECK:   [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16>
++; CHECK:   [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16>
+ ; CHECK:   zext i16 [[Rdx]] to i32
+ ;
+ define i16 @reduction_i16_2(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %n) {
+diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll
+index 27868480c23b..262236075f7c 100644
+--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll
++++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll
+@@ -29,7 +29,7 @@
+ ; NEOVERSE-N2: LV: Vector loop of width vscale x 4 costs: 3 (assuming a minimum vscale of 1).
+ 
+ ; VF-4: <4 x i32>
+-; VF-VSCALE4: <vscale x 4 x i32>
++; VF-VSCALE4: <16 x i32>
+ define void @test0(i32* %a, i8* %b, i32* %c) #0 {
+ entry:
+   br label %loop
+diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll
+index 9bd9c31d32d3..1d2c70db11cf 100644
+--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll
++++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll
+@@ -9,9 +9,9 @@
+ define void @test0(i32* %a, i8* %b, i32* %c) #0 {
+ ; CHECK: LV: Checking a loop in "test0"
+ ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4
+-; CHECK_SCALABLE_ON: LV: Selecting VF: vscale x 4
++; CHECK_SCALABLE_ON: LV: Selecting VF: 16
+ ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF
+-; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4
++; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 16
+ ; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 16
+ ; CHECK_SCALABLE_ON_MAXBW: LV: Selecting VF: vscale x 16
+ entry:
+@@ -40,9 +40,9 @@ exit:
+ define void @test1(i32* %a, i8* %b) #0 {
+ ; CHECK: LV: Checking a loop in "test1"
+ ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4
+-; CHECK_SCALABLE_ON: LV: Selecting VF: vscale x 4
++; CHECK_SCALABLE_ON: LV: Selecting VF: 16
+ ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF
+-; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4
++; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 16
+ ; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 4
+ ; CHECK_SCALABLE_ON_MAXBW: LV: Selecting VF: 16
+ entry:
+@@ -72,9 +72,9 @@ exit:
+ define void @test2(i32* %a, i8* %b) #0 {
+ ; CHECK: LV: Checking a loop in "test2"
+ ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 2
+-; CHECK_SCALABLE_ON: LV: Selecting VF: vscale x 2
++; CHECK_SCALABLE_ON: LV: Selecting VF: 16
+ ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF
+-; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4
++; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 16
+ ; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 2
+ ; CHECK_SCALABLE_ON_MAXBW: LV: Selecting VF: 16
+ entry:
+@@ -104,9 +104,9 @@ exit:
+ define void @test3(i32* %a, i8* %b) #0 {
+ ; CHECK: LV: Checking a loop in "test3"
+ ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 1
+-; CHECK_SCALABLE_ON: LV: Selecting VF: 4
++; CHECK_SCALABLE_ON: LV: Selecting VF: 16
+ ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF
+-; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4
++; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 16
+ ; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 1
+ ; CHECK_SCALABLE_ON_MAXBW: LV: Selecting VF: 16
+ entry:
+diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-illegal-type.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-illegal-type.ll
+index 4d0886f4d953..43ef43c11507 100644
+--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-illegal-type.ll
++++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-illegal-type.ll
+@@ -83,11 +83,11 @@ for.end:
+ define void @uniform_store_i1(i1* noalias %dst, i64* noalias %start, i64 %N) {
+ ; CHECK-LABEL: @uniform_store_i1
+ ; CHECK: vector.body
+-; CHECK: %[[GEP:.*]] = getelementptr inbounds i64, <2 x i64*> {{.*}}, i64 1
+-; CHECK: %[[ICMP:.*]] = icmp eq <2 x i64*> %[[GEP]], %[[SPLAT:.*]]
+-; CHECK: %[[EXTRACT1:.*]] = extractelement <2 x i1> %[[ICMP]], i32 0
++; CHECK: %[[GEP:.*]] = getelementptr inbounds i64, <64 x i64*> {{.*}}, i64 1
++; CHECK: %[[ICMP:.*]] = icmp eq <64 x i64*> %[[GEP]], %[[SPLAT:.*]]
++; CHECK: %[[EXTRACT1:.*]] = extractelement <64 x i1> %[[ICMP]], i32 0
+ ; CHECK: store i1 %[[EXTRACT1]], i1* %dst
+-; CHECK: %[[EXTRACT2:.*]] = extractelement <2 x i1> %[[ICMP]], i32 1
++; CHECK: %[[EXTRACT2:.*]] = extractelement <64 x i1> %[[ICMP]], i32 1
+ ; CHECK: store i1 %[[EXTRACT2]], i1* %dst
+ ; CHECK-NOT: vscale
+ entry:
+diff --git a/llvm/test/Transforms/LoopVectorize/X86/i1-reg-usage.ll b/llvm/test/Transforms/LoopVectorize/X86/i1-reg-usage.ll
+new file mode 100644
+index 000000000000..4cab716c7544
+--- /dev/null
++++ b/llvm/test/Transforms/LoopVectorize/X86/i1-reg-usage.ll
+@@ -0,0 +1,32 @@
++; RUN: opt -loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 < %s | FileCheck %s
++; REQUIRES: asserts
++
++target triple = "x86_64"
++
++; Test that shows how many registers the loop vectorizer thinks an illegal <VF x i1> will consume.
++
++; CHECK-LABEL: LV: Checking a loop in 'or_reduction_avx' from <stdin>
++; CHECK: LV(REG): VF = 64
++; CHECK-NEXT: LV(REG): Found max usage: 2 item
++; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 136 registers
++; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers
++
++define i1 @or_reduction_avx(i32 %arg, ptr %ptr) "target-features"="+avx" {
++entry:
++  br label %loop
++exit:
++  ret i1 %reduction_next
++loop:
++  %induction = phi i32 [ 0, %entry ], [ %induction_next, %loop ]
++  %reduction = phi i1 [ 0, %entry ], [ %reduction_next, %loop ]
++  %gep = getelementptr inbounds i32, ptr %ptr, i32 %induction
++  %loaded = load i32, ptr %gep
++  %i1 = icmp eq i32 %loaded, %induction
++  %reduction_next = or i1 %i1, %reduction
++  %induction_next = add nuw i32 %induction, 1
++  %cond = icmp eq i32 %induction_next, %arg
++  br i1 %cond, label %exit, label %loop, !llvm.loop !64
++}
++
++!64 = distinct !{!64, !65}
++!65 = !{!"llvm.loop.vectorize.width", i32 64}
diff --git a/patches/cherry/a8de8cab7006bc885804e8a2c0a6902702521cfe.patch b/patches/cherry/a8de8cab7006bc885804e8a2c0a6902702521cfe.patch
new file mode 100644
index 0000000..99b9594
--- /dev/null
+++ b/patches/cherry/a8de8cab7006bc885804e8a2c0a6902702521cfe.patch
@@ -0,0 +1,1910 @@
+From a8de8cab7006bc885804e8a2c0a6902702521cfe Mon Sep 17 00:00:00 2001
+From: Cullen Rhodes <cullen.rhodes@arm.com>
+Date: Fri, 22 Jul 2022 07:26:54 +0000
+Subject: [PATCH] [AArch64] Add fcmp fast math tests
+
+Reviewed By: paulwalker-arm
+
+Differential Revision: https://reviews.llvm.org/D130094
+---
+ .../AArch64/neon-compare-instructions.ll      | 1887 +++++++++++++++++
+ 1 file changed, 1887 insertions(+)
+
+diff --git a/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll b/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll
+index bd665955eb99..dcb0ca631c5b 100644
+--- a/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll
++++ b/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll
+@@ -4494,3 +4494,1890 @@ define <2 x i64> @fcmunoz2xdouble(<2 x double> %A) {
+   ret <2 x i64> %tmp4
+ 
+ }
++
++define <2 x i32> @fcmoeq2xfloat_fast(<2 x float> %A, <2 x float> %B) {
++; CHECK-LABEL: fcmoeq2xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmeq v0.2s, v0.2s, v1.2s
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmoeq2xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmeq v0.2s, v0.2s, v1.2s
++; GISEL-NEXT:    shl v0.2s, v0.2s, #31
++; GISEL-NEXT:    sshr v0.2s, v0.2s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast oeq <2 x float> %A, %B
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
++  ret <2 x i32> %tmp4
++}
++
++define <4 x i32> @fcmoeq4xfloat_fast(<4 x float> %A, <4 x float> %B) {
++; CHECK-LABEL: fcmoeq4xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmoeq4xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
++; GISEL-NEXT:    shl v0.4s, v0.4s, #31
++; GISEL-NEXT:    sshr v0.4s, v0.4s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast oeq <4 x float> %A, %B
++  %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
++  ret <4 x i32> %tmp4
++}
++define <2 x i64> @fcmoeq2xdouble_fast(<2 x double> %A, <2 x double> %B) {
++; CHECK-LABEL: fcmoeq2xdouble_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmeq v0.2d, v0.2d, v1.2d
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmoeq2xdouble_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmeq v0.2d, v0.2d, v1.2d
++; GISEL-NEXT:    shl v0.2d, v0.2d, #63
++; GISEL-NEXT:    sshr v0.2d, v0.2d, #63
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast oeq <2 x double> %A, %B
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
++  ret <2 x i64> %tmp4
++}
++
++define <2 x i32> @fcmoge2xfloat_fast(<2 x float> %A, <2 x float> %B) {
++; CHECK-LABEL: fcmoge2xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmge v0.2s, v0.2s, v1.2s
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmoge2xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmge v0.2s, v0.2s, v1.2s
++; GISEL-NEXT:    shl v0.2s, v0.2s, #31
++; GISEL-NEXT:    sshr v0.2s, v0.2s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast oge <2 x float> %A, %B
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
++  ret <2 x i32> %tmp4
++}
++
++define <4 x i32> @fcmoge4xfloat_fast(<4 x float> %A, <4 x float> %B) {
++; CHECK-LABEL: fcmoge4xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmge v0.4s, v0.4s, v1.4s
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmoge4xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmge v0.4s, v0.4s, v1.4s
++; GISEL-NEXT:    shl v0.4s, v0.4s, #31
++; GISEL-NEXT:    sshr v0.4s, v0.4s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast oge <4 x float> %A, %B
++  %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
++  ret <4 x i32> %tmp4
++}
++define <2 x i64> @fcmoge2xdouble_fast(<2 x double> %A, <2 x double> %B) {
++; CHECK-LABEL: fcmoge2xdouble_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmge v0.2d, v0.2d, v1.2d
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmoge2xdouble_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmge v0.2d, v0.2d, v1.2d
++; GISEL-NEXT:    shl v0.2d, v0.2d, #63
++; GISEL-NEXT:    sshr v0.2d, v0.2d, #63
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast oge <2 x double> %A, %B
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
++  ret <2 x i64> %tmp4
++}
++
++define <2 x i32> @fcmogt2xfloat_fast(<2 x float> %A, <2 x float> %B) {
++; CHECK-LABEL: fcmogt2xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmgt v0.2s, v0.2s, v1.2s
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmogt2xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmgt v0.2s, v0.2s, v1.2s
++; GISEL-NEXT:    shl v0.2s, v0.2s, #31
++; GISEL-NEXT:    sshr v0.2s, v0.2s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast ogt <2 x float> %A, %B
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
++  ret <2 x i32> %tmp4
++}
++
++define <4 x i32> @fcmogt4xfloat_fast(<4 x float> %A, <4 x float> %B) {
++; CHECK-LABEL: fcmogt4xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmgt v0.4s, v0.4s, v1.4s
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmogt4xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmgt v0.4s, v0.4s, v1.4s
++; GISEL-NEXT:    shl v0.4s, v0.4s, #31
++; GISEL-NEXT:    sshr v0.4s, v0.4s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast ogt <4 x float> %A, %B
++  %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
++  ret <4 x i32> %tmp4
++}
++define <2 x i64> @fcmogt2xdouble_fast(<2 x double> %A, <2 x double> %B) {
++; CHECK-LABEL: fcmogt2xdouble_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmgt v0.2d, v0.2d, v1.2d
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmogt2xdouble_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmgt v0.2d, v0.2d, v1.2d
++; GISEL-NEXT:    shl v0.2d, v0.2d, #63
++; GISEL-NEXT:    sshr v0.2d, v0.2d, #63
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast ogt <2 x double> %A, %B
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
++  ret <2 x i64> %tmp4
++}
++
++define <2 x i32> @fcmole2xfloat_fast(<2 x float> %A, <2 x float> %B) {
++; CHECK-LABEL: fcmole2xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
++; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
++; CHECK-NEXT:    mov s2, v1.s[1]
++; CHECK-NEXT:    mov s3, v0.s[1]
++; CHECK-NEXT:    fcmp s3, s2
++; CHECK-NEXT:    csetm w8, le
++; CHECK-NEXT:    fcmp s0, s1
++; CHECK-NEXT:    csetm w9, le
++; CHECK-NEXT:    fmov s0, w9
++; CHECK-NEXT:    mov v0.s[1], w8
++; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmole2xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmge v0.2s, v1.2s, v0.2s
++; GISEL-NEXT:    shl v0.2s, v0.2s, #31
++; GISEL-NEXT:    sshr v0.2s, v0.2s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast ole <2 x float> %A, %B
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
++  ret <2 x i32> %tmp4
++}
++
++define <4 x i32> @fcmole4xfloat_fast(<4 x float> %A, <4 x float> %B) {
++; CHECK-LABEL: fcmole4xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    mov s2, v1.s[1]
++; CHECK-NEXT:    mov s3, v0.s[1]
++; CHECK-NEXT:    mov s4, v0.s[2]
++; CHECK-NEXT:    fcmp s3, s2
++; CHECK-NEXT:    mov s3, v1.s[2]
++; CHECK-NEXT:    csetm w8, le
++; CHECK-NEXT:    fcmp s0, s1
++; CHECK-NEXT:    mov s1, v1.s[3]
++; CHECK-NEXT:    mov s0, v0.s[3]
++; CHECK-NEXT:    csetm w9, le
++; CHECK-NEXT:    fcmp s4, s3
++; CHECK-NEXT:    fmov s2, w9
++; CHECK-NEXT:    mov v2.s[1], w8
++; CHECK-NEXT:    csetm w8, le
++; CHECK-NEXT:    fcmp s0, s1
++; CHECK-NEXT:    mov v2.s[2], w8
++; CHECK-NEXT:    csetm w8, le
++; CHECK-NEXT:    mov v2.s[3], w8
++; CHECK-NEXT:    mov v0.16b, v2.16b
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmole4xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmge v0.4s, v1.4s, v0.4s
++; GISEL-NEXT:    shl v0.4s, v0.4s, #31
++; GISEL-NEXT:    sshr v0.4s, v0.4s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast ole <4 x float> %A, %B
++  %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
++  ret <4 x i32> %tmp4
++}
++
++define <2 x i64> @fcmole2xdouble_fast(<2 x double> %A, <2 x double> %B) {
++; CHECK-LABEL: fcmole2xdouble_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    mov d2, v1.d[1]
++; CHECK-NEXT:    mov d3, v0.d[1]
++; CHECK-NEXT:    fcmp d3, d2
++; CHECK-NEXT:    csetm x8, le
++; CHECK-NEXT:    fcmp d0, d1
++; CHECK-NEXT:    csetm x9, le
++; CHECK-NEXT:    fmov d0, x9
++; CHECK-NEXT:    mov v0.d[1], x8
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmole2xdouble_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmge v0.2d, v1.2d, v0.2d
++; GISEL-NEXT:    shl v0.2d, v0.2d, #63
++; GISEL-NEXT:    sshr v0.2d, v0.2d, #63
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast ole <2 x double> %A, %B
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
++  ret <2 x i64> %tmp4
++}
++
++define <2 x i32> @fcmolt2xfloat_fast(<2 x float> %A, <2 x float> %B) {
++; CHECK-LABEL: fcmolt2xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
++; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
++; CHECK-NEXT:    mov s2, v1.s[1]
++; CHECK-NEXT:    mov s3, v0.s[1]
++; CHECK-NEXT:    fcmp s3, s2
++; CHECK-NEXT:    csetm w8, lt
++; CHECK-NEXT:    fcmp s0, s1
++; CHECK-NEXT:    csetm w9, lt
++; CHECK-NEXT:    fmov s0, w9
++; CHECK-NEXT:    mov v0.s[1], w8
++; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmolt2xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmgt v0.2s, v1.2s, v0.2s
++; GISEL-NEXT:    shl v0.2s, v0.2s, #31
++; GISEL-NEXT:    sshr v0.2s, v0.2s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast olt <2 x float> %A, %B
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
++  ret <2 x i32> %tmp4
++}
++
++define <4 x i32> @fcmolt4xfloat_fast(<4 x float> %A, <4 x float> %B) {
++; CHECK-LABEL: fcmolt4xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    mov s2, v1.s[1]
++; CHECK-NEXT:    mov s3, v0.s[1]
++; CHECK-NEXT:    mov s4, v0.s[2]
++; CHECK-NEXT:    fcmp s3, s2
++; CHECK-NEXT:    mov s3, v1.s[2]
++; CHECK-NEXT:    csetm w8, lt
++; CHECK-NEXT:    fcmp s0, s1
++; CHECK-NEXT:    mov s1, v1.s[3]
++; CHECK-NEXT:    mov s0, v0.s[3]
++; CHECK-NEXT:    csetm w9, lt
++; CHECK-NEXT:    fcmp s4, s3
++; CHECK-NEXT:    fmov s2, w9
++; CHECK-NEXT:    mov v2.s[1], w8
++; CHECK-NEXT:    csetm w8, lt
++; CHECK-NEXT:    fcmp s0, s1
++; CHECK-NEXT:    mov v2.s[2], w8
++; CHECK-NEXT:    csetm w8, lt
++; CHECK-NEXT:    mov v2.s[3], w8
++; CHECK-NEXT:    mov v0.16b, v2.16b
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmolt4xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
++; GISEL-NEXT:    shl v0.4s, v0.4s, #31
++; GISEL-NEXT:    sshr v0.4s, v0.4s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast olt <4 x float> %A, %B
++  %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
++  ret <4 x i32> %tmp4
++}
++
++define <2 x i64> @fcmolt2xdouble_fast(<2 x double> %A, <2 x double> %B) {
++; CHECK-LABEL: fcmolt2xdouble_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    mov d2, v1.d[1]
++; CHECK-NEXT:    mov d3, v0.d[1]
++; CHECK-NEXT:    fcmp d3, d2
++; CHECK-NEXT:    csetm x8, lt
++; CHECK-NEXT:    fcmp d0, d1
++; CHECK-NEXT:    csetm x9, lt
++; CHECK-NEXT:    fmov d0, x9
++; CHECK-NEXT:    mov v0.d[1], x8
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmolt2xdouble_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmgt v0.2d, v1.2d, v0.2d
++; GISEL-NEXT:    shl v0.2d, v0.2d, #63
++; GISEL-NEXT:    sshr v0.2d, v0.2d, #63
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast olt <2 x double> %A, %B
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
++  ret <2 x i64> %tmp4
++}
++
++define <2 x i32> @fcmone2xfloat_fast(<2 x float> %A, <2 x float> %B) {
++; CHECK-LABEL: fcmone2xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmeq v0.2s, v0.2s, v1.2s
++; CHECK-NEXT:    mvn v0.8b, v0.8b
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmone2xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmgt v2.2s, v0.2s, v1.2s
++; GISEL-NEXT:    fcmgt v0.2s, v1.2s, v0.2s
++; GISEL-NEXT:    orr v0.8b, v0.8b, v2.8b
++; GISEL-NEXT:    shl v0.2s, v0.2s, #31
++; GISEL-NEXT:    sshr v0.2s, v0.2s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast one <2 x float> %A, %B
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
++  ret <2 x i32> %tmp4
++}
++
++define <4 x i32> @fcmone4xfloat_fast(<4 x float> %A, <4 x float> %B) {
++; CHECK-LABEL: fcmone4xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
++; CHECK-NEXT:    mvn v0.16b, v0.16b
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmone4xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmgt v2.4s, v0.4s, v1.4s
++; GISEL-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
++; GISEL-NEXT:    orr v0.16b, v0.16b, v2.16b
++; GISEL-NEXT:    shl v0.4s, v0.4s, #31
++; GISEL-NEXT:    sshr v0.4s, v0.4s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast one <4 x float> %A, %B
++  %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
++  ret <4 x i32> %tmp4
++}
++
++define <2 x i64> @fcmone2xdouble_fast(<2 x double> %A, <2 x double> %B) {
++; CHECK-LABEL: fcmone2xdouble_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmeq v0.2d, v0.2d, v1.2d
++; CHECK-NEXT:    mvn v0.16b, v0.16b
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmone2xdouble_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmgt v2.2d, v0.2d, v1.2d
++; GISEL-NEXT:    fcmgt v0.2d, v1.2d, v0.2d
++; GISEL-NEXT:    orr v0.16b, v0.16b, v2.16b
++; GISEL-NEXT:    shl v0.2d, v0.2d, #63
++; GISEL-NEXT:    sshr v0.2d, v0.2d, #63
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast one <2 x double> %A, %B
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
++  ret <2 x i64> %tmp4
++}
++
++define <2 x i32> @fcmord2xfloat_fast(<2 x float> %A, <2 x float> %B) {
++; CHECK-LABEL: fcmord2xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmge v2.2s, v0.2s, v1.2s
++; CHECK-NEXT:    fcmgt v0.2s, v1.2s, v0.2s
++; CHECK-NEXT:    orr v0.8b, v0.8b, v2.8b
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmord2xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmge v2.2s, v0.2s, v1.2s
++; GISEL-NEXT:    fcmgt v0.2s, v1.2s, v0.2s
++; GISEL-NEXT:    orr v0.8b, v0.8b, v2.8b
++; GISEL-NEXT:    shl v0.2s, v0.2s, #31
++; GISEL-NEXT:    sshr v0.2s, v0.2s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast ord <2 x float> %A, %B
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
++  ret <2 x i32> %tmp4
++}
++
++define <4 x i32> @fcmord4xfloat_fast(<4 x float> %A, <4 x float> %B) {
++; CHECK-LABEL: fcmord4xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmge v2.4s, v0.4s, v1.4s
++; CHECK-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
++; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmord4xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmge v2.4s, v0.4s, v1.4s
++; GISEL-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
++; GISEL-NEXT:    orr v0.16b, v0.16b, v2.16b
++; GISEL-NEXT:    shl v0.4s, v0.4s, #31
++; GISEL-NEXT:    sshr v0.4s, v0.4s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast ord <4 x float> %A, %B
++  %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
++  ret <4 x i32> %tmp4
++}
++
++define <2 x i64> @fcmord2xdouble_fast(<2 x double> %A, <2 x double> %B) {
++; CHECK-LABEL: fcmord2xdouble_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmge v2.2d, v0.2d, v1.2d
++; CHECK-NEXT:    fcmgt v0.2d, v1.2d, v0.2d
++; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmord2xdouble_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmge v2.2d, v0.2d, v1.2d
++; GISEL-NEXT:    fcmgt v0.2d, v1.2d, v0.2d
++; GISEL-NEXT:    orr v0.16b, v0.16b, v2.16b
++; GISEL-NEXT:    shl v0.2d, v0.2d, #63
++; GISEL-NEXT:    sshr v0.2d, v0.2d, #63
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast ord <2 x double> %A, %B
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
++  ret <2 x i64> %tmp4
++}
++
++
++define <2 x i32> @fcmuno2xfloat_fast(<2 x float> %A, <2 x float> %B) {
++; CHECK-LABEL: fcmuno2xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmge v2.2s, v0.2s, v1.2s
++; CHECK-NEXT:    fcmgt v0.2s, v1.2s, v0.2s
++; CHECK-NEXT:    orr v0.8b, v0.8b, v2.8b
++; CHECK-NEXT:    mvn v0.8b, v0.8b
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmuno2xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmge v2.2s, v0.2s, v1.2s
++; GISEL-NEXT:    fcmgt v0.2s, v1.2s, v0.2s
++; GISEL-NEXT:    orr v0.8b, v0.8b, v2.8b
++; GISEL-NEXT:    mvn v0.8b, v0.8b
++; GISEL-NEXT:    shl v0.2s, v0.2s, #31
++; GISEL-NEXT:    sshr v0.2s, v0.2s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast uno <2 x float> %A, %B
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
++  ret <2 x i32> %tmp4
++}
++
++define <4 x i32> @fcmuno4xfloat_fast(<4 x float> %A, <4 x float> %B) {
++; CHECK-LABEL: fcmuno4xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmge v2.4s, v0.4s, v1.4s
++; CHECK-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
++; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
++; CHECK-NEXT:    mvn v0.16b, v0.16b
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmuno4xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmge v2.4s, v0.4s, v1.4s
++; GISEL-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
++; GISEL-NEXT:    orr v0.16b, v0.16b, v2.16b
++; GISEL-NEXT:    mvn v0.16b, v0.16b
++; GISEL-NEXT:    shl v0.4s, v0.4s, #31
++; GISEL-NEXT:    sshr v0.4s, v0.4s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast uno <4 x float> %A, %B
++  %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
++  ret <4 x i32> %tmp4
++}
++
++define <2 x i64> @fcmuno2xdouble_fast(<2 x double> %A, <2 x double> %B) {
++; CHECK-LABEL: fcmuno2xdouble_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmge v2.2d, v0.2d, v1.2d
++; CHECK-NEXT:    fcmgt v0.2d, v1.2d, v0.2d
++; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
++; CHECK-NEXT:    mvn v0.16b, v0.16b
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmuno2xdouble_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmge v2.2d, v0.2d, v1.2d
++; GISEL-NEXT:    fcmgt v0.2d, v1.2d, v0.2d
++; GISEL-NEXT:    orr v0.16b, v0.16b, v2.16b
++; GISEL-NEXT:    mvn v0.16b, v0.16b
++; GISEL-NEXT:    shl v0.2d, v0.2d, #63
++; GISEL-NEXT:    sshr v0.2d, v0.2d, #63
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast uno <2 x double> %A, %B
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
++  ret <2 x i64> %tmp4
++}
++
++define <2 x i32> @fcmueq2xfloat_fast(<2 x float> %A, <2 x float> %B) {
++; CHECK-LABEL: fcmueq2xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmeq v0.2s, v0.2s, v1.2s
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmueq2xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmgt v2.2s, v0.2s, v1.2s
++; GISEL-NEXT:    fcmgt v0.2s, v1.2s, v0.2s
++; GISEL-NEXT:    orr v0.8b, v0.8b, v2.8b
++; GISEL-NEXT:    mvn v0.8b, v0.8b
++; GISEL-NEXT:    shl v0.2s, v0.2s, #31
++; GISEL-NEXT:    sshr v0.2s, v0.2s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast ueq <2 x float> %A, %B
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
++  ret <2 x i32> %tmp4
++}
++
++define <4 x i32> @fcmueq4xfloat_fast(<4 x float> %A, <4 x float> %B) {
++; CHECK-LABEL: fcmueq4xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmueq4xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmgt v2.4s, v0.4s, v1.4s
++; GISEL-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
++; GISEL-NEXT:    orr v0.16b, v0.16b, v2.16b
++; GISEL-NEXT:    mvn v0.16b, v0.16b
++; GISEL-NEXT:    shl v0.4s, v0.4s, #31
++; GISEL-NEXT:    sshr v0.4s, v0.4s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast ueq <4 x float> %A, %B
++  %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
++  ret <4 x i32> %tmp4
++}
++
++define <2 x i64> @fcmueq2xdouble_fast(<2 x double> %A, <2 x double> %B) {
++; CHECK-LABEL: fcmueq2xdouble_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmeq v0.2d, v0.2d, v1.2d
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmueq2xdouble_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmgt v2.2d, v0.2d, v1.2d
++; GISEL-NEXT:    fcmgt v0.2d, v1.2d, v0.2d
++; GISEL-NEXT:    orr v0.16b, v0.16b, v2.16b
++; GISEL-NEXT:    mvn v0.16b, v0.16b
++; GISEL-NEXT:    shl v0.2d, v0.2d, #63
++; GISEL-NEXT:    sshr v0.2d, v0.2d, #63
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast ueq <2 x double> %A, %B
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
++  ret <2 x i64> %tmp4
++}
++
++define <2 x i32> @fcmuge2xfloat_fast(<2 x float> %A, <2 x float> %B) {
++; CHECK-LABEL: fcmuge2xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmge v0.2s, v0.2s, v1.2s
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmuge2xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmgt v0.2s, v1.2s, v0.2s
++; GISEL-NEXT:    mvn v0.8b, v0.8b
++; GISEL-NEXT:    shl v0.2s, v0.2s, #31
++; GISEL-NEXT:    sshr v0.2s, v0.2s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast uge <2 x float> %A, %B
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
++  ret <2 x i32> %tmp4
++}
++
++define <4 x i32> @fcmuge4xfloat_fast(<4 x float> %A, <4 x float> %B) {
++; CHECK-LABEL: fcmuge4xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmge v0.4s, v0.4s, v1.4s
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmuge4xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
++; GISEL-NEXT:    mvn v0.16b, v0.16b
++; GISEL-NEXT:    shl v0.4s, v0.4s, #31
++; GISEL-NEXT:    sshr v0.4s, v0.4s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast uge <4 x float> %A, %B
++  %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
++  ret <4 x i32> %tmp4
++}
++
++define <2 x i64> @fcmuge2xdouble_fast(<2 x double> %A, <2 x double> %B) {
++; CHECK-LABEL: fcmuge2xdouble_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmge v0.2d, v0.2d, v1.2d
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmuge2xdouble_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmgt v0.2d, v1.2d, v0.2d
++; GISEL-NEXT:    mvn v0.16b, v0.16b
++; GISEL-NEXT:    shl v0.2d, v0.2d, #63
++; GISEL-NEXT:    sshr v0.2d, v0.2d, #63
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast uge <2 x double> %A, %B
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
++  ret <2 x i64> %tmp4
++}
++
++define <2 x i32> @fcmugt2xfloat_fast(<2 x float> %A, <2 x float> %B) {
++; CHECK-LABEL: fcmugt2xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmgt v0.2s, v0.2s, v1.2s
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmugt2xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmge v0.2s, v1.2s, v0.2s
++; GISEL-NEXT:    mvn v0.8b, v0.8b
++; GISEL-NEXT:    shl v0.2s, v0.2s, #31
++; GISEL-NEXT:    sshr v0.2s, v0.2s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast ugt <2 x float> %A, %B
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
++  ret <2 x i32> %tmp4
++}
++
++define <4 x i32> @fcmugt4xfloat_fast(<4 x float> %A, <4 x float> %B) {
++; CHECK-LABEL: fcmugt4xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmgt v0.4s, v0.4s, v1.4s
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmugt4xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmge v0.4s, v1.4s, v0.4s
++; GISEL-NEXT:    mvn v0.16b, v0.16b
++; GISEL-NEXT:    shl v0.4s, v0.4s, #31
++; GISEL-NEXT:    sshr v0.4s, v0.4s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast ugt <4 x float> %A, %B
++  %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
++  ret <4 x i32> %tmp4
++}
++
++define <2 x i64> @fcmugt2xdouble_fast(<2 x double> %A, <2 x double> %B) {
++; CHECK-LABEL: fcmugt2xdouble_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmgt v0.2d, v0.2d, v1.2d
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmugt2xdouble_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmge v0.2d, v1.2d, v0.2d
++; GISEL-NEXT:    mvn v0.16b, v0.16b
++; GISEL-NEXT:    shl v0.2d, v0.2d, #63
++; GISEL-NEXT:    sshr v0.2d, v0.2d, #63
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast ugt <2 x double> %A, %B
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
++  ret <2 x i64> %tmp4
++}
++
++define <2 x i32> @fcmule2xfloat_fast(<2 x float> %A, <2 x float> %B) {
++; CHECK-LABEL: fcmule2xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
++; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
++; CHECK-NEXT:    mov s2, v1.s[1]
++; CHECK-NEXT:    mov s3, v0.s[1]
++; CHECK-NEXT:    fcmp s3, s2
++; CHECK-NEXT:    csetm w8, le
++; CHECK-NEXT:    fcmp s0, s1
++; CHECK-NEXT:    csetm w9, le
++; CHECK-NEXT:    fmov s0, w9
++; CHECK-NEXT:    mov v0.s[1], w8
++; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmule2xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmgt v0.2s, v0.2s, v1.2s
++; GISEL-NEXT:    mvn v0.8b, v0.8b
++; GISEL-NEXT:    shl v0.2s, v0.2s, #31
++; GISEL-NEXT:    sshr v0.2s, v0.2s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast ule <2 x float> %A, %B
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
++  ret <2 x i32> %tmp4
++}
++
++define <4 x i32> @fcmule4xfloat_fast(<4 x float> %A, <4 x float> %B) {
++; CHECK-LABEL: fcmule4xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    mov s2, v1.s[1]
++; CHECK-NEXT:    mov s3, v0.s[1]
++; CHECK-NEXT:    mov s4, v0.s[2]
++; CHECK-NEXT:    fcmp s3, s2
++; CHECK-NEXT:    mov s3, v1.s[2]
++; CHECK-NEXT:    csetm w8, le
++; CHECK-NEXT:    fcmp s0, s1
++; CHECK-NEXT:    mov s1, v1.s[3]
++; CHECK-NEXT:    mov s0, v0.s[3]
++; CHECK-NEXT:    csetm w9, le
++; CHECK-NEXT:    fcmp s4, s3
++; CHECK-NEXT:    fmov s2, w9
++; CHECK-NEXT:    mov v2.s[1], w8
++; CHECK-NEXT:    csetm w8, le
++; CHECK-NEXT:    fcmp s0, s1
++; CHECK-NEXT:    mov v2.s[2], w8
++; CHECK-NEXT:    csetm w8, le
++; CHECK-NEXT:    mov v2.s[3], w8
++; CHECK-NEXT:    mov v0.16b, v2.16b
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmule4xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmgt v0.4s, v0.4s, v1.4s
++; GISEL-NEXT:    mvn v0.16b, v0.16b
++; GISEL-NEXT:    shl v0.4s, v0.4s, #31
++; GISEL-NEXT:    sshr v0.4s, v0.4s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast ule <4 x float> %A, %B
++  %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
++  ret <4 x i32> %tmp4
++}
++
++define <2 x i64> @fcmule2xdouble_fast(<2 x double> %A, <2 x double> %B) {
++; CHECK-LABEL: fcmule2xdouble_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    mov d2, v1.d[1]
++; CHECK-NEXT:    mov d3, v0.d[1]
++; CHECK-NEXT:    fcmp d3, d2
++; CHECK-NEXT:    csetm x8, le
++; CHECK-NEXT:    fcmp d0, d1
++; CHECK-NEXT:    csetm x9, le
++; CHECK-NEXT:    fmov d0, x9
++; CHECK-NEXT:    mov v0.d[1], x8
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmule2xdouble_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmgt v0.2d, v0.2d, v1.2d
++; GISEL-NEXT:    mvn v0.16b, v0.16b
++; GISEL-NEXT:    shl v0.2d, v0.2d, #63
++; GISEL-NEXT:    sshr v0.2d, v0.2d, #63
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast ule <2 x double> %A, %B
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
++  ret <2 x i64> %tmp4
++}
++
++define <2 x i32> @fcmult2xfloat_fast(<2 x float> %A, <2 x float> %B) {
++; CHECK-LABEL: fcmult2xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
++; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
++; CHECK-NEXT:    mov s2, v1.s[1]
++; CHECK-NEXT:    mov s3, v0.s[1]
++; CHECK-NEXT:    fcmp s3, s2
++; CHECK-NEXT:    csetm w8, lt
++; CHECK-NEXT:    fcmp s0, s1
++; CHECK-NEXT:    csetm w9, lt
++; CHECK-NEXT:    fmov s0, w9
++; CHECK-NEXT:    mov v0.s[1], w8
++; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmult2xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmge v0.2s, v0.2s, v1.2s
++; GISEL-NEXT:    mvn v0.8b, v0.8b
++; GISEL-NEXT:    shl v0.2s, v0.2s, #31
++; GISEL-NEXT:    sshr v0.2s, v0.2s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast ult <2 x float> %A, %B
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
++  ret <2 x i32> %tmp4
++}
++
++define <4 x i32> @fcmult4xfloat_fast(<4 x float> %A, <4 x float> %B) {
++; CHECK-LABEL: fcmult4xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    mov s2, v1.s[1]
++; CHECK-NEXT:    mov s3, v0.s[1]
++; CHECK-NEXT:    mov s4, v0.s[2]
++; CHECK-NEXT:    fcmp s3, s2
++; CHECK-NEXT:    mov s3, v1.s[2]
++; CHECK-NEXT:    csetm w8, lt
++; CHECK-NEXT:    fcmp s0, s1
++; CHECK-NEXT:    mov s1, v1.s[3]
++; CHECK-NEXT:    mov s0, v0.s[3]
++; CHECK-NEXT:    csetm w9, lt
++; CHECK-NEXT:    fcmp s4, s3
++; CHECK-NEXT:    fmov s2, w9
++; CHECK-NEXT:    mov v2.s[1], w8
++; CHECK-NEXT:    csetm w8, lt
++; CHECK-NEXT:    fcmp s0, s1
++; CHECK-NEXT:    mov v2.s[2], w8
++; CHECK-NEXT:    csetm w8, lt
++; CHECK-NEXT:    mov v2.s[3], w8
++; CHECK-NEXT:    mov v0.16b, v2.16b
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmult4xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmge v0.4s, v0.4s, v1.4s
++; GISEL-NEXT:    mvn v0.16b, v0.16b
++; GISEL-NEXT:    shl v0.4s, v0.4s, #31
++; GISEL-NEXT:    sshr v0.4s, v0.4s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast ult <4 x float> %A, %B
++  %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
++  ret <4 x i32> %tmp4
++}
++
++define <2 x i64> @fcmult2xdouble_fast(<2 x double> %A, <2 x double> %B) {
++; CHECK-LABEL: fcmult2xdouble_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    mov d2, v1.d[1]
++; CHECK-NEXT:    mov d3, v0.d[1]
++; CHECK-NEXT:    fcmp d3, d2
++; CHECK-NEXT:    csetm x8, lt
++; CHECK-NEXT:    fcmp d0, d1
++; CHECK-NEXT:    csetm x9, lt
++; CHECK-NEXT:    fmov d0, x9
++; CHECK-NEXT:    mov v0.d[1], x8
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmult2xdouble_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmge v0.2d, v0.2d, v1.2d
++; GISEL-NEXT:    mvn v0.16b, v0.16b
++; GISEL-NEXT:    shl v0.2d, v0.2d, #63
++; GISEL-NEXT:    sshr v0.2d, v0.2d, #63
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast ult <2 x double> %A, %B
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
++  ret <2 x i64> %tmp4
++}
++
++define <2 x i32> @fcmune2xfloat_fast(<2 x float> %A, <2 x float> %B) {
++; CHECK-LABEL: fcmune2xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmeq v0.2s, v0.2s, v1.2s
++; CHECK-NEXT:    mvn v0.8b, v0.8b
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmune2xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmeq v0.2s, v0.2s, v1.2s
++; GISEL-NEXT:    mvn v0.8b, v0.8b
++; GISEL-NEXT:    shl v0.2s, v0.2s, #31
++; GISEL-NEXT:    sshr v0.2s, v0.2s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast une <2 x float> %A, %B
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
++  ret <2 x i32> %tmp4
++}
++
++define <4 x i32> @fcmune4xfloat_fast(<4 x float> %A, <4 x float> %B) {
++; CHECK-LABEL: fcmune4xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
++; CHECK-NEXT:    mvn v0.16b, v0.16b
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmune4xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
++; GISEL-NEXT:    mvn v0.16b, v0.16b
++; GISEL-NEXT:    shl v0.4s, v0.4s, #31
++; GISEL-NEXT:    sshr v0.4s, v0.4s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast une <4 x float> %A, %B
++  %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
++  ret <4 x i32> %tmp4
++}
++
++define <2 x i64> @fcmune2xdouble_fast(<2 x double> %A, <2 x double> %B) {
++; CHECK-LABEL: fcmune2xdouble_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmeq v0.2d, v0.2d, v1.2d
++; CHECK-NEXT:    mvn v0.16b, v0.16b
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmune2xdouble_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmeq v0.2d, v0.2d, v1.2d
++; GISEL-NEXT:    mvn v0.16b, v0.16b
++; GISEL-NEXT:    shl v0.2d, v0.2d, #63
++; GISEL-NEXT:    sshr v0.2d, v0.2d, #63
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast une <2 x double> %A, %B
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
++  ret <2 x i64> %tmp4
++}
++
++define <2 x i32> @fcmoeqz2xfloat_fast(<2 x float> %A) {
++; CHECK-LABEL: fcmoeqz2xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmeq v0.2s, v0.2s, #0.0
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmoeqz2xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmeq v0.2s, v0.2s, #0.0
++; GISEL-NEXT:    shl v0.2s, v0.2s, #31
++; GISEL-NEXT:    sshr v0.2s, v0.2s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast oeq <2 x float> %A, zeroinitializer
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
++  ret <2 x i32> %tmp4
++}
++
++define <4 x i32> @fcmoeqz4xfloat_fast(<4 x float> %A) {
++; CHECK-LABEL: fcmoeqz4xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmeq v0.4s, v0.4s, #0.0
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmoeqz4xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmeq v0.4s, v0.4s, #0.0
++; GISEL-NEXT:    shl v0.4s, v0.4s, #31
++; GISEL-NEXT:    sshr v0.4s, v0.4s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast oeq <4 x float> %A, zeroinitializer
++  %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
++  ret <4 x i32> %tmp4
++}
++define <2 x i64> @fcmoeqz2xdouble_fast(<2 x double> %A) {
++; CHECK-LABEL: fcmoeqz2xdouble_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmeq v0.2d, v0.2d, #0.0
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmoeqz2xdouble_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmeq v0.2d, v0.2d, #0.0
++; GISEL-NEXT:    shl v0.2d, v0.2d, #63
++; GISEL-NEXT:    sshr v0.2d, v0.2d, #63
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast oeq <2 x double> %A, zeroinitializer
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
++  ret <2 x i64> %tmp4
++}
++
++
++define <2 x i32> @fcmogez2xfloat_fast(<2 x float> %A) {
++; CHECK-LABEL: fcmogez2xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmge v0.2s, v0.2s, #0.0
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmogez2xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmge v0.2s, v0.2s, #0.0
++; GISEL-NEXT:    shl v0.2s, v0.2s, #31
++; GISEL-NEXT:    sshr v0.2s, v0.2s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast oge <2 x float> %A, zeroinitializer
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
++  ret <2 x i32> %tmp4
++}
++
++define <4 x i32> @fcmogez4xfloat_fast(<4 x float> %A) {
++; CHECK-LABEL: fcmogez4xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmge v0.4s, v0.4s, #0.0
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmogez4xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmge v0.4s, v0.4s, #0.0
++; GISEL-NEXT:    shl v0.4s, v0.4s, #31
++; GISEL-NEXT:    sshr v0.4s, v0.4s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast oge <4 x float> %A, zeroinitializer
++  %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
++  ret <4 x i32> %tmp4
++}
++define <2 x i64> @fcmogez2xdouble_fast(<2 x double> %A) {
++; CHECK-LABEL: fcmogez2xdouble_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmge v0.2d, v0.2d, #0.0
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmogez2xdouble_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmge v0.2d, v0.2d, #0.0
++; GISEL-NEXT:    shl v0.2d, v0.2d, #63
++; GISEL-NEXT:    sshr v0.2d, v0.2d, #63
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast oge <2 x double> %A, zeroinitializer
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
++  ret <2 x i64> %tmp4
++}
++
++define <2 x i32> @fcmogtz2xfloat_fast(<2 x float> %A) {
++; CHECK-LABEL: fcmogtz2xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmgt v0.2s, v0.2s, #0.0
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmogtz2xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmgt v0.2s, v0.2s, #0.0
++; GISEL-NEXT:    shl v0.2s, v0.2s, #31
++; GISEL-NEXT:    sshr v0.2s, v0.2s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast ogt <2 x float> %A, zeroinitializer
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
++  ret <2 x i32> %tmp4
++}
++
++define <4 x i32> @fcmogtz4xfloat_fast(<4 x float> %A) {
++; CHECK-LABEL: fcmogtz4xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmgt v0.4s, v0.4s, #0.0
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmogtz4xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmgt v0.4s, v0.4s, #0.0
++; GISEL-NEXT:    shl v0.4s, v0.4s, #31
++; GISEL-NEXT:    sshr v0.4s, v0.4s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast ogt <4 x float> %A, zeroinitializer
++  %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
++  ret <4 x i32> %tmp4
++}
++define <2 x i64> @fcmogtz2xdouble_fast(<2 x double> %A) {
++; CHECK-LABEL: fcmogtz2xdouble_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmgt v0.2d, v0.2d, #0.0
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmogtz2xdouble_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmgt v0.2d, v0.2d, #0.0
++; GISEL-NEXT:    shl v0.2d, v0.2d, #63
++; GISEL-NEXT:    sshr v0.2d, v0.2d, #63
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast ogt <2 x double> %A, zeroinitializer
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
++  ret <2 x i64> %tmp4
++}
++
++define <2 x i32> @fcmoltz2xfloat_fast(<2 x float> %A) {
++; CHECK-LABEL: fcmoltz2xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
++; CHECK-NEXT:    mov s1, v0.s[1]
++; CHECK-NEXT:    fcmp s1, #0.0
++; CHECK-NEXT:    csetm w8, lt
++; CHECK-NEXT:    fcmp s0, #0.0
++; CHECK-NEXT:    csetm w9, lt
++; CHECK-NEXT:    fmov s0, w9
++; CHECK-NEXT:    mov v0.s[1], w8
++; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmoltz2xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmlt v0.2s, v0.2s, #0.0
++; GISEL-NEXT:    shl v0.2s, v0.2s, #31
++; GISEL-NEXT:    sshr v0.2s, v0.2s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast olt <2 x float> %A, zeroinitializer
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
++  ret <2 x i32> %tmp4
++}
++
++define <4 x i32> @fcmoltz4xfloat_fast(<4 x float> %A) {
++; CHECK-LABEL: fcmoltz4xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    mov s1, v0.s[1]
++; CHECK-NEXT:    mov s2, v0.s[2]
++; CHECK-NEXT:    fcmp s1, #0.0
++; CHECK-NEXT:    csetm w8, lt
++; CHECK-NEXT:    fcmp s0, #0.0
++; CHECK-NEXT:    mov s0, v0.s[3]
++; CHECK-NEXT:    csetm w9, lt
++; CHECK-NEXT:    fcmp s2, #0.0
++; CHECK-NEXT:    fmov s1, w9
++; CHECK-NEXT:    mov v1.s[1], w8
++; CHECK-NEXT:    csetm w8, lt
++; CHECK-NEXT:    fcmp s0, #0.0
++; CHECK-NEXT:    mov v1.s[2], w8
++; CHECK-NEXT:    csetm w8, lt
++; CHECK-NEXT:    mov v1.s[3], w8
++; CHECK-NEXT:    mov v0.16b, v1.16b
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmoltz4xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmlt v0.4s, v0.4s, #0.0
++; GISEL-NEXT:    shl v0.4s, v0.4s, #31
++; GISEL-NEXT:    sshr v0.4s, v0.4s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast olt <4 x float> %A, zeroinitializer
++  %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
++  ret <4 x i32> %tmp4
++}
++
++define <2 x i64> @fcmoltz2xdouble_fast(<2 x double> %A) {
++; CHECK-LABEL: fcmoltz2xdouble_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    mov d1, v0.d[1]
++; CHECK-NEXT:    fcmp d1, #0.0
++; CHECK-NEXT:    csetm x8, lt
++; CHECK-NEXT:    fcmp d0, #0.0
++; CHECK-NEXT:    csetm x9, lt
++; CHECK-NEXT:    fmov d0, x9
++; CHECK-NEXT:    mov v0.d[1], x8
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmoltz2xdouble_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmlt v0.2d, v0.2d, #0.0
++; GISEL-NEXT:    shl v0.2d, v0.2d, #63
++; GISEL-NEXT:    sshr v0.2d, v0.2d, #63
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast olt <2 x double> %A, zeroinitializer
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
++  ret <2 x i64> %tmp4
++}
++
++define <2 x i32> @fcmolez2xfloat_fast(<2 x float> %A) {
++; CHECK-LABEL: fcmolez2xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
++; CHECK-NEXT:    mov s1, v0.s[1]
++; CHECK-NEXT:    fcmp s1, #0.0
++; CHECK-NEXT:    csetm w8, le
++; CHECK-NEXT:    fcmp s0, #0.0
++; CHECK-NEXT:    csetm w9, le
++; CHECK-NEXT:    fmov s0, w9
++; CHECK-NEXT:    mov v0.s[1], w8
++; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmolez2xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmle v0.2s, v0.2s, #0.0
++; GISEL-NEXT:    shl v0.2s, v0.2s, #31
++; GISEL-NEXT:    sshr v0.2s, v0.2s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast ole <2 x float> %A, zeroinitializer
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
++  ret <2 x i32> %tmp4
++}
++
++define <4 x i32> @fcmolez4xfloat_fast(<4 x float> %A) {
++; CHECK-LABEL: fcmolez4xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    mov s1, v0.s[1]
++; CHECK-NEXT:    mov s2, v0.s[2]
++; CHECK-NEXT:    fcmp s1, #0.0
++; CHECK-NEXT:    csetm w8, le
++; CHECK-NEXT:    fcmp s0, #0.0
++; CHECK-NEXT:    mov s0, v0.s[3]
++; CHECK-NEXT:    csetm w9, le
++; CHECK-NEXT:    fcmp s2, #0.0
++; CHECK-NEXT:    fmov s1, w9
++; CHECK-NEXT:    mov v1.s[1], w8
++; CHECK-NEXT:    csetm w8, le
++; CHECK-NEXT:    fcmp s0, #0.0
++; CHECK-NEXT:    mov v1.s[2], w8
++; CHECK-NEXT:    csetm w8, le
++; CHECK-NEXT:    mov v1.s[3], w8
++; CHECK-NEXT:    mov v0.16b, v1.16b
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmolez4xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmle v0.4s, v0.4s, #0.0
++; GISEL-NEXT:    shl v0.4s, v0.4s, #31
++; GISEL-NEXT:    sshr v0.4s, v0.4s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast ole <4 x float> %A, zeroinitializer
++  %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
++  ret <4 x i32> %tmp4
++}
++
++define <2 x i64> @fcmolez2xdouble_fast(<2 x double> %A) {
++; CHECK-LABEL: fcmolez2xdouble_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    mov d1, v0.d[1]
++; CHECK-NEXT:    fcmp d1, #0.0
++; CHECK-NEXT:    csetm x8, le
++; CHECK-NEXT:    fcmp d0, #0.0
++; CHECK-NEXT:    csetm x9, le
++; CHECK-NEXT:    fmov d0, x9
++; CHECK-NEXT:    mov v0.d[1], x8
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmolez2xdouble_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmle v0.2d, v0.2d, #0.0
++; GISEL-NEXT:    shl v0.2d, v0.2d, #63
++; GISEL-NEXT:    sshr v0.2d, v0.2d, #63
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast ole <2 x double> %A, zeroinitializer
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
++  ret <2 x i64> %tmp4
++}
++
++define <2 x i32> @fcmonez2xfloat_fast(<2 x float> %A) {
++; CHECK-LABEL: fcmonez2xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmeq v0.2s, v0.2s, #0.0
++; CHECK-NEXT:    mvn v0.8b, v0.8b
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmonez2xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmgt v1.2s, v0.2s, #0.0
++; GISEL-NEXT:    fcmlt v0.2s, v0.2s, #0.0
++; GISEL-NEXT:    orr v0.8b, v0.8b, v1.8b
++; GISEL-NEXT:    shl v0.2s, v0.2s, #31
++; GISEL-NEXT:    sshr v0.2s, v0.2s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast one <2 x float> %A, zeroinitializer
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
++  ret <2 x i32> %tmp4
++}
++
++define <4 x i32> @fcmonez4xfloat_fast(<4 x float> %A) {
++; CHECK-LABEL: fcmonez4xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmeq v0.4s, v0.4s, #0.0
++; CHECK-NEXT:    mvn v0.16b, v0.16b
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmonez4xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmgt v1.4s, v0.4s, #0.0
++; GISEL-NEXT:    fcmlt v0.4s, v0.4s, #0.0
++; GISEL-NEXT:    orr v0.16b, v0.16b, v1.16b
++; GISEL-NEXT:    shl v0.4s, v0.4s, #31
++; GISEL-NEXT:    sshr v0.4s, v0.4s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast one <4 x float> %A, zeroinitializer
++  %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
++  ret <4 x i32> %tmp4
++}
++
++define <2 x i64> @fcmonez2xdouble_fast(<2 x double> %A) {
++; CHECK-LABEL: fcmonez2xdouble_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmeq v0.2d, v0.2d, #0.0
++; CHECK-NEXT:    mvn v0.16b, v0.16b
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmonez2xdouble_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmgt v1.2d, v0.2d, #0.0
++; GISEL-NEXT:    fcmlt v0.2d, v0.2d, #0.0
++; GISEL-NEXT:    orr v0.16b, v0.16b, v1.16b
++; GISEL-NEXT:    shl v0.2d, v0.2d, #63
++; GISEL-NEXT:    sshr v0.2d, v0.2d, #63
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast one <2 x double> %A, zeroinitializer
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
++  ret <2 x i64> %tmp4
++}
++
++define <2 x i32> @fcmordz2xfloat_fast(<2 x float> %A) {
++; CHECK-LABEL: fcmordz2xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmge v1.2s, v0.2s, #0.0
++; CHECK-NEXT:    fcmlt v0.2s, v0.2s, #0.0
++; CHECK-NEXT:    orr v0.8b, v0.8b, v1.8b
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmordz2xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmge v1.2s, v0.2s, #0.0
++; GISEL-NEXT:    fcmlt v0.2s, v0.2s, #0.0
++; GISEL-NEXT:    orr v0.8b, v0.8b, v1.8b
++; GISEL-NEXT:    shl v0.2s, v0.2s, #31
++; GISEL-NEXT:    sshr v0.2s, v0.2s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast ord <2 x float> %A, zeroinitializer
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
++  ret <2 x i32> %tmp4
++}
++
++define <4 x i32> @fcmordz4xfloat_fast(<4 x float> %A) {
++; CHECK-LABEL: fcmordz4xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmge v1.4s, v0.4s, #0.0
++; CHECK-NEXT:    fcmlt v0.4s, v0.4s, #0.0
++; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmordz4xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmge v1.4s, v0.4s, #0.0
++; GISEL-NEXT:    fcmlt v0.4s, v0.4s, #0.0
++; GISEL-NEXT:    orr v0.16b, v0.16b, v1.16b
++; GISEL-NEXT:    shl v0.4s, v0.4s, #31
++; GISEL-NEXT:    sshr v0.4s, v0.4s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast ord <4 x float> %A, zeroinitializer
++  %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
++  ret <4 x i32> %tmp4
++}
++
++define <2 x i64> @fcmordz2xdouble_fast(<2 x double> %A) {
++; CHECK-LABEL: fcmordz2xdouble_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmge v1.2d, v0.2d, #0.0
++; CHECK-NEXT:    fcmlt v0.2d, v0.2d, #0.0
++; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmordz2xdouble_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmge v1.2d, v0.2d, #0.0
++; GISEL-NEXT:    fcmlt v0.2d, v0.2d, #0.0
++; GISEL-NEXT:    orr v0.16b, v0.16b, v1.16b
++; GISEL-NEXT:    shl v0.2d, v0.2d, #63
++; GISEL-NEXT:    sshr v0.2d, v0.2d, #63
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast ord <2 x double> %A, zeroinitializer
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
++  ret <2 x i64> %tmp4
++}
++
++define <2 x i32> @fcmueqz2xfloat_fast(<2 x float> %A) {
++; CHECK-LABEL: fcmueqz2xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmeq v0.2s, v0.2s, #0.0
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmueqz2xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmgt v1.2s, v0.2s, #0.0
++; GISEL-NEXT:    fcmlt v0.2s, v0.2s, #0.0
++; GISEL-NEXT:    orr v0.8b, v0.8b, v1.8b
++; GISEL-NEXT:    mvn v0.8b, v0.8b
++; GISEL-NEXT:    shl v0.2s, v0.2s, #31
++; GISEL-NEXT:    sshr v0.2s, v0.2s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast ueq <2 x float> %A, zeroinitializer
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
++  ret <2 x i32> %tmp4
++}
++
++define <4 x i32> @fcmueqz4xfloat_fast(<4 x float> %A) {
++; CHECK-LABEL: fcmueqz4xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmeq v0.4s, v0.4s, #0.0
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmueqz4xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmgt v1.4s, v0.4s, #0.0
++; GISEL-NEXT:    fcmlt v0.4s, v0.4s, #0.0
++; GISEL-NEXT:    orr v0.16b, v0.16b, v1.16b
++; GISEL-NEXT:    mvn v0.16b, v0.16b
++; GISEL-NEXT:    shl v0.4s, v0.4s, #31
++; GISEL-NEXT:    sshr v0.4s, v0.4s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast ueq <4 x float> %A, zeroinitializer
++  %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
++  ret <4 x i32> %tmp4
++}
++
++define <2 x i64> @fcmueqz2xdouble_fast(<2 x double> %A) {
++; CHECK-LABEL: fcmueqz2xdouble_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmeq v0.2d, v0.2d, #0.0
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmueqz2xdouble_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmgt v1.2d, v0.2d, #0.0
++; GISEL-NEXT:    fcmlt v0.2d, v0.2d, #0.0
++; GISEL-NEXT:    orr v0.16b, v0.16b, v1.16b
++; GISEL-NEXT:    mvn v0.16b, v0.16b
++; GISEL-NEXT:    shl v0.2d, v0.2d, #63
++; GISEL-NEXT:    sshr v0.2d, v0.2d, #63
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast ueq <2 x double> %A, zeroinitializer
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
++  ret <2 x i64> %tmp4
++}
++
++define <2 x i32> @fcmugez2xfloat_fast(<2 x float> %A) {
++; CHECK-LABEL: fcmugez2xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmge v0.2s, v0.2s, #0.0
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmugez2xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmlt v0.2s, v0.2s, #0.0
++; GISEL-NEXT:    mvn v0.8b, v0.8b
++; GISEL-NEXT:    shl v0.2s, v0.2s, #31
++; GISEL-NEXT:    sshr v0.2s, v0.2s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast uge <2 x float> %A, zeroinitializer
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
++  ret <2 x i32> %tmp4
++}
++
++define <4 x i32> @fcmugez4xfloat_fast(<4 x float> %A) {
++; CHECK-LABEL: fcmugez4xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmge v0.4s, v0.4s, #0.0
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmugez4xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmlt v0.4s, v0.4s, #0.0
++; GISEL-NEXT:    mvn v0.16b, v0.16b
++; GISEL-NEXT:    shl v0.4s, v0.4s, #31
++; GISEL-NEXT:    sshr v0.4s, v0.4s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast uge <4 x float> %A, zeroinitializer
++  %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
++  ret <4 x i32> %tmp4
++}
++
++define <2 x i64> @fcmugez2xdouble_fast(<2 x double> %A) {
++; CHECK-LABEL: fcmugez2xdouble_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmge v0.2d, v0.2d, #0.0
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmugez2xdouble_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmlt v0.2d, v0.2d, #0.0
++; GISEL-NEXT:    mvn v0.16b, v0.16b
++; GISEL-NEXT:    shl v0.2d, v0.2d, #63
++; GISEL-NEXT:    sshr v0.2d, v0.2d, #63
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast uge <2 x double> %A, zeroinitializer
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
++  ret <2 x i64> %tmp4
++}
++
++define <2 x i32> @fcmugtz2xfloat_fast(<2 x float> %A) {
++; CHECK-LABEL: fcmugtz2xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmgt v0.2s, v0.2s, #0.0
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmugtz2xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmle v0.2s, v0.2s, #0.0
++; GISEL-NEXT:    mvn v0.8b, v0.8b
++; GISEL-NEXT:    shl v0.2s, v0.2s, #31
++; GISEL-NEXT:    sshr v0.2s, v0.2s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast ugt <2 x float> %A, zeroinitializer
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
++  ret <2 x i32> %tmp4
++}
++
++define <4 x i32> @fcmugtz4xfloat_fast(<4 x float> %A) {
++; CHECK-LABEL: fcmugtz4xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmgt v0.4s, v0.4s, #0.0
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmugtz4xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmle v0.4s, v0.4s, #0.0
++; GISEL-NEXT:    mvn v0.16b, v0.16b
++; GISEL-NEXT:    shl v0.4s, v0.4s, #31
++; GISEL-NEXT:    sshr v0.4s, v0.4s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast ugt <4 x float> %A, zeroinitializer
++  %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
++  ret <4 x i32> %tmp4
++}
++
++define <2 x i64> @fcmugtz2xdouble_fast(<2 x double> %A) {
++; CHECK-LABEL: fcmugtz2xdouble_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmgt v0.2d, v0.2d, #0.0
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmugtz2xdouble_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmle v0.2d, v0.2d, #0.0
++; GISEL-NEXT:    mvn v0.16b, v0.16b
++; GISEL-NEXT:    shl v0.2d, v0.2d, #63
++; GISEL-NEXT:    sshr v0.2d, v0.2d, #63
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast ugt <2 x double> %A, zeroinitializer
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
++  ret <2 x i64> %tmp4
++}
++
++define <2 x i32> @fcmultz2xfloat_fast(<2 x float> %A) {
++; CHECK-LABEL: fcmultz2xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
++; CHECK-NEXT:    mov s1, v0.s[1]
++; CHECK-NEXT:    fcmp s1, #0.0
++; CHECK-NEXT:    csetm w8, lt
++; CHECK-NEXT:    fcmp s0, #0.0
++; CHECK-NEXT:    csetm w9, lt
++; CHECK-NEXT:    fmov s0, w9
++; CHECK-NEXT:    mov v0.s[1], w8
++; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmultz2xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmge v0.2s, v0.2s, #0.0
++; GISEL-NEXT:    mvn v0.8b, v0.8b
++; GISEL-NEXT:    shl v0.2s, v0.2s, #31
++; GISEL-NEXT:    sshr v0.2s, v0.2s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast ult <2 x float> %A, zeroinitializer
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
++  ret <2 x i32> %tmp4
++}
++
++define <4 x i32> @fcmultz4xfloat_fast(<4 x float> %A) {
++; CHECK-LABEL: fcmultz4xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    mov s1, v0.s[1]
++; CHECK-NEXT:    mov s2, v0.s[2]
++; CHECK-NEXT:    fcmp s1, #0.0
++; CHECK-NEXT:    csetm w8, lt
++; CHECK-NEXT:    fcmp s0, #0.0
++; CHECK-NEXT:    mov s0, v0.s[3]
++; CHECK-NEXT:    csetm w9, lt
++; CHECK-NEXT:    fcmp s2, #0.0
++; CHECK-NEXT:    fmov s1, w9
++; CHECK-NEXT:    mov v1.s[1], w8
++; CHECK-NEXT:    csetm w8, lt
++; CHECK-NEXT:    fcmp s0, #0.0
++; CHECK-NEXT:    mov v1.s[2], w8
++; CHECK-NEXT:    csetm w8, lt
++; CHECK-NEXT:    mov v1.s[3], w8
++; CHECK-NEXT:    mov v0.16b, v1.16b
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmultz4xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmge v0.4s, v0.4s, #0.0
++; GISEL-NEXT:    mvn v0.16b, v0.16b
++; GISEL-NEXT:    shl v0.4s, v0.4s, #31
++; GISEL-NEXT:    sshr v0.4s, v0.4s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast ult <4 x float> %A, zeroinitializer
++  %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
++  ret <4 x i32> %tmp4
++}
++
++define <2 x i64> @fcmultz2xdouble_fast(<2 x double> %A) {
++; CHECK-LABEL: fcmultz2xdouble_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    mov d1, v0.d[1]
++; CHECK-NEXT:    fcmp d1, #0.0
++; CHECK-NEXT:    csetm x8, lt
++; CHECK-NEXT:    fcmp d0, #0.0
++; CHECK-NEXT:    csetm x9, lt
++; CHECK-NEXT:    fmov d0, x9
++; CHECK-NEXT:    mov v0.d[1], x8
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmultz2xdouble_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmge v0.2d, v0.2d, #0.0
++; GISEL-NEXT:    mvn v0.16b, v0.16b
++; GISEL-NEXT:    shl v0.2d, v0.2d, #63
++; GISEL-NEXT:    sshr v0.2d, v0.2d, #63
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast ult <2 x double> %A, zeroinitializer
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
++  ret <2 x i64> %tmp4
++}
++
++; ULE with zero = !OGT
++define <2 x i32> @fcmulez2xfloat_fast(<2 x float> %A) {
++; CHECK-LABEL: fcmulez2xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
++; CHECK-NEXT:    mov s1, v0.s[1]
++; CHECK-NEXT:    fcmp s1, #0.0
++; CHECK-NEXT:    csetm w8, le
++; CHECK-NEXT:    fcmp s0, #0.0
++; CHECK-NEXT:    csetm w9, le
++; CHECK-NEXT:    fmov s0, w9
++; CHECK-NEXT:    mov v0.s[1], w8
++; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmulez2xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmgt v0.2s, v0.2s, #0.0
++; GISEL-NEXT:    mvn v0.8b, v0.8b
++; GISEL-NEXT:    shl v0.2s, v0.2s, #31
++; GISEL-NEXT:    sshr v0.2s, v0.2s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast ule <2 x float> %A, zeroinitializer
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
++  ret <2 x i32> %tmp4
++}
++
++define <4 x i32> @fcmulez4xfloat_fast(<4 x float> %A) {
++; CHECK-LABEL: fcmulez4xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    mov s1, v0.s[1]
++; CHECK-NEXT:    mov s2, v0.s[2]
++; CHECK-NEXT:    fcmp s1, #0.0
++; CHECK-NEXT:    csetm w8, le
++; CHECK-NEXT:    fcmp s0, #0.0
++; CHECK-NEXT:    mov s0, v0.s[3]
++; CHECK-NEXT:    csetm w9, le
++; CHECK-NEXT:    fcmp s2, #0.0
++; CHECK-NEXT:    fmov s1, w9
++; CHECK-NEXT:    mov v1.s[1], w8
++; CHECK-NEXT:    csetm w8, le
++; CHECK-NEXT:    fcmp s0, #0.0
++; CHECK-NEXT:    mov v1.s[2], w8
++; CHECK-NEXT:    csetm w8, le
++; CHECK-NEXT:    mov v1.s[3], w8
++; CHECK-NEXT:    mov v0.16b, v1.16b
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmulez4xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmgt v0.4s, v0.4s, #0.0
++; GISEL-NEXT:    mvn v0.16b, v0.16b
++; GISEL-NEXT:    shl v0.4s, v0.4s, #31
++; GISEL-NEXT:    sshr v0.4s, v0.4s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast ule <4 x float> %A, zeroinitializer
++  %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
++  ret <4 x i32> %tmp4
++}
++
++define <2 x i64> @fcmulez2xdouble_fast(<2 x double> %A) {
++; CHECK-LABEL: fcmulez2xdouble_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    mov d1, v0.d[1]
++; CHECK-NEXT:    fcmp d1, #0.0
++; CHECK-NEXT:    csetm x8, le
++; CHECK-NEXT:    fcmp d0, #0.0
++; CHECK-NEXT:    csetm x9, le
++; CHECK-NEXT:    fmov d0, x9
++; CHECK-NEXT:    mov v0.d[1], x8
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmulez2xdouble_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmgt v0.2d, v0.2d, #0.0
++; GISEL-NEXT:    mvn v0.16b, v0.16b
++; GISEL-NEXT:    shl v0.2d, v0.2d, #63
++; GISEL-NEXT:    sshr v0.2d, v0.2d, #63
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast ule <2 x double> %A, zeroinitializer
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
++  ret <2 x i64> %tmp4
++}
++
++define <2 x i32> @fcmunez2xfloat_fast(<2 x float> %A) {
++; CHECK-LABEL: fcmunez2xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmeq v0.2s, v0.2s, #0.0
++; CHECK-NEXT:    mvn v0.8b, v0.8b
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmunez2xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmeq v0.2s, v0.2s, #0.0
++; GISEL-NEXT:    mvn v0.8b, v0.8b
++; GISEL-NEXT:    shl v0.2s, v0.2s, #31
++; GISEL-NEXT:    sshr v0.2s, v0.2s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast une <2 x float> %A, zeroinitializer
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
++  ret <2 x i32> %tmp4
++}
++
++define <4 x i32> @fcmunez4xfloat_fast(<4 x float> %A) {
++; CHECK-LABEL: fcmunez4xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmeq v0.4s, v0.4s, #0.0
++; CHECK-NEXT:    mvn v0.16b, v0.16b
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmunez4xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmeq v0.4s, v0.4s, #0.0
++; GISEL-NEXT:    mvn v0.16b, v0.16b
++; GISEL-NEXT:    shl v0.4s, v0.4s, #31
++; GISEL-NEXT:    sshr v0.4s, v0.4s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast une <4 x float> %A, zeroinitializer
++  %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
++  ret <4 x i32> %tmp4
++}
++
++define <2 x i64> @fcmunez2xdouble_fast(<2 x double> %A) {
++; CHECK-LABEL: fcmunez2xdouble_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmeq v0.2d, v0.2d, #0.0
++; CHECK-NEXT:    mvn v0.16b, v0.16b
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmunez2xdouble_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmeq v0.2d, v0.2d, #0.0
++; GISEL-NEXT:    mvn v0.16b, v0.16b
++; GISEL-NEXT:    shl v0.2d, v0.2d, #63
++; GISEL-NEXT:    sshr v0.2d, v0.2d, #63
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast une <2 x double> %A, zeroinitializer
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
++  ret <2 x i64> %tmp4
++}
++
++define <2 x i32> @fcmunoz2xfloat_fast(<2 x float> %A) {
++; CHECK-LABEL: fcmunoz2xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmge v1.2s, v0.2s, #0.0
++; CHECK-NEXT:    fcmlt v0.2s, v0.2s, #0.0
++; CHECK-NEXT:    orr v0.8b, v0.8b, v1.8b
++; CHECK-NEXT:    mvn v0.8b, v0.8b
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmunoz2xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmge v1.2s, v0.2s, #0.0
++; GISEL-NEXT:    fcmlt v0.2s, v0.2s, #0.0
++; GISEL-NEXT:    orr v0.8b, v0.8b, v1.8b
++; GISEL-NEXT:    mvn v0.8b, v0.8b
++; GISEL-NEXT:    shl v0.2s, v0.2s, #31
++; GISEL-NEXT:    sshr v0.2s, v0.2s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast uno <2 x float> %A, zeroinitializer
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
++  ret <2 x i32> %tmp4
++}
++
++define <4 x i32> @fcmunoz4xfloat_fast(<4 x float> %A) {
++; CHECK-LABEL: fcmunoz4xfloat_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmge v1.4s, v0.4s, #0.0
++; CHECK-NEXT:    fcmlt v0.4s, v0.4s, #0.0
++; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
++; CHECK-NEXT:    mvn v0.16b, v0.16b
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmunoz4xfloat_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmge v1.4s, v0.4s, #0.0
++; GISEL-NEXT:    fcmlt v0.4s, v0.4s, #0.0
++; GISEL-NEXT:    orr v0.16b, v0.16b, v1.16b
++; GISEL-NEXT:    mvn v0.16b, v0.16b
++; GISEL-NEXT:    shl v0.4s, v0.4s, #31
++; GISEL-NEXT:    sshr v0.4s, v0.4s, #31
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast uno <4 x float> %A, zeroinitializer
++  %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
++  ret <4 x i32> %tmp4
++}
++
++define <2 x i64> @fcmunoz2xdouble_fast(<2 x double> %A) {
++; CHECK-LABEL: fcmunoz2xdouble_fast:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcmge v1.2d, v0.2d, #0.0
++; CHECK-NEXT:    fcmlt v0.2d, v0.2d, #0.0
++; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
++; CHECK-NEXT:    mvn v0.16b, v0.16b
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmunoz2xdouble_fast:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmge v1.2d, v0.2d, #0.0
++; GISEL-NEXT:    fcmlt v0.2d, v0.2d, #0.0
++; GISEL-NEXT:    orr v0.16b, v0.16b, v1.16b
++; GISEL-NEXT:    mvn v0.16b, v0.16b
++; GISEL-NEXT:    shl v0.2d, v0.2d, #63
++; GISEL-NEXT:    sshr v0.2d, v0.2d, #63
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast uno <2 x double> %A, zeroinitializer
++  %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
++  ret <2 x i64> %tmp4
++
++}
++
++; Test SETCC fast-math flags are propagated when combining zext(setcc).
++define <4 x i32> @fcmule4xfloat_fast_zext(<4 x float> %A, <4 x float> %B) {
++; CHECK-LABEL: fcmule4xfloat_fast_zext:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    mov s3, v1.s[1]
++; CHECK-NEXT:    mov s4, v0.s[1]
++; CHECK-NEXT:    movi v2.4s, #1
++; CHECK-NEXT:    fcmp s4, s3
++; CHECK-NEXT:    mov s3, v1.s[2]
++; CHECK-NEXT:    mov s4, v0.s[2]
++; CHECK-NEXT:    csetm w8, le
++; CHECK-NEXT:    fcmp s0, s1
++; CHECK-NEXT:    mov s1, v1.s[3]
++; CHECK-NEXT:    mov s0, v0.s[3]
++; CHECK-NEXT:    csetm w9, le
++; CHECK-NEXT:    fcmp s4, s3
++; CHECK-NEXT:    fmov s3, w9
++; CHECK-NEXT:    mov v3.s[1], w8
++; CHECK-NEXT:    csetm w8, le
++; CHECK-NEXT:    fcmp s0, s1
++; CHECK-NEXT:    mov v3.s[2], w8
++; CHECK-NEXT:    csetm w8, le
++; CHECK-NEXT:    mov v3.s[3], w8
++; CHECK-NEXT:    and v0.16b, v3.16b, v2.16b
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmule4xfloat_fast_zext:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    adrp x8, .LCPI322_0
++; GISEL-NEXT:    fcmgt v0.4s, v0.4s, v1.4s
++; GISEL-NEXT:    ldr q1, [x8, :lo12:.LCPI322_0]
++; GISEL-NEXT:    bic v0.16b, v1.16b, v0.16b
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast ule <4 x float> %A, %B
++  %tmp4 = zext <4 x i1> %tmp3 to <4 x i32>
++  ret <4 x i32> %tmp4
++}
++
++; Test SETCC fast-math flags are propagated when combining aext(setcc).
++define <4 x i1> @fcmule4xfloat_fast_aext(<4 x float> %A, <4 x float> %B) {
++; CHECK-LABEL: fcmule4xfloat_fast_aext:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    mov s2, v1.s[1]
++; CHECK-NEXT:    mov s3, v0.s[1]
++; CHECK-NEXT:    fcmp s3, s2
++; CHECK-NEXT:    mov s2, v1.s[2]
++; CHECK-NEXT:    mov s3, v0.s[2]
++; CHECK-NEXT:    csetm w8, le
++; CHECK-NEXT:    fcmp s0, s1
++; CHECK-NEXT:    mov s1, v1.s[3]
++; CHECK-NEXT:    mov s0, v0.s[3]
++; CHECK-NEXT:    csetm w9, le
++; CHECK-NEXT:    fcmp s3, s2
++; CHECK-NEXT:    fmov s4, w9
++; CHECK-NEXT:    mov v4.s[1], w8
++; CHECK-NEXT:    csetm w8, le
++; CHECK-NEXT:    fcmp s0, s1
++; CHECK-NEXT:    mov v4.s[2], w8
++; CHECK-NEXT:    csetm w8, le
++; CHECK-NEXT:    mov v4.s[3], w8
++; CHECK-NEXT:    xtn v0.4h, v4.4s
++; CHECK-NEXT:    ret
++;
++; GISEL-LABEL: fcmule4xfloat_fast_aext:
++; GISEL:       // %bb.0:
++; GISEL-NEXT:    fcmgt v0.4s, v0.4s, v1.4s
++; GISEL-NEXT:    mvn v0.16b, v0.16b
++; GISEL-NEXT:    xtn v0.4h, v0.4s
++; GISEL-NEXT:    ret
++  %tmp3 = fcmp fast ule <4 x float> %A, %B
++  ret <4 x i1> %tmp3
++}
+-- 
+2.34.1
+
diff --git a/patches/cherry/a9a012086a917dff367bb63de2d63782b23111fc.patch b/patches/cherry/a9a012086a917dff367bb63de2d63782b23111fc.patch
new file mode 100644
index 0000000..c6abbb2
--- /dev/null
+++ b/patches/cherry/a9a012086a917dff367bb63de2d63782b23111fc.patch
@@ -0,0 +1,72 @@
+From a9a012086a917dff367bb63de2d63782b23111fc Mon Sep 17 00:00:00 2001
+From: Florian Hahn <flo@fhahn.com>
+Date: Thu, 26 May 2022 10:35:38 +0100
+Subject: [PATCH] [AArch64] Add additional tests for sinking free shuffles for
+ FMAs.
+
+---
+ .../AArch64/sink-free-instructions.ll         | 41 ++++++++++++++++++-
+ 1 file changed, 39 insertions(+), 2 deletions(-)
+
+diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll
+index 244d2c35bbac..5d7a26f65784 100644
+--- a/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll
++++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll
+@@ -585,6 +585,43 @@ if.else:
+   ret <4 x float> %r.3
+ }
+ 
++define <4 x float> @sink_shufflevector_first_arg_fma_v4f3(i1 %c, <8 x float> %a, <4 x float> %b) {
++; CHECK-LABEL: @sink_shufflevector_first_arg_fma_v4f3(
++; CHECK-NEXT:  entry:
++; CHECK-NEXT:    [[S0:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> zeroinitializer
++; CHECK-NEXT:    [[S1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
++; CHECK-NEXT:    [[S2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
++; CHECK-NEXT:    [[S3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
++; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
++; CHECK:       if.then:
++; CHECK-NEXT:    [[R_0:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[S0]], <4 x float> [[B:%.*]], <4 x float> [[B]])
++; CHECK-NEXT:    [[R_1:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[S1]], <4 x float> [[R_0]], <4 x float> [[B]])
++; CHECK-NEXT:    ret <4 x float> [[R_1]]
++; CHECK:       if.else:
++; CHECK-NEXT:    [[R_2:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[S2]], <4 x float> [[B]], <4 x float> [[B]])
++; CHECK-NEXT:    [[R_3:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[S3]], <4 x float> [[R_2]], <4 x float> [[B]])
++; CHECK-NEXT:    ret <4 x float> [[R_3]]
++;
++entry:
++  %s0 = shufflevector <8 x float> %a, <8 x float> poison, <4 x i32> zeroinitializer
++  %s1 = shufflevector <8 x float> %a, <8 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
++  %s2 = shufflevector <8 x float> %a, <8 x float> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
++  %s3 = shufflevector <8 x float> %a, <8 x float> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
++  br i1 %c, label %if.then, label %if.else
++
++if.then:
++  %r.0 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %s0, <4 x float> %b, <4 x float> %b)
++  %r.1 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %s1, <4 x float> %r.0, <4 x float> %b)
++  ret <4 x float> %r.1
++
++if.else:
++  %r.2 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %s2, <4 x float> %b, <4 x float> %b)
++  %r.3 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %s3, <4 x float> %r.2, <4 x float> %b)
++  ret <4 x float> %r.3
++}
++
++
++
+ declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>)
+ 
+ define <2 x double> @sink_shufflevector_fma_v2f64(i1 %c, <2 x double> %a, <2 x double> %b) {
+@@ -639,8 +676,8 @@ if.else:
+ 
+ declare <5 x float> @llvm.fma.v5f32(<5 x float>, <5 x float>, <5 x float>)
+ 
+-define <5 x float> @do_not_sink_shufflevector_fma_v5f32(i1 %c, <8 x float> %a, <5 x float> %b) {
+-; CHECK-LABEL: @do_not_sink_shufflevector_fma_v5f32(
++define <5 x float> @sink_shufflevector_fma_v5f32(i1 %c, <8 x float> %a, <5 x float> %b) {
++; CHECK-LABEL: @sink_shufflevector_fma_v5f32(
+ ; CHECK-NEXT:  entry:
+ ; CHECK-NEXT:    [[S0:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <5 x i32> zeroinitializer
+ ; CHECK-NEXT:    [[S1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <5 x i32> <i32 1, i32 1, i32 1, i32 1, i32 4>
+-- 
+2.34.1
+
diff --git a/patches/cherry/bb362d890f0d51c250818711d4a9b0b51cea7bc6.patch b/patches/cherry/bb362d890f0d51c250818711d4a9b0b51cea7bc6.patch
new file mode 100644
index 0000000..e7d8543
--- /dev/null
+++ b/patches/cherry/bb362d890f0d51c250818711d4a9b0b51cea7bc6.patch
@@ -0,0 +1,1507 @@
+From bb362d890f0d51c250818711d4a9b0b51cea7bc6 Mon Sep 17 00:00:00 2001
+From: David Green <david.green@arm.com>
+Date: Thu, 10 Feb 2022 21:04:41 +0000
+Subject: [PATCH] [AArch64] Add extra fptoint_sat tests for larger than legal
+ types. NFC
+
+---
+ .../test/CodeGen/AArch64/fptosi-sat-vector.ll | 821 ++++++++++++++++++
+ .../test/CodeGen/AArch64/fptoui-sat-vector.ll | 656 ++++++++++++++
+ 2 files changed, 1477 insertions(+)
+
+diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
+index 55e018783f04..3625bd6011fb 100644
+--- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
++++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
+@@ -2976,3 +2976,824 @@ define <8 x i128> @test_signed_v8f16_v8i128(<8 x half> %f) {
+ }
+ 
+ 
++declare <8 x i8> @llvm.fptosi.sat.v8f32.v8i8(<8 x float> %f)
++declare <8 x i16> @llvm.fptosi.sat.v8f32.v8i16(<8 x float> %f)
++declare <16 x i8> @llvm.fptosi.sat.v16f32.v16i8(<16 x float> %f)
++declare <16 x i16> @llvm.fptosi.sat.v16f32.v16i16(<16 x float> %f)
++
++declare <16 x i8> @llvm.fptosi.sat.v16f16.v16i8(<16 x half> %f)
++declare <16 x i16> @llvm.fptosi.sat.v16f16.v16i16(<16 x half> %f)
++
++declare <8 x i8> @llvm.fptosi.sat.v8f64.v8i8(<8 x double> %f)
++declare <8 x i16> @llvm.fptosi.sat.v8f64.v8i16(<8 x double> %f)
++declare <16 x i8> @llvm.fptosi.sat.v16f64.v16i8(<16 x double> %f)
++declare <16 x i16> @llvm.fptosi.sat.v16f64.v16i16(<16 x double> %f)
++
++define <8 x i8> @test_signed_v8f32_v8i8(<8 x float> %f) {
++; CHECK-LABEL: test_signed_v8f32_v8i8:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    movi v2.4s, #127
++; CHECK-NEXT:    fcvtzs v1.4s, v1.4s
++; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
++; CHECK-NEXT:    mvni v3.4s, #127
++; CHECK-NEXT:    smin v1.4s, v1.4s, v2.4s
++; CHECK-NEXT:    smin v0.4s, v0.4s, v2.4s
++; CHECK-NEXT:    smax v1.4s, v1.4s, v3.4s
++; CHECK-NEXT:    smax v0.4s, v0.4s, v3.4s
++; CHECK-NEXT:    xtn v1.4h, v1.4s
++; CHECK-NEXT:    xtn v0.4h, v0.4s
++; CHECK-NEXT:    uzp1 v0.8b, v0.8b, v1.8b
++; CHECK-NEXT:    ret
++    %x = call <8 x i8> @llvm.fptosi.sat.v8f32.v8i8(<8 x float> %f)
++    ret <8 x i8> %x
++}
++
++define <16 x i8> @test_signed_v16f32_v16i8(<16 x float> %f) {
++; CHECK-LABEL: test_signed_v16f32_v16i8:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    movi v4.4s, #127
++; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
++; CHECK-NEXT:    mvni v5.4s, #127
++; CHECK-NEXT:    fcvtzs v1.4s, v1.4s
++; CHECK-NEXT:    fcvtzs v2.4s, v2.4s
++; CHECK-NEXT:    smin v0.4s, v0.4s, v4.4s
++; CHECK-NEXT:    smin v1.4s, v1.4s, v4.4s
++; CHECK-NEXT:    smin v2.4s, v2.4s, v4.4s
++; CHECK-NEXT:    smax v0.4s, v0.4s, v5.4s
++; CHECK-NEXT:    smax v1.4s, v1.4s, v5.4s
++; CHECK-NEXT:    smax v2.4s, v2.4s, v5.4s
++; CHECK-NEXT:    xtn v6.4h, v0.4s
++; CHECK-NEXT:    umov w8, v6.h[0]
++; CHECK-NEXT:    umov w9, v6.h[1]
++; CHECK-NEXT:    xtn v1.4h, v1.4s
++; CHECK-NEXT:    fmov s0, w8
++; CHECK-NEXT:    umov w8, v6.h[2]
++; CHECK-NEXT:    mov v0.b[1], w9
++; CHECK-NEXT:    mov v0.b[2], w8
++; CHECK-NEXT:    umov w8, v6.h[3]
++; CHECK-NEXT:    mov v0.b[3], w8
++; CHECK-NEXT:    umov w8, v1.h[0]
++; CHECK-NEXT:    mov v0.b[4], w8
++; CHECK-NEXT:    umov w8, v1.h[1]
++; CHECK-NEXT:    mov v0.b[5], w8
++; CHECK-NEXT:    umov w8, v1.h[2]
++; CHECK-NEXT:    mov v0.b[6], w8
++; CHECK-NEXT:    umov w8, v1.h[3]
++; CHECK-NEXT:    xtn v1.4h, v2.4s
++; CHECK-NEXT:    fcvtzs v2.4s, v3.4s
++; CHECK-NEXT:    mov v0.b[7], w8
++; CHECK-NEXT:    umov w8, v1.h[0]
++; CHECK-NEXT:    smin v2.4s, v2.4s, v4.4s
++; CHECK-NEXT:    mov v0.b[8], w8
++; CHECK-NEXT:    umov w8, v1.h[1]
++; CHECK-NEXT:    smax v2.4s, v2.4s, v5.4s
++; CHECK-NEXT:    mov v0.b[9], w8
++; CHECK-NEXT:    umov w8, v1.h[2]
++; CHECK-NEXT:    mov v0.b[10], w8
++; CHECK-NEXT:    umov w8, v1.h[3]
++; CHECK-NEXT:    xtn v1.4h, v2.4s
++; CHECK-NEXT:    mov v0.b[11], w8
++; CHECK-NEXT:    umov w8, v1.h[0]
++; CHECK-NEXT:    mov v0.b[12], w8
++; CHECK-NEXT:    umov w8, v1.h[1]
++; CHECK-NEXT:    mov v0.b[13], w8
++; CHECK-NEXT:    umov w8, v1.h[2]
++; CHECK-NEXT:    mov v0.b[14], w8
++; CHECK-NEXT:    umov w8, v1.h[3]
++; CHECK-NEXT:    mov v0.b[15], w8
++; CHECK-NEXT:    ret
++    %x = call <16 x i8> @llvm.fptosi.sat.v16f32.v16i8(<16 x float> %f)
++    ret <16 x i8> %x
++}
++
++define <8 x i16> @test_signed_v8f32_v8i16(<8 x float> %f) {
++; CHECK-LABEL: test_signed_v8f32_v8i16:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
++; CHECK-NEXT:    fcvtzs v1.4s, v1.4s
++; CHECK-NEXT:    sqxtn v0.4h, v0.4s
++; CHECK-NEXT:    sqxtn2 v0.8h, v1.4s
++; CHECK-NEXT:    ret
++    %x = call <8 x i16> @llvm.fptosi.sat.v8f32.v8i16(<8 x float> %f)
++    ret <8 x i16> %x
++}
++
++define <16 x i16> @test_signed_v16f32_v16i16(<16 x float> %f) {
++; CHECK-LABEL: test_signed_v16f32_v16i16:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
++; CHECK-NEXT:    fcvtzs v2.4s, v2.4s
++; CHECK-NEXT:    fcvtzs v4.4s, v1.4s
++; CHECK-NEXT:    fcvtzs v3.4s, v3.4s
++; CHECK-NEXT:    sqxtn v0.4h, v0.4s
++; CHECK-NEXT:    sqxtn v1.4h, v2.4s
++; CHECK-NEXT:    sqxtn2 v0.8h, v4.4s
++; CHECK-NEXT:    sqxtn2 v1.8h, v3.4s
++; CHECK-NEXT:    ret
++    %x = call <16 x i16> @llvm.fptosi.sat.v16f32.v16i16(<16 x float> %f)
++    ret <16 x i16> %x
++}
++
++
++
++define <16 x i8> @test_signed_v16f16_v16i8(<16 x half> %f) {
++; CHECK-CVT-LABEL: test_signed_v16f16_v16i8:
++; CHECK-CVT:       // %bb.0:
++; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v1.8h
++; CHECK-CVT-NEXT:    mov w8, #127
++; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
++; CHECK-CVT-NEXT:    mov w9, #-128
++; CHECK-CVT-NEXT:    mov s3, v2.s[1]
++; CHECK-CVT-NEXT:    fcvtzs w11, s2
++; CHECK-CVT-NEXT:    fcvtzs w10, s3
++; CHECK-CVT-NEXT:    mov s3, v2.s[2]
++; CHECK-CVT-NEXT:    mov s2, v2.s[3]
++; CHECK-CVT-NEXT:    cmp w10, #127
++; CHECK-CVT-NEXT:    csel w10, w10, w8, lt
++; CHECK-CVT-NEXT:    fcvtzs w12, s3
++; CHECK-CVT-NEXT:    cmn w10, #128
++; CHECK-CVT-NEXT:    mov s3, v1.s[1]
++; CHECK-CVT-NEXT:    csel w10, w10, w9, gt
++; CHECK-CVT-NEXT:    cmp w11, #127
++; CHECK-CVT-NEXT:    csel w11, w11, w8, lt
++; CHECK-CVT-NEXT:    fcvtzs w14, s2
++; CHECK-CVT-NEXT:    cmn w11, #128
++; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v0.8h
++; CHECK-CVT-NEXT:    csel w11, w11, w9, gt
++; CHECK-CVT-NEXT:    cmp w12, #127
++; CHECK-CVT-NEXT:    csel w12, w12, w8, lt
++; CHECK-CVT-NEXT:    fcvtzs w15, s3
++; CHECK-CVT-NEXT:    cmn w12, #128
++; CHECK-CVT-NEXT:    mov s3, v1.s[2]
++; CHECK-CVT-NEXT:    csel w13, w12, w9, gt
++; CHECK-CVT-NEXT:    cmp w14, #127
++; CHECK-CVT-NEXT:    csel w12, w14, w8, lt
++; CHECK-CVT-NEXT:    fcvtzs w14, s1
++; CHECK-CVT-NEXT:    cmn w12, #128
++; CHECK-CVT-NEXT:    mov s1, v1.s[3]
++; CHECK-CVT-NEXT:    csel w12, w12, w9, gt
++; CHECK-CVT-NEXT:    cmp w15, #127
++; CHECK-CVT-NEXT:    csel w15, w15, w8, lt
++; CHECK-CVT-NEXT:    fcvtzs w16, s3
++; CHECK-CVT-NEXT:    cmn w15, #128
++; CHECK-CVT-NEXT:    mov s3, v2.s[1]
++; CHECK-CVT-NEXT:    csel w15, w15, w9, gt
++; CHECK-CVT-NEXT:    cmp w14, #127
++; CHECK-CVT-NEXT:    csel w14, w14, w8, lt
++; CHECK-CVT-NEXT:    fcvtzs w17, s1
++; CHECK-CVT-NEXT:    cmn w14, #128
++; CHECK-CVT-NEXT:    mov s1, v2.s[2]
++; CHECK-CVT-NEXT:    csel w14, w14, w9, gt
++; CHECK-CVT-NEXT:    cmp w16, #127
++; CHECK-CVT-NEXT:    csel w16, w16, w8, lt
++; CHECK-CVT-NEXT:    fcvtzs w18, s3
++; CHECK-CVT-NEXT:    cmn w16, #128
++; CHECK-CVT-NEXT:    fcvtzs w0, s2
++; CHECK-CVT-NEXT:    csel w16, w16, w9, gt
++; CHECK-CVT-NEXT:    cmp w17, #127
++; CHECK-CVT-NEXT:    csel w17, w17, w8, lt
++; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
++; CHECK-CVT-NEXT:    cmn w17, #128
++; CHECK-CVT-NEXT:    mov s2, v2.s[3]
++; CHECK-CVT-NEXT:    csel w17, w17, w9, gt
++; CHECK-CVT-NEXT:    cmp w18, #127
++; CHECK-CVT-NEXT:    csel w18, w18, w8, lt
++; CHECK-CVT-NEXT:    fcvtzs w1, s1
++; CHECK-CVT-NEXT:    cmn w18, #128
++; CHECK-CVT-NEXT:    mov s1, v0.s[1]
++; CHECK-CVT-NEXT:    csel w18, w18, w9, gt
++; CHECK-CVT-NEXT:    cmp w0, #127
++; CHECK-CVT-NEXT:    csel w0, w0, w8, lt
++; CHECK-CVT-NEXT:    fcvtzs w2, s2
++; CHECK-CVT-NEXT:    cmn w0, #128
++; CHECK-CVT-NEXT:    fcvtzs w4, s0
++; CHECK-CVT-NEXT:    csel w0, w0, w9, gt
++; CHECK-CVT-NEXT:    cmp w1, #127
++; CHECK-CVT-NEXT:    csel w1, w1, w8, lt
++; CHECK-CVT-NEXT:    fcvtzs w3, s1
++; CHECK-CVT-NEXT:    cmn w1, #128
++; CHECK-CVT-NEXT:    mov s1, v0.s[2]
++; CHECK-CVT-NEXT:    csel w1, w1, w9, gt
++; CHECK-CVT-NEXT:    cmp w2, #127
++; CHECK-CVT-NEXT:    csel w2, w2, w8, lt
++; CHECK-CVT-NEXT:    fmov s2, w11
++; CHECK-CVT-NEXT:    cmn w2, #128
++; CHECK-CVT-NEXT:    fmov s3, w14
++; CHECK-CVT-NEXT:    csel w2, w2, w9, gt
++; CHECK-CVT-NEXT:    cmp w3, #127
++; CHECK-CVT-NEXT:    csel w3, w3, w8, lt
++; CHECK-CVT-NEXT:    fcvtzs w14, s1
++; CHECK-CVT-NEXT:    cmn w3, #128
++; CHECK-CVT-NEXT:    mov s0, v0.s[3]
++; CHECK-CVT-NEXT:    csel w3, w3, w9, gt
++; CHECK-CVT-NEXT:    cmp w4, #127
++; CHECK-CVT-NEXT:    csel w11, w4, w8, lt
++; CHECK-CVT-NEXT:    fmov s4, w0
++; CHECK-CVT-NEXT:    cmn w11, #128
++; CHECK-CVT-NEXT:    csel w11, w11, w9, gt
++; CHECK-CVT-NEXT:    cmp w14, #127
++; CHECK-CVT-NEXT:    mov v2.s[1], w10
++; CHECK-CVT-NEXT:    csel w10, w14, w8, lt
++; CHECK-CVT-NEXT:    mov v3.s[1], w15
++; CHECK-CVT-NEXT:    cmn w10, #128
++; CHECK-CVT-NEXT:    fmov s1, w11
++; CHECK-CVT-NEXT:    csel w10, w10, w9, gt
++; CHECK-CVT-NEXT:    fcvtzs w11, s0
++; CHECK-CVT-NEXT:    mov v4.s[1], w18
++; CHECK-CVT-NEXT:    mov v1.s[1], w3
++; CHECK-CVT-NEXT:    cmp w11, #127
++; CHECK-CVT-NEXT:    csel w8, w11, w8, lt
++; CHECK-CVT-NEXT:    mov v2.s[2], w13
++; CHECK-CVT-NEXT:    cmn w8, #128
++; CHECK-CVT-NEXT:    mov v3.s[2], w16
++; CHECK-CVT-NEXT:    csel w8, w8, w9, gt
++; CHECK-CVT-NEXT:    mov v4.s[2], w1
++; CHECK-CVT-NEXT:    mov v1.s[2], w10
++; CHECK-CVT-NEXT:    mov v2.s[3], w12
++; CHECK-CVT-NEXT:    mov v3.s[3], w17
++; CHECK-CVT-NEXT:    mov v4.s[3], w2
++; CHECK-CVT-NEXT:    mov v1.s[3], w8
++; CHECK-CVT-NEXT:    uzp1 v0.8h, v3.8h, v2.8h
++; CHECK-CVT-NEXT:    uzp1 v1.8h, v1.8h, v4.8h
++; CHECK-CVT-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
++; CHECK-CVT-NEXT:    ret
++;
++; CHECK-FP16-LABEL: test_signed_v16f16_v16i8:
++; CHECK-FP16:       // %bb.0:
++; CHECK-FP16-NEXT:    fcvtzs v0.8h, v0.8h
++; CHECK-FP16-NEXT:    fcvtzs v1.8h, v1.8h
++; CHECK-FP16-NEXT:    sqxtn v0.8b, v0.8h
++; CHECK-FP16-NEXT:    sqxtn2 v0.16b, v1.8h
++; CHECK-FP16-NEXT:    ret
++    %x = call <16 x i8> @llvm.fptosi.sat.v16f16.v16i8(<16 x half> %f)
++    ret <16 x i8> %x
++}
++
++define <16 x i16> @test_signed_v16f16_v16i16(<16 x half> %f) {
++; CHECK-CVT-LABEL: test_signed_v16f16_v16i16:
++; CHECK-CVT:       // %bb.0:
++; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v0.8h
++; CHECK-CVT-NEXT:    mov w8, #32767
++; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
++; CHECK-CVT-NEXT:    mov w9, #-32768
++; CHECK-CVT-NEXT:    mov s3, v2.s[1]
++; CHECK-CVT-NEXT:    fcvtzs w11, s2
++; CHECK-CVT-NEXT:    fcvtzs w10, s3
++; CHECK-CVT-NEXT:    mov s3, v2.s[2]
++; CHECK-CVT-NEXT:    mov s2, v2.s[3]
++; CHECK-CVT-NEXT:    cmp w10, w8
++; CHECK-CVT-NEXT:    csel w10, w10, w8, lt
++; CHECK-CVT-NEXT:    fcvtzs w12, s3
++; CHECK-CVT-NEXT:    cmn w10, #8, lsl #12 // =32768
++; CHECK-CVT-NEXT:    mov s3, v0.s[1]
++; CHECK-CVT-NEXT:    csel w10, w10, w9, gt
++; CHECK-CVT-NEXT:    cmp w11, w8
++; CHECK-CVT-NEXT:    csel w11, w11, w8, lt
++; CHECK-CVT-NEXT:    fcvtzs w14, s2
++; CHECK-CVT-NEXT:    cmn w11, #8, lsl #12 // =32768
++; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v1.8h
++; CHECK-CVT-NEXT:    csel w11, w11, w9, gt
++; CHECK-CVT-NEXT:    cmp w12, w8
++; CHECK-CVT-NEXT:    csel w12, w12, w8, lt
++; CHECK-CVT-NEXT:    fcvtzs w15, s3
++; CHECK-CVT-NEXT:    cmn w12, #8, lsl #12 // =32768
++; CHECK-CVT-NEXT:    mov s3, v0.s[2]
++; CHECK-CVT-NEXT:    csel w13, w12, w9, gt
++; CHECK-CVT-NEXT:    cmp w14, w8
++; CHECK-CVT-NEXT:    csel w12, w14, w8, lt
++; CHECK-CVT-NEXT:    fcvtzs w14, s0
++; CHECK-CVT-NEXT:    cmn w12, #8, lsl #12 // =32768
++; CHECK-CVT-NEXT:    mov s0, v0.s[3]
++; CHECK-CVT-NEXT:    csel w12, w12, w9, gt
++; CHECK-CVT-NEXT:    cmp w15, w8
++; CHECK-CVT-NEXT:    csel w15, w15, w8, lt
++; CHECK-CVT-NEXT:    fcvtzs w16, s3
++; CHECK-CVT-NEXT:    cmn w15, #8, lsl #12 // =32768
++; CHECK-CVT-NEXT:    mov s3, v2.s[1]
++; CHECK-CVT-NEXT:    csel w15, w15, w9, gt
++; CHECK-CVT-NEXT:    cmp w14, w8
++; CHECK-CVT-NEXT:    csel w14, w14, w8, lt
++; CHECK-CVT-NEXT:    fcvtzs w17, s0
++; CHECK-CVT-NEXT:    cmn w14, #8, lsl #12 // =32768
++; CHECK-CVT-NEXT:    fcvtl v0.4s, v1.4h
++; CHECK-CVT-NEXT:    csel w14, w14, w9, gt
++; CHECK-CVT-NEXT:    cmp w16, w8
++; CHECK-CVT-NEXT:    csel w16, w16, w8, lt
++; CHECK-CVT-NEXT:    fcvtzs w18, s3
++; CHECK-CVT-NEXT:    cmn w16, #8, lsl #12 // =32768
++; CHECK-CVT-NEXT:    mov s1, v2.s[2]
++; CHECK-CVT-NEXT:    csel w16, w16, w9, gt
++; CHECK-CVT-NEXT:    cmp w17, w8
++; CHECK-CVT-NEXT:    csel w17, w17, w8, lt
++; CHECK-CVT-NEXT:    fcvtzs w0, s2
++; CHECK-CVT-NEXT:    cmn w17, #8, lsl #12 // =32768
++; CHECK-CVT-NEXT:    mov s2, v2.s[3]
++; CHECK-CVT-NEXT:    csel w17, w17, w9, gt
++; CHECK-CVT-NEXT:    cmp w18, w8
++; CHECK-CVT-NEXT:    csel w18, w18, w8, lt
++; CHECK-CVT-NEXT:    fcvtzs w1, s1
++; CHECK-CVT-NEXT:    cmn w18, #8, lsl #12 // =32768
++; CHECK-CVT-NEXT:    mov s1, v0.s[1]
++; CHECK-CVT-NEXT:    csel w18, w18, w9, gt
++; CHECK-CVT-NEXT:    cmp w0, w8
++; CHECK-CVT-NEXT:    csel w0, w0, w8, lt
++; CHECK-CVT-NEXT:    fcvtzs w2, s2
++; CHECK-CVT-NEXT:    cmn w0, #8, lsl #12 // =32768
++; CHECK-CVT-NEXT:    fcvtzs w4, s0
++; CHECK-CVT-NEXT:    csel w0, w0, w9, gt
++; CHECK-CVT-NEXT:    cmp w1, w8
++; CHECK-CVT-NEXT:    csel w1, w1, w8, lt
++; CHECK-CVT-NEXT:    fcvtzs w3, s1
++; CHECK-CVT-NEXT:    cmn w1, #8, lsl #12 // =32768
++; CHECK-CVT-NEXT:    mov s1, v0.s[2]
++; CHECK-CVT-NEXT:    csel w1, w1, w9, gt
++; CHECK-CVT-NEXT:    cmp w2, w8
++; CHECK-CVT-NEXT:    csel w2, w2, w8, lt
++; CHECK-CVT-NEXT:    fmov s2, w11
++; CHECK-CVT-NEXT:    cmn w2, #8, lsl #12 // =32768
++; CHECK-CVT-NEXT:    fmov s3, w14
++; CHECK-CVT-NEXT:    csel w2, w2, w9, gt
++; CHECK-CVT-NEXT:    cmp w3, w8
++; CHECK-CVT-NEXT:    csel w3, w3, w8, lt
++; CHECK-CVT-NEXT:    fcvtzs w14, s1
++; CHECK-CVT-NEXT:    cmn w3, #8, lsl #12 // =32768
++; CHECK-CVT-NEXT:    mov s0, v0.s[3]
++; CHECK-CVT-NEXT:    csel w3, w3, w9, gt
++; CHECK-CVT-NEXT:    cmp w4, w8
++; CHECK-CVT-NEXT:    csel w11, w4, w8, lt
++; CHECK-CVT-NEXT:    fmov s4, w0
++; CHECK-CVT-NEXT:    cmn w11, #8, lsl #12 // =32768
++; CHECK-CVT-NEXT:    csel w11, w11, w9, gt
++; CHECK-CVT-NEXT:    cmp w14, w8
++; CHECK-CVT-NEXT:    mov v2.s[1], w10
++; CHECK-CVT-NEXT:    csel w10, w14, w8, lt
++; CHECK-CVT-NEXT:    mov v3.s[1], w15
++; CHECK-CVT-NEXT:    cmn w10, #8, lsl #12 // =32768
++; CHECK-CVT-NEXT:    fmov s1, w11
++; CHECK-CVT-NEXT:    csel w10, w10, w9, gt
++; CHECK-CVT-NEXT:    fcvtzs w11, s0
++; CHECK-CVT-NEXT:    mov v4.s[1], w18
++; CHECK-CVT-NEXT:    mov v1.s[1], w3
++; CHECK-CVT-NEXT:    cmp w11, w8
++; CHECK-CVT-NEXT:    csel w8, w11, w8, lt
++; CHECK-CVT-NEXT:    mov v2.s[2], w13
++; CHECK-CVT-NEXT:    cmn w8, #8, lsl #12 // =32768
++; CHECK-CVT-NEXT:    mov v3.s[2], w16
++; CHECK-CVT-NEXT:    csel w8, w8, w9, gt
++; CHECK-CVT-NEXT:    mov v4.s[2], w1
++; CHECK-CVT-NEXT:    mov v1.s[2], w10
++; CHECK-CVT-NEXT:    mov v2.s[3], w12
++; CHECK-CVT-NEXT:    mov v3.s[3], w17
++; CHECK-CVT-NEXT:    mov v4.s[3], w2
++; CHECK-CVT-NEXT:    mov v1.s[3], w8
++; CHECK-CVT-NEXT:    uzp1 v0.8h, v3.8h, v2.8h
++; CHECK-CVT-NEXT:    uzp1 v1.8h, v1.8h, v4.8h
++; CHECK-CVT-NEXT:    ret
++;
++; CHECK-FP16-LABEL: test_signed_v16f16_v16i16:
++; CHECK-FP16:       // %bb.0:
++; CHECK-FP16-NEXT:    fcvtzs v0.8h, v0.8h
++; CHECK-FP16-NEXT:    fcvtzs v1.8h, v1.8h
++; CHECK-FP16-NEXT:    ret
++    %x = call <16 x i16> @llvm.fptosi.sat.v16f16.v16i16(<16 x half> %f)
++    ret <16 x i16> %x
++}
++
++define <8 x i8> @test_signed_v8f64_v8i8(<8 x double> %f) {
++; CHECK-LABEL: test_signed_v8f64_v8i8:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    mov d4, v0.d[1]
++; CHECK-NEXT:    mov w8, #127
++; CHECK-NEXT:    fcvtzs w11, d0
++; CHECK-NEXT:    mov w9, #-128
++; CHECK-NEXT:    mov d0, v2.d[1]
++; CHECK-NEXT:    fcvtzs w13, d1
++; CHECK-NEXT:    fcvtzs w15, d3
++; CHECK-NEXT:    fcvtzs w10, d4
++; CHECK-NEXT:    mov d4, v1.d[1]
++; CHECK-NEXT:    mov d1, v3.d[1]
++; CHECK-NEXT:    fcvtzs w14, d0
++; CHECK-NEXT:    cmp w10, #127
++; CHECK-NEXT:    csel w10, w10, w8, lt
++; CHECK-NEXT:    fcvtzs w12, d4
++; CHECK-NEXT:    cmn w10, #128
++; CHECK-NEXT:    csel w10, w10, w9, gt
++; CHECK-NEXT:    cmp w11, #127
++; CHECK-NEXT:    csel w11, w11, w8, lt
++; CHECK-NEXT:    cmn w11, #128
++; CHECK-NEXT:    csel w11, w11, w9, gt
++; CHECK-NEXT:    cmp w12, #127
++; CHECK-NEXT:    csel w12, w12, w8, lt
++; CHECK-NEXT:    cmn w12, #128
++; CHECK-NEXT:    csel w12, w12, w9, gt
++; CHECK-NEXT:    cmp w13, #127
++; CHECK-NEXT:    fmov s0, w11
++; CHECK-NEXT:    csel w11, w13, w8, lt
++; CHECK-NEXT:    cmn w11, #128
++; CHECK-NEXT:    fcvtzs w13, d2
++; CHECK-NEXT:    csel w11, w11, w9, gt
++; CHECK-NEXT:    cmp w14, #127
++; CHECK-NEXT:    mov v0.s[1], w10
++; CHECK-NEXT:    csel w10, w14, w8, lt
++; CHECK-NEXT:    cmn w10, #128
++; CHECK-NEXT:    fmov s2, w11
++; CHECK-NEXT:    csel w10, w10, w9, gt
++; CHECK-NEXT:    cmp w13, #127
++; CHECK-NEXT:    mov w11, v0.s[1]
++; CHECK-NEXT:    csel w13, w13, w8, lt
++; CHECK-NEXT:    mov v2.s[1], w12
++; CHECK-NEXT:    cmn w13, #128
++; CHECK-NEXT:    fcvtzs w12, d1
++; CHECK-NEXT:    csel w13, w13, w9, gt
++; CHECK-NEXT:    mov v0.b[1], w11
++; CHECK-NEXT:    fmov w14, s2
++; CHECK-NEXT:    cmp w12, #127
++; CHECK-NEXT:    fmov s1, w13
++; CHECK-NEXT:    csel w12, w12, w8, lt
++; CHECK-NEXT:    cmn w12, #128
++; CHECK-NEXT:    mov w11, v2.s[1]
++; CHECK-NEXT:    mov v0.b[2], w14
++; CHECK-NEXT:    csel w12, w12, w9, gt
++; CHECK-NEXT:    cmp w15, #127
++; CHECK-NEXT:    mov v1.s[1], w10
++; CHECK-NEXT:    csel w8, w15, w8, lt
++; CHECK-NEXT:    cmn w8, #128
++; CHECK-NEXT:    csel w8, w8, w9, gt
++; CHECK-NEXT:    mov v0.b[3], w11
++; CHECK-NEXT:    fmov w9, s1
++; CHECK-NEXT:    fmov s2, w8
++; CHECK-NEXT:    mov w8, v1.s[1]
++; CHECK-NEXT:    mov v0.b[4], w9
++; CHECK-NEXT:    mov v2.s[1], w12
++; CHECK-NEXT:    mov v0.b[5], w8
++; CHECK-NEXT:    fmov w8, s2
++; CHECK-NEXT:    mov w9, v2.s[1]
++; CHECK-NEXT:    mov v0.b[6], w8
++; CHECK-NEXT:    mov v0.b[7], w9
++; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
++; CHECK-NEXT:    ret
++    %x = call <8 x i8> @llvm.fptosi.sat.v8f64.v8i8(<8 x double> %f)
++    ret <8 x i8> %x
++}
++
++define <16 x i8> @test_signed_v16f64_v16i8(<16 x double> %f) {
++; CHECK-LABEL: test_signed_v16f64_v16i8:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    mov d16, v0.d[1]
++; CHECK-NEXT:    mov w8, #127
++; CHECK-NEXT:    fcvtzs w11, d0
++; CHECK-NEXT:    mov w9, #-128
++; CHECK-NEXT:    fcvtzs w13, d1
++; CHECK-NEXT:    mov d0, v2.d[1]
++; CHECK-NEXT:    fcvtzs w14, d2
++; CHECK-NEXT:    fcvtzs w10, d16
++; CHECK-NEXT:    mov d16, v1.d[1]
++; CHECK-NEXT:    mov d1, v3.d[1]
++; CHECK-NEXT:    fcvtzs w15, d0
++; CHECK-NEXT:    cmp w10, #127
++; CHECK-NEXT:    csel w10, w10, w8, lt
++; CHECK-NEXT:    fcvtzs w12, d16
++; CHECK-NEXT:    cmn w10, #128
++; CHECK-NEXT:    csel w10, w10, w9, gt
++; CHECK-NEXT:    cmp w11, #127
++; CHECK-NEXT:    csel w11, w11, w8, lt
++; CHECK-NEXT:    cmn w11, #128
++; CHECK-NEXT:    csel w11, w11, w9, gt
++; CHECK-NEXT:    cmp w12, #127
++; CHECK-NEXT:    csel w12, w12, w8, lt
++; CHECK-NEXT:    cmn w12, #128
++; CHECK-NEXT:    csel w12, w12, w9, gt
++; CHECK-NEXT:    cmp w13, #127
++; CHECK-NEXT:    csel w13, w13, w8, lt
++; CHECK-NEXT:    fmov s0, w11
++; CHECK-NEXT:    cmn w13, #128
++; CHECK-NEXT:    csel w11, w13, w9, gt
++; CHECK-NEXT:    cmp w15, #127
++; CHECK-NEXT:    mov v0.s[1], w10
++; CHECK-NEXT:    csel w10, w15, w8, lt
++; CHECK-NEXT:    cmn w10, #128
++; CHECK-NEXT:    fcvtzs w13, d3
++; CHECK-NEXT:    fmov s2, w11
++; CHECK-NEXT:    csel w10, w10, w9, gt
++; CHECK-NEXT:    cmp w14, #127
++; CHECK-NEXT:    fcvtzs w11, d1
++; CHECK-NEXT:    mov w15, v0.s[1]
++; CHECK-NEXT:    csel w14, w14, w8, lt
++; CHECK-NEXT:    mov v2.s[1], w12
++; CHECK-NEXT:    cmn w14, #128
++; CHECK-NEXT:    csel w12, w14, w9, gt
++; CHECK-NEXT:    cmp w11, #127
++; CHECK-NEXT:    csel w11, w11, w8, lt
++; CHECK-NEXT:    mov d1, v4.d[1]
++; CHECK-NEXT:    mov v0.b[1], w15
++; CHECK-NEXT:    cmn w11, #128
++; CHECK-NEXT:    fmov w14, s2
++; CHECK-NEXT:    csel w11, w11, w9, gt
++; CHECK-NEXT:    fmov s3, w12
++; CHECK-NEXT:    cmp w13, #127
++; CHECK-NEXT:    mov w12, v2.s[1]
++; CHECK-NEXT:    csel w13, w13, w8, lt
++; CHECK-NEXT:    mov v0.b[2], w14
++; CHECK-NEXT:    cmn w13, #128
++; CHECK-NEXT:    mov v3.s[1], w10
++; CHECK-NEXT:    csel w13, w13, w9, gt
++; CHECK-NEXT:    fcvtzs w15, d1
++; CHECK-NEXT:    fcvtzs w14, d4
++; CHECK-NEXT:    mov d1, v5.d[1]
++; CHECK-NEXT:    mov v0.b[3], w12
++; CHECK-NEXT:    fmov s4, w13
++; CHECK-NEXT:    cmp w15, #127
++; CHECK-NEXT:    fmov w13, s3
++; CHECK-NEXT:    csel w10, w15, w8, lt
++; CHECK-NEXT:    mov w12, v3.s[1]
++; CHECK-NEXT:    cmn w10, #128
++; CHECK-NEXT:    fcvtzs w15, d1
++; CHECK-NEXT:    csel w10, w10, w9, gt
++; CHECK-NEXT:    cmp w14, #127
++; CHECK-NEXT:    mov v0.b[4], w13
++; CHECK-NEXT:    csel w14, w14, w8, lt
++; CHECK-NEXT:    mov v4.s[1], w11
++; CHECK-NEXT:    cmn w14, #128
++; CHECK-NEXT:    csel w14, w14, w9, gt
++; CHECK-NEXT:    fcvtzs w13, d5
++; CHECK-NEXT:    cmp w15, #127
++; CHECK-NEXT:    mov d2, v6.d[1]
++; CHECK-NEXT:    mov v0.b[5], w12
++; CHECK-NEXT:    csel w11, w15, w8, lt
++; CHECK-NEXT:    fmov w12, s4
++; CHECK-NEXT:    cmn w11, #128
++; CHECK-NEXT:    fmov s1, w14
++; CHECK-NEXT:    csel w11, w11, w9, gt
++; CHECK-NEXT:    cmp w13, #127
++; CHECK-NEXT:    mov w14, v4.s[1]
++; CHECK-NEXT:    mov v0.b[6], w12
++; CHECK-NEXT:    csel w13, w13, w8, lt
++; CHECK-NEXT:    mov v1.s[1], w10
++; CHECK-NEXT:    cmn w13, #128
++; CHECK-NEXT:    fcvtzs w15, d2
++; CHECK-NEXT:    csel w13, w13, w9, gt
++; CHECK-NEXT:    fcvtzs w10, d6
++; CHECK-NEXT:    mov v0.b[7], w14
++; CHECK-NEXT:    cmp w15, #127
++; CHECK-NEXT:    fmov w14, s1
++; CHECK-NEXT:    csel w12, w15, w8, lt
++; CHECK-NEXT:    fmov s2, w13
++; CHECK-NEXT:    mov w13, v1.s[1]
++; CHECK-NEXT:    mov d1, v7.d[1]
++; CHECK-NEXT:    cmn w12, #128
++; CHECK-NEXT:    fcvtzs w15, d7
++; CHECK-NEXT:    csel w12, w12, w9, gt
++; CHECK-NEXT:    cmp w10, #127
++; CHECK-NEXT:    mov v0.b[8], w14
++; CHECK-NEXT:    csel w10, w10, w8, lt
++; CHECK-NEXT:    mov v2.s[1], w11
++; CHECK-NEXT:    cmn w10, #128
++; CHECK-NEXT:    fcvtzs w11, d1
++; CHECK-NEXT:    csel w10, w10, w9, gt
++; CHECK-NEXT:    mov v0.b[9], w13
++; CHECK-NEXT:    fmov w14, s2
++; CHECK-NEXT:    cmp w11, #127
++; CHECK-NEXT:    fmov s1, w10
++; CHECK-NEXT:    csel w10, w11, w8, lt
++; CHECK-NEXT:    cmn w10, #128
++; CHECK-NEXT:    mov w13, v2.s[1]
++; CHECK-NEXT:    mov v0.b[10], w14
++; CHECK-NEXT:    csel w10, w10, w9, gt
++; CHECK-NEXT:    cmp w15, #127
++; CHECK-NEXT:    mov v1.s[1], w12
++; CHECK-NEXT:    csel w8, w15, w8, lt
++; CHECK-NEXT:    cmn w8, #128
++; CHECK-NEXT:    csel w8, w8, w9, gt
++; CHECK-NEXT:    mov v0.b[11], w13
++; CHECK-NEXT:    fmov w9, s1
++; CHECK-NEXT:    fmov s2, w8
++; CHECK-NEXT:    mov w8, v1.s[1]
++; CHECK-NEXT:    mov v0.b[12], w9
++; CHECK-NEXT:    mov v2.s[1], w10
++; CHECK-NEXT:    mov v0.b[13], w8
++; CHECK-NEXT:    fmov w8, s2
++; CHECK-NEXT:    mov w9, v2.s[1]
++; CHECK-NEXT:    mov v0.b[14], w8
++; CHECK-NEXT:    mov v0.b[15], w9
++; CHECK-NEXT:    ret
++    %x = call <16 x i8> @llvm.fptosi.sat.v16f64.v16i8(<16 x double> %f)
++    ret <16 x i8> %x
++}
++
++define <8 x i16> @test_signed_v8f64_v8i16(<8 x double> %f) {
++; CHECK-LABEL: test_signed_v8f64_v8i16:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    mov d4, v0.d[1]
++; CHECK-NEXT:    mov w8, #32767
++; CHECK-NEXT:    fcvtzs w10, d0
++; CHECK-NEXT:    mov w11, #-32768
++; CHECK-NEXT:    mov d0, v2.d[1]
++; CHECK-NEXT:    fcvtzs w13, d1
++; CHECK-NEXT:    fcvtzs w15, d3
++; CHECK-NEXT:    fcvtzs w9, d4
++; CHECK-NEXT:    mov d4, v1.d[1]
++; CHECK-NEXT:    mov d1, v3.d[1]
++; CHECK-NEXT:    fcvtzs w14, d0
++; CHECK-NEXT:    cmp w9, w8
++; CHECK-NEXT:    csel w9, w9, w8, lt
++; CHECK-NEXT:    fcvtzs w12, d4
++; CHECK-NEXT:    cmn w9, #8, lsl #12 // =32768
++; CHECK-NEXT:    csel w9, w9, w11, gt
++; CHECK-NEXT:    cmp w10, w8
++; CHECK-NEXT:    csel w10, w10, w8, lt
++; CHECK-NEXT:    cmn w10, #8, lsl #12 // =32768
++; CHECK-NEXT:    csel w10, w10, w11, gt
++; CHECK-NEXT:    cmp w12, w8
++; CHECK-NEXT:    csel w12, w12, w8, lt
++; CHECK-NEXT:    cmn w12, #8, lsl #12 // =32768
++; CHECK-NEXT:    csel w12, w12, w11, gt
++; CHECK-NEXT:    cmp w13, w8
++; CHECK-NEXT:    fmov s0, w10
++; CHECK-NEXT:    csel w10, w13, w8, lt
++; CHECK-NEXT:    cmn w10, #8, lsl #12 // =32768
++; CHECK-NEXT:    fcvtzs w13, d2
++; CHECK-NEXT:    csel w10, w10, w11, gt
++; CHECK-NEXT:    cmp w14, w8
++; CHECK-NEXT:    mov v0.s[1], w9
++; CHECK-NEXT:    csel w9, w14, w8, lt
++; CHECK-NEXT:    cmn w9, #8, lsl #12 // =32768
++; CHECK-NEXT:    fmov s2, w10
++; CHECK-NEXT:    csel w9, w9, w11, gt
++; CHECK-NEXT:    cmp w13, w8
++; CHECK-NEXT:    mov w10, v0.s[1]
++; CHECK-NEXT:    csel w13, w13, w8, lt
++; CHECK-NEXT:    mov v2.s[1], w12
++; CHECK-NEXT:    cmn w13, #8, lsl #12 // =32768
++; CHECK-NEXT:    fcvtzs w12, d1
++; CHECK-NEXT:    csel w13, w13, w11, gt
++; CHECK-NEXT:    mov v0.h[1], w10
++; CHECK-NEXT:    fmov w14, s2
++; CHECK-NEXT:    cmp w12, w8
++; CHECK-NEXT:    fmov s1, w13
++; CHECK-NEXT:    csel w12, w12, w8, lt
++; CHECK-NEXT:    cmn w12, #8, lsl #12 // =32768
++; CHECK-NEXT:    mov w10, v2.s[1]
++; CHECK-NEXT:    mov v0.h[2], w14
++; CHECK-NEXT:    csel w12, w12, w11, gt
++; CHECK-NEXT:    cmp w15, w8
++; CHECK-NEXT:    mov v1.s[1], w9
++; CHECK-NEXT:    csel w8, w15, w8, lt
++; CHECK-NEXT:    cmn w8, #8, lsl #12 // =32768
++; CHECK-NEXT:    csel w8, w8, w11, gt
++; CHECK-NEXT:    mov v0.h[3], w10
++; CHECK-NEXT:    fmov w9, s1
++; CHECK-NEXT:    fmov s2, w8
++; CHECK-NEXT:    mov w8, v1.s[1]
++; CHECK-NEXT:    mov v0.h[4], w9
++; CHECK-NEXT:    mov v2.s[1], w12
++; CHECK-NEXT:    mov v0.h[5], w8
++; CHECK-NEXT:    fmov w8, s2
++; CHECK-NEXT:    mov w9, v2.s[1]
++; CHECK-NEXT:    mov v0.h[6], w8
++; CHECK-NEXT:    mov v0.h[7], w9
++; CHECK-NEXT:    ret
++    %x = call <8 x i16> @llvm.fptosi.sat.v8f64.v8i16(<8 x double> %f)
++    ret <8 x i16> %x
++}
++
++define <16 x i16> @test_signed_v16f64_v16i16(<16 x double> %f) {
++; CHECK-LABEL: test_signed_v16f64_v16i16:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    mov d16, v0.d[1]
++; CHECK-NEXT:    mov w9, #32767
++; CHECK-NEXT:    fcvtzs w11, d0
++; CHECK-NEXT:    mov w8, #-32768
++; CHECK-NEXT:    mov d0, v2.d[1]
++; CHECK-NEXT:    fcvtzs w12, d1
++; CHECK-NEXT:    fcvtzs w14, d2
++; CHECK-NEXT:    mov d2, v4.d[1]
++; CHECK-NEXT:    fcvtzs w10, d16
++; CHECK-NEXT:    mov d16, v1.d[1]
++; CHECK-NEXT:    mov d1, v3.d[1]
++; CHECK-NEXT:    fcvtzs w16, d3
++; CHECK-NEXT:    fcvtzs w15, d0
++; CHECK-NEXT:    mov d3, v6.d[1]
++; CHECK-NEXT:    cmp w10, w9
++; CHECK-NEXT:    csel w10, w10, w9, lt
++; CHECK-NEXT:    fcvtzs w13, d16
++; CHECK-NEXT:    cmn w10, #8, lsl #12 // =32768
++; CHECK-NEXT:    fcvtzs w17, d1
++; CHECK-NEXT:    csel w10, w10, w8, gt
++; CHECK-NEXT:    cmp w11, w9
++; CHECK-NEXT:    csel w11, w11, w9, lt
++; CHECK-NEXT:    mov d1, v5.d[1]
++; CHECK-NEXT:    cmn w11, #8, lsl #12 // =32768
++; CHECK-NEXT:    csel w11, w11, w8, gt
++; CHECK-NEXT:    cmp w13, w9
++; CHECK-NEXT:    csel w13, w13, w9, lt
++; CHECK-NEXT:    cmn w13, #8, lsl #12 // =32768
++; CHECK-NEXT:    csel w13, w13, w8, gt
++; CHECK-NEXT:    cmp w12, w9
++; CHECK-NEXT:    csel w12, w12, w9, lt
++; CHECK-NEXT:    fmov s0, w11
++; CHECK-NEXT:    cmn w12, #8, lsl #12 // =32768
++; CHECK-NEXT:    csel w12, w12, w8, gt
++; CHECK-NEXT:    cmp w15, w9
++; CHECK-NEXT:    csel w15, w15, w9, lt
++; CHECK-NEXT:    cmn w15, #8, lsl #12 // =32768
++; CHECK-NEXT:    csel w11, w15, w8, gt
++; CHECK-NEXT:    cmp w14, w9
++; CHECK-NEXT:    csel w14, w14, w9, lt
++; CHECK-NEXT:    fcvtzs w15, d4
++; CHECK-NEXT:    cmn w14, #8, lsl #12 // =32768
++; CHECK-NEXT:    csel w14, w14, w8, gt
++; CHECK-NEXT:    cmp w17, w9
++; CHECK-NEXT:    mov v0.s[1], w10
++; CHECK-NEXT:    csel w10, w17, w9, lt
++; CHECK-NEXT:    cmn w10, #8, lsl #12 // =32768
++; CHECK-NEXT:    fcvtzs w17, d2
++; CHECK-NEXT:    csel w10, w10, w8, gt
++; CHECK-NEXT:    cmp w16, w9
++; CHECK-NEXT:    fmov s2, w12
++; CHECK-NEXT:    csel w12, w16, w9, lt
++; CHECK-NEXT:    cmn w12, #8, lsl #12 // =32768
++; CHECK-NEXT:    mov w16, v0.s[1]
++; CHECK-NEXT:    csel w12, w12, w8, gt
++; CHECK-NEXT:    cmp w17, w9
++; CHECK-NEXT:    mov v2.s[1], w13
++; CHECK-NEXT:    csel w13, w17, w9, lt
++; CHECK-NEXT:    cmn w13, #8, lsl #12 // =32768
++; CHECK-NEXT:    fcvtzs w17, d1
++; CHECK-NEXT:    csel w13, w13, w8, gt
++; CHECK-NEXT:    cmp w15, w9
++; CHECK-NEXT:    csel w15, w15, w9, lt
++; CHECK-NEXT:    fmov s4, w14
++; CHECK-NEXT:    cmn w15, #8, lsl #12 // =32768
++; CHECK-NEXT:    mov v0.h[1], w16
++; CHECK-NEXT:    fcvtzs w16, d5
++; CHECK-NEXT:    csel w15, w15, w8, gt
++; CHECK-NEXT:    cmp w17, w9
++; CHECK-NEXT:    csel w17, w17, w9, lt
++; CHECK-NEXT:    cmn w17, #8, lsl #12 // =32768
++; CHECK-NEXT:    csel w14, w17, w8, gt
++; CHECK-NEXT:    cmp w16, w9
++; CHECK-NEXT:    fmov s1, w15
++; CHECK-NEXT:    csel w15, w16, w9, lt
++; CHECK-NEXT:    fcvtzs w16, d3
++; CHECK-NEXT:    cmn w15, #8, lsl #12 // =32768
++; CHECK-NEXT:    mov v4.s[1], w11
++; CHECK-NEXT:    csel w11, w15, w8, gt
++; CHECK-NEXT:    fcvtzs w15, d6
++; CHECK-NEXT:    mov v1.s[1], w13
++; CHECK-NEXT:    cmp w16, w9
++; CHECK-NEXT:    fmov s3, w11
++; CHECK-NEXT:    csel w16, w16, w9, lt
++; CHECK-NEXT:    fmov w11, s2
++; CHECK-NEXT:    mov w13, v2.s[1]
++; CHECK-NEXT:    mov d2, v7.d[1]
++; CHECK-NEXT:    cmn w16, #8, lsl #12 // =32768
++; CHECK-NEXT:    csel w16, w16, w8, gt
++; CHECK-NEXT:    cmp w15, w9
++; CHECK-NEXT:    mov v0.h[2], w11
++; CHECK-NEXT:    csel w11, w15, w9, lt
++; CHECK-NEXT:    mov w15, v1.s[1]
++; CHECK-NEXT:    cmn w11, #8, lsl #12 // =32768
++; CHECK-NEXT:    mov v3.s[1], w14
++; CHECK-NEXT:    fcvtzs w14, d2
++; CHECK-NEXT:    csel w11, w11, w8, gt
++; CHECK-NEXT:    mov v0.h[3], w13
++; CHECK-NEXT:    mov v1.h[1], w15
++; CHECK-NEXT:    cmp w14, w9
++; CHECK-NEXT:    fmov w13, s3
++; CHECK-NEXT:    csel w14, w14, w9, lt
++; CHECK-NEXT:    fcvtzs w15, d7
++; CHECK-NEXT:    fmov s2, w11
++; CHECK-NEXT:    cmn w14, #8, lsl #12 // =32768
++; CHECK-NEXT:    mov w11, v3.s[1]
++; CHECK-NEXT:    mov v1.h[2], w13
++; CHECK-NEXT:    csel w13, w14, w8, gt
++; CHECK-NEXT:    cmp w15, w9
++; CHECK-NEXT:    fmov s3, w12
++; CHECK-NEXT:    mov v2.s[1], w16
++; CHECK-NEXT:    csel w9, w15, w9, lt
++; CHECK-NEXT:    cmn w9, #8, lsl #12 // =32768
++; CHECK-NEXT:    fmov w12, s4
++; CHECK-NEXT:    csel w8, w9, w8, gt
++; CHECK-NEXT:    mov w14, v4.s[1]
++; CHECK-NEXT:    mov v1.h[3], w11
++; CHECK-NEXT:    fmov w11, s2
++; CHECK-NEXT:    mov w9, v2.s[1]
++; CHECK-NEXT:    fmov s2, w8
++; CHECK-NEXT:    mov v0.h[4], w12
++; CHECK-NEXT:    mov v1.h[4], w11
++; CHECK-NEXT:    mov v3.s[1], w10
++; CHECK-NEXT:    mov v2.s[1], w13
++; CHECK-NEXT:    mov v0.h[5], w14
++; CHECK-NEXT:    mov v1.h[5], w9
++; CHECK-NEXT:    fmov w8, s3
++; CHECK-NEXT:    fmov w9, s2
++; CHECK-NEXT:    mov w10, v3.s[1]
++; CHECK-NEXT:    mov w11, v2.s[1]
++; CHECK-NEXT:    mov v0.h[6], w8
++; CHECK-NEXT:    mov v1.h[6], w9
++; CHECK-NEXT:    mov v0.h[7], w10
++; CHECK-NEXT:    mov v1.h[7], w11
++; CHECK-NEXT:    ret
++    %x = call <16 x i16> @llvm.fptosi.sat.v16f64.v16i16(<16 x double> %f)
++    ret <16 x i16> %x
++}
+diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
+index 017845d3624a..ace519684215 100644
+--- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
++++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
+@@ -2480,3 +2480,659 @@ define <8 x i128> @test_unsigned_v8f16_v8i128(<8 x half> %f) {
+     %x = call <8 x i128> @llvm.fptoui.sat.v8f16.v8i128(<8 x half> %f)
+     ret <8 x i128> %x
+ }
++
++
++declare <8 x i8> @llvm.fptoui.sat.v8f32.v8i8(<8 x float> %f)
++declare <8 x i16> @llvm.fptoui.sat.v8f32.v8i16(<8 x float> %f)
++declare <16 x i8> @llvm.fptoui.sat.v16f32.v16i8(<16 x float> %f)
++declare <16 x i16> @llvm.fptoui.sat.v16f32.v16i16(<16 x float> %f)
++
++declare <16 x i8> @llvm.fptoui.sat.v16f16.v16i8(<16 x half> %f)
++declare <16 x i16> @llvm.fptoui.sat.v16f16.v16i16(<16 x half> %f)
++
++declare <8 x i8> @llvm.fptoui.sat.v8f64.v8i8(<8 x double> %f)
++declare <8 x i16> @llvm.fptoui.sat.v8f64.v8i16(<8 x double> %f)
++declare <16 x i8> @llvm.fptoui.sat.v16f64.v16i8(<16 x double> %f)
++declare <16 x i16> @llvm.fptoui.sat.v16f64.v16i16(<16 x double> %f)
++
++define <8 x i8> @test_unsigned_v8f32_v8i8(<8 x float> %f) {
++; CHECK-LABEL: test_unsigned_v8f32_v8i8:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    movi v2.2d, #0x0000ff000000ff
++; CHECK-NEXT:    fcvtzu v1.4s, v1.4s
++; CHECK-NEXT:    fcvtzu v0.4s, v0.4s
++; CHECK-NEXT:    umin v1.4s, v1.4s, v2.4s
++; CHECK-NEXT:    umin v0.4s, v0.4s, v2.4s
++; CHECK-NEXT:    xtn v1.4h, v1.4s
++; CHECK-NEXT:    xtn v0.4h, v0.4s
++; CHECK-NEXT:    uzp1 v0.8b, v0.8b, v1.8b
++; CHECK-NEXT:    ret
++    %x = call <8 x i8> @llvm.fptoui.sat.v8f32.v8i8(<8 x float> %f)
++    ret <8 x i8> %x
++}
++
++define <16 x i8> @test_unsigned_v16f32_v16i8(<16 x float> %f) {
++; CHECK-LABEL: test_unsigned_v16f32_v16i8:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    movi v4.2d, #0x0000ff000000ff
++; CHECK-NEXT:    fcvtzu v0.4s, v0.4s
++; CHECK-NEXT:    fcvtzu v1.4s, v1.4s
++; CHECK-NEXT:    fcvtzu v2.4s, v2.4s
++; CHECK-NEXT:    umin v0.4s, v0.4s, v4.4s
++; CHECK-NEXT:    umin v1.4s, v1.4s, v4.4s
++; CHECK-NEXT:    umin v2.4s, v2.4s, v4.4s
++; CHECK-NEXT:    xtn v5.4h, v0.4s
++; CHECK-NEXT:    xtn v1.4h, v1.4s
++; CHECK-NEXT:    umov w8, v5.h[0]
++; CHECK-NEXT:    umov w9, v5.h[1]
++; CHECK-NEXT:    fmov s0, w8
++; CHECK-NEXT:    umov w8, v5.h[2]
++; CHECK-NEXT:    mov v0.b[1], w9
++; CHECK-NEXT:    mov v0.b[2], w8
++; CHECK-NEXT:    umov w8, v5.h[3]
++; CHECK-NEXT:    mov v0.b[3], w8
++; CHECK-NEXT:    umov w8, v1.h[0]
++; CHECK-NEXT:    mov v0.b[4], w8
++; CHECK-NEXT:    umov w8, v1.h[1]
++; CHECK-NEXT:    mov v0.b[5], w8
++; CHECK-NEXT:    umov w8, v1.h[2]
++; CHECK-NEXT:    mov v0.b[6], w8
++; CHECK-NEXT:    umov w8, v1.h[3]
++; CHECK-NEXT:    xtn v1.4h, v2.4s
++; CHECK-NEXT:    fcvtzu v2.4s, v3.4s
++; CHECK-NEXT:    mov v0.b[7], w8
++; CHECK-NEXT:    umov w8, v1.h[0]
++; CHECK-NEXT:    umin v2.4s, v2.4s, v4.4s
++; CHECK-NEXT:    mov v0.b[8], w8
++; CHECK-NEXT:    umov w8, v1.h[1]
++; CHECK-NEXT:    mov v0.b[9], w8
++; CHECK-NEXT:    umov w8, v1.h[2]
++; CHECK-NEXT:    mov v0.b[10], w8
++; CHECK-NEXT:    umov w8, v1.h[3]
++; CHECK-NEXT:    xtn v1.4h, v2.4s
++; CHECK-NEXT:    mov v0.b[11], w8
++; CHECK-NEXT:    umov w8, v1.h[0]
++; CHECK-NEXT:    mov v0.b[12], w8
++; CHECK-NEXT:    umov w8, v1.h[1]
++; CHECK-NEXT:    mov v0.b[13], w8
++; CHECK-NEXT:    umov w8, v1.h[2]
++; CHECK-NEXT:    mov v0.b[14], w8
++; CHECK-NEXT:    umov w8, v1.h[3]
++; CHECK-NEXT:    mov v0.b[15], w8
++; CHECK-NEXT:    ret
++    %x = call <16 x i8> @llvm.fptoui.sat.v16f32.v16i8(<16 x float> %f)
++    ret <16 x i8> %x
++}
++
++define <8 x i16> @test_unsigned_v8f32_v8i16(<8 x float> %f) {
++; CHECK-LABEL: test_unsigned_v8f32_v8i16:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    movi v2.2d, #0x00ffff0000ffff
++; CHECK-NEXT:    fcvtzu v1.4s, v1.4s
++; CHECK-NEXT:    fcvtzu v0.4s, v0.4s
++; CHECK-NEXT:    umin v1.4s, v1.4s, v2.4s
++; CHECK-NEXT:    umin v0.4s, v0.4s, v2.4s
++; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
++; CHECK-NEXT:    ret
++    %x = call <8 x i16> @llvm.fptoui.sat.v8f32.v8i16(<8 x float> %f)
++    ret <8 x i16> %x
++}
++
++define <16 x i16> @test_unsigned_v16f32_v16i16(<16 x float> %f) {
++; CHECK-LABEL: test_unsigned_v16f32_v16i16:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    movi v4.2d, #0x00ffff0000ffff
++; CHECK-NEXT:    fcvtzu v1.4s, v1.4s
++; CHECK-NEXT:    fcvtzu v0.4s, v0.4s
++; CHECK-NEXT:    fcvtzu v3.4s, v3.4s
++; CHECK-NEXT:    fcvtzu v2.4s, v2.4s
++; CHECK-NEXT:    umin v1.4s, v1.4s, v4.4s
++; CHECK-NEXT:    umin v0.4s, v0.4s, v4.4s
++; CHECK-NEXT:    umin v3.4s, v3.4s, v4.4s
++; CHECK-NEXT:    umin v2.4s, v2.4s, v4.4s
++; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
++; CHECK-NEXT:    uzp1 v1.8h, v2.8h, v3.8h
++; CHECK-NEXT:    ret
++    %x = call <16 x i16> @llvm.fptoui.sat.v16f32.v16i16(<16 x float> %f)
++    ret <16 x i16> %x
++}
++
++
++
++define <16 x i8> @test_unsigned_v16f16_v16i8(<16 x half> %f) {
++; CHECK-CVT-LABEL: test_unsigned_v16f16_v16i8:
++; CHECK-CVT:       // %bb.0:
++; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v1.8h
++; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
++; CHECK-CVT-NEXT:    fcvtl2 v5.4s, v0.8h
++; CHECK-CVT-NEXT:    mov w8, #255
++; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
++; CHECK-CVT-NEXT:    mov s3, v2.s[1]
++; CHECK-CVT-NEXT:    mov s4, v2.s[2]
++; CHECK-CVT-NEXT:    fcvtzu w9, s2
++; CHECK-CVT-NEXT:    mov s2, v2.s[3]
++; CHECK-CVT-NEXT:    fcvtzu w12, s1
++; CHECK-CVT-NEXT:    fcvtzu w16, s5
++; CHECK-CVT-NEXT:    fcvtzu w2, s0
++; CHECK-CVT-NEXT:    fcvtzu w10, s3
++; CHECK-CVT-NEXT:    mov s3, v1.s[1]
++; CHECK-CVT-NEXT:    fcvtzu w11, s4
++; CHECK-CVT-NEXT:    mov s4, v1.s[2]
++; CHECK-CVT-NEXT:    mov s1, v1.s[3]
++; CHECK-CVT-NEXT:    fcvtzu w13, s2
++; CHECK-CVT-NEXT:    cmp w10, #255
++; CHECK-CVT-NEXT:    mov s2, v5.s[1]
++; CHECK-CVT-NEXT:    fcvtzu w14, s3
++; CHECK-CVT-NEXT:    csel w10, w10, w8, lo
++; CHECK-CVT-NEXT:    cmp w9, #255
++; CHECK-CVT-NEXT:    fcvtzu w15, s4
++; CHECK-CVT-NEXT:    csel w9, w9, w8, lo
++; CHECK-CVT-NEXT:    cmp w11, #255
++; CHECK-CVT-NEXT:    csel w11, w11, w8, lo
++; CHECK-CVT-NEXT:    cmp w13, #255
++; CHECK-CVT-NEXT:    mov s3, v5.s[2]
++; CHECK-CVT-NEXT:    fcvtzu w17, s1
++; CHECK-CVT-NEXT:    csel w13, w13, w8, lo
++; CHECK-CVT-NEXT:    cmp w14, #255
++; CHECK-CVT-NEXT:    mov s4, v5.s[3]
++; CHECK-CVT-NEXT:    fcvtzu w18, s2
++; CHECK-CVT-NEXT:    csel w14, w14, w8, lo
++; CHECK-CVT-NEXT:    cmp w12, #255
++; CHECK-CVT-NEXT:    mov s1, v0.s[1]
++; CHECK-CVT-NEXT:    csel w12, w12, w8, lo
++; CHECK-CVT-NEXT:    cmp w15, #255
++; CHECK-CVT-NEXT:    fcvtzu w0, s3
++; CHECK-CVT-NEXT:    csel w15, w15, w8, lo
++; CHECK-CVT-NEXT:    cmp w17, #255
++; CHECK-CVT-NEXT:    csel w17, w17, w8, lo
++; CHECK-CVT-NEXT:    cmp w18, #255
++; CHECK-CVT-NEXT:    fmov s2, w9
++; CHECK-CVT-NEXT:    csel w9, w18, w8, lo
++; CHECK-CVT-NEXT:    fcvtzu w18, s4
++; CHECK-CVT-NEXT:    cmp w16, #255
++; CHECK-CVT-NEXT:    fcvtzu w1, s1
++; CHECK-CVT-NEXT:    csel w16, w16, w8, lo
++; CHECK-CVT-NEXT:    cmp w0, #255
++; CHECK-CVT-NEXT:    mov s1, v0.s[2]
++; CHECK-CVT-NEXT:    csel w0, w0, w8, lo
++; CHECK-CVT-NEXT:    cmp w18, #255
++; CHECK-CVT-NEXT:    mov v2.s[1], w10
++; CHECK-CVT-NEXT:    csel w10, w18, w8, lo
++; CHECK-CVT-NEXT:    cmp w1, #255
++; CHECK-CVT-NEXT:    fmov s3, w12
++; CHECK-CVT-NEXT:    csel w18, w1, w8, lo
++; CHECK-CVT-NEXT:    cmp w2, #255
++; CHECK-CVT-NEXT:    csel w1, w2, w8, lo
++; CHECK-CVT-NEXT:    fmov s4, w16
++; CHECK-CVT-NEXT:    mov v2.s[2], w11
++; CHECK-CVT-NEXT:    fcvtzu w11, s1
++; CHECK-CVT-NEXT:    mov s0, v0.s[3]
++; CHECK-CVT-NEXT:    fmov s1, w1
++; CHECK-CVT-NEXT:    mov v3.s[1], w14
++; CHECK-CVT-NEXT:    cmp w11, #255
++; CHECK-CVT-NEXT:    mov v4.s[1], w9
++; CHECK-CVT-NEXT:    csel w9, w11, w8, lo
++; CHECK-CVT-NEXT:    mov v1.s[1], w18
++; CHECK-CVT-NEXT:    fcvtzu w11, s0
++; CHECK-CVT-NEXT:    mov v3.s[2], w15
++; CHECK-CVT-NEXT:    mov v4.s[2], w0
++; CHECK-CVT-NEXT:    mov v1.s[2], w9
++; CHECK-CVT-NEXT:    cmp w11, #255
++; CHECK-CVT-NEXT:    csel w8, w11, w8, lo
++; CHECK-CVT-NEXT:    mov v2.s[3], w13
++; CHECK-CVT-NEXT:    mov v3.s[3], w17
++; CHECK-CVT-NEXT:    mov v4.s[3], w10
++; CHECK-CVT-NEXT:    mov v1.s[3], w8
++; CHECK-CVT-NEXT:    uzp1 v0.8h, v3.8h, v2.8h
++; CHECK-CVT-NEXT:    uzp1 v1.8h, v1.8h, v4.8h
++; CHECK-CVT-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
++; CHECK-CVT-NEXT:    ret
++;
++; CHECK-FP16-LABEL: test_unsigned_v16f16_v16i8:
++; CHECK-FP16:       // %bb.0:
++; CHECK-FP16-NEXT:    movi v2.2d, #0xff00ff00ff00ff
++; CHECK-FP16-NEXT:    fcvtzu v1.8h, v1.8h
++; CHECK-FP16-NEXT:    fcvtzu v0.8h, v0.8h
++; CHECK-FP16-NEXT:    umin v1.8h, v1.8h, v2.8h
++; CHECK-FP16-NEXT:    umin v0.8h, v0.8h, v2.8h
++; CHECK-FP16-NEXT:    uzp1 v0.16b, v0.16b, v1.16b
++; CHECK-FP16-NEXT:    ret
++    %x = call <16 x i8> @llvm.fptoui.sat.v16f16.v16i8(<16 x half> %f)
++    ret <16 x i8> %x
++}
++
++define <16 x i16> @test_unsigned_v16f16_v16i16(<16 x half> %f) {
++; CHECK-CVT-LABEL: test_unsigned_v16f16_v16i16:
++; CHECK-CVT:       // %bb.0:
++; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v0.8h
++; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
++; CHECK-CVT-NEXT:    fcvtl2 v5.4s, v1.8h
++; CHECK-CVT-NEXT:    mov w8, #65535
++; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
++; CHECK-CVT-NEXT:    mov s3, v2.s[1]
++; CHECK-CVT-NEXT:    mov s4, v2.s[2]
++; CHECK-CVT-NEXT:    fcvtzu w9, s2
++; CHECK-CVT-NEXT:    mov s2, v2.s[3]
++; CHECK-CVT-NEXT:    fcvtzu w12, s0
++; CHECK-CVT-NEXT:    fcvtzu w16, s5
++; CHECK-CVT-NEXT:    fcvtzu w2, s1
++; CHECK-CVT-NEXT:    fcvtzu w10, s3
++; CHECK-CVT-NEXT:    mov s3, v0.s[1]
++; CHECK-CVT-NEXT:    fcvtzu w11, s4
++; CHECK-CVT-NEXT:    mov s4, v0.s[2]
++; CHECK-CVT-NEXT:    mov s0, v0.s[3]
++; CHECK-CVT-NEXT:    fcvtzu w13, s2
++; CHECK-CVT-NEXT:    cmp w10, w8
++; CHECK-CVT-NEXT:    mov s2, v5.s[1]
++; CHECK-CVT-NEXT:    fcvtzu w14, s3
++; CHECK-CVT-NEXT:    csel w10, w10, w8, lo
++; CHECK-CVT-NEXT:    cmp w9, w8
++; CHECK-CVT-NEXT:    fcvtzu w15, s4
++; CHECK-CVT-NEXT:    csel w9, w9, w8, lo
++; CHECK-CVT-NEXT:    cmp w11, w8
++; CHECK-CVT-NEXT:    csel w11, w11, w8, lo
++; CHECK-CVT-NEXT:    cmp w13, w8
++; CHECK-CVT-NEXT:    mov s3, v5.s[2]
++; CHECK-CVT-NEXT:    fcvtzu w17, s0
++; CHECK-CVT-NEXT:    csel w13, w13, w8, lo
++; CHECK-CVT-NEXT:    cmp w14, w8
++; CHECK-CVT-NEXT:    mov s4, v5.s[3]
++; CHECK-CVT-NEXT:    fcvtzu w18, s2
++; CHECK-CVT-NEXT:    csel w14, w14, w8, lo
++; CHECK-CVT-NEXT:    cmp w12, w8
++; CHECK-CVT-NEXT:    mov s0, v1.s[1]
++; CHECK-CVT-NEXT:    csel w12, w12, w8, lo
++; CHECK-CVT-NEXT:    cmp w15, w8
++; CHECK-CVT-NEXT:    fcvtzu w0, s3
++; CHECK-CVT-NEXT:    csel w15, w15, w8, lo
++; CHECK-CVT-NEXT:    cmp w17, w8
++; CHECK-CVT-NEXT:    csel w17, w17, w8, lo
++; CHECK-CVT-NEXT:    cmp w18, w8
++; CHECK-CVT-NEXT:    fmov s2, w9
++; CHECK-CVT-NEXT:    csel w9, w18, w8, lo
++; CHECK-CVT-NEXT:    fcvtzu w18, s4
++; CHECK-CVT-NEXT:    cmp w16, w8
++; CHECK-CVT-NEXT:    fcvtzu w1, s0
++; CHECK-CVT-NEXT:    csel w16, w16, w8, lo
++; CHECK-CVT-NEXT:    cmp w0, w8
++; CHECK-CVT-NEXT:    mov s0, v1.s[2]
++; CHECK-CVT-NEXT:    csel w0, w0, w8, lo
++; CHECK-CVT-NEXT:    cmp w18, w8
++; CHECK-CVT-NEXT:    mov v2.s[1], w10
++; CHECK-CVT-NEXT:    csel w10, w18, w8, lo
++; CHECK-CVT-NEXT:    cmp w1, w8
++; CHECK-CVT-NEXT:    fmov s3, w12
++; CHECK-CVT-NEXT:    csel w18, w1, w8, lo
++; CHECK-CVT-NEXT:    cmp w2, w8
++; CHECK-CVT-NEXT:    csel w1, w2, w8, lo
++; CHECK-CVT-NEXT:    fmov s4, w16
++; CHECK-CVT-NEXT:    mov v2.s[2], w11
++; CHECK-CVT-NEXT:    fcvtzu w11, s0
++; CHECK-CVT-NEXT:    mov s0, v1.s[3]
++; CHECK-CVT-NEXT:    fmov s5, w1
++; CHECK-CVT-NEXT:    mov v3.s[1], w14
++; CHECK-CVT-NEXT:    cmp w11, w8
++; CHECK-CVT-NEXT:    mov v4.s[1], w9
++; CHECK-CVT-NEXT:    csel w9, w11, w8, lo
++; CHECK-CVT-NEXT:    mov v5.s[1], w18
++; CHECK-CVT-NEXT:    fcvtzu w11, s0
++; CHECK-CVT-NEXT:    mov v3.s[2], w15
++; CHECK-CVT-NEXT:    mov v4.s[2], w0
++; CHECK-CVT-NEXT:    mov v5.s[2], w9
++; CHECK-CVT-NEXT:    cmp w11, w8
++; CHECK-CVT-NEXT:    csel w8, w11, w8, lo
++; CHECK-CVT-NEXT:    mov v2.s[3], w13
++; CHECK-CVT-NEXT:    mov v3.s[3], w17
++; CHECK-CVT-NEXT:    mov v4.s[3], w10
++; CHECK-CVT-NEXT:    mov v5.s[3], w8
++; CHECK-CVT-NEXT:    uzp1 v0.8h, v3.8h, v2.8h
++; CHECK-CVT-NEXT:    uzp1 v1.8h, v5.8h, v4.8h
++; CHECK-CVT-NEXT:    ret
++;
++; CHECK-FP16-LABEL: test_unsigned_v16f16_v16i16:
++; CHECK-FP16:       // %bb.0:
++; CHECK-FP16-NEXT:    fcvtzu v0.8h, v0.8h
++; CHECK-FP16-NEXT:    fcvtzu v1.8h, v1.8h
++; CHECK-FP16-NEXT:    ret
++    %x = call <16 x i16> @llvm.fptoui.sat.v16f16.v16i16(<16 x half> %f)
++    ret <16 x i16> %x
++}
++
++define <8 x i8> @test_unsigned_v8f64_v8i8(<8 x double> %f) {
++; CHECK-LABEL: test_unsigned_v8f64_v8i8:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    mov d5, v0.d[1]
++; CHECK-NEXT:    fcvtzu w10, d0
++; CHECK-NEXT:    mov d0, v1.d[1]
++; CHECK-NEXT:    mov w8, #255
++; CHECK-NEXT:    fcvtzu w12, d1
++; CHECK-NEXT:    mov d4, v2.d[1]
++; CHECK-NEXT:    fcvtzu w13, d3
++; CHECK-NEXT:    fcvtzu w9, d5
++; CHECK-NEXT:    fcvtzu w11, d0
++; CHECK-NEXT:    cmp w9, #255
++; CHECK-NEXT:    csel w9, w9, w8, lo
++; CHECK-NEXT:    cmp w10, #255
++; CHECK-NEXT:    csel w10, w10, w8, lo
++; CHECK-NEXT:    cmp w11, #255
++; CHECK-NEXT:    fmov s0, w10
++; CHECK-NEXT:    csel w10, w11, w8, lo
++; CHECK-NEXT:    cmp w12, #255
++; CHECK-NEXT:    csel w11, w12, w8, lo
++; CHECK-NEXT:    mov v0.s[1], w9
++; CHECK-NEXT:    fcvtzu w9, d4
++; CHECK-NEXT:    fmov s1, w11
++; CHECK-NEXT:    fcvtzu w11, d2
++; CHECK-NEXT:    cmp w9, #255
++; CHECK-NEXT:    mov d2, v3.d[1]
++; CHECK-NEXT:    mov w12, v0.s[1]
++; CHECK-NEXT:    csel w9, w9, w8, lo
++; CHECK-NEXT:    mov v1.s[1], w10
++; CHECK-NEXT:    cmp w11, #255
++; CHECK-NEXT:    csel w10, w11, w8, lo
++; CHECK-NEXT:    mov v0.b[1], w12
++; CHECK-NEXT:    fmov w11, s1
++; CHECK-NEXT:    fmov s4, w10
++; CHECK-NEXT:    fcvtzu w10, d2
++; CHECK-NEXT:    mov w12, v1.s[1]
++; CHECK-NEXT:    mov v0.b[2], w11
++; CHECK-NEXT:    mov v4.s[1], w9
++; CHECK-NEXT:    cmp w10, #255
++; CHECK-NEXT:    csel w9, w10, w8, lo
++; CHECK-NEXT:    cmp w13, #255
++; CHECK-NEXT:    csel w8, w13, w8, lo
++; CHECK-NEXT:    mov v0.b[3], w12
++; CHECK-NEXT:    fmov w10, s4
++; CHECK-NEXT:    fmov s1, w8
++; CHECK-NEXT:    mov w8, v4.s[1]
++; CHECK-NEXT:    mov v0.b[4], w10
++; CHECK-NEXT:    mov v1.s[1], w9
++; CHECK-NEXT:    mov v0.b[5], w8
++; CHECK-NEXT:    fmov w8, s1
++; CHECK-NEXT:    mov w9, v1.s[1]
++; CHECK-NEXT:    mov v0.b[6], w8
++; CHECK-NEXT:    mov v0.b[7], w9
++; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
++; CHECK-NEXT:    ret
++    %x = call <8 x i8> @llvm.fptoui.sat.v8f64.v8i8(<8 x double> %f)
++    ret <8 x i8> %x
++}
++
++define <16 x i8> @test_unsigned_v16f64_v16i8(<16 x double> %f) {
++; CHECK-LABEL: test_unsigned_v16f64_v16i8:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    mov d16, v0.d[1]
++; CHECK-NEXT:    fcvtzu w10, d0
++; CHECK-NEXT:    mov d0, v1.d[1]
++; CHECK-NEXT:    mov w8, #255
++; CHECK-NEXT:    fcvtzu w12, d1
++; CHECK-NEXT:    mov d1, v2.d[1]
++; CHECK-NEXT:    fcvtzu w9, d16
++; CHECK-NEXT:    fcvtzu w11, d0
++; CHECK-NEXT:    cmp w9, #255
++; CHECK-NEXT:    csel w9, w9, w8, lo
++; CHECK-NEXT:    cmp w10, #255
++; CHECK-NEXT:    csel w10, w10, w8, lo
++; CHECK-NEXT:    cmp w11, #255
++; CHECK-NEXT:    fmov s0, w10
++; CHECK-NEXT:    csel w10, w11, w8, lo
++; CHECK-NEXT:    cmp w12, #255
++; CHECK-NEXT:    csel w11, w12, w8, lo
++; CHECK-NEXT:    mov v0.s[1], w9
++; CHECK-NEXT:    fcvtzu w9, d1
++; CHECK-NEXT:    fmov s1, w11
++; CHECK-NEXT:    fcvtzu w11, d2
++; CHECK-NEXT:    cmp w9, #255
++; CHECK-NEXT:    mov d2, v3.d[1]
++; CHECK-NEXT:    mov w12, v0.s[1]
++; CHECK-NEXT:    csel w9, w9, w8, lo
++; CHECK-NEXT:    mov v1.s[1], w10
++; CHECK-NEXT:    cmp w11, #255
++; CHECK-NEXT:    csel w11, w11, w8, lo
++; CHECK-NEXT:    fcvtzu w10, d2
++; CHECK-NEXT:    mov d2, v4.d[1]
++; CHECK-NEXT:    mov v0.b[1], w12
++; CHECK-NEXT:    fmov w13, s1
++; CHECK-NEXT:    mov w12, v1.s[1]
++; CHECK-NEXT:    fmov s1, w11
++; CHECK-NEXT:    fcvtzu w11, d3
++; CHECK-NEXT:    cmp w10, #255
++; CHECK-NEXT:    mov v0.b[2], w13
++; CHECK-NEXT:    mov v1.s[1], w9
++; CHECK-NEXT:    csel w9, w10, w8, lo
++; CHECK-NEXT:    cmp w11, #255
++; CHECK-NEXT:    fcvtzu w10, d2
++; CHECK-NEXT:    csel w11, w11, w8, lo
++; CHECK-NEXT:    mov d2, v5.d[1]
++; CHECK-NEXT:    mov v0.b[3], w12
++; CHECK-NEXT:    fmov w12, s1
++; CHECK-NEXT:    cmp w10, #255
++; CHECK-NEXT:    mov w13, v1.s[1]
++; CHECK-NEXT:    fmov s1, w11
++; CHECK-NEXT:    fcvtzu w11, d4
++; CHECK-NEXT:    mov v0.b[4], w12
++; CHECK-NEXT:    mov v1.s[1], w9
++; CHECK-NEXT:    csel w9, w10, w8, lo
++; CHECK-NEXT:    cmp w11, #255
++; CHECK-NEXT:    csel w10, w11, w8, lo
++; CHECK-NEXT:    mov v0.b[5], w13
++; CHECK-NEXT:    fcvtzu w13, d2
++; CHECK-NEXT:    fmov w11, s1
++; CHECK-NEXT:    mov w12, v1.s[1]
++; CHECK-NEXT:    fmov s1, w10
++; CHECK-NEXT:    fcvtzu w10, d5
++; CHECK-NEXT:    cmp w13, #255
++; CHECK-NEXT:    mov v0.b[6], w11
++; CHECK-NEXT:    mov d2, v6.d[1]
++; CHECK-NEXT:    mov v1.s[1], w9
++; CHECK-NEXT:    csel w9, w13, w8, lo
++; CHECK-NEXT:    cmp w10, #255
++; CHECK-NEXT:    fcvtzu w13, d6
++; CHECK-NEXT:    csel w10, w10, w8, lo
++; CHECK-NEXT:    mov v0.b[7], w12
++; CHECK-NEXT:    fcvtzu w12, d2
++; CHECK-NEXT:    fmov w11, s1
++; CHECK-NEXT:    fmov s2, w10
++; CHECK-NEXT:    mov w10, v1.s[1]
++; CHECK-NEXT:    cmp w12, #255
++; CHECK-NEXT:    mov d1, v7.d[1]
++; CHECK-NEXT:    mov v0.b[8], w11
++; CHECK-NEXT:    mov v2.s[1], w9
++; CHECK-NEXT:    csel w9, w12, w8, lo
++; CHECK-NEXT:    cmp w13, #255
++; CHECK-NEXT:    csel w11, w13, w8, lo
++; CHECK-NEXT:    fcvtzu w13, d7
++; CHECK-NEXT:    mov v0.b[9], w10
++; CHECK-NEXT:    fmov w10, s2
++; CHECK-NEXT:    fmov s3, w11
++; CHECK-NEXT:    fcvtzu w11, d1
++; CHECK-NEXT:    mov w12, v2.s[1]
++; CHECK-NEXT:    mov v0.b[10], w10
++; CHECK-NEXT:    mov v3.s[1], w9
++; CHECK-NEXT:    cmp w11, #255
++; CHECK-NEXT:    csel w9, w11, w8, lo
++; CHECK-NEXT:    cmp w13, #255
++; CHECK-NEXT:    csel w8, w13, w8, lo
++; CHECK-NEXT:    mov v0.b[11], w12
++; CHECK-NEXT:    fmov w10, s3
++; CHECK-NEXT:    fmov s1, w8
++; CHECK-NEXT:    mov w8, v3.s[1]
++; CHECK-NEXT:    mov v0.b[12], w10
++; CHECK-NEXT:    mov v1.s[1], w9
++; CHECK-NEXT:    mov v0.b[13], w8
++; CHECK-NEXT:    fmov w8, s1
++; CHECK-NEXT:    mov w9, v1.s[1]
++; CHECK-NEXT:    mov v0.b[14], w8
++; CHECK-NEXT:    mov v0.b[15], w9
++; CHECK-NEXT:    ret
++    %x = call <16 x i8> @llvm.fptoui.sat.v16f64.v16i8(<16 x double> %f)
++    ret <16 x i8> %x
++}
++
++define <8 x i16> @test_unsigned_v8f64_v8i16(<8 x double> %f) {
++; CHECK-LABEL: test_unsigned_v8f64_v8i16:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    mov d5, v0.d[1]
++; CHECK-NEXT:    fcvtzu w10, d0
++; CHECK-NEXT:    mov d0, v1.d[1]
++; CHECK-NEXT:    mov w8, #65535
++; CHECK-NEXT:    fcvtzu w12, d1
++; CHECK-NEXT:    mov d4, v2.d[1]
++; CHECK-NEXT:    fcvtzu w13, d3
++; CHECK-NEXT:    fcvtzu w9, d5
++; CHECK-NEXT:    fcvtzu w11, d0
++; CHECK-NEXT:    cmp w9, w8
++; CHECK-NEXT:    csel w9, w9, w8, lo
++; CHECK-NEXT:    cmp w10, w8
++; CHECK-NEXT:    csel w10, w10, w8, lo
++; CHECK-NEXT:    cmp w11, w8
++; CHECK-NEXT:    fmov s0, w10
++; CHECK-NEXT:    csel w10, w11, w8, lo
++; CHECK-NEXT:    cmp w12, w8
++; CHECK-NEXT:    csel w11, w12, w8, lo
++; CHECK-NEXT:    mov v0.s[1], w9
++; CHECK-NEXT:    fcvtzu w9, d4
++; CHECK-NEXT:    fmov s1, w11
++; CHECK-NEXT:    fcvtzu w11, d2
++; CHECK-NEXT:    cmp w9, w8
++; CHECK-NEXT:    mov d2, v3.d[1]
++; CHECK-NEXT:    mov w12, v0.s[1]
++; CHECK-NEXT:    csel w9, w9, w8, lo
++; CHECK-NEXT:    mov v1.s[1], w10
++; CHECK-NEXT:    cmp w11, w8
++; CHECK-NEXT:    csel w10, w11, w8, lo
++; CHECK-NEXT:    mov v0.h[1], w12
++; CHECK-NEXT:    fmov w11, s1
++; CHECK-NEXT:    fmov s4, w10
++; CHECK-NEXT:    fcvtzu w10, d2
++; CHECK-NEXT:    mov w12, v1.s[1]
++; CHECK-NEXT:    mov v0.h[2], w11
++; CHECK-NEXT:    mov v4.s[1], w9
++; CHECK-NEXT:    cmp w10, w8
++; CHECK-NEXT:    csel w9, w10, w8, lo
++; CHECK-NEXT:    cmp w13, w8
++; CHECK-NEXT:    csel w8, w13, w8, lo
++; CHECK-NEXT:    mov v0.h[3], w12
++; CHECK-NEXT:    fmov w10, s4
++; CHECK-NEXT:    fmov s1, w8
++; CHECK-NEXT:    mov w8, v4.s[1]
++; CHECK-NEXT:    mov v0.h[4], w10
++; CHECK-NEXT:    mov v1.s[1], w9
++; CHECK-NEXT:    mov v0.h[5], w8
++; CHECK-NEXT:    fmov w8, s1
++; CHECK-NEXT:    mov w9, v1.s[1]
++; CHECK-NEXT:    mov v0.h[6], w8
++; CHECK-NEXT:    mov v0.h[7], w9
++; CHECK-NEXT:    ret
++    %x = call <8 x i16> @llvm.fptoui.sat.v8f64.v8i16(<8 x double> %f)
++    ret <8 x i16> %x
++}
++
++define <16 x i16> @test_unsigned_v16f64_v16i16(<16 x double> %f) {
++; CHECK-LABEL: test_unsigned_v16f64_v16i16:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    mov d16, v0.d[1]
++; CHECK-NEXT:    fcvtzu w9, d0
++; CHECK-NEXT:    mov d0, v1.d[1]
++; CHECK-NEXT:    mov d17, v2.d[1]
++; CHECK-NEXT:    fcvtzu w10, d1
++; CHECK-NEXT:    mov d1, v3.d[1]
++; CHECK-NEXT:    mov w8, #65535
++; CHECK-NEXT:    fcvtzu w12, d2
++; CHECK-NEXT:    fcvtzu w11, d16
++; CHECK-NEXT:    mov d2, v4.d[1]
++; CHECK-NEXT:    fcvtzu w13, d0
++; CHECK-NEXT:    fcvtzu w14, d17
++; CHECK-NEXT:    fcvtzu w15, d1
++; CHECK-NEXT:    fcvtzu w16, d3
++; CHECK-NEXT:    cmp w11, w8
++; CHECK-NEXT:    mov d1, v5.d[1]
++; CHECK-NEXT:    csel w11, w11, w8, lo
++; CHECK-NEXT:    cmp w9, w8
++; CHECK-NEXT:    csel w9, w9, w8, lo
++; CHECK-NEXT:    cmp w13, w8
++; CHECK-NEXT:    csel w13, w13, w8, lo
++; CHECK-NEXT:    cmp w10, w8
++; CHECK-NEXT:    csel w10, w10, w8, lo
++; CHECK-NEXT:    cmp w14, w8
++; CHECK-NEXT:    csel w14, w14, w8, lo
++; CHECK-NEXT:    cmp w12, w8
++; CHECK-NEXT:    csel w12, w12, w8, lo
++; CHECK-NEXT:    cmp w15, w8
++; CHECK-NEXT:    fcvtzu w17, d2
++; CHECK-NEXT:    fmov s0, w9
++; CHECK-NEXT:    csel w9, w15, w8, lo
++; CHECK-NEXT:    fcvtzu w15, d4
++; CHECK-NEXT:    cmp w16, w8
++; CHECK-NEXT:    fcvtzu w18, d1
++; CHECK-NEXT:    csel w16, w16, w8, lo
++; CHECK-NEXT:    cmp w17, w8
++; CHECK-NEXT:    csel w17, w17, w8, lo
++; CHECK-NEXT:    cmp w15, w8
++; CHECK-NEXT:    mov v0.s[1], w11
++; CHECK-NEXT:    fcvtzu w0, d5
++; CHECK-NEXT:    csel w11, w15, w8, lo
++; CHECK-NEXT:    fmov s2, w10
++; CHECK-NEXT:    cmp w18, w8
++; CHECK-NEXT:    mov d4, v6.d[1]
++; CHECK-NEXT:    csel w10, w18, w8, lo
++; CHECK-NEXT:    cmp w0, w8
++; CHECK-NEXT:    fmov s1, w11
++; CHECK-NEXT:    csel w11, w0, w8, lo
++; CHECK-NEXT:    mov v2.s[1], w13
++; CHECK-NEXT:    mov w13, v0.s[1]
++; CHECK-NEXT:    fcvtzu w15, d4
++; CHECK-NEXT:    mov v1.s[1], w17
++; CHECK-NEXT:    fmov s3, w11
++; CHECK-NEXT:    mov d4, v7.d[1]
++; CHECK-NEXT:    mov v0.h[1], w13
++; CHECK-NEXT:    fmov w11, s2
++; CHECK-NEXT:    mov v3.s[1], w10
++; CHECK-NEXT:    cmp w15, w8
++; CHECK-NEXT:    mov w10, v1.s[1]
++; CHECK-NEXT:    mov w13, v2.s[1]
++; CHECK-NEXT:    fmov s2, w12
++; CHECK-NEXT:    mov v0.h[2], w11
++; CHECK-NEXT:    fcvtzu w11, d6
++; CHECK-NEXT:    csel w12, w15, w8, lo
++; CHECK-NEXT:    mov v1.h[1], w10
++; CHECK-NEXT:    fmov w10, s3
++; CHECK-NEXT:    cmp w11, w8
++; CHECK-NEXT:    csel w11, w11, w8, lo
++; CHECK-NEXT:    mov v0.h[3], w13
++; CHECK-NEXT:    fcvtzu w13, d7
++; CHECK-NEXT:    mov v1.h[2], w10
++; CHECK-NEXT:    fmov s5, w11
++; CHECK-NEXT:    fcvtzu w10, d4
++; CHECK-NEXT:    mov w11, v3.s[1]
++; CHECK-NEXT:    mov v2.s[1], w14
++; CHECK-NEXT:    fmov s3, w16
++; CHECK-NEXT:    mov v5.s[1], w12
++; CHECK-NEXT:    cmp w10, w8
++; CHECK-NEXT:    csel w10, w10, w8, lo
++; CHECK-NEXT:    cmp w13, w8
++; CHECK-NEXT:    csel w8, w13, w8, lo
++; CHECK-NEXT:    fmov w12, s2
++; CHECK-NEXT:    mov v1.h[3], w11
++; CHECK-NEXT:    fmov w13, s5
++; CHECK-NEXT:    mov w14, v2.s[1]
++; CHECK-NEXT:    fmov s2, w8
++; CHECK-NEXT:    mov w11, v5.s[1]
++; CHECK-NEXT:    mov v0.h[4], w12
++; CHECK-NEXT:    mov v1.h[4], w13
++; CHECK-NEXT:    mov v3.s[1], w9
++; CHECK-NEXT:    mov v2.s[1], w10
++; CHECK-NEXT:    mov v0.h[5], w14
++; CHECK-NEXT:    mov v1.h[5], w11
++; CHECK-NEXT:    fmov w8, s3
++; CHECK-NEXT:    fmov w9, s2
++; CHECK-NEXT:    mov w10, v3.s[1]
++; CHECK-NEXT:    mov w11, v2.s[1]
++; CHECK-NEXT:    mov v0.h[6], w8
++; CHECK-NEXT:    mov v1.h[6], w9
++; CHECK-NEXT:    mov v0.h[7], w10
++; CHECK-NEXT:    mov v1.h[7], w11
++; CHECK-NEXT:    ret
++    %x = call <16 x i16> @llvm.fptoui.sat.v16f64.v16i16(<16 x double> %f)
++    ret <16 x i16> %x
++}
+-- 
+2.34.1
+
diff --git a/patches/cherry/bf268a05cd9294854ffccc3158c0e673069bed4a.patch b/patches/cherry/bf268a05cd9294854ffccc3158c0e673069bed4a.patch
new file mode 100644
index 0000000..9e87587
--- /dev/null
+++ b/patches/cherry/bf268a05cd9294854ffccc3158c0e673069bed4a.patch
@@ -0,0 +1,609 @@
+From bf268a05cd9294854ffccc3158c0e673069bed4a Mon Sep 17 00:00:00 2001
+From: Cullen Rhodes <cullen.rhodes@arm.com>
+Date: Fri, 22 Jul 2022 07:27:12 +0000
+Subject: [PATCH] [AArch64] Emit vector FP cmp when LE is used with fast-math
+
+Reviewed By: paulwalker-arm
+
+Differential Revision: https://reviews.llvm.org/D130093
+---
+ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |   9 +
+ .../Target/AArch64/AArch64ISelLowering.cpp    |   7 +-
+ .../AArch64/neon-compare-instructions.ll      | 346 ++----------------
+ 3 files changed, 43 insertions(+), 319 deletions(-)
+
+diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+index 06c633e45ccd..803278e34db8 100644
+--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
++++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+@@ -11843,6 +11843,9 @@ SDValue DAGCombiner::foldSextSetcc(SDNode *N) {
+   EVT N00VT = N00.getValueType();
+   SDLoc DL(N);
+ 
++  // Propagate fast-math-flags.
++  SelectionDAG::FlagInserter FlagsInserter(DAG, N0->getFlags());
++
+   // On some architectures (such as SSE/NEON/etc) the SETCC result type is
+   // the same size as the compared operands. Try to optimize sext(setcc())
+   // if this is the case.
+@@ -12384,6 +12387,9 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
+     return V;
+ 
+   if (N0.getOpcode() == ISD::SETCC) {
++    // Propagate fast-math-flags.
++    SelectionDAG::FlagInserter FlagsInserter(DAG, N0->getFlags());
++
+     // Only do this before legalize for now.
+     if (!LegalOperations && VT.isVector() &&
+         N0.getValueType().getVectorElementType() == MVT::i1) {
+@@ -12575,6 +12581,9 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
+   }
+ 
+   if (N0.getOpcode() == ISD::SETCC) {
++    // Propagate fast-math-flags.
++    SelectionDAG::FlagInserter FlagsInserter(DAG, N0->getFlags());
++
+     // For vectors:
+     // aext(setcc) -> vsetcc
+     // aext(setcc) -> truncate(vsetcc)
+diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+index 52f026456f02..1f6ce2d381ae 100644
+--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
++++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+@@ -11975,6 +11975,11 @@ static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
+       if (IsZero)
+         return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
+       return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
++    case AArch64CC::LE:
++      if (!NoNans)
++        return SDValue();
++      // If we ignore NaNs then we can use to the LS implementation.
++      LLVM_FALLTHROUGH;
+     case AArch64CC::LS:
+       if (IsZero)
+         return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
+@@ -12079,7 +12084,7 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
+   bool ShouldInvert;
+   changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
+ 
+-  bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath;
++  bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs();
+   SDValue Cmp =
+       EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
+   if (!Cmp.getNode())
+diff --git a/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll b/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll
+index dcb0ca631c5b..ec210b4efc7b 100644
+--- a/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll
++++ b/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll
+@@ -4648,17 +4648,7 @@ define <2 x i64> @fcmogt2xdouble_fast(<2 x double> %A, <2 x double> %B) {
+ define <2 x i32> @fcmole2xfloat_fast(<2 x float> %A, <2 x float> %B) {
+ ; CHECK-LABEL: fcmole2xfloat_fast:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+-; CHECK-NEXT:    mov s2, v1.s[1]
+-; CHECK-NEXT:    mov s3, v0.s[1]
+-; CHECK-NEXT:    fcmp s3, s2
+-; CHECK-NEXT:    csetm w8, le
+-; CHECK-NEXT:    fcmp s0, s1
+-; CHECK-NEXT:    csetm w9, le
+-; CHECK-NEXT:    fmov s0, w9
+-; CHECK-NEXT:    mov v0.s[1], w8
+-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
++; CHECK-NEXT:    fcmge v0.2s, v1.2s, v0.2s
+ ; CHECK-NEXT:    ret
+ ;
+ ; GISEL-LABEL: fcmole2xfloat_fast:
+@@ -4675,25 +4665,7 @@ define <2 x i32> @fcmole2xfloat_fast(<2 x float> %A, <2 x float> %B) {
+ define <4 x i32> @fcmole4xfloat_fast(<4 x float> %A, <4 x float> %B) {
+ ; CHECK-LABEL: fcmole4xfloat_fast:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    mov s2, v1.s[1]
+-; CHECK-NEXT:    mov s3, v0.s[1]
+-; CHECK-NEXT:    mov s4, v0.s[2]
+-; CHECK-NEXT:    fcmp s3, s2
+-; CHECK-NEXT:    mov s3, v1.s[2]
+-; CHECK-NEXT:    csetm w8, le
+-; CHECK-NEXT:    fcmp s0, s1
+-; CHECK-NEXT:    mov s1, v1.s[3]
+-; CHECK-NEXT:    mov s0, v0.s[3]
+-; CHECK-NEXT:    csetm w9, le
+-; CHECK-NEXT:    fcmp s4, s3
+-; CHECK-NEXT:    fmov s2, w9
+-; CHECK-NEXT:    mov v2.s[1], w8
+-; CHECK-NEXT:    csetm w8, le
+-; CHECK-NEXT:    fcmp s0, s1
+-; CHECK-NEXT:    mov v2.s[2], w8
+-; CHECK-NEXT:    csetm w8, le
+-; CHECK-NEXT:    mov v2.s[3], w8
+-; CHECK-NEXT:    mov v0.16b, v2.16b
++; CHECK-NEXT:    fcmge v0.4s, v1.4s, v0.4s
+ ; CHECK-NEXT:    ret
+ ;
+ ; GISEL-LABEL: fcmole4xfloat_fast:
+@@ -4710,14 +4682,7 @@ define <4 x i32> @fcmole4xfloat_fast(<4 x float> %A, <4 x float> %B) {
+ define <2 x i64> @fcmole2xdouble_fast(<2 x double> %A, <2 x double> %B) {
+ ; CHECK-LABEL: fcmole2xdouble_fast:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    mov d2, v1.d[1]
+-; CHECK-NEXT:    mov d3, v0.d[1]
+-; CHECK-NEXT:    fcmp d3, d2
+-; CHECK-NEXT:    csetm x8, le
+-; CHECK-NEXT:    fcmp d0, d1
+-; CHECK-NEXT:    csetm x9, le
+-; CHECK-NEXT:    fmov d0, x9
+-; CHECK-NEXT:    mov v0.d[1], x8
++; CHECK-NEXT:    fcmge v0.2d, v1.2d, v0.2d
+ ; CHECK-NEXT:    ret
+ ;
+ ; GISEL-LABEL: fcmole2xdouble_fast:
+@@ -4734,17 +4699,7 @@ define <2 x i64> @fcmole2xdouble_fast(<2 x double> %A, <2 x double> %B) {
+ define <2 x i32> @fcmolt2xfloat_fast(<2 x float> %A, <2 x float> %B) {
+ ; CHECK-LABEL: fcmolt2xfloat_fast:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+-; CHECK-NEXT:    mov s2, v1.s[1]
+-; CHECK-NEXT:    mov s3, v0.s[1]
+-; CHECK-NEXT:    fcmp s3, s2
+-; CHECK-NEXT:    csetm w8, lt
+-; CHECK-NEXT:    fcmp s0, s1
+-; CHECK-NEXT:    csetm w9, lt
+-; CHECK-NEXT:    fmov s0, w9
+-; CHECK-NEXT:    mov v0.s[1], w8
+-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
++; CHECK-NEXT:    fcmgt v0.2s, v1.2s, v0.2s
+ ; CHECK-NEXT:    ret
+ ;
+ ; GISEL-LABEL: fcmolt2xfloat_fast:
+@@ -4761,25 +4716,7 @@ define <2 x i32> @fcmolt2xfloat_fast(<2 x float> %A, <2 x float> %B) {
+ define <4 x i32> @fcmolt4xfloat_fast(<4 x float> %A, <4 x float> %B) {
+ ; CHECK-LABEL: fcmolt4xfloat_fast:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    mov s2, v1.s[1]
+-; CHECK-NEXT:    mov s3, v0.s[1]
+-; CHECK-NEXT:    mov s4, v0.s[2]
+-; CHECK-NEXT:    fcmp s3, s2
+-; CHECK-NEXT:    mov s3, v1.s[2]
+-; CHECK-NEXT:    csetm w8, lt
+-; CHECK-NEXT:    fcmp s0, s1
+-; CHECK-NEXT:    mov s1, v1.s[3]
+-; CHECK-NEXT:    mov s0, v0.s[3]
+-; CHECK-NEXT:    csetm w9, lt
+-; CHECK-NEXT:    fcmp s4, s3
+-; CHECK-NEXT:    fmov s2, w9
+-; CHECK-NEXT:    mov v2.s[1], w8
+-; CHECK-NEXT:    csetm w8, lt
+-; CHECK-NEXT:    fcmp s0, s1
+-; CHECK-NEXT:    mov v2.s[2], w8
+-; CHECK-NEXT:    csetm w8, lt
+-; CHECK-NEXT:    mov v2.s[3], w8
+-; CHECK-NEXT:    mov v0.16b, v2.16b
++; CHECK-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+ ; CHECK-NEXT:    ret
+ ;
+ ; GISEL-LABEL: fcmolt4xfloat_fast:
+@@ -4796,14 +4733,7 @@ define <4 x i32> @fcmolt4xfloat_fast(<4 x float> %A, <4 x float> %B) {
+ define <2 x i64> @fcmolt2xdouble_fast(<2 x double> %A, <2 x double> %B) {
+ ; CHECK-LABEL: fcmolt2xdouble_fast:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    mov d2, v1.d[1]
+-; CHECK-NEXT:    mov d3, v0.d[1]
+-; CHECK-NEXT:    fcmp d3, d2
+-; CHECK-NEXT:    csetm x8, lt
+-; CHECK-NEXT:    fcmp d0, d1
+-; CHECK-NEXT:    csetm x9, lt
+-; CHECK-NEXT:    fmov d0, x9
+-; CHECK-NEXT:    mov v0.d[1], x8
++; CHECK-NEXT:    fcmgt v0.2d, v1.2d, v0.2d
+ ; CHECK-NEXT:    ret
+ ;
+ ; GISEL-LABEL: fcmolt2xdouble_fast:
+@@ -5181,17 +5111,7 @@ define <2 x i64> @fcmugt2xdouble_fast(<2 x double> %A, <2 x double> %B) {
+ define <2 x i32> @fcmule2xfloat_fast(<2 x float> %A, <2 x float> %B) {
+ ; CHECK-LABEL: fcmule2xfloat_fast:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+-; CHECK-NEXT:    mov s2, v1.s[1]
+-; CHECK-NEXT:    mov s3, v0.s[1]
+-; CHECK-NEXT:    fcmp s3, s2
+-; CHECK-NEXT:    csetm w8, le
+-; CHECK-NEXT:    fcmp s0, s1
+-; CHECK-NEXT:    csetm w9, le
+-; CHECK-NEXT:    fmov s0, w9
+-; CHECK-NEXT:    mov v0.s[1], w8
+-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
++; CHECK-NEXT:    fcmge v0.2s, v1.2s, v0.2s
+ ; CHECK-NEXT:    ret
+ ;
+ ; GISEL-LABEL: fcmule2xfloat_fast:
+@@ -5209,25 +5129,7 @@ define <2 x i32> @fcmule2xfloat_fast(<2 x float> %A, <2 x float> %B) {
+ define <4 x i32> @fcmule4xfloat_fast(<4 x float> %A, <4 x float> %B) {
+ ; CHECK-LABEL: fcmule4xfloat_fast:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    mov s2, v1.s[1]
+-; CHECK-NEXT:    mov s3, v0.s[1]
+-; CHECK-NEXT:    mov s4, v0.s[2]
+-; CHECK-NEXT:    fcmp s3, s2
+-; CHECK-NEXT:    mov s3, v1.s[2]
+-; CHECK-NEXT:    csetm w8, le
+-; CHECK-NEXT:    fcmp s0, s1
+-; CHECK-NEXT:    mov s1, v1.s[3]
+-; CHECK-NEXT:    mov s0, v0.s[3]
+-; CHECK-NEXT:    csetm w9, le
+-; CHECK-NEXT:    fcmp s4, s3
+-; CHECK-NEXT:    fmov s2, w9
+-; CHECK-NEXT:    mov v2.s[1], w8
+-; CHECK-NEXT:    csetm w8, le
+-; CHECK-NEXT:    fcmp s0, s1
+-; CHECK-NEXT:    mov v2.s[2], w8
+-; CHECK-NEXT:    csetm w8, le
+-; CHECK-NEXT:    mov v2.s[3], w8
+-; CHECK-NEXT:    mov v0.16b, v2.16b
++; CHECK-NEXT:    fcmge v0.4s, v1.4s, v0.4s
+ ; CHECK-NEXT:    ret
+ ;
+ ; GISEL-LABEL: fcmule4xfloat_fast:
+@@ -5245,14 +5147,7 @@ define <4 x i32> @fcmule4xfloat_fast(<4 x float> %A, <4 x float> %B) {
+ define <2 x i64> @fcmule2xdouble_fast(<2 x double> %A, <2 x double> %B) {
+ ; CHECK-LABEL: fcmule2xdouble_fast:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    mov d2, v1.d[1]
+-; CHECK-NEXT:    mov d3, v0.d[1]
+-; CHECK-NEXT:    fcmp d3, d2
+-; CHECK-NEXT:    csetm x8, le
+-; CHECK-NEXT:    fcmp d0, d1
+-; CHECK-NEXT:    csetm x9, le
+-; CHECK-NEXT:    fmov d0, x9
+-; CHECK-NEXT:    mov v0.d[1], x8
++; CHECK-NEXT:    fcmge v0.2d, v1.2d, v0.2d
+ ; CHECK-NEXT:    ret
+ ;
+ ; GISEL-LABEL: fcmule2xdouble_fast:
+@@ -5270,17 +5165,7 @@ define <2 x i64> @fcmule2xdouble_fast(<2 x double> %A, <2 x double> %B) {
+ define <2 x i32> @fcmult2xfloat_fast(<2 x float> %A, <2 x float> %B) {
+ ; CHECK-LABEL: fcmult2xfloat_fast:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+-; CHECK-NEXT:    mov s2, v1.s[1]
+-; CHECK-NEXT:    mov s3, v0.s[1]
+-; CHECK-NEXT:    fcmp s3, s2
+-; CHECK-NEXT:    csetm w8, lt
+-; CHECK-NEXT:    fcmp s0, s1
+-; CHECK-NEXT:    csetm w9, lt
+-; CHECK-NEXT:    fmov s0, w9
+-; CHECK-NEXT:    mov v0.s[1], w8
+-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
++; CHECK-NEXT:    fcmgt v0.2s, v1.2s, v0.2s
+ ; CHECK-NEXT:    ret
+ ;
+ ; GISEL-LABEL: fcmult2xfloat_fast:
+@@ -5298,25 +5183,7 @@ define <2 x i32> @fcmult2xfloat_fast(<2 x float> %A, <2 x float> %B) {
+ define <4 x i32> @fcmult4xfloat_fast(<4 x float> %A, <4 x float> %B) {
+ ; CHECK-LABEL: fcmult4xfloat_fast:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    mov s2, v1.s[1]
+-; CHECK-NEXT:    mov s3, v0.s[1]
+-; CHECK-NEXT:    mov s4, v0.s[2]
+-; CHECK-NEXT:    fcmp s3, s2
+-; CHECK-NEXT:    mov s3, v1.s[2]
+-; CHECK-NEXT:    csetm w8, lt
+-; CHECK-NEXT:    fcmp s0, s1
+-; CHECK-NEXT:    mov s1, v1.s[3]
+-; CHECK-NEXT:    mov s0, v0.s[3]
+-; CHECK-NEXT:    csetm w9, lt
+-; CHECK-NEXT:    fcmp s4, s3
+-; CHECK-NEXT:    fmov s2, w9
+-; CHECK-NEXT:    mov v2.s[1], w8
+-; CHECK-NEXT:    csetm w8, lt
+-; CHECK-NEXT:    fcmp s0, s1
+-; CHECK-NEXT:    mov v2.s[2], w8
+-; CHECK-NEXT:    csetm w8, lt
+-; CHECK-NEXT:    mov v2.s[3], w8
+-; CHECK-NEXT:    mov v0.16b, v2.16b
++; CHECK-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+ ; CHECK-NEXT:    ret
+ ;
+ ; GISEL-LABEL: fcmult4xfloat_fast:
+@@ -5334,14 +5201,7 @@ define <4 x i32> @fcmult4xfloat_fast(<4 x float> %A, <4 x float> %B) {
+ define <2 x i64> @fcmult2xdouble_fast(<2 x double> %A, <2 x double> %B) {
+ ; CHECK-LABEL: fcmult2xdouble_fast:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    mov d2, v1.d[1]
+-; CHECK-NEXT:    mov d3, v0.d[1]
+-; CHECK-NEXT:    fcmp d3, d2
+-; CHECK-NEXT:    csetm x8, lt
+-; CHECK-NEXT:    fcmp d0, d1
+-; CHECK-NEXT:    csetm x9, lt
+-; CHECK-NEXT:    fmov d0, x9
+-; CHECK-NEXT:    mov v0.d[1], x8
++; CHECK-NEXT:    fcmgt v0.2d, v1.2d, v0.2d
+ ; CHECK-NEXT:    ret
+ ;
+ ; GISEL-LABEL: fcmult2xdouble_fast:
+@@ -5567,15 +5427,7 @@ define <2 x i64> @fcmogtz2xdouble_fast(<2 x double> %A) {
+ define <2 x i32> @fcmoltz2xfloat_fast(<2 x float> %A) {
+ ; CHECK-LABEL: fcmoltz2xfloat_fast:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+-; CHECK-NEXT:    mov s1, v0.s[1]
+-; CHECK-NEXT:    fcmp s1, #0.0
+-; CHECK-NEXT:    csetm w8, lt
+-; CHECK-NEXT:    fcmp s0, #0.0
+-; CHECK-NEXT:    csetm w9, lt
+-; CHECK-NEXT:    fmov s0, w9
+-; CHECK-NEXT:    mov v0.s[1], w8
+-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
++; CHECK-NEXT:    fcmlt v0.2s, v0.2s, #0.0
+ ; CHECK-NEXT:    ret
+ ;
+ ; GISEL-LABEL: fcmoltz2xfloat_fast:
+@@ -5592,22 +5444,7 @@ define <2 x i32> @fcmoltz2xfloat_fast(<2 x float> %A) {
+ define <4 x i32> @fcmoltz4xfloat_fast(<4 x float> %A) {
+ ; CHECK-LABEL: fcmoltz4xfloat_fast:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    mov s1, v0.s[1]
+-; CHECK-NEXT:    mov s2, v0.s[2]
+-; CHECK-NEXT:    fcmp s1, #0.0
+-; CHECK-NEXT:    csetm w8, lt
+-; CHECK-NEXT:    fcmp s0, #0.0
+-; CHECK-NEXT:    mov s0, v0.s[3]
+-; CHECK-NEXT:    csetm w9, lt
+-; CHECK-NEXT:    fcmp s2, #0.0
+-; CHECK-NEXT:    fmov s1, w9
+-; CHECK-NEXT:    mov v1.s[1], w8
+-; CHECK-NEXT:    csetm w8, lt
+-; CHECK-NEXT:    fcmp s0, #0.0
+-; CHECK-NEXT:    mov v1.s[2], w8
+-; CHECK-NEXT:    csetm w8, lt
+-; CHECK-NEXT:    mov v1.s[3], w8
+-; CHECK-NEXT:    mov v0.16b, v1.16b
++; CHECK-NEXT:    fcmlt v0.4s, v0.4s, #0.0
+ ; CHECK-NEXT:    ret
+ ;
+ ; GISEL-LABEL: fcmoltz4xfloat_fast:
+@@ -5624,13 +5461,7 @@ define <4 x i32> @fcmoltz4xfloat_fast(<4 x float> %A) {
+ define <2 x i64> @fcmoltz2xdouble_fast(<2 x double> %A) {
+ ; CHECK-LABEL: fcmoltz2xdouble_fast:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    mov d1, v0.d[1]
+-; CHECK-NEXT:    fcmp d1, #0.0
+-; CHECK-NEXT:    csetm x8, lt
+-; CHECK-NEXT:    fcmp d0, #0.0
+-; CHECK-NEXT:    csetm x9, lt
+-; CHECK-NEXT:    fmov d0, x9
+-; CHECK-NEXT:    mov v0.d[1], x8
++; CHECK-NEXT:    fcmlt v0.2d, v0.2d, #0.0
+ ; CHECK-NEXT:    ret
+ ;
+ ; GISEL-LABEL: fcmoltz2xdouble_fast:
+@@ -5647,15 +5478,7 @@ define <2 x i64> @fcmoltz2xdouble_fast(<2 x double> %A) {
+ define <2 x i32> @fcmolez2xfloat_fast(<2 x float> %A) {
+ ; CHECK-LABEL: fcmolez2xfloat_fast:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+-; CHECK-NEXT:    mov s1, v0.s[1]
+-; CHECK-NEXT:    fcmp s1, #0.0
+-; CHECK-NEXT:    csetm w8, le
+-; CHECK-NEXT:    fcmp s0, #0.0
+-; CHECK-NEXT:    csetm w9, le
+-; CHECK-NEXT:    fmov s0, w9
+-; CHECK-NEXT:    mov v0.s[1], w8
+-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
++; CHECK-NEXT:    fcmle v0.2s, v0.2s, #0.0
+ ; CHECK-NEXT:    ret
+ ;
+ ; GISEL-LABEL: fcmolez2xfloat_fast:
+@@ -5672,22 +5495,7 @@ define <2 x i32> @fcmolez2xfloat_fast(<2 x float> %A) {
+ define <4 x i32> @fcmolez4xfloat_fast(<4 x float> %A) {
+ ; CHECK-LABEL: fcmolez4xfloat_fast:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    mov s1, v0.s[1]
+-; CHECK-NEXT:    mov s2, v0.s[2]
+-; CHECK-NEXT:    fcmp s1, #0.0
+-; CHECK-NEXT:    csetm w8, le
+-; CHECK-NEXT:    fcmp s0, #0.0
+-; CHECK-NEXT:    mov s0, v0.s[3]
+-; CHECK-NEXT:    csetm w9, le
+-; CHECK-NEXT:    fcmp s2, #0.0
+-; CHECK-NEXT:    fmov s1, w9
+-; CHECK-NEXT:    mov v1.s[1], w8
+-; CHECK-NEXT:    csetm w8, le
+-; CHECK-NEXT:    fcmp s0, #0.0
+-; CHECK-NEXT:    mov v1.s[2], w8
+-; CHECK-NEXT:    csetm w8, le
+-; CHECK-NEXT:    mov v1.s[3], w8
+-; CHECK-NEXT:    mov v0.16b, v1.16b
++; CHECK-NEXT:    fcmle v0.4s, v0.4s, #0.0
+ ; CHECK-NEXT:    ret
+ ;
+ ; GISEL-LABEL: fcmolez4xfloat_fast:
+@@ -5704,13 +5512,7 @@ define <4 x i32> @fcmolez4xfloat_fast(<4 x float> %A) {
+ define <2 x i64> @fcmolez2xdouble_fast(<2 x double> %A) {
+ ; CHECK-LABEL: fcmolez2xdouble_fast:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    mov d1, v0.d[1]
+-; CHECK-NEXT:    fcmp d1, #0.0
+-; CHECK-NEXT:    csetm x8, le
+-; CHECK-NEXT:    fcmp d0, #0.0
+-; CHECK-NEXT:    csetm x9, le
+-; CHECK-NEXT:    fmov d0, x9
+-; CHECK-NEXT:    mov v0.d[1], x8
++; CHECK-NEXT:    fcmle v0.2d, v0.2d, #0.0
+ ; CHECK-NEXT:    ret
+ ;
+ ; GISEL-LABEL: fcmolez2xdouble_fast:
+@@ -6018,15 +5820,7 @@ define <2 x i64> @fcmugtz2xdouble_fast(<2 x double> %A) {
+ define <2 x i32> @fcmultz2xfloat_fast(<2 x float> %A) {
+ ; CHECK-LABEL: fcmultz2xfloat_fast:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+-; CHECK-NEXT:    mov s1, v0.s[1]
+-; CHECK-NEXT:    fcmp s1, #0.0
+-; CHECK-NEXT:    csetm w8, lt
+-; CHECK-NEXT:    fcmp s0, #0.0
+-; CHECK-NEXT:    csetm w9, lt
+-; CHECK-NEXT:    fmov s0, w9
+-; CHECK-NEXT:    mov v0.s[1], w8
+-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
++; CHECK-NEXT:    fcmlt v0.2s, v0.2s, #0.0
+ ; CHECK-NEXT:    ret
+ ;
+ ; GISEL-LABEL: fcmultz2xfloat_fast:
+@@ -6044,22 +5838,7 @@ define <2 x i32> @fcmultz2xfloat_fast(<2 x float> %A) {
+ define <4 x i32> @fcmultz4xfloat_fast(<4 x float> %A) {
+ ; CHECK-LABEL: fcmultz4xfloat_fast:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    mov s1, v0.s[1]
+-; CHECK-NEXT:    mov s2, v0.s[2]
+-; CHECK-NEXT:    fcmp s1, #0.0
+-; CHECK-NEXT:    csetm w8, lt
+-; CHECK-NEXT:    fcmp s0, #0.0
+-; CHECK-NEXT:    mov s0, v0.s[3]
+-; CHECK-NEXT:    csetm w9, lt
+-; CHECK-NEXT:    fcmp s2, #0.0
+-; CHECK-NEXT:    fmov s1, w9
+-; CHECK-NEXT:    mov v1.s[1], w8
+-; CHECK-NEXT:    csetm w8, lt
+-; CHECK-NEXT:    fcmp s0, #0.0
+-; CHECK-NEXT:    mov v1.s[2], w8
+-; CHECK-NEXT:    csetm w8, lt
+-; CHECK-NEXT:    mov v1.s[3], w8
+-; CHECK-NEXT:    mov v0.16b, v1.16b
++; CHECK-NEXT:    fcmlt v0.4s, v0.4s, #0.0
+ ; CHECK-NEXT:    ret
+ ;
+ ; GISEL-LABEL: fcmultz4xfloat_fast:
+@@ -6077,13 +5856,7 @@ define <4 x i32> @fcmultz4xfloat_fast(<4 x float> %A) {
+ define <2 x i64> @fcmultz2xdouble_fast(<2 x double> %A) {
+ ; CHECK-LABEL: fcmultz2xdouble_fast:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    mov d1, v0.d[1]
+-; CHECK-NEXT:    fcmp d1, #0.0
+-; CHECK-NEXT:    csetm x8, lt
+-; CHECK-NEXT:    fcmp d0, #0.0
+-; CHECK-NEXT:    csetm x9, lt
+-; CHECK-NEXT:    fmov d0, x9
+-; CHECK-NEXT:    mov v0.d[1], x8
++; CHECK-NEXT:    fcmlt v0.2d, v0.2d, #0.0
+ ; CHECK-NEXT:    ret
+ ;
+ ; GISEL-LABEL: fcmultz2xdouble_fast:
+@@ -6102,15 +5875,7 @@ define <2 x i64> @fcmultz2xdouble_fast(<2 x double> %A) {
+ define <2 x i32> @fcmulez2xfloat_fast(<2 x float> %A) {
+ ; CHECK-LABEL: fcmulez2xfloat_fast:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+-; CHECK-NEXT:    mov s1, v0.s[1]
+-; CHECK-NEXT:    fcmp s1, #0.0
+-; CHECK-NEXT:    csetm w8, le
+-; CHECK-NEXT:    fcmp s0, #0.0
+-; CHECK-NEXT:    csetm w9, le
+-; CHECK-NEXT:    fmov s0, w9
+-; CHECK-NEXT:    mov v0.s[1], w8
+-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
++; CHECK-NEXT:    fcmle v0.2s, v0.2s, #0.0
+ ; CHECK-NEXT:    ret
+ ;
+ ; GISEL-LABEL: fcmulez2xfloat_fast:
+@@ -6128,22 +5893,7 @@ define <2 x i32> @fcmulez2xfloat_fast(<2 x float> %A) {
+ define <4 x i32> @fcmulez4xfloat_fast(<4 x float> %A) {
+ ; CHECK-LABEL: fcmulez4xfloat_fast:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    mov s1, v0.s[1]
+-; CHECK-NEXT:    mov s2, v0.s[2]
+-; CHECK-NEXT:    fcmp s1, #0.0
+-; CHECK-NEXT:    csetm w8, le
+-; CHECK-NEXT:    fcmp s0, #0.0
+-; CHECK-NEXT:    mov s0, v0.s[3]
+-; CHECK-NEXT:    csetm w9, le
+-; CHECK-NEXT:    fcmp s2, #0.0
+-; CHECK-NEXT:    fmov s1, w9
+-; CHECK-NEXT:    mov v1.s[1], w8
+-; CHECK-NEXT:    csetm w8, le
+-; CHECK-NEXT:    fcmp s0, #0.0
+-; CHECK-NEXT:    mov v1.s[2], w8
+-; CHECK-NEXT:    csetm w8, le
+-; CHECK-NEXT:    mov v1.s[3], w8
+-; CHECK-NEXT:    mov v0.16b, v1.16b
++; CHECK-NEXT:    fcmle v0.4s, v0.4s, #0.0
+ ; CHECK-NEXT:    ret
+ ;
+ ; GISEL-LABEL: fcmulez4xfloat_fast:
+@@ -6161,13 +5911,7 @@ define <4 x i32> @fcmulez4xfloat_fast(<4 x float> %A) {
+ define <2 x i64> @fcmulez2xdouble_fast(<2 x double> %A) {
+ ; CHECK-LABEL: fcmulez2xdouble_fast:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    mov d1, v0.d[1]
+-; CHECK-NEXT:    fcmp d1, #0.0
+-; CHECK-NEXT:    csetm x8, le
+-; CHECK-NEXT:    fcmp d0, #0.0
+-; CHECK-NEXT:    csetm x9, le
+-; CHECK-NEXT:    fmov d0, x9
+-; CHECK-NEXT:    mov v0.d[1], x8
++; CHECK-NEXT:    fcmle v0.2d, v0.2d, #0.0
+ ; CHECK-NEXT:    ret
+ ;
+ ; GISEL-LABEL: fcmulez2xdouble_fast:
+@@ -6313,26 +6057,9 @@ define <2 x i64> @fcmunoz2xdouble_fast(<2 x double> %A) {
+ define <4 x i32> @fcmule4xfloat_fast_zext(<4 x float> %A, <4 x float> %B) {
+ ; CHECK-LABEL: fcmule4xfloat_fast_zext:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    mov s3, v1.s[1]
+-; CHECK-NEXT:    mov s4, v0.s[1]
+ ; CHECK-NEXT:    movi v2.4s, #1
+-; CHECK-NEXT:    fcmp s4, s3
+-; CHECK-NEXT:    mov s3, v1.s[2]
+-; CHECK-NEXT:    mov s4, v0.s[2]
+-; CHECK-NEXT:    csetm w8, le
+-; CHECK-NEXT:    fcmp s0, s1
+-; CHECK-NEXT:    mov s1, v1.s[3]
+-; CHECK-NEXT:    mov s0, v0.s[3]
+-; CHECK-NEXT:    csetm w9, le
+-; CHECK-NEXT:    fcmp s4, s3
+-; CHECK-NEXT:    fmov s3, w9
+-; CHECK-NEXT:    mov v3.s[1], w8
+-; CHECK-NEXT:    csetm w8, le
+-; CHECK-NEXT:    fcmp s0, s1
+-; CHECK-NEXT:    mov v3.s[2], w8
+-; CHECK-NEXT:    csetm w8, le
+-; CHECK-NEXT:    mov v3.s[3], w8
+-; CHECK-NEXT:    and v0.16b, v3.16b, v2.16b
++; CHECK-NEXT:    fcmge v0.4s, v1.4s, v0.4s
++; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
+ ; CHECK-NEXT:    ret
+ ;
+ ; GISEL-LABEL: fcmule4xfloat_fast_zext:
+@@ -6351,25 +6078,8 @@ define <4 x i32> @fcmule4xfloat_fast_zext(<4 x float> %A, <4 x float> %B) {
+ define <4 x i1> @fcmule4xfloat_fast_aext(<4 x float> %A, <4 x float> %B) {
+ ; CHECK-LABEL: fcmule4xfloat_fast_aext:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    mov s2, v1.s[1]
+-; CHECK-NEXT:    mov s3, v0.s[1]
+-; CHECK-NEXT:    fcmp s3, s2
+-; CHECK-NEXT:    mov s2, v1.s[2]
+-; CHECK-NEXT:    mov s3, v0.s[2]
+-; CHECK-NEXT:    csetm w8, le
+-; CHECK-NEXT:    fcmp s0, s1
+-; CHECK-NEXT:    mov s1, v1.s[3]
+-; CHECK-NEXT:    mov s0, v0.s[3]
+-; CHECK-NEXT:    csetm w9, le
+-; CHECK-NEXT:    fcmp s3, s2
+-; CHECK-NEXT:    fmov s4, w9
+-; CHECK-NEXT:    mov v4.s[1], w8
+-; CHECK-NEXT:    csetm w8, le
+-; CHECK-NEXT:    fcmp s0, s1
+-; CHECK-NEXT:    mov v4.s[2], w8
+-; CHECK-NEXT:    csetm w8, le
+-; CHECK-NEXT:    mov v4.s[3], w8
+-; CHECK-NEXT:    xtn v0.4h, v4.4s
++; CHECK-NEXT:    fcmge v0.4s, v1.4s, v0.4s
++; CHECK-NEXT:    xtn v0.4h, v0.4s
+ ; CHECK-NEXT:    ret
+ ;
+ ; GISEL-LABEL: fcmule4xfloat_fast_aext:
+-- 
+2.34.1
+
diff --git a/patches/cherry/d9633d149022054bdac90bd3d03a240dbdb46f7e.patch b/patches/cherry/d9633d149022054bdac90bd3d03a240dbdb46f7e.patch
new file mode 100644
index 0000000..51e504b
--- /dev/null
+++ b/patches/cherry/d9633d149022054bdac90bd3d03a240dbdb46f7e.patch
@@ -0,0 +1,408 @@
+From d9633d149022054bdac90bd3d03a240dbdb46f7e Mon Sep 17 00:00:00 2001
+From: David Green <david.green@arm.com>
+Date: Mon, 7 Mar 2022 09:42:54 +0000
+Subject: [PATCH] [AArch64] Turn truncating buildvectors into truncates
+
+When lowering large v16f32->v16i8 fp_to_si_sat, the fp_to_si_sat node is
+split several times, creating an illegal v4i8 concat that gets expanded
+into a BUILD_VECTOR. After some combining and other legalisation, it
+ends up the a buildvector that extracts from 4 vectors, looking like
+BUILDVECTOR(a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3). That is
+really an v16i32->v16i8 truncate in disguise.
+
+This adds a ReconstructTruncateFromBuildVector method to detect the
+pattern, converting it back into the legal "concat(trunc(concat(trunc(a),
+trunc(b))), trunc(concat(trunc(c), trunc(d))))" tree. The extracted
+nodes could also be v4i16, in which case the truncates are not needed.
+All those truncates and concats then become uzip1's, which is much
+better than expanding by moving vector lanes around.
+
+Differential Revision: https://reviews.llvm.org/D119469
+---
+ .../Target/AArch64/AArch64ISelLowering.cpp    |  56 ++++++++
+ .../test/CodeGen/AArch64/fptosi-sat-vector.ll |  57 ++------
+ .../test/CodeGen/AArch64/fptoui-sat-vector.ll |  51 ++-----
+ .../CodeGen/AArch64/neon-extracttruncate.ll   | 133 ++----------------
+ 4 files changed, 92 insertions(+), 205 deletions(-)
+
+diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+index 51f17b37a8d6..dd421970e99f 100644
+--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
++++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+@@ -9252,6 +9252,56 @@ static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
+   return true;
+ }
+ 
++// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
++// v4i32s. This is really a truncate, which we can construct out of (legal)
++// concats and truncate nodes.
++static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG) {
++  if (V.getValueType() != MVT::v16i8)
++    return SDValue();
++  assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR");
++
++  for (unsigned X = 0; X < 4; X++) {
++    // Check the first item in each group is an extract from lane 0 of a v4i32
++    // or v4i16.
++    SDValue BaseExt = V.getOperand(X * 4);
++    if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
++        (BaseExt.getOperand(0).getValueType() != MVT::v4i16 &&
++         BaseExt.getOperand(0).getValueType() != MVT::v4i32) ||
++        !isa<ConstantSDNode>(BaseExt.getOperand(1)) ||
++        BaseExt.getConstantOperandVal(1) != 0)
++      return SDValue();
++    SDValue Base = BaseExt.getOperand(0);
++    // And check the other items are extracts from the same vector.
++    for (unsigned Y = 1; Y < 4; Y++) {
++      SDValue Ext = V.getOperand(X * 4 + Y);
++      if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
++          Ext.getOperand(0) != Base ||
++          !isa<ConstantSDNode>(Ext.getOperand(1)) ||
++          Ext.getConstantOperandVal(1) != Y)
++        return SDValue();
++    }
++  }
++
++  // Turn the buildvector into a series of truncates and concates, which will
++  // become uzip1's. Any v4i32s we found get truncated to v4i16, which are
++  // concat together to produce 2 v8i16. These are both truncated and concat
++  // together.
++  SDLoc DL(V);
++  SDValue Trunc[4] = {
++      V.getOperand(0).getOperand(0), V.getOperand(4).getOperand(0),
++      V.getOperand(8).getOperand(0), V.getOperand(12).getOperand(0)};
++  for (int I = 0; I < 4; I++)
++    if (Trunc[I].getValueType() == MVT::v4i32)
++      Trunc[I] = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, Trunc[I]);
++  SDValue Concat0 =
++      DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]);
++  SDValue Concat1 =
++      DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]);
++  SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0);
++  SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1);
++  return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1);
++}
++
+ /// Check if a vector shuffle corresponds to a DUP instructions with a larger
+ /// element width than the vector lane type. If that is the case the function
+ /// returns true and writes the value of the DUP instruction lane operand into
+@@ -10871,6 +10921,12 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
+     return SDValue();
+   }
+ 
++  // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
++  // v4i32s. This is really a truncate, which we can construct out of (legal)
++  // concats and truncate nodes.
++  if (SDValue M = ReconstructTruncateFromBuildVector(Op, DAG))
++    return M;
++
+   // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
+   if (NumElts >= 4) {
+     if (SDValue shuffle = ReconstructShuffle(Op, DAG))
+diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
+index ebfe8e4a20d0..244c65312e0e 100644
+--- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
++++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
+@@ -3004,55 +3004,22 @@ define <16 x i8> @test_signed_v16f32_v16i8(<16 x float> %f) {
+ ; CHECK-LABEL: test_signed_v16f32_v16i8:
+ ; CHECK:       // %bb.0:
+ ; CHECK-NEXT:    movi v4.4s, #127
++; CHECK-NEXT:    fcvtzs v3.4s, v3.4s
++; CHECK-NEXT:    fcvtzs v2.4s, v2.4s
++; CHECK-NEXT:    fcvtzs v1.4s, v1.4s
+ ; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
+ ; CHECK-NEXT:    mvni v5.4s, #127
+-; CHECK-NEXT:    fcvtzs v1.4s, v1.4s
+-; CHECK-NEXT:    fcvtzs v2.4s, v2.4s
+-; CHECK-NEXT:    smin v0.4s, v0.4s, v4.4s
+-; CHECK-NEXT:    smin v1.4s, v1.4s, v4.4s
++; CHECK-NEXT:    smin v3.4s, v3.4s, v4.4s
+ ; CHECK-NEXT:    smin v2.4s, v2.4s, v4.4s
+-; CHECK-NEXT:    smax v0.4s, v0.4s, v5.4s
+-; CHECK-NEXT:    smax v1.4s, v1.4s, v5.4s
+-; CHECK-NEXT:    smax v2.4s, v2.4s, v5.4s
+-; CHECK-NEXT:    xtn v6.4h, v0.4s
+-; CHECK-NEXT:    umov w8, v6.h[0]
+-; CHECK-NEXT:    umov w9, v6.h[1]
+-; CHECK-NEXT:    xtn v1.4h, v1.4s
+-; CHECK-NEXT:    fmov s0, w8
+-; CHECK-NEXT:    umov w8, v6.h[2]
+-; CHECK-NEXT:    mov v0.b[1], w9
+-; CHECK-NEXT:    mov v0.b[2], w8
+-; CHECK-NEXT:    umov w8, v6.h[3]
+-; CHECK-NEXT:    mov v0.b[3], w8
+-; CHECK-NEXT:    umov w8, v1.h[0]
+-; CHECK-NEXT:    mov v0.b[4], w8
+-; CHECK-NEXT:    umov w8, v1.h[1]
+-; CHECK-NEXT:    mov v0.b[5], w8
+-; CHECK-NEXT:    umov w8, v1.h[2]
+-; CHECK-NEXT:    mov v0.b[6], w8
+-; CHECK-NEXT:    umov w8, v1.h[3]
+-; CHECK-NEXT:    xtn v1.4h, v2.4s
+-; CHECK-NEXT:    fcvtzs v2.4s, v3.4s
+-; CHECK-NEXT:    mov v0.b[7], w8
+-; CHECK-NEXT:    umov w8, v1.h[0]
+-; CHECK-NEXT:    smin v2.4s, v2.4s, v4.4s
+-; CHECK-NEXT:    mov v0.b[8], w8
+-; CHECK-NEXT:    umov w8, v1.h[1]
++; CHECK-NEXT:    smin v1.4s, v1.4s, v4.4s
++; CHECK-NEXT:    smin v0.4s, v0.4s, v4.4s
++; CHECK-NEXT:    smax v3.4s, v3.4s, v5.4s
+ ; CHECK-NEXT:    smax v2.4s, v2.4s, v5.4s
+-; CHECK-NEXT:    mov v0.b[9], w8
+-; CHECK-NEXT:    umov w8, v1.h[2]
+-; CHECK-NEXT:    mov v0.b[10], w8
+-; CHECK-NEXT:    umov w8, v1.h[3]
+-; CHECK-NEXT:    xtn v1.4h, v2.4s
+-; CHECK-NEXT:    mov v0.b[11], w8
+-; CHECK-NEXT:    umov w8, v1.h[0]
+-; CHECK-NEXT:    mov v0.b[12], w8
+-; CHECK-NEXT:    umov w8, v1.h[1]
+-; CHECK-NEXT:    mov v0.b[13], w8
+-; CHECK-NEXT:    umov w8, v1.h[2]
+-; CHECK-NEXT:    mov v0.b[14], w8
+-; CHECK-NEXT:    umov w8, v1.h[3]
+-; CHECK-NEXT:    mov v0.b[15], w8
++; CHECK-NEXT:    smax v1.4s, v1.4s, v5.4s
++; CHECK-NEXT:    smax v0.4s, v0.4s, v5.4s
++; CHECK-NEXT:    uzp1 v2.8h, v2.8h, v3.8h
++; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
++; CHECK-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
+ ; CHECK-NEXT:    ret
+     %x = call <16 x i8> @llvm.fptosi.sat.v16f32.v16i8(<16 x float> %f)
+     ret <16 x i8> %x
+diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
+index cbb8b8a51126..d8d4b6f8b98c 100644
+--- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
++++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
+@@ -2515,50 +2515,17 @@ define <16 x i8> @test_unsigned_v16f32_v16i8(<16 x float> %f) {
+ ; CHECK-LABEL: test_unsigned_v16f32_v16i8:
+ ; CHECK:       // %bb.0:
+ ; CHECK-NEXT:    movi v4.2d, #0x0000ff000000ff
+-; CHECK-NEXT:    fcvtzu v0.4s, v0.4s
+-; CHECK-NEXT:    fcvtzu v1.4s, v1.4s
++; CHECK-NEXT:    fcvtzu v3.4s, v3.4s
+ ; CHECK-NEXT:    fcvtzu v2.4s, v2.4s
+-; CHECK-NEXT:    umin v0.4s, v0.4s, v4.4s
+-; CHECK-NEXT:    umin v1.4s, v1.4s, v4.4s
+-; CHECK-NEXT:    umin v2.4s, v2.4s, v4.4s
+-; CHECK-NEXT:    xtn v5.4h, v0.4s
+-; CHECK-NEXT:    xtn v1.4h, v1.4s
+-; CHECK-NEXT:    umov w8, v5.h[0]
+-; CHECK-NEXT:    umov w9, v5.h[1]
+-; CHECK-NEXT:    fmov s0, w8
+-; CHECK-NEXT:    umov w8, v5.h[2]
+-; CHECK-NEXT:    mov v0.b[1], w9
+-; CHECK-NEXT:    mov v0.b[2], w8
+-; CHECK-NEXT:    umov w8, v5.h[3]
+-; CHECK-NEXT:    mov v0.b[3], w8
+-; CHECK-NEXT:    umov w8, v1.h[0]
+-; CHECK-NEXT:    mov v0.b[4], w8
+-; CHECK-NEXT:    umov w8, v1.h[1]
+-; CHECK-NEXT:    mov v0.b[5], w8
+-; CHECK-NEXT:    umov w8, v1.h[2]
+-; CHECK-NEXT:    mov v0.b[6], w8
+-; CHECK-NEXT:    umov w8, v1.h[3]
+-; CHECK-NEXT:    xtn v1.4h, v2.4s
+-; CHECK-NEXT:    fcvtzu v2.4s, v3.4s
+-; CHECK-NEXT:    mov v0.b[7], w8
+-; CHECK-NEXT:    umov w8, v1.h[0]
++; CHECK-NEXT:    fcvtzu v1.4s, v1.4s
++; CHECK-NEXT:    fcvtzu v0.4s, v0.4s
++; CHECK-NEXT:    umin v3.4s, v3.4s, v4.4s
+ ; CHECK-NEXT:    umin v2.4s, v2.4s, v4.4s
+-; CHECK-NEXT:    mov v0.b[8], w8
+-; CHECK-NEXT:    umov w8, v1.h[1]
+-; CHECK-NEXT:    mov v0.b[9], w8
+-; CHECK-NEXT:    umov w8, v1.h[2]
+-; CHECK-NEXT:    mov v0.b[10], w8
+-; CHECK-NEXT:    umov w8, v1.h[3]
+-; CHECK-NEXT:    xtn v1.4h, v2.4s
+-; CHECK-NEXT:    mov v0.b[11], w8
+-; CHECK-NEXT:    umov w8, v1.h[0]
+-; CHECK-NEXT:    mov v0.b[12], w8
+-; CHECK-NEXT:    umov w8, v1.h[1]
+-; CHECK-NEXT:    mov v0.b[13], w8
+-; CHECK-NEXT:    umov w8, v1.h[2]
+-; CHECK-NEXT:    mov v0.b[14], w8
+-; CHECK-NEXT:    umov w8, v1.h[3]
+-; CHECK-NEXT:    mov v0.b[15], w8
++; CHECK-NEXT:    umin v1.4s, v1.4s, v4.4s
++; CHECK-NEXT:    umin v0.4s, v0.4s, v4.4s
++; CHECK-NEXT:    uzp1 v2.8h, v2.8h, v3.8h
++; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
++; CHECK-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
+ ; CHECK-NEXT:    ret
+     %x = call <16 x i8> @llvm.fptoui.sat.v16f32.v16i8(<16 x float> %f)
+     ret <16 x i8> %x
+diff --git a/llvm/test/CodeGen/AArch64/neon-extracttruncate.ll b/llvm/test/CodeGen/AArch64/neon-extracttruncate.ll
+index 14cc333120c7..dd7dd44bedf7 100644
+--- a/llvm/test/CodeGen/AArch64/neon-extracttruncate.ll
++++ b/llvm/test/CodeGen/AArch64/neon-extracttruncate.ll
+@@ -84,43 +84,13 @@ entry:
+ define <16 x i8> @extract_4_v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) {
+ ; CHECK-LABEL: extract_4_v4i16:
+ ; CHECK:       // %bb.0: // %entry
+-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+-; CHECK-NEXT:    umov w9, v0.h[0]
+-; CHECK-NEXT:    umov w10, v0.h[1]
+-; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+ ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+-; CHECK-NEXT:    umov w8, v2.h[0]
++; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+ ; CHECK-NEXT:    // kill: def $d3 killed $d3 def $q3
+-; CHECK-NEXT:    fmov s4, w9
+-; CHECK-NEXT:    umov w9, v0.h[2]
+-; CHECK-NEXT:    mov v4.b[1], w10
+-; CHECK-NEXT:    umov w10, v0.h[3]
+-; CHECK-NEXT:    mov v4.b[2], w9
+-; CHECK-NEXT:    umov w9, v1.h[0]
+-; CHECK-NEXT:    mov v4.b[3], w10
+-; CHECK-NEXT:    umov w10, v1.h[1]
+-; CHECK-NEXT:    mov v4.b[4], w9
+-; CHECK-NEXT:    umov w9, v1.h[2]
+-; CHECK-NEXT:    mov v4.b[5], w10
+-; CHECK-NEXT:    umov w10, v1.h[3]
+-; CHECK-NEXT:    mov v4.b[6], w9
+-; CHECK-NEXT:    umov w9, v2.h[1]
+-; CHECK-NEXT:    mov v4.b[7], w10
+-; CHECK-NEXT:    mov v4.b[8], w8
+-; CHECK-NEXT:    umov w8, v2.h[2]
+-; CHECK-NEXT:    mov v4.b[9], w9
+-; CHECK-NEXT:    umov w9, v2.h[3]
+-; CHECK-NEXT:    mov v4.b[10], w8
+-; CHECK-NEXT:    umov w8, v3.h[0]
+-; CHECK-NEXT:    mov v4.b[11], w9
+-; CHECK-NEXT:    umov w9, v3.h[1]
+-; CHECK-NEXT:    mov v4.b[12], w8
+-; CHECK-NEXT:    umov w8, v3.h[2]
+-; CHECK-NEXT:    mov v4.b[13], w9
+-; CHECK-NEXT:    umov w9, v3.h[3]
+-; CHECK-NEXT:    mov v4.b[14], w8
+-; CHECK-NEXT:    mov v4.b[15], w9
+-; CHECK-NEXT:    mov v0.16b, v4.16b
++; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
++; CHECK-NEXT:    mov v2.d[1], v3.d[0]
++; CHECK-NEXT:    mov v0.d[1], v1.d[0]
++; CHECK-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
+ ; CHECK-NEXT:    ret
+ entry:
+   %a0 = extractelement <4 x i16> %a, i32 0
+@@ -177,36 +147,9 @@ entry:
+ define <16 x i8> @extract_4_v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
+ ; CHECK-LABEL: extract_4_v4i32:
+ ; CHECK:       // %bb.0: // %entry
+-; CHECK-NEXT:    mov w8, v0.s[1]
+-; CHECK-NEXT:    mov w9, v0.s[2]
+-; CHECK-NEXT:    mov w10, v0.s[3]
+-; CHECK-NEXT:    mov v0.b[1], w8
+-; CHECK-NEXT:    fmov w8, s1
+-; CHECK-NEXT:    mov v0.b[2], w9
+-; CHECK-NEXT:    mov w9, v1.s[1]
+-; CHECK-NEXT:    mov v0.b[3], w10
+-; CHECK-NEXT:    mov v0.b[4], w8
+-; CHECK-NEXT:    mov w8, v1.s[2]
+-; CHECK-NEXT:    mov v0.b[5], w9
+-; CHECK-NEXT:    mov w9, v1.s[3]
+-; CHECK-NEXT:    mov v0.b[6], w8
+-; CHECK-NEXT:    fmov w8, s2
+-; CHECK-NEXT:    mov v0.b[7], w9
+-; CHECK-NEXT:    mov w9, v2.s[1]
+-; CHECK-NEXT:    mov v0.b[8], w8
+-; CHECK-NEXT:    mov w8, v2.s[2]
+-; CHECK-NEXT:    mov v0.b[9], w9
+-; CHECK-NEXT:    mov w9, v2.s[3]
+-; CHECK-NEXT:    mov v0.b[10], w8
+-; CHECK-NEXT:    fmov w8, s3
+-; CHECK-NEXT:    mov v0.b[11], w9
+-; CHECK-NEXT:    mov w9, v3.s[1]
+-; CHECK-NEXT:    mov v0.b[12], w8
+-; CHECK-NEXT:    mov w8, v3.s[2]
+-; CHECK-NEXT:    mov v0.b[13], w9
+-; CHECK-NEXT:    mov w9, v3.s[3]
+-; CHECK-NEXT:    mov v0.b[14], w8
+-; CHECK-NEXT:    mov v0.b[15], w9
++; CHECK-NEXT:    uzp1 v2.8h, v2.8h, v3.8h
++; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
++; CHECK-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
+ ; CHECK-NEXT:    ret
+ entry:
+   %a0 = extractelement <4 x i32> %a, i32 0
+@@ -263,41 +206,12 @@ entry:
+ define <16 x i8> @extract_4_mixed(<4 x i16> %a, <4 x i32> %b, <4 x i32> %c, <4 x i16> %d) {
+ ; CHECK-LABEL: extract_4_mixed:
+ ; CHECK:       // %bb.0: // %entry
+-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+-; CHECK-NEXT:    umov w8, v0.h[0]
+-; CHECK-NEXT:    umov w9, v0.h[1]
++; CHECK-NEXT:    xtn v2.4h, v2.4s
+ ; CHECK-NEXT:    // kill: def $d3 killed $d3 def $q3
+-; CHECK-NEXT:    fmov s4, w8
+-; CHECK-NEXT:    umov w8, v0.h[2]
+-; CHECK-NEXT:    mov v4.b[1], w9
+-; CHECK-NEXT:    umov w9, v0.h[3]
+-; CHECK-NEXT:    mov v4.b[2], w8
+-; CHECK-NEXT:    fmov w8, s1
+-; CHECK-NEXT:    mov v4.b[3], w9
+-; CHECK-NEXT:    mov w9, v1.s[1]
+-; CHECK-NEXT:    mov v4.b[4], w8
+-; CHECK-NEXT:    mov w8, v1.s[2]
+-; CHECK-NEXT:    mov v4.b[5], w9
+-; CHECK-NEXT:    mov w9, v1.s[3]
+-; CHECK-NEXT:    mov v4.b[6], w8
+-; CHECK-NEXT:    fmov w8, s2
+-; CHECK-NEXT:    mov v4.b[7], w9
+-; CHECK-NEXT:    mov w9, v2.s[1]
+-; CHECK-NEXT:    mov v4.b[8], w8
+-; CHECK-NEXT:    mov w8, v2.s[2]
+-; CHECK-NEXT:    mov v4.b[9], w9
+-; CHECK-NEXT:    mov w9, v2.s[3]
+-; CHECK-NEXT:    mov v4.b[10], w8
+-; CHECK-NEXT:    umov w8, v3.h[0]
+-; CHECK-NEXT:    mov v4.b[11], w9
+-; CHECK-NEXT:    umov w9, v3.h[1]
+-; CHECK-NEXT:    mov v4.b[12], w8
+-; CHECK-NEXT:    umov w8, v3.h[2]
+-; CHECK-NEXT:    mov v4.b[13], w9
+-; CHECK-NEXT:    umov w9, v3.h[3]
+-; CHECK-NEXT:    mov v4.b[14], w8
+-; CHECK-NEXT:    mov v4.b[15], w9
+-; CHECK-NEXT:    mov v0.16b, v4.16b
++; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
++; CHECK-NEXT:    xtn2 v0.8h, v1.4s
++; CHECK-NEXT:    mov v2.d[1], v3.d[0]
++; CHECK-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
+ ; CHECK-NEXT:    ret
+ entry:
+   %a0 = extractelement <4 x i16> %a, i32 0
+@@ -440,25 +354,8 @@ entry:
+ define <16 x i8> @extract_4_v4i32_one(<4 x i32> %a) {
+ ; CHECK-LABEL: extract_4_v4i32_one:
+ ; CHECK:       // %bb.0: // %entry
+-; CHECK-NEXT:    mov w8, v0.s[1]
+-; CHECK-NEXT:    fmov w9, s0
+-; CHECK-NEXT:    mov w10, v0.s[2]
+-; CHECK-NEXT:    mov w11, v0.s[3]
+-; CHECK-NEXT:    mov v0.b[1], w8
+-; CHECK-NEXT:    mov v0.b[2], w10
+-; CHECK-NEXT:    mov v0.b[3], w11
+-; CHECK-NEXT:    mov v0.b[4], w9
+-; CHECK-NEXT:    mov v0.b[5], w8
+-; CHECK-NEXT:    mov v0.b[6], w10
+-; CHECK-NEXT:    mov v0.b[7], w11
+-; CHECK-NEXT:    mov v0.b[8], w9
+-; CHECK-NEXT:    mov v0.b[9], w8
+-; CHECK-NEXT:    mov v0.b[10], w10
+-; CHECK-NEXT:    mov v0.b[11], w11
+-; CHECK-NEXT:    mov v0.b[12], w9
+-; CHECK-NEXT:    mov v0.b[13], w8
+-; CHECK-NEXT:    mov v0.b[14], w10
+-; CHECK-NEXT:    mov v0.b[15], w11
++; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v0.8h
++; CHECK-NEXT:    uzp1 v0.16b, v0.16b, v0.16b
+ ; CHECK-NEXT:    ret
+ entry:
+   %a0 = extractelement <4 x i32> %a, i32 0
+-- 
+2.34.1
+
author	Pavel Iliin <Pavel.Iliin@arm.com>	2022-10-19 14:07:26 +0100
committer	Pirama Arumuga Nainar <pirama@google.com>	2022-11-30 05:04:27 +0000
commit	91fdeab43d29b1f228113859da8ee238bc8c2f16 (patch)
tree	28d683a8957973b0291ba54157248813208d755a
parent	ecc27f8f2bfef88a9a579b39a64898326e36bfc0 (diff)
download	llvm_android-llvm-r450784.tar.gz