diff options
author | Pavel Iliin <Pavel.Iliin@arm.com> | 2022-10-19 14:07:26 +0100 |
---|---|---|
committer | Pirama Arumuga Nainar <pirama@google.com> | 2022-11-30 05:04:27 +0000 |
commit | 91fdeab43d29b1f228113859da8ee238bc8c2f16 (patch) | |
tree | 28d683a8957973b0291ba54157248813208d755a | |
parent | ecc27f8f2bfef88a9a579b39a64898326e36bfc0 (diff) | |
download | llvm_android-llvm-r450784.tar.gz |
[patches] Cherry pick CLS for: truncating buildvectors into truncates,llvm-r450784
support for FMA intrinsics to shouldSinkOperands, vector FP cmp, maximum
VF with shouldMaximizeVectorBandwidth
Change-Id: If19bb09ce7675c18bb07586d215a61c82e4dbac6
-rw-r--r-- | patches/PATCHES.json | 140 | ||||
-rw-r--r-- | patches/cherry/786c687810a5e3db4c64312018de25c65527c40c.patch | 245 | ||||
-rw-r--r-- | patches/cherry/7a605ab7bfbc681c34335684f45b7da32d495db1.patch | 727 | ||||
-rw-r--r-- | patches/cherry/84ccd015e7dd3ca57c4a9366ecd2b9a7430f505d.patch | 509 | ||||
-rw-r--r-- | patches/cherry/86617256864ebcbda03b6ce843deeb6a41a85800.patch | 206 | ||||
-rw-r--r--[-rwxr-xr-x] | patches/cherry/AArch64-Use-Tbl.patch | 0 | ||||
-rw-r--r-- | patches/cherry/Loop-Vectorizer-shouldMaximizeVectorBandwidth.patch | 527 | ||||
-rw-r--r-- | patches/cherry/a8de8cab7006bc885804e8a2c0a6902702521cfe.patch | 1910 | ||||
-rw-r--r-- | patches/cherry/a9a012086a917dff367bb63de2d63782b23111fc.patch | 72 | ||||
-rw-r--r-- | patches/cherry/bb362d890f0d51c250818711d4a9b0b51cea7bc6.patch | 1507 | ||||
-rw-r--r-- | patches/cherry/bf268a05cd9294854ffccc3158c0e673069bed4a.patch | 609 | ||||
-rw-r--r-- | patches/cherry/d9633d149022054bdac90bd3d03a240dbdb46f7e.patch | 408 |
12 files changed, 6860 insertions, 0 deletions
diff --git a/patches/PATCHES.json b/patches/PATCHES.json index 497732f..382e45e 100644 --- a/patches/PATCHES.json +++ b/patches/PATCHES.json @@ -1094,5 +1094,145 @@ "from": 450784, "until": null } + }, + { + "metadata": { + "info": [], + "title": "[UPSTREAM] [AArch64] Add extra fptoint_sat tests for larger than legal types. NFC" + }, + "platforms": [ + "android" + ], + "rel_patch_path": "cherry/bb362d890f0d51c250818711d4a9b0b51cea7bc6.patch", + "version_range": { + "from": 450784, + "until": null + } + }, + { + "metadata": { + "info": [], + "title": "[UPSTREAM] [AArch64] Use simd mov to materialize big fp constants" + }, + "platforms": [ + "android" + ], + "rel_patch_path": "cherry/7a605ab7bfbc681c34335684f45b7da32d495db1.patch", + "version_range": { + "from": 450784, + "until": null + } + }, + { + "metadata": { + "info": [], + "title": "[UPSTREAM] [AArch64] Some tests to show reconstructing truncates. NFC" + }, + "platforms": [ + "android" + ], + "rel_patch_path": "cherry/84ccd015e7dd3ca57c4a9366ecd2b9a7430f505d.patch", + "version_range": { + "from": 450784, + "until": null + } + }, + { + "metadata": { + "info": [], + "title": "[UPSTREAM] [AArch64] Turn truncating buildvectors into truncates." + }, + "platforms": [ + "android" + ], + "rel_patch_path": "cherry/d9633d149022054bdac90bd3d03a240dbdb46f7e.patch", + "version_range": { + "from": 450784, + "until": null + } + }, + { + "metadata": { + "info": [], + "title": "[UPSTREAM] [AArch64] Add tests with free shuffles for indexed fma variants." + }, + "platforms": [ + "android" + ], + "rel_patch_path": "cherry/86617256864ebcbda03b6ce843deeb6a41a85800.patch", + "version_range": { + "from": 450784, + "until": null + } + }, + { + "metadata": { + "info": [], + "title": "[UPSTREAM] [AArch64] Add additional tests for sinking free shuffles for FMAs." + }, + "platforms": [ + "android" + ], + "rel_patch_path": "cherry/a9a012086a917dff367bb63de2d63782b23111fc.patch", + "version_range": { + "from": 450784, + "until": null + } + }, + { + "metadata": { + "info": [], + "title": "[UPSTRREAM] [AArch64] Add support for FMA intrinsics to shouldSinkOperands." + }, + "platforms": [ + "android" + ], + "rel_patch_path": "cherry/786c687810a5e3db4c64312018de25c65527c40c.patch", + "version_range": { + "from": 450784, + "until": null + } + }, + { + "metadata": { + "info": [], + "title": "[UPSTRREAM] [AArch64] Add fcmp fast math tests" + }, + "platforms": [ + "android" + ], + "rel_patch_path": "cherry/a8de8cab7006bc885804e8a2c0a6902702521cfe.patch", + "version_range": { + "from": 450784, + "until": null + } + }, + { + "metadata": { + "info": [], + "title": "[UPSTRREAM] [PATCH] [AArch64] Emit vector FP cmp when LE is used with fast-math" + }, + "platforms": [ + "android" + ], + "rel_patch_path": "cherry/bf268a05cd9294854ffccc3158c0e673069bed4a.patch", + "version_range": { + "from": 450784, + "until": null + } + }, + { + "metadata": { + "info": [], + "title": "[MERGED] [UPSTREAM] [AArch64] Set maximum VF with shouldMaximizeVectorBandwidth" + }, + "platforms": [ + "android" + ], + "rel_patch_path": "cherry/Loop-Vectorizer-shouldMaximizeVectorBandwidth.patch", + "version_range": { + "from": 450784, + "until": null + } } ] diff --git a/patches/cherry/786c687810a5e3db4c64312018de25c65527c40c.patch b/patches/cherry/786c687810a5e3db4c64312018de25c65527c40c.patch new file mode 100644 index 0000000..3e06f7b --- /dev/null +++ b/patches/cherry/786c687810a5e3db4c64312018de25c65527c40c.patch @@ -0,0 +1,245 @@ +From 786c687810a5e3db4c64312018de25c65527c40c Mon Sep 17 00:00:00 2001 +From: Florian Hahn <flo@fhahn.com> +Date: Fri, 27 May 2022 10:37:02 +0100 +Subject: [PATCH] [AArch64] Add support for FMA intrinsics to + shouldSinkOperands. + +If the fma operates on a legal vector type, the indexed variants can be +used, if the second operand is a splat of a valid index. + +Reviewed By: dmgreen + +Differential Revision: https://reviews.llvm.org/D126234 +--- + .../Target/AArch64/AArch64ISelLowering.cpp | 6 +- + .../AArch64/sink-free-instructions.ll | 127 +++++++++++------- + 2 files changed, 81 insertions(+), 52 deletions(-) + +diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +index e31a58da0831..d31008496ea4 100644 +--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp ++++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +@@ -12545,6 +12545,11 @@ bool AArch64TargetLowering::shouldSinkOperands( + } + LLVM_FALLTHROUGH; + ++ case Intrinsic::fma: ++ if (cast<VectorType>(I->getType())->getElementType()->isHalfTy() && ++ !Subtarget->hasFullFP16()) ++ return false; ++ LLVM_FALLTHROUGH; + case Intrinsic::aarch64_neon_sqdmull: + case Intrinsic::aarch64_neon_sqdmulh: + case Intrinsic::aarch64_neon_sqrdmulh: +@@ -12568,7 +12573,6 @@ bool AArch64TargetLowering::shouldSinkOperands( + Ops.push_back(&II->getArgOperandUse(0)); + Ops.push_back(&II->getArgOperandUse(1)); + return true; +- + default: + return false; + } +diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll +index 5d7a26f65784..fc60b119225c 100644 +--- a/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll ++++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll +@@ -1,5 +1,6 @@ + ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +-; RUN: opt < %s -codegenprepare -S | FileCheck %s ++; RUN: opt < %s -codegenprepare -S | FileCheck --check-prefixes=CHECK,NOFP16 %s ++; RUN: opt < %s -codegenprepare -S -mattr=+fullfp16 | FileCheck --check-prefixes=CHECK,FULLFP16 %s + + target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" + target triple = "aarch64-unknown" +@@ -498,29 +499,53 @@ if.else: + declare <8 x half> @llvm.fma.v8f16(<8 x half>, <8 x half>, <8 x half>) + + define <8 x half> @sink_shufflevector_fma_v8f16(i1 %c, <8 x half> %a, <8 x half> %b) { +-; CHECK-LABEL: @sink_shufflevector_fma_v8f16( +-; CHECK-NEXT: entry: +-; CHECK-NEXT: [[S0:%.*]] = shufflevector <8 x half> [[A:%.*]], <8 x half> poison, <8 x i32> zeroinitializer +-; CHECK-NEXT: [[S1:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> +-; CHECK-NEXT: [[S2:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> +-; CHECK-NEXT: [[S3:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> +-; CHECK-NEXT: [[S4:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4> +-; CHECK-NEXT: [[S5:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5> +-; CHECK-NEXT: [[S6:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6> +-; CHECK-NEXT: [[S7:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> +-; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +-; CHECK: if.then: +-; CHECK-NEXT: [[R_0:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[B:%.*]], <8 x half> [[S0]], <8 x half> [[B]]) +-; CHECK-NEXT: [[R_1:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_0]], <8 x half> [[S1]], <8 x half> [[B]]) +-; CHECK-NEXT: [[R_2:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_1]], <8 x half> [[S2]], <8 x half> [[B]]) +-; CHECK-NEXT: [[R_3:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_2]], <8 x half> [[S3]], <8 x half> [[B]]) +-; CHECK-NEXT: ret <8 x half> [[R_3]] +-; CHECK: if.else: +-; CHECK-NEXT: [[R_4:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[B]], <8 x half> [[S4]], <8 x half> [[B]]) +-; CHECK-NEXT: [[R_5:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_4]], <8 x half> [[S5]], <8 x half> [[B]]) +-; CHECK-NEXT: [[R_6:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_5]], <8 x half> [[S6]], <8 x half> [[B]]) +-; CHECK-NEXT: [[R_7:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_6]], <8 x half> [[S7]], <8 x half> [[B]]) +-; CHECK-NEXT: ret <8 x half> [[R_7]] ++; NOFP16-LABEL: @sink_shufflevector_fma_v8f16( ++; NOFP16-NEXT: entry: ++; NOFP16-NEXT: [[S0:%.*]] = shufflevector <8 x half> [[A:%.*]], <8 x half> poison, <8 x i32> zeroinitializer ++; NOFP16-NEXT: [[S1:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> ++; NOFP16-NEXT: [[S2:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> ++; NOFP16-NEXT: [[S3:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> ++; NOFP16-NEXT: [[S4:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4> ++; NOFP16-NEXT: [[S5:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5> ++; NOFP16-NEXT: [[S6:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6> ++; NOFP16-NEXT: [[S7:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> ++; NOFP16-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] ++; NOFP16: if.then: ++; NOFP16-NEXT: [[R_0:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[B:%.*]], <8 x half> [[S0]], <8 x half> [[B]]) ++; NOFP16-NEXT: [[R_1:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_0]], <8 x half> [[S1]], <8 x half> [[B]]) ++; NOFP16-NEXT: [[R_2:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_1]], <8 x half> [[S2]], <8 x half> [[B]]) ++; NOFP16-NEXT: [[R_3:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_2]], <8 x half> [[S3]], <8 x half> [[B]]) ++; NOFP16-NEXT: ret <8 x half> [[R_3]] ++; NOFP16: if.else: ++; NOFP16-NEXT: [[R_4:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[B]], <8 x half> [[S4]], <8 x half> [[B]]) ++; NOFP16-NEXT: [[R_5:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_4]], <8 x half> [[S5]], <8 x half> [[B]]) ++; NOFP16-NEXT: [[R_6:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_5]], <8 x half> [[S6]], <8 x half> [[B]]) ++; NOFP16-NEXT: [[R_7:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_6]], <8 x half> [[S7]], <8 x half> [[B]]) ++; NOFP16-NEXT: ret <8 x half> [[R_7]] ++; ++; FULLFP16-LABEL: @sink_shufflevector_fma_v8f16( ++; FULLFP16-NEXT: entry: ++; FULLFP16-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] ++; FULLFP16: if.then: ++; FULLFP16-NEXT: [[TMP0:%.*]] = shufflevector <8 x half> [[A:%.*]], <8 x half> poison, <8 x i32> zeroinitializer ++; FULLFP16-NEXT: [[R_0:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[B:%.*]], <8 x half> [[TMP0]], <8 x half> [[B]]) ++; FULLFP16-NEXT: [[TMP1:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> ++; FULLFP16-NEXT: [[R_1:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_0]], <8 x half> [[TMP1]], <8 x half> [[B]]) ++; FULLFP16-NEXT: [[TMP2:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> ++; FULLFP16-NEXT: [[R_2:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_1]], <8 x half> [[TMP2]], <8 x half> [[B]]) ++; FULLFP16-NEXT: [[TMP3:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> ++; FULLFP16-NEXT: [[R_3:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_2]], <8 x half> [[TMP3]], <8 x half> [[B]]) ++; FULLFP16-NEXT: ret <8 x half> [[R_3]] ++; FULLFP16: if.else: ++; FULLFP16-NEXT: [[TMP4:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4> ++; FULLFP16-NEXT: [[R_4:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[B]], <8 x half> [[TMP4]], <8 x half> [[B]]) ++; FULLFP16-NEXT: [[TMP5:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5> ++; FULLFP16-NEXT: [[R_5:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_4]], <8 x half> [[TMP5]], <8 x half> [[B]]) ++; FULLFP16-NEXT: [[TMP6:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6> ++; FULLFP16-NEXT: [[R_6:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_5]], <8 x half> [[TMP6]], <8 x half> [[B]]) ++; FULLFP16-NEXT: [[TMP7:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> ++; FULLFP16-NEXT: [[R_7:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_6]], <8 x half> [[TMP7]], <8 x half> [[B]]) ++; FULLFP16-NEXT: ret <8 x half> [[R_7]] + ; + entry: + %s0 = shufflevector <8 x half> %a, <8 x half> poison, <8 x i32> zeroinitializer +@@ -553,18 +578,18 @@ declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) + define <4 x float> @sink_shufflevector_fma_v4f32(i1 %c, <8 x float> %a, <4 x float> %b) { + ; CHECK-LABEL: @sink_shufflevector_fma_v4f32( + ; CHECK-NEXT: entry: +-; CHECK-NEXT: [[S0:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> zeroinitializer +-; CHECK-NEXT: [[S1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> +-; CHECK-NEXT: [[S2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2> +-; CHECK-NEXT: [[S3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3> + ; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] + ; CHECK: if.then: +-; CHECK-NEXT: [[R_0:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[B:%.*]], <4 x float> [[S0]], <4 x float> [[B]]) +-; CHECK-NEXT: [[R_1:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[R_0]], <4 x float> [[S1]], <4 x float> [[B]]) ++; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> zeroinitializer ++; CHECK-NEXT: [[R_0:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[B:%.*]], <4 x float> [[TMP0]], <4 x float> [[B]]) ++; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ++; CHECK-NEXT: [[R_1:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[R_0]], <4 x float> [[TMP1]], <4 x float> [[B]]) + ; CHECK-NEXT: ret <4 x float> [[R_1]] + ; CHECK: if.else: +-; CHECK-NEXT: [[R_2:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[B]], <4 x float> [[S2]], <4 x float> [[B]]) +-; CHECK-NEXT: [[R_3:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[R_2]], <4 x float> [[S3]], <4 x float> [[B]]) ++; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2> ++; CHECK-NEXT: [[R_2:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[B]], <4 x float> [[TMP2]], <4 x float> [[B]]) ++; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3> ++; CHECK-NEXT: [[R_3:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[R_2]], <4 x float> [[TMP3]], <4 x float> [[B]]) + ; CHECK-NEXT: ret <4 x float> [[R_3]] + ; + entry: +@@ -588,18 +613,18 @@ if.else: + define <4 x float> @sink_shufflevector_first_arg_fma_v4f3(i1 %c, <8 x float> %a, <4 x float> %b) { + ; CHECK-LABEL: @sink_shufflevector_first_arg_fma_v4f3( + ; CHECK-NEXT: entry: +-; CHECK-NEXT: [[S0:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> zeroinitializer +-; CHECK-NEXT: [[S1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> +-; CHECK-NEXT: [[S2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2> +-; CHECK-NEXT: [[S3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3> + ; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] + ; CHECK: if.then: +-; CHECK-NEXT: [[R_0:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[S0]], <4 x float> [[B:%.*]], <4 x float> [[B]]) +-; CHECK-NEXT: [[R_1:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[S1]], <4 x float> [[R_0]], <4 x float> [[B]]) ++; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> zeroinitializer ++; CHECK-NEXT: [[R_0:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP0]], <4 x float> [[B:%.*]], <4 x float> [[B]]) ++; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ++; CHECK-NEXT: [[R_1:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP1]], <4 x float> [[R_0]], <4 x float> [[B]]) + ; CHECK-NEXT: ret <4 x float> [[R_1]] + ; CHECK: if.else: +-; CHECK-NEXT: [[R_2:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[S2]], <4 x float> [[B]], <4 x float> [[B]]) +-; CHECK-NEXT: [[R_3:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[S3]], <4 x float> [[R_2]], <4 x float> [[B]]) ++; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2> ++; CHECK-NEXT: [[R_2:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP2]], <4 x float> [[B]], <4 x float> [[B]]) ++; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3> ++; CHECK-NEXT: [[R_3:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP3]], <4 x float> [[R_2]], <4 x float> [[B]]) + ; CHECK-NEXT: ret <4 x float> [[R_3]] + ; + entry: +@@ -627,14 +652,14 @@ declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) + define <2 x double> @sink_shufflevector_fma_v2f64(i1 %c, <2 x double> %a, <2 x double> %b) { + ; CHECK-LABEL: @sink_shufflevector_fma_v2f64( + ; CHECK-NEXT: entry: +-; CHECK-NEXT: [[S0:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> poison, <2 x i32> zeroinitializer +-; CHECK-NEXT: [[S1:%.*]] = shufflevector <2 x double> [[A]], <2 x double> poison, <2 x i32> <i32 1, i32 1> + ; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] + ; CHECK: if.then: +-; CHECK-NEXT: [[R_0:%.*]] = tail call fast <2 x double> @llvm.fma.v2f64(<2 x double> [[B:%.*]], <2 x double> [[S0]], <2 x double> [[B]]) ++; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> poison, <2 x i32> zeroinitializer ++; CHECK-NEXT: [[R_0:%.*]] = tail call fast <2 x double> @llvm.fma.v2f64(<2 x double> [[B:%.*]], <2 x double> [[TMP0]], <2 x double> [[B]]) + ; CHECK-NEXT: ret <2 x double> [[R_0]] + ; CHECK: if.else: +-; CHECK-NEXT: [[R_1:%.*]] = tail call fast <2 x double> @llvm.fma.v2f64(<2 x double> [[B]], <2 x double> [[S1]], <2 x double> [[B]]) ++; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[A]], <2 x double> poison, <2 x i32> <i32 1, i32 1> ++; CHECK-NEXT: [[R_1:%.*]] = tail call fast <2 x double> @llvm.fma.v2f64(<2 x double> [[B]], <2 x double> [[TMP1]], <2 x double> [[B]]) + ; CHECK-NEXT: ret <2 x double> [[R_1]] + ; + entry: +@@ -654,10 +679,10 @@ if.else: + define <4 x float> @do_not_sink_out_of_range_shufflevector_fma_v4f32(i1 %c, <8 x float> %a, <4 x float> %b) { + ; CHECK-LABEL: @do_not_sink_out_of_range_shufflevector_fma_v4f32( + ; CHECK-NEXT: entry: +-; CHECK-NEXT: [[S4:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> <i32 4, i32 4, i32 4, i32 4> + ; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] + ; CHECK: if.then: +-; CHECK-NEXT: [[R:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[B:%.*]], <4 x float> [[S4]], <4 x float> [[B]]) ++; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> <i32 4, i32 4, i32 4, i32 4> ++; CHECK-NEXT: [[R:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[B:%.*]], <4 x float> [[TMP0]], <4 x float> [[B]]) + ; CHECK-NEXT: ret <4 x float> [[R]] + ; CHECK: if.else: + ; CHECK-NEXT: ret <4 x float> zeroinitializer +@@ -679,20 +704,20 @@ declare <5 x float> @llvm.fma.v5f32(<5 x float>, <5 x float>, <5 x float>) + define <5 x float> @sink_shufflevector_fma_v5f32(i1 %c, <8 x float> %a, <5 x float> %b) { + ; CHECK-LABEL: @sink_shufflevector_fma_v5f32( + ; CHECK-NEXT: entry: +-; CHECK-NEXT: [[S0:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <5 x i32> zeroinitializer +-; CHECK-NEXT: [[S1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <5 x i32> <i32 1, i32 1, i32 1, i32 1, i32 4> ++; CHECK-NEXT: [[S1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <5 x i32> <i32 1, i32 1, i32 1, i32 1, i32 4> + ; CHECK-NEXT: [[S2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <5 x i32> <i32 2, i32 2, i32 2, i32 2, i32 4> + ; CHECK-NEXT: [[S3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <5 x i32> <i32 3, i32 3, i32 3, i32 3, i32 4> +-; CHECK-NEXT: [[S4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <5 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4> + ; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] + ; CHECK: if.then: +-; CHECK-NEXT: [[R_0:%.*]] = tail call fast <5 x float> @llvm.fma.v5f32(<5 x float> [[B:%.*]], <5 x float> [[S0]], <5 x float> [[B]]) ++; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <5 x i32> zeroinitializer ++; CHECK-NEXT: [[R_0:%.*]] = tail call fast <5 x float> @llvm.fma.v5f32(<5 x float> [[B:%.*]], <5 x float> [[TMP0]], <5 x float> [[B]]) + ; CHECK-NEXT: [[R_1:%.*]] = tail call fast <5 x float> @llvm.fma.v5f32(<5 x float> [[R_0]], <5 x float> [[S1]], <5 x float> [[B]]) + ; CHECK-NEXT: ret <5 x float> [[R_1]] + ; CHECK: if.else: + ; CHECK-NEXT: [[R_2:%.*]] = tail call fast <5 x float> @llvm.fma.v5f32(<5 x float> [[B]], <5 x float> [[S2]], <5 x float> [[B]]) + ; CHECK-NEXT: [[R_3:%.*]] = tail call fast <5 x float> @llvm.fma.v5f32(<5 x float> [[R_2]], <5 x float> [[S3]], <5 x float> [[B]]) +-; CHECK-NEXT: [[R_4:%.*]] = tail call fast <5 x float> @llvm.fma.v5f32(<5 x float> [[R_3]], <5 x float> [[S4]], <5 x float> [[B]]) ++; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <5 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4> ++; CHECK-NEXT: [[R_4:%.*]] = tail call fast <5 x float> @llvm.fma.v5f32(<5 x float> [[R_3]], <5 x float> [[TMP1]], <5 x float> [[B]]) + ; CHECK-NEXT: ret <5 x float> [[R_4]] + ; + entry: +-- +2.34.1 + diff --git a/patches/cherry/7a605ab7bfbc681c34335684f45b7da32d495db1.patch b/patches/cherry/7a605ab7bfbc681c34335684f45b7da32d495db1.patch new file mode 100644 index 0000000..19568db --- /dev/null +++ b/patches/cherry/7a605ab7bfbc681c34335684f45b7da32d495db1.patch @@ -0,0 +1,727 @@ +From 7a605ab7bfbc681c34335684f45b7da32d495db1 Mon Sep 17 00:00:00 2001 +From: zhongyunde <zhongyunde@huawei.com> +Date: Fri, 4 Mar 2022 22:44:14 +0800 +Subject: [PATCH] [AArch64] Use simd mov to materialize big fp constants + +mov w8, #1325400064 + fmov s0, w8 ==> movi v0.2s, 0x4f, lsl 24 +Fix https://github.com/llvm/llvm-project/issues/53651 + +Reviewed By: dmgreen, fhahn + +Differential Revision: https://reviews.llvm.org/D120452 +--- + .../lib/Target/AArch64/AArch64InstrFormats.td | 14 ++ + llvm/lib/Target/AArch64/AArch64InstrInfo.td | 8 ++ + llvm/test/CodeGen/AArch64/fabs.ll | 3 +- + llvm/test/CodeGen/AArch64/fcvt-fixed.ll | 136 ++++++++---------- + llvm/test/CodeGen/AArch64/fpimm.ll | 5 +- + .../test/CodeGen/AArch64/fptosi-sat-scalar.ll | 20 ++- + .../test/CodeGen/AArch64/fptosi-sat-vector.ll | 76 +++++----- + .../CodeGen/AArch64/remat-const-float-simd.ll | 33 +++++ + .../AArch64/vecreduce-fadd-legalization.ll | 3 +- + 9 files changed, 157 insertions(+), 141 deletions(-) + create mode 100644 llvm/test/CodeGen/AArch64/remat-const-float-simd.ll + +diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td +index 659d2a62b8c4..74dccb85a66e 100644 +--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td ++++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td +@@ -1178,6 +1178,13 @@ def fpimm32XForm : SDNodeXForm<fpimm, [{ + return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32); + }]>; + ++def fpimm32SIMDModImmType4XForm : SDNodeXForm<fpimm, [{ ++ uint32_t enc = AArch64_AM::encodeAdvSIMDModImmType4(N->getValueAPF() ++ .bitcastToAPInt() ++ .getZExtValue()); ++ return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32); ++ }]>; ++ + def fpimm64XForm : SDNodeXForm<fpimm, [{ + APFloat InVal = N->getValueAPF(); + uint32_t enc = AArch64_AM::getFP64Imm(InVal); +@@ -1199,6 +1206,13 @@ def fpimm32 : Operand<f32>, + let ParserMatchClass = FPImmOperand; + let PrintMethod = "printFPImmOperand"; + } ++ ++def fpimm32SIMDModImmType4 : FPImmLeaf<f32, [{ ++ uint64_t Enc = Imm.bitcastToAPInt().getZExtValue(); ++ return Enc != 0 && AArch64_AM::isAdvSIMDModImmType4(Enc << 32 | Enc); ++ }], fpimm32SIMDModImmType4XForm> { ++} ++ + def fpimm64 : Operand<f64>, + FPImmLeaf<f64, [{ + return AArch64_AM::getFP64Imm(Imm) != -1; +diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td +index 1152f8b20a7b..3b50a2e5ece4 100644 +--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td ++++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td +@@ -6145,6 +6145,14 @@ def : Pat<(v8i8 immAllOnesV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 255)), dsub)>; + let isReMaterializable = 1, isAsCheapAsAMove = 1 in + defm MOVI : SIMDModifiedImmVectorShift<0, 0b10, 0b00, "movi">; + ++let Predicates = [HasNEON] in { ++ // Using the MOVI to materialize fp constants. ++ def : Pat<(f32 fpimm32SIMDModImmType4:$in), ++ (EXTRACT_SUBREG (MOVIv2i32 (fpimm32SIMDModImmType4XForm f32:$in), ++ (i32 24)), ++ ssub)>; ++} ++ + def : InstAlias<"movi $Vd.4h, $imm", (MOVIv4i16 V64:$Vd, imm0_255:$imm, 0), 0>; + def : InstAlias<"movi $Vd.8h, $imm", (MOVIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>; + def : InstAlias<"movi $Vd.2s, $imm", (MOVIv2i32 V64:$Vd, imm0_255:$imm, 0), 0>; +diff --git a/llvm/test/CodeGen/AArch64/fabs.ll b/llvm/test/CodeGen/AArch64/fabs.ll +index bc6b32770d4c..23bf7a699195 100644 +--- a/llvm/test/CodeGen/AArch64/fabs.ll ++++ b/llvm/test/CodeGen/AArch64/fabs.ll +@@ -22,9 +22,8 @@ define double @not_fabs(double %x) #0 { + define float @still_not_fabs(float %x) #0 { + ; CHECK-LABEL: still_not_fabs: + ; CHECK: // %bb.0: +-; CHECK-NEXT: mov w8, #-2147483648 ++; CHECK-NEXT: movi v1.2s, #128, lsl #24 + ; CHECK-NEXT: fneg s2, s0 +-; CHECK-NEXT: fmov s1, w8 + ; CHECK-NEXT: fcmp s0, s1 + ; CHECK-NEXT: fcsel s0, s0, s2, ge + ; CHECK-NEXT: ret +diff --git a/llvm/test/CodeGen/AArch64/fcvt-fixed.ll b/llvm/test/CodeGen/AArch64/fcvt-fixed.ll +index 79978af6f80e..296be831da76 100644 +--- a/llvm/test/CodeGen/AArch64/fcvt-fixed.ll ++++ b/llvm/test/CodeGen/AArch64/fcvt-fixed.ll +@@ -87,9 +87,8 @@ define i64 @fcvtzs_f64_i64_64(double %dbl) { + define i32 @fcvtzs_f16_i32_7(half %flt) { + ; CHECK-NO16-LABEL: fcvtzs_f16_i32_7: + ; CHECK-NO16: // %bb.0: +-; CHECK-NO16-NEXT: mov w8, #1124073472 ++; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24 + ; CHECK-NO16-NEXT: fcvt s0, h0 +-; CHECK-NO16-NEXT: fmov s1, w8 + ; CHECK-NO16-NEXT: fmul s0, s0, s1 + ; CHECK-NO16-NEXT: fcvt h0, s0 + ; CHECK-NO16-NEXT: fcvt s0, h0 +@@ -108,9 +107,8 @@ define i32 @fcvtzs_f16_i32_7(half %flt) { + define i32 @fcvtzs_f16_i32_15(half %flt) { + ; CHECK-NO16-LABEL: fcvtzs_f16_i32_15: + ; CHECK-NO16: // %bb.0: +-; CHECK-NO16-NEXT: mov w8, #1191182336 ++; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24 + ; CHECK-NO16-NEXT: fcvt s0, h0 +-; CHECK-NO16-NEXT: fmov s1, w8 + ; CHECK-NO16-NEXT: fmul s0, s0, s1 + ; CHECK-NO16-NEXT: fcvt h0, s0 + ; CHECK-NO16-NEXT: fcvt s0, h0 +@@ -129,9 +127,8 @@ define i32 @fcvtzs_f16_i32_15(half %flt) { + define i64 @fcvtzs_f16_i64_7(half %flt) { + ; CHECK-NO16-LABEL: fcvtzs_f16_i64_7: + ; CHECK-NO16: // %bb.0: +-; CHECK-NO16-NEXT: mov w8, #1124073472 ++; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24 + ; CHECK-NO16-NEXT: fcvt s0, h0 +-; CHECK-NO16-NEXT: fmov s1, w8 + ; CHECK-NO16-NEXT: fmul s0, s0, s1 + ; CHECK-NO16-NEXT: fcvt h0, s0 + ; CHECK-NO16-NEXT: fcvt s0, h0 +@@ -150,9 +147,8 @@ define i64 @fcvtzs_f16_i64_7(half %flt) { + define i64 @fcvtzs_f16_i64_15(half %flt) { + ; CHECK-NO16-LABEL: fcvtzs_f16_i64_15: + ; CHECK-NO16: // %bb.0: +-; CHECK-NO16-NEXT: mov w8, #1191182336 ++; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24 + ; CHECK-NO16-NEXT: fcvt s0, h0 +-; CHECK-NO16-NEXT: fmov s1, w8 + ; CHECK-NO16-NEXT: fmul s0, s0, s1 + ; CHECK-NO16-NEXT: fcvt h0, s0 + ; CHECK-NO16-NEXT: fcvt s0, h0 +@@ -253,9 +249,8 @@ define i64 @fcvtzu_f64_i64_64(double %dbl) { + define i32 @fcvtzu_f16_i32_7(half %flt) { + ; CHECK-NO16-LABEL: fcvtzu_f16_i32_7: + ; CHECK-NO16: // %bb.0: +-; CHECK-NO16-NEXT: mov w8, #1124073472 ++; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24 + ; CHECK-NO16-NEXT: fcvt s0, h0 +-; CHECK-NO16-NEXT: fmov s1, w8 + ; CHECK-NO16-NEXT: fmul s0, s0, s1 + ; CHECK-NO16-NEXT: fcvt h0, s0 + ; CHECK-NO16-NEXT: fcvt s0, h0 +@@ -274,9 +269,8 @@ define i32 @fcvtzu_f16_i32_7(half %flt) { + define i32 @fcvtzu_f16_i32_15(half %flt) { + ; CHECK-NO16-LABEL: fcvtzu_f16_i32_15: + ; CHECK-NO16: // %bb.0: +-; CHECK-NO16-NEXT: mov w8, #1191182336 ++; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24 + ; CHECK-NO16-NEXT: fcvt s0, h0 +-; CHECK-NO16-NEXT: fmov s1, w8 + ; CHECK-NO16-NEXT: fmul s0, s0, s1 + ; CHECK-NO16-NEXT: fcvt h0, s0 + ; CHECK-NO16-NEXT: fcvt s0, h0 +@@ -295,9 +289,8 @@ define i32 @fcvtzu_f16_i32_15(half %flt) { + define i64 @fcvtzu_f16_i64_7(half %flt) { + ; CHECK-NO16-LABEL: fcvtzu_f16_i64_7: + ; CHECK-NO16: // %bb.0: +-; CHECK-NO16-NEXT: mov w8, #1124073472 ++; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24 + ; CHECK-NO16-NEXT: fcvt s0, h0 +-; CHECK-NO16-NEXT: fmov s1, w8 + ; CHECK-NO16-NEXT: fmul s0, s0, s1 + ; CHECK-NO16-NEXT: fcvt h0, s0 + ; CHECK-NO16-NEXT: fcvt s0, h0 +@@ -316,9 +309,8 @@ define i64 @fcvtzu_f16_i64_7(half %flt) { + define i64 @fcvtzu_f16_i64_15(half %flt) { + ; CHECK-NO16-LABEL: fcvtzu_f16_i64_15: + ; CHECK-NO16: // %bb.0: +-; CHECK-NO16-NEXT: mov w8, #1191182336 ++; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24 + ; CHECK-NO16-NEXT: fcvt s0, h0 +-; CHECK-NO16-NEXT: fmov s1, w8 + ; CHECK-NO16-NEXT: fmul s0, s0, s1 + ; CHECK-NO16-NEXT: fcvt h0, s0 + ; CHECK-NO16-NEXT: fcvt s0, h0 +@@ -419,12 +411,11 @@ define double @scvtf_f64_i64_64(i64 %long) { + define half @scvtf_f16_i32_7(i32 %int) { + ; CHECK-NO16-LABEL: scvtf_f16_i32_7: + ; CHECK-NO16: // %bb.0: +-; CHECK-NO16-NEXT: scvtf s0, w0 +-; CHECK-NO16-NEXT: mov w8, #1124073472 +-; CHECK-NO16-NEXT: fmov s1, w8 +-; CHECK-NO16-NEXT: fcvt h0, s0 +-; CHECK-NO16-NEXT: fcvt s0, h0 +-; CHECK-NO16-NEXT: fdiv s0, s0, s1 ++; CHECK-NO16-NEXT: scvtf s1, w0 ++; CHECK-NO16-NEXT: movi v0.2s, #67, lsl #24 ++; CHECK-NO16-NEXT: fcvt h1, s1 ++; CHECK-NO16-NEXT: fcvt s1, h1 ++; CHECK-NO16-NEXT: fdiv s0, s1, s0 + ; CHECK-NO16-NEXT: fcvt h0, s0 + ; CHECK-NO16-NEXT: ret + ; +@@ -440,12 +431,11 @@ define half @scvtf_f16_i32_7(i32 %int) { + define half @scvtf_f16_i32_15(i32 %int) { + ; CHECK-NO16-LABEL: scvtf_f16_i32_15: + ; CHECK-NO16: // %bb.0: +-; CHECK-NO16-NEXT: scvtf s0, w0 +-; CHECK-NO16-NEXT: mov w8, #1191182336 +-; CHECK-NO16-NEXT: fmov s1, w8 +-; CHECK-NO16-NEXT: fcvt h0, s0 +-; CHECK-NO16-NEXT: fcvt s0, h0 +-; CHECK-NO16-NEXT: fdiv s0, s0, s1 ++; CHECK-NO16-NEXT: scvtf s1, w0 ++; CHECK-NO16-NEXT: movi v0.2s, #71, lsl #24 ++; CHECK-NO16-NEXT: fcvt h1, s1 ++; CHECK-NO16-NEXT: fcvt s1, h1 ++; CHECK-NO16-NEXT: fdiv s0, s1, s0 + ; CHECK-NO16-NEXT: fcvt h0, s0 + ; CHECK-NO16-NEXT: ret + ; +@@ -461,12 +451,11 @@ define half @scvtf_f16_i32_15(i32 %int) { + define half @scvtf_f16_i64_7(i64 %long) { + ; CHECK-NO16-LABEL: scvtf_f16_i64_7: + ; CHECK-NO16: // %bb.0: +-; CHECK-NO16-NEXT: scvtf s0, x0 +-; CHECK-NO16-NEXT: mov w8, #1124073472 +-; CHECK-NO16-NEXT: fmov s1, w8 +-; CHECK-NO16-NEXT: fcvt h0, s0 +-; CHECK-NO16-NEXT: fcvt s0, h0 +-; CHECK-NO16-NEXT: fdiv s0, s0, s1 ++; CHECK-NO16-NEXT: scvtf s1, x0 ++; CHECK-NO16-NEXT: movi v0.2s, #67, lsl #24 ++; CHECK-NO16-NEXT: fcvt h1, s1 ++; CHECK-NO16-NEXT: fcvt s1, h1 ++; CHECK-NO16-NEXT: fdiv s0, s1, s0 + ; CHECK-NO16-NEXT: fcvt h0, s0 + ; CHECK-NO16-NEXT: ret + ; +@@ -482,12 +471,11 @@ define half @scvtf_f16_i64_7(i64 %long) { + define half @scvtf_f16_i64_15(i64 %long) { + ; CHECK-NO16-LABEL: scvtf_f16_i64_15: + ; CHECK-NO16: // %bb.0: +-; CHECK-NO16-NEXT: scvtf s0, x0 +-; CHECK-NO16-NEXT: mov w8, #1191182336 +-; CHECK-NO16-NEXT: fmov s1, w8 +-; CHECK-NO16-NEXT: fcvt h0, s0 +-; CHECK-NO16-NEXT: fcvt s0, h0 +-; CHECK-NO16-NEXT: fdiv s0, s0, s1 ++; CHECK-NO16-NEXT: scvtf s1, x0 ++; CHECK-NO16-NEXT: movi v0.2s, #71, lsl #24 ++; CHECK-NO16-NEXT: fcvt h1, s1 ++; CHECK-NO16-NEXT: fcvt s1, h1 ++; CHECK-NO16-NEXT: fdiv s0, s1, s0 + ; CHECK-NO16-NEXT: fcvt h0, s0 + ; CHECK-NO16-NEXT: ret + ; +@@ -585,12 +573,11 @@ define double @ucvtf_f64_i64_64(i64 %long) { + define half @ucvtf_f16_i32_7(i32 %int) { + ; CHECK-NO16-LABEL: ucvtf_f16_i32_7: + ; CHECK-NO16: // %bb.0: +-; CHECK-NO16-NEXT: ucvtf s0, w0 +-; CHECK-NO16-NEXT: mov w8, #1124073472 +-; CHECK-NO16-NEXT: fmov s1, w8 +-; CHECK-NO16-NEXT: fcvt h0, s0 +-; CHECK-NO16-NEXT: fcvt s0, h0 +-; CHECK-NO16-NEXT: fdiv s0, s0, s1 ++; CHECK-NO16-NEXT: ucvtf s1, w0 ++; CHECK-NO16-NEXT: movi v0.2s, #67, lsl #24 ++; CHECK-NO16-NEXT: fcvt h1, s1 ++; CHECK-NO16-NEXT: fcvt s1, h1 ++; CHECK-NO16-NEXT: fdiv s0, s1, s0 + ; CHECK-NO16-NEXT: fcvt h0, s0 + ; CHECK-NO16-NEXT: ret + ; +@@ -606,12 +593,11 @@ define half @ucvtf_f16_i32_7(i32 %int) { + define half @ucvtf_f16_i32_15(i32 %int) { + ; CHECK-NO16-LABEL: ucvtf_f16_i32_15: + ; CHECK-NO16: // %bb.0: +-; CHECK-NO16-NEXT: ucvtf s0, w0 +-; CHECK-NO16-NEXT: mov w8, #1191182336 +-; CHECK-NO16-NEXT: fmov s1, w8 +-; CHECK-NO16-NEXT: fcvt h0, s0 +-; CHECK-NO16-NEXT: fcvt s0, h0 +-; CHECK-NO16-NEXT: fdiv s0, s0, s1 ++; CHECK-NO16-NEXT: ucvtf s1, w0 ++; CHECK-NO16-NEXT: movi v0.2s, #71, lsl #24 ++; CHECK-NO16-NEXT: fcvt h1, s1 ++; CHECK-NO16-NEXT: fcvt s1, h1 ++; CHECK-NO16-NEXT: fdiv s0, s1, s0 + ; CHECK-NO16-NEXT: fcvt h0, s0 + ; CHECK-NO16-NEXT: ret + ; +@@ -627,12 +613,11 @@ define half @ucvtf_f16_i32_15(i32 %int) { + define half @ucvtf_f16_i64_7(i64 %long) { + ; CHECK-NO16-LABEL: ucvtf_f16_i64_7: + ; CHECK-NO16: // %bb.0: +-; CHECK-NO16-NEXT: ucvtf s0, x0 +-; CHECK-NO16-NEXT: mov w8, #1124073472 +-; CHECK-NO16-NEXT: fmov s1, w8 +-; CHECK-NO16-NEXT: fcvt h0, s0 +-; CHECK-NO16-NEXT: fcvt s0, h0 +-; CHECK-NO16-NEXT: fdiv s0, s0, s1 ++; CHECK-NO16-NEXT: ucvtf s1, x0 ++; CHECK-NO16-NEXT: movi v0.2s, #67, lsl #24 ++; CHECK-NO16-NEXT: fcvt h1, s1 ++; CHECK-NO16-NEXT: fcvt s1, h1 ++; CHECK-NO16-NEXT: fdiv s0, s1, s0 + ; CHECK-NO16-NEXT: fcvt h0, s0 + ; CHECK-NO16-NEXT: ret + ; +@@ -648,12 +633,11 @@ define half @ucvtf_f16_i64_7(i64 %long) { + define half @ucvtf_f16_i64_15(i64 %long) { + ; CHECK-NO16-LABEL: ucvtf_f16_i64_15: + ; CHECK-NO16: // %bb.0: +-; CHECK-NO16-NEXT: ucvtf s0, x0 +-; CHECK-NO16-NEXT: mov w8, #1191182336 +-; CHECK-NO16-NEXT: fmov s1, w8 +-; CHECK-NO16-NEXT: fcvt h0, s0 +-; CHECK-NO16-NEXT: fcvt s0, h0 +-; CHECK-NO16-NEXT: fdiv s0, s0, s1 ++; CHECK-NO16-NEXT: ucvtf s1, x0 ++; CHECK-NO16-NEXT: movi v0.2s, #71, lsl #24 ++; CHECK-NO16-NEXT: fcvt h1, s1 ++; CHECK-NO16-NEXT: fcvt s1, h1 ++; CHECK-NO16-NEXT: fdiv s0, s1, s0 + ; CHECK-NO16-NEXT: fcvt h0, s0 + ; CHECK-NO16-NEXT: ret + ; +@@ -749,9 +733,8 @@ define i64 @fcvtzs_sat_f64_i64_64(double %dbl) { + define i32 @fcvtzs_sat_f16_i32_7(half %dbl) { + ; CHECK-NO16-LABEL: fcvtzs_sat_f16_i32_7: + ; CHECK-NO16: // %bb.0: +-; CHECK-NO16-NEXT: mov w8, #1124073472 ++; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24 + ; CHECK-NO16-NEXT: fcvt s0, h0 +-; CHECK-NO16-NEXT: fmov s1, w8 + ; CHECK-NO16-NEXT: fmul s0, s0, s1 + ; CHECK-NO16-NEXT: fcvt h0, s0 + ; CHECK-NO16-NEXT: fcvt s0, h0 +@@ -770,9 +753,8 @@ define i32 @fcvtzs_sat_f16_i32_7(half %dbl) { + define i32 @fcvtzs_sat_f16_i32_15(half %dbl) { + ; CHECK-NO16-LABEL: fcvtzs_sat_f16_i32_15: + ; CHECK-NO16: // %bb.0: +-; CHECK-NO16-NEXT: mov w8, #1191182336 ++; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24 + ; CHECK-NO16-NEXT: fcvt s0, h0 +-; CHECK-NO16-NEXT: fmov s1, w8 + ; CHECK-NO16-NEXT: fmul s0, s0, s1 + ; CHECK-NO16-NEXT: fcvt h0, s0 + ; CHECK-NO16-NEXT: fcvt s0, h0 +@@ -791,9 +773,8 @@ define i32 @fcvtzs_sat_f16_i32_15(half %dbl) { + define i64 @fcvtzs_sat_f16_i64_7(half %dbl) { + ; CHECK-NO16-LABEL: fcvtzs_sat_f16_i64_7: + ; CHECK-NO16: // %bb.0: +-; CHECK-NO16-NEXT: mov w8, #1124073472 ++; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24 + ; CHECK-NO16-NEXT: fcvt s0, h0 +-; CHECK-NO16-NEXT: fmov s1, w8 + ; CHECK-NO16-NEXT: fmul s0, s0, s1 + ; CHECK-NO16-NEXT: fcvt h0, s0 + ; CHECK-NO16-NEXT: fcvt s0, h0 +@@ -812,9 +793,8 @@ define i64 @fcvtzs_sat_f16_i64_7(half %dbl) { + define i64 @fcvtzs_sat_f16_i64_15(half %dbl) { + ; CHECK-NO16-LABEL: fcvtzs_sat_f16_i64_15: + ; CHECK-NO16: // %bb.0: +-; CHECK-NO16-NEXT: mov w8, #1191182336 ++; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24 + ; CHECK-NO16-NEXT: fcvt s0, h0 +-; CHECK-NO16-NEXT: fmov s1, w8 + ; CHECK-NO16-NEXT: fmul s0, s0, s1 + ; CHECK-NO16-NEXT: fcvt h0, s0 + ; CHECK-NO16-NEXT: fcvt s0, h0 +@@ -912,9 +892,8 @@ define i64 @fcvtzu_sat_f64_i64_64(double %dbl) { + define i32 @fcvtzu_sat_f16_i32_7(half %dbl) { + ; CHECK-NO16-LABEL: fcvtzu_sat_f16_i32_7: + ; CHECK-NO16: // %bb.0: +-; CHECK-NO16-NEXT: mov w8, #1124073472 ++; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24 + ; CHECK-NO16-NEXT: fcvt s0, h0 +-; CHECK-NO16-NEXT: fmov s1, w8 + ; CHECK-NO16-NEXT: fmul s0, s0, s1 + ; CHECK-NO16-NEXT: fcvt h0, s0 + ; CHECK-NO16-NEXT: fcvt s0, h0 +@@ -933,9 +912,8 @@ define i32 @fcvtzu_sat_f16_i32_7(half %dbl) { + define i32 @fcvtzu_sat_f16_i32_15(half %dbl) { + ; CHECK-NO16-LABEL: fcvtzu_sat_f16_i32_15: + ; CHECK-NO16: // %bb.0: +-; CHECK-NO16-NEXT: mov w8, #1191182336 ++; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24 + ; CHECK-NO16-NEXT: fcvt s0, h0 +-; CHECK-NO16-NEXT: fmov s1, w8 + ; CHECK-NO16-NEXT: fmul s0, s0, s1 + ; CHECK-NO16-NEXT: fcvt h0, s0 + ; CHECK-NO16-NEXT: fcvt s0, h0 +@@ -954,9 +932,8 @@ define i32 @fcvtzu_sat_f16_i32_15(half %dbl) { + define i64 @fcvtzu_sat_f16_i64_7(half %dbl) { + ; CHECK-NO16-LABEL: fcvtzu_sat_f16_i64_7: + ; CHECK-NO16: // %bb.0: +-; CHECK-NO16-NEXT: mov w8, #1124073472 ++; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24 + ; CHECK-NO16-NEXT: fcvt s0, h0 +-; CHECK-NO16-NEXT: fmov s1, w8 + ; CHECK-NO16-NEXT: fmul s0, s0, s1 + ; CHECK-NO16-NEXT: fcvt h0, s0 + ; CHECK-NO16-NEXT: fcvt s0, h0 +@@ -975,9 +952,8 @@ define i64 @fcvtzu_sat_f16_i64_7(half %dbl) { + define i64 @fcvtzu_sat_f16_i64_15(half %dbl) { + ; CHECK-NO16-LABEL: fcvtzu_sat_f16_i64_15: + ; CHECK-NO16: // %bb.0: +-; CHECK-NO16-NEXT: mov w8, #1191182336 ++; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24 + ; CHECK-NO16-NEXT: fcvt s0, h0 +-; CHECK-NO16-NEXT: fmov s1, w8 + ; CHECK-NO16-NEXT: fmul s0, s0, s1 + ; CHECK-NO16-NEXT: fcvt h0, s0 + ; CHECK-NO16-NEXT: fcvt s0, h0 +diff --git a/llvm/test/CodeGen/AArch64/fpimm.ll b/llvm/test/CodeGen/AArch64/fpimm.ll +index 4c732f589147..10233ded3236 100644 +--- a/llvm/test/CodeGen/AArch64/fpimm.ll ++++ b/llvm/test/CodeGen/AArch64/fpimm.ll +@@ -1,5 +1,5 @@ + ; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs < %s | FileCheck %s +-; RUN: llc -mtriple=aarch64-apple-darwin -code-model=large -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LARGE ++; RUN: llc -mtriple=aarch64-apple-darwin -code-model=large -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LARGE + ; RUN: llc -mtriple=aarch64-none-eabi -code-model=tiny -verify-machineinstrs < %s | FileCheck %s + + @varf32 = global float 0.0 +@@ -15,8 +15,7 @@ define void @check_float() { + + %newval2 = fadd float %val, 128.0 + store volatile float %newval2, float* @varf32 +-; CHECK-DAG: mov [[W128:w[0-9]+]], #1124073472 +-; CHECK-DAG: fmov {{s[0-9]+}}, [[W128]] ++; CHECK-DAG: movi [[REG:v[0-9s]+]].2s, #67, lsl #24 + + ; CHECK: ret + ret void +diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-scalar.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-scalar.ll +index 70f9031123d7..729f531d3a50 100644 +--- a/llvm/test/CodeGen/AArch64/fptosi-sat-scalar.ll ++++ b/llvm/test/CodeGen/AArch64/fptosi-sat-scalar.ll +@@ -131,11 +131,10 @@ define i100 @test_signed_i100_f32(float %f) nounwind { + ; CHECK-NEXT: str x30, [sp, #8] // 8-byte Folded Spill + ; CHECK-NEXT: fmov s8, s0 + ; CHECK-NEXT: bl __fixsfti +-; CHECK-NEXT: mov w8, #-251658240 ++; CHECK-NEXT: movi v0.2s, #241, lsl #24 ++; CHECK-NEXT: mov w8, #1895825407 + ; CHECK-NEXT: mov x10, #34359738367 + ; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload +-; CHECK-NEXT: fmov s0, w8 +-; CHECK-NEXT: mov w8, #1895825407 + ; CHECK-NEXT: fcmp s8, s0 + ; CHECK-NEXT: fmov s0, w8 + ; CHECK-NEXT: mov x8, #-34359738368 +@@ -160,11 +159,10 @@ define i128 @test_signed_i128_f32(float %f) nounwind { + ; CHECK-NEXT: str x30, [sp, #8] // 8-byte Folded Spill + ; CHECK-NEXT: fmov s8, s0 + ; CHECK-NEXT: bl __fixsfti +-; CHECK-NEXT: mov w8, #-16777216 ++; CHECK-NEXT: movi v0.2s, #255, lsl #24 ++; CHECK-NEXT: mov w8, #2130706431 + ; CHECK-NEXT: mov x10, #9223372036854775807 + ; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload +-; CHECK-NEXT: fmov s0, w8 +-; CHECK-NEXT: mov w8, #2130706431 + ; CHECK-NEXT: fcmp s8, s0 + ; CHECK-NEXT: fmov s0, w8 + ; CHECK-NEXT: mov x8, #-9223372036854775808 +@@ -575,11 +573,10 @@ define i100 @test_signed_i100_f16(half %f) nounwind { + ; CHECK-NEXT: str x30, [sp, #8] // 8-byte Folded Spill + ; CHECK-NEXT: fmov s0, s8 + ; CHECK-NEXT: bl __fixsfti +-; CHECK-NEXT: mov w8, #-251658240 ++; CHECK-NEXT: movi v0.2s, #241, lsl #24 ++; CHECK-NEXT: mov w8, #1895825407 + ; CHECK-NEXT: mov x10, #34359738367 + ; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload +-; CHECK-NEXT: fmov s0, w8 +-; CHECK-NEXT: mov w8, #1895825407 + ; CHECK-NEXT: fcmp s8, s0 + ; CHECK-NEXT: fmov s0, w8 + ; CHECK-NEXT: mov x8, #-34359738368 +@@ -605,11 +602,10 @@ define i128 @test_signed_i128_f16(half %f) nounwind { + ; CHECK-NEXT: str x30, [sp, #8] // 8-byte Folded Spill + ; CHECK-NEXT: fmov s0, s8 + ; CHECK-NEXT: bl __fixsfti +-; CHECK-NEXT: mov w8, #-16777216 ++; CHECK-NEXT: movi v0.2s, #255, lsl #24 ++; CHECK-NEXT: mov w8, #2130706431 + ; CHECK-NEXT: mov x10, #9223372036854775807 + ; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload +-; CHECK-NEXT: fmov s0, w8 +-; CHECK-NEXT: mov w8, #2130706431 + ; CHECK-NEXT: fcmp s8, s0 + ; CHECK-NEXT: fmov s0, w8 + ; CHECK-NEXT: mov x8, #-9223372036854775808 +diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll +index 9fc4455972dc..55d4abc962fc 100644 +--- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll ++++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll +@@ -827,15 +827,14 @@ define <2 x i100> @test_signed_v2f32_v2i100(<2 x float> %f) { + ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill + ; CHECK-NEXT: fmov s0, s8 + ; CHECK-NEXT: bl __fixsfti +-; CHECK-NEXT: mov w8, #-251658240 ++; CHECK-NEXT: movi v9.2s, #241, lsl #24 ++; CHECK-NEXT: mov w8, #1895825407 + ; CHECK-NEXT: mov x21, #-34359738368 + ; CHECK-NEXT: mov x22, #34359738367 + ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ++; CHECK-NEXT: fmov s10, w8 + ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 +-; CHECK-NEXT: fmov s9, w8 +-; CHECK-NEXT: mov w8, #1895825407 + ; CHECK-NEXT: fcmp s8, s9 +-; CHECK-NEXT: fmov s10, w8 + ; CHECK-NEXT: csel x8, xzr, x0, lt + ; CHECK-NEXT: csel x9, x21, x1, lt + ; CHECK-NEXT: fcmp s8, s10 +@@ -894,15 +893,14 @@ define <2 x i128> @test_signed_v2f32_v2i128(<2 x float> %f) { + ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill + ; CHECK-NEXT: fmov s0, s8 + ; CHECK-NEXT: bl __fixsfti +-; CHECK-NEXT: mov w8, #-16777216 ++; CHECK-NEXT: movi v9.2s, #255, lsl #24 ++; CHECK-NEXT: mov w8, #2130706431 + ; CHECK-NEXT: mov x21, #-9223372036854775808 + ; CHECK-NEXT: mov x22, #9223372036854775807 + ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ++; CHECK-NEXT: fmov s10, w8 + ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 +-; CHECK-NEXT: fmov s9, w8 +-; CHECK-NEXT: mov w8, #2130706431 + ; CHECK-NEXT: fcmp s8, s9 +-; CHECK-NEXT: fmov s10, w8 + ; CHECK-NEXT: csel x8, xzr, x0, lt + ; CHECK-NEXT: csel x9, x21, x1, lt + ; CHECK-NEXT: fcmp s8, s10 +@@ -1106,20 +1104,19 @@ define <4 x i100> @test_signed_v4f32_v4i100(<4 x float> %f) { + ; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill + ; CHECK-NEXT: fmov s0, s8 + ; CHECK-NEXT: bl __fixsfti +-; CHECK-NEXT: mov w8, #-251658240 ++; CHECK-NEXT: movi v9.2s, #241, lsl #24 ++; CHECK-NEXT: mov w8, #1895825407 + ; CHECK-NEXT: mov x25, #-34359738368 + ; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload + ; CHECK-NEXT: mov x26, #34359738367 +-; CHECK-NEXT: fmov s9, w8 +-; CHECK-NEXT: mov w8, #1895825407 +-; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +-; CHECK-NEXT: fcmp s8, s9 + ; CHECK-NEXT: fmov s10, w8 +-; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +-; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 ++; CHECK-NEXT: fcmp s8, s9 ++; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 + ; CHECK-NEXT: csel x8, xzr, x0, lt + ; CHECK-NEXT: csel x9, x25, x1, lt + ; CHECK-NEXT: fcmp s8, s10 ++; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ++; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 + ; CHECK-NEXT: csel x9, x26, x9, gt + ; CHECK-NEXT: csinv x8, x8, xzr, le + ; CHECK-NEXT: fcmp s8, s8 +@@ -1211,20 +1208,19 @@ define <4 x i128> @test_signed_v4f32_v4i128(<4 x float> %f) { + ; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill + ; CHECK-NEXT: fmov s0, s8 + ; CHECK-NEXT: bl __fixsfti +-; CHECK-NEXT: mov w8, #-16777216 ++; CHECK-NEXT: movi v9.2s, #255, lsl #24 ++; CHECK-NEXT: mov w8, #2130706431 + ; CHECK-NEXT: mov x25, #-9223372036854775808 + ; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload + ; CHECK-NEXT: mov x26, #9223372036854775807 +-; CHECK-NEXT: fmov s9, w8 +-; CHECK-NEXT: mov w8, #2130706431 +-; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +-; CHECK-NEXT: fcmp s8, s9 + ; CHECK-NEXT: fmov s10, w8 +-; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +-; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 ++; CHECK-NEXT: fcmp s8, s9 ++; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 + ; CHECK-NEXT: csel x8, xzr, x0, lt + ; CHECK-NEXT: csel x9, x25, x1, lt + ; CHECK-NEXT: fcmp s8, s10 ++; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ++; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 + ; CHECK-NEXT: csel x9, x26, x9, gt + ; CHECK-NEXT: csinv x8, x8, xzr, le + ; CHECK-NEXT: fcmp s8, s8 +@@ -1862,15 +1858,14 @@ define <4 x i100> @test_signed_v4f16_v4i100(<4 x half> %f) { + ; CHECK-NEXT: fcvt s8, h1 + ; CHECK-NEXT: fmov s0, s8 + ; CHECK-NEXT: bl __fixsfti +-; CHECK-NEXT: mov w8, #-251658240 ++; CHECK-NEXT: movi v9.2s, #241, lsl #24 ++; CHECK-NEXT: mov w8, #1895825407 + ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload + ; CHECK-NEXT: mov x25, #-34359738368 + ; CHECK-NEXT: mov x26, #34359738367 +-; CHECK-NEXT: fmov s9, w8 +-; CHECK-NEXT: mov w8, #1895825407 +-; CHECK-NEXT: mov h0, v0.h[2] +-; CHECK-NEXT: fcmp s8, s9 + ; CHECK-NEXT: fmov s10, w8 ++; CHECK-NEXT: fcmp s8, s9 ++; CHECK-NEXT: mov h0, v0.h[2] + ; CHECK-NEXT: csel x8, xzr, x0, lt + ; CHECK-NEXT: csel x9, x25, x1, lt + ; CHECK-NEXT: fcmp s8, s10 +@@ -1970,15 +1965,14 @@ define <4 x i128> @test_signed_v4f16_v4i128(<4 x half> %f) { + ; CHECK-NEXT: fcvt s8, h1 + ; CHECK-NEXT: fmov s0, s8 + ; CHECK-NEXT: bl __fixsfti +-; CHECK-NEXT: mov w8, #-16777216 ++; CHECK-NEXT: movi v9.2s, #255, lsl #24 ++; CHECK-NEXT: mov w8, #2130706431 + ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload + ; CHECK-NEXT: mov x25, #-9223372036854775808 + ; CHECK-NEXT: mov x26, #9223372036854775807 +-; CHECK-NEXT: fmov s9, w8 +-; CHECK-NEXT: mov w8, #2130706431 +-; CHECK-NEXT: mov h0, v0.h[2] +-; CHECK-NEXT: fcmp s8, s9 + ; CHECK-NEXT: fmov s10, w8 ++; CHECK-NEXT: fcmp s8, s9 ++; CHECK-NEXT: mov h0, v0.h[2] + ; CHECK-NEXT: csel x8, xzr, x0, lt + ; CHECK-NEXT: csel x9, x25, x1, lt + ; CHECK-NEXT: fcmp s8, s10 +@@ -2618,15 +2612,14 @@ define <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) { + ; CHECK-NEXT: fcvt s8, h0 + ; CHECK-NEXT: fmov s0, s8 + ; CHECK-NEXT: bl __fixsfti +-; CHECK-NEXT: mov w8, #-251658240 ++; CHECK-NEXT: movi v10.2s, #241, lsl #24 ++; CHECK-NEXT: mov w8, #1895825407 + ; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload + ; CHECK-NEXT: mov x25, #-34359738368 + ; CHECK-NEXT: mov x23, #34359738367 +-; CHECK-NEXT: fmov s10, w8 +-; CHECK-NEXT: mov w8, #1895825407 +-; CHECK-NEXT: mov h0, v0.h[3] +-; CHECK-NEXT: fcmp s8, s10 + ; CHECK-NEXT: fmov s9, w8 ++; CHECK-NEXT: fcmp s8, s10 ++; CHECK-NEXT: mov h0, v0.h[3] + ; CHECK-NEXT: csel x8, xzr, x0, lt + ; CHECK-NEXT: csel x9, x25, x1, lt + ; CHECK-NEXT: fcmp s8, s9 +@@ -2827,15 +2820,14 @@ define <8 x i128> @test_signed_v8f16_v8i128(<8 x half> %f) { + ; CHECK-NEXT: str q0, [sp, #32] // 16-byte Folded Spill + ; CHECK-NEXT: fmov s0, s8 + ; CHECK-NEXT: bl __fixsfti +-; CHECK-NEXT: mov w8, #-16777216 ++; CHECK-NEXT: movi v10.2s, #255, lsl #24 ++; CHECK-NEXT: mov w8, #2130706431 + ; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload + ; CHECK-NEXT: mov x21, #-9223372036854775808 + ; CHECK-NEXT: mov x22, #9223372036854775807 +-; CHECK-NEXT: fmov s10, w8 +-; CHECK-NEXT: mov w8, #2130706431 +-; CHECK-NEXT: mov h0, v0.h[1] +-; CHECK-NEXT: fcmp s8, s10 + ; CHECK-NEXT: fmov s9, w8 ++; CHECK-NEXT: fcmp s8, s10 ++; CHECK-NEXT: mov h0, v0.h[1] + ; CHECK-NEXT: csel x8, xzr, x0, lt + ; CHECK-NEXT: csel x9, x21, x1, lt + ; CHECK-NEXT: fcmp s8, s9 +diff --git a/llvm/test/CodeGen/AArch64/remat-const-float-simd.ll b/llvm/test/CodeGen/AArch64/remat-const-float-simd.ll +new file mode 100644 +index 000000000000..cdb8b86fc398 +--- /dev/null ++++ b/llvm/test/CodeGen/AArch64/remat-const-float-simd.ll +@@ -0,0 +1,33 @@ ++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ++; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -mattr=+neon | FileCheck %s --check-prefixes=CHECK,CHECK-NEON ++; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -mattr=-neon | FileCheck %s --check-prefixes=CHECK,CHECK-SCALAR ++ ++; Check that big fp constants can be rematerialized with movi ++target triple = "aarch64-unknown-linux-gnu" ++ ++; float foo(void) { return float(2147483648); } ++define float @foo() { ++; CHECK-LABEL: foo: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEON-NEXT: movi v0.2s, #79, lsl #24 ++; CHECK-SCALAR-NEXT: mov w8, #1325400064 ++; CHECK-SCALAR-NEXT: fmov s0, w8 ++; CHECK-NEXT: ret ++entry: ++ ret float 0x41E0000000000000 ++} ++ ++; float foo2(float p) { return p + float(2147483648); } ++define float @foo2(float %f) { ++; CHECK-LABEL: foo2: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEON-NEXT: movi v1.2s, #79, lsl #24 ++; CHECK-NEON-NEXT: fadd s0, s0, s1 ++; CHECK-SCALAR-NEXT: mov w8, #1325400064 ++; CHECK-SCALAR-NEXT: fmov s1, w8 ++; CHECK-SCALAR-NEXT: fadd s0, s0, s1 ++; CHECK-NEXT: ret ++entry: ++ %p = fadd float %f, 0x41E0000000000000 ++ ret float %p ++} +diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization.ll +index 285139c30896..a2e5a8a1b4c4 100644 +--- a/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization.ll ++++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization.ll +@@ -48,8 +48,7 @@ define fp128 @test_v1f128(<1 x fp128> %a) nounwind { + define float @test_v3f32(<3 x float> %a) nounwind { + ; CHECK-LABEL: test_v3f32: + ; CHECK: // %bb.0: +-; CHECK-NEXT: mov w8, #-2147483648 +-; CHECK-NEXT: fmov s1, w8 ++; CHECK-NEXT: movi v1.2s, #128, lsl #24 + ; CHECK-NEXT: mov v0.s[3], v1.s[0] + ; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s + ; CHECK-NEXT: faddp s0, v0.2s +-- +2.34.1 + diff --git a/patches/cherry/84ccd015e7dd3ca57c4a9366ecd2b9a7430f505d.patch b/patches/cherry/84ccd015e7dd3ca57c4a9366ecd2b9a7430f505d.patch new file mode 100644 index 0000000..8d640a5 --- /dev/null +++ b/patches/cherry/84ccd015e7dd3ca57c4a9366ecd2b9a7430f505d.patch @@ -0,0 +1,509 @@ +From 84ccd015e7dd3ca57c4a9366ecd2b9a7430f505d Mon Sep 17 00:00:00 2001 +From: David Green <david.green@arm.com> +Date: Sat, 5 Mar 2022 18:35:43 +0000 +Subject: [PATCH] [AArch64] Some tests to show reconstructing truncates. NFC + +--- + .../CodeGen/AArch64/neon-extracttruncate.ll | 490 ++++++++++++++++++ + 1 file changed, 490 insertions(+) + create mode 100644 llvm/test/CodeGen/AArch64/neon-extracttruncate.ll + +diff --git a/llvm/test/CodeGen/AArch64/neon-extracttruncate.ll b/llvm/test/CodeGen/AArch64/neon-extracttruncate.ll +new file mode 100644 +index 000000000000..14cc333120c7 +--- /dev/null ++++ b/llvm/test/CodeGen/AArch64/neon-extracttruncate.ll +@@ -0,0 +1,490 @@ ++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ++; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s ++ ++define <8 x i8> @extract_2_v4i16(<4 x i16> %a, <4 x i16> %b) { ++; CHECK-LABEL: extract_2_v4i16: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: uzp1 v0.8b, v0.8b, v1.8b ++; CHECK-NEXT: ret ++entry: ++ %a0 = extractelement <4 x i16> %a, i32 0 ++ %a1 = extractelement <4 x i16> %a, i32 1 ++ %a2 = extractelement <4 x i16> %a, i32 2 ++ %a3 = extractelement <4 x i16> %a, i32 3 ++ %b0 = extractelement <4 x i16> %b, i32 0 ++ %b1 = extractelement <4 x i16> %b, i32 1 ++ %b2 = extractelement <4 x i16> %b, i32 2 ++ %b3 = extractelement <4 x i16> %b, i32 3 ++ %t0 = trunc i16 %a0 to i8 ++ %t1 = trunc i16 %a1 to i8 ++ %t2 = trunc i16 %a2 to i8 ++ %t3 = trunc i16 %a3 to i8 ++ %t4 = trunc i16 %b0 to i8 ++ %t5 = trunc i16 %b1 to i8 ++ %t6 = trunc i16 %b2 to i8 ++ %t7 = trunc i16 %b3 to i8 ++ %i0 = insertelement <8 x i8> undef, i8 %t0, i32 0 ++ %i1 = insertelement <8 x i8> %i0, i8 %t1, i32 1 ++ %i2 = insertelement <8 x i8> %i1, i8 %t2, i32 2 ++ %i3 = insertelement <8 x i8> %i2, i8 %t3, i32 3 ++ %i4 = insertelement <8 x i8> %i3, i8 %t4, i32 4 ++ %i5 = insertelement <8 x i8> %i4, i8 %t5, i32 5 ++ %i6 = insertelement <8 x i8> %i5, i8 %t6, i32 6 ++ %i7 = insertelement <8 x i8> %i6, i8 %t7, i32 7 ++ ret <8 x i8> %i7 ++} ++ ++define <8 x i8> @extract_2_v4i32(<4 x i32> %a, <4 x i32> %b) { ++; CHECK-LABEL: extract_2_v4i32: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: mov w8, v0.s[1] ++; CHECK-NEXT: mov w9, v0.s[2] ++; CHECK-NEXT: mov w10, v0.s[3] ++; CHECK-NEXT: mov v0.b[1], w8 ++; CHECK-NEXT: fmov w8, s1 ++; CHECK-NEXT: mov v0.b[2], w9 ++; CHECK-NEXT: mov w9, v1.s[1] ++; CHECK-NEXT: mov v0.b[3], w10 ++; CHECK-NEXT: mov v0.b[4], w8 ++; CHECK-NEXT: mov w8, v1.s[2] ++; CHECK-NEXT: mov v0.b[5], w9 ++; CHECK-NEXT: mov w9, v1.s[3] ++; CHECK-NEXT: mov v0.b[6], w8 ++; CHECK-NEXT: mov v0.b[7], w9 ++; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ++; CHECK-NEXT: ret ++entry: ++ %a0 = extractelement <4 x i32> %a, i32 0 ++ %a1 = extractelement <4 x i32> %a, i32 1 ++ %a2 = extractelement <4 x i32> %a, i32 2 ++ %a3 = extractelement <4 x i32> %a, i32 3 ++ %b0 = extractelement <4 x i32> %b, i32 0 ++ %b1 = extractelement <4 x i32> %b, i32 1 ++ %b2 = extractelement <4 x i32> %b, i32 2 ++ %b3 = extractelement <4 x i32> %b, i32 3 ++ %t0 = trunc i32 %a0 to i8 ++ %t1 = trunc i32 %a1 to i8 ++ %t2 = trunc i32 %a2 to i8 ++ %t3 = trunc i32 %a3 to i8 ++ %t4 = trunc i32 %b0 to i8 ++ %t5 = trunc i32 %b1 to i8 ++ %t6 = trunc i32 %b2 to i8 ++ %t7 = trunc i32 %b3 to i8 ++ %i0 = insertelement <8 x i8> undef, i8 %t0, i32 0 ++ %i1 = insertelement <8 x i8> %i0, i8 %t1, i32 1 ++ %i2 = insertelement <8 x i8> %i1, i8 %t2, i32 2 ++ %i3 = insertelement <8 x i8> %i2, i8 %t3, i32 3 ++ %i4 = insertelement <8 x i8> %i3, i8 %t4, i32 4 ++ %i5 = insertelement <8 x i8> %i4, i8 %t5, i32 5 ++ %i6 = insertelement <8 x i8> %i5, i8 %t6, i32 6 ++ %i7 = insertelement <8 x i8> %i6, i8 %t7, i32 7 ++ ret <8 x i8> %i7 ++} ++ ++define <16 x i8> @extract_4_v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) { ++; CHECK-LABEL: extract_4_v4i16: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ++; CHECK-NEXT: umov w9, v0.h[0] ++; CHECK-NEXT: umov w10, v0.h[1] ++; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 ++; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 ++; CHECK-NEXT: umov w8, v2.h[0] ++; CHECK-NEXT: // kill: def $d3 killed $d3 def $q3 ++; CHECK-NEXT: fmov s4, w9 ++; CHECK-NEXT: umov w9, v0.h[2] ++; CHECK-NEXT: mov v4.b[1], w10 ++; CHECK-NEXT: umov w10, v0.h[3] ++; CHECK-NEXT: mov v4.b[2], w9 ++; CHECK-NEXT: umov w9, v1.h[0] ++; CHECK-NEXT: mov v4.b[3], w10 ++; CHECK-NEXT: umov w10, v1.h[1] ++; CHECK-NEXT: mov v4.b[4], w9 ++; CHECK-NEXT: umov w9, v1.h[2] ++; CHECK-NEXT: mov v4.b[5], w10 ++; CHECK-NEXT: umov w10, v1.h[3] ++; CHECK-NEXT: mov v4.b[6], w9 ++; CHECK-NEXT: umov w9, v2.h[1] ++; CHECK-NEXT: mov v4.b[7], w10 ++; CHECK-NEXT: mov v4.b[8], w8 ++; CHECK-NEXT: umov w8, v2.h[2] ++; CHECK-NEXT: mov v4.b[9], w9 ++; CHECK-NEXT: umov w9, v2.h[3] ++; CHECK-NEXT: mov v4.b[10], w8 ++; CHECK-NEXT: umov w8, v3.h[0] ++; CHECK-NEXT: mov v4.b[11], w9 ++; CHECK-NEXT: umov w9, v3.h[1] ++; CHECK-NEXT: mov v4.b[12], w8 ++; CHECK-NEXT: umov w8, v3.h[2] ++; CHECK-NEXT: mov v4.b[13], w9 ++; CHECK-NEXT: umov w9, v3.h[3] ++; CHECK-NEXT: mov v4.b[14], w8 ++; CHECK-NEXT: mov v4.b[15], w9 ++; CHECK-NEXT: mov v0.16b, v4.16b ++; CHECK-NEXT: ret ++entry: ++ %a0 = extractelement <4 x i16> %a, i32 0 ++ %a1 = extractelement <4 x i16> %a, i32 1 ++ %a2 = extractelement <4 x i16> %a, i32 2 ++ %a3 = extractelement <4 x i16> %a, i32 3 ++ %b0 = extractelement <4 x i16> %b, i32 0 ++ %b1 = extractelement <4 x i16> %b, i32 1 ++ %b2 = extractelement <4 x i16> %b, i32 2 ++ %b3 = extractelement <4 x i16> %b, i32 3 ++ %c0 = extractelement <4 x i16> %c, i32 0 ++ %c1 = extractelement <4 x i16> %c, i32 1 ++ %c2 = extractelement <4 x i16> %c, i32 2 ++ %c3 = extractelement <4 x i16> %c, i32 3 ++ %d0 = extractelement <4 x i16> %d, i32 0 ++ %d1 = extractelement <4 x i16> %d, i32 1 ++ %d2 = extractelement <4 x i16> %d, i32 2 ++ %d3 = extractelement <4 x i16> %d, i32 3 ++ %t0 = trunc i16 %a0 to i8 ++ %t1 = trunc i16 %a1 to i8 ++ %t2 = trunc i16 %a2 to i8 ++ %t3 = trunc i16 %a3 to i8 ++ %t4 = trunc i16 %b0 to i8 ++ %t5 = trunc i16 %b1 to i8 ++ %t6 = trunc i16 %b2 to i8 ++ %t7 = trunc i16 %b3 to i8 ++ %t8 = trunc i16 %c0 to i8 ++ %t9 = trunc i16 %c1 to i8 ++ %t10 = trunc i16 %c2 to i8 ++ %t11 = trunc i16 %c3 to i8 ++ %t12 = trunc i16 %d0 to i8 ++ %t13 = trunc i16 %d1 to i8 ++ %t14 = trunc i16 %d2 to i8 ++ %t15 = trunc i16 %d3 to i8 ++ %i0 = insertelement <16 x i8> undef, i8 %t0, i32 0 ++ %i1 = insertelement <16 x i8> %i0, i8 %t1, i32 1 ++ %i2 = insertelement <16 x i8> %i1, i8 %t2, i32 2 ++ %i3 = insertelement <16 x i8> %i2, i8 %t3, i32 3 ++ %i4 = insertelement <16 x i8> %i3, i8 %t4, i32 4 ++ %i5 = insertelement <16 x i8> %i4, i8 %t5, i32 5 ++ %i6 = insertelement <16 x i8> %i5, i8 %t6, i32 6 ++ %i7 = insertelement <16 x i8> %i6, i8 %t7, i32 7 ++ %i8 = insertelement <16 x i8> %i7, i8 %t8, i32 8 ++ %i9 = insertelement <16 x i8> %i8, i8 %t9, i32 9 ++ %i10 = insertelement <16 x i8> %i9, i8 %t10, i32 10 ++ %i11 = insertelement <16 x i8> %i10, i8 %t11, i32 11 ++ %i12 = insertelement <16 x i8> %i11, i8 %t12, i32 12 ++ %i13 = insertelement <16 x i8> %i12, i8 %t13, i32 13 ++ %i14 = insertelement <16 x i8> %i13, i8 %t14, i32 14 ++ %i15 = insertelement <16 x i8> %i14, i8 %t15, i32 15 ++ ret <16 x i8> %i15 ++} ++ ++define <16 x i8> @extract_4_v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) { ++; CHECK-LABEL: extract_4_v4i32: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: mov w8, v0.s[1] ++; CHECK-NEXT: mov w9, v0.s[2] ++; CHECK-NEXT: mov w10, v0.s[3] ++; CHECK-NEXT: mov v0.b[1], w8 ++; CHECK-NEXT: fmov w8, s1 ++; CHECK-NEXT: mov v0.b[2], w9 ++; CHECK-NEXT: mov w9, v1.s[1] ++; CHECK-NEXT: mov v0.b[3], w10 ++; CHECK-NEXT: mov v0.b[4], w8 ++; CHECK-NEXT: mov w8, v1.s[2] ++; CHECK-NEXT: mov v0.b[5], w9 ++; CHECK-NEXT: mov w9, v1.s[3] ++; CHECK-NEXT: mov v0.b[6], w8 ++; CHECK-NEXT: fmov w8, s2 ++; CHECK-NEXT: mov v0.b[7], w9 ++; CHECK-NEXT: mov w9, v2.s[1] ++; CHECK-NEXT: mov v0.b[8], w8 ++; CHECK-NEXT: mov w8, v2.s[2] ++; CHECK-NEXT: mov v0.b[9], w9 ++; CHECK-NEXT: mov w9, v2.s[3] ++; CHECK-NEXT: mov v0.b[10], w8 ++; CHECK-NEXT: fmov w8, s3 ++; CHECK-NEXT: mov v0.b[11], w9 ++; CHECK-NEXT: mov w9, v3.s[1] ++; CHECK-NEXT: mov v0.b[12], w8 ++; CHECK-NEXT: mov w8, v3.s[2] ++; CHECK-NEXT: mov v0.b[13], w9 ++; CHECK-NEXT: mov w9, v3.s[3] ++; CHECK-NEXT: mov v0.b[14], w8 ++; CHECK-NEXT: mov v0.b[15], w9 ++; CHECK-NEXT: ret ++entry: ++ %a0 = extractelement <4 x i32> %a, i32 0 ++ %a1 = extractelement <4 x i32> %a, i32 1 ++ %a2 = extractelement <4 x i32> %a, i32 2 ++ %a3 = extractelement <4 x i32> %a, i32 3 ++ %b0 = extractelement <4 x i32> %b, i32 0 ++ %b1 = extractelement <4 x i32> %b, i32 1 ++ %b2 = extractelement <4 x i32> %b, i32 2 ++ %b3 = extractelement <4 x i32> %b, i32 3 ++ %c0 = extractelement <4 x i32> %c, i32 0 ++ %c1 = extractelement <4 x i32> %c, i32 1 ++ %c2 = extractelement <4 x i32> %c, i32 2 ++ %c3 = extractelement <4 x i32> %c, i32 3 ++ %d0 = extractelement <4 x i32> %d, i32 0 ++ %d1 = extractelement <4 x i32> %d, i32 1 ++ %d2 = extractelement <4 x i32> %d, i32 2 ++ %d3 = extractelement <4 x i32> %d, i32 3 ++ %t0 = trunc i32 %a0 to i8 ++ %t1 = trunc i32 %a1 to i8 ++ %t2 = trunc i32 %a2 to i8 ++ %t3 = trunc i32 %a3 to i8 ++ %t4 = trunc i32 %b0 to i8 ++ %t5 = trunc i32 %b1 to i8 ++ %t6 = trunc i32 %b2 to i8 ++ %t7 = trunc i32 %b3 to i8 ++ %t8 = trunc i32 %c0 to i8 ++ %t9 = trunc i32 %c1 to i8 ++ %t10 = trunc i32 %c2 to i8 ++ %t11 = trunc i32 %c3 to i8 ++ %t12 = trunc i32 %d0 to i8 ++ %t13 = trunc i32 %d1 to i8 ++ %t14 = trunc i32 %d2 to i8 ++ %t15 = trunc i32 %d3 to i8 ++ %i0 = insertelement <16 x i8> undef, i8 %t0, i32 0 ++ %i1 = insertelement <16 x i8> %i0, i8 %t1, i32 1 ++ %i2 = insertelement <16 x i8> %i1, i8 %t2, i32 2 ++ %i3 = insertelement <16 x i8> %i2, i8 %t3, i32 3 ++ %i4 = insertelement <16 x i8> %i3, i8 %t4, i32 4 ++ %i5 = insertelement <16 x i8> %i4, i8 %t5, i32 5 ++ %i6 = insertelement <16 x i8> %i5, i8 %t6, i32 6 ++ %i7 = insertelement <16 x i8> %i6, i8 %t7, i32 7 ++ %i8 = insertelement <16 x i8> %i7, i8 %t8, i32 8 ++ %i9 = insertelement <16 x i8> %i8, i8 %t9, i32 9 ++ %i10 = insertelement <16 x i8> %i9, i8 %t10, i32 10 ++ %i11 = insertelement <16 x i8> %i10, i8 %t11, i32 11 ++ %i12 = insertelement <16 x i8> %i11, i8 %t12, i32 12 ++ %i13 = insertelement <16 x i8> %i12, i8 %t13, i32 13 ++ %i14 = insertelement <16 x i8> %i13, i8 %t14, i32 14 ++ %i15 = insertelement <16 x i8> %i14, i8 %t15, i32 15 ++ ret <16 x i8> %i15 ++} ++ ++define <16 x i8> @extract_4_mixed(<4 x i16> %a, <4 x i32> %b, <4 x i32> %c, <4 x i16> %d) { ++; CHECK-LABEL: extract_4_mixed: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ++; CHECK-NEXT: umov w8, v0.h[0] ++; CHECK-NEXT: umov w9, v0.h[1] ++; CHECK-NEXT: // kill: def $d3 killed $d3 def $q3 ++; CHECK-NEXT: fmov s4, w8 ++; CHECK-NEXT: umov w8, v0.h[2] ++; CHECK-NEXT: mov v4.b[1], w9 ++; CHECK-NEXT: umov w9, v0.h[3] ++; CHECK-NEXT: mov v4.b[2], w8 ++; CHECK-NEXT: fmov w8, s1 ++; CHECK-NEXT: mov v4.b[3], w9 ++; CHECK-NEXT: mov w9, v1.s[1] ++; CHECK-NEXT: mov v4.b[4], w8 ++; CHECK-NEXT: mov w8, v1.s[2] ++; CHECK-NEXT: mov v4.b[5], w9 ++; CHECK-NEXT: mov w9, v1.s[3] ++; CHECK-NEXT: mov v4.b[6], w8 ++; CHECK-NEXT: fmov w8, s2 ++; CHECK-NEXT: mov v4.b[7], w9 ++; CHECK-NEXT: mov w9, v2.s[1] ++; CHECK-NEXT: mov v4.b[8], w8 ++; CHECK-NEXT: mov w8, v2.s[2] ++; CHECK-NEXT: mov v4.b[9], w9 ++; CHECK-NEXT: mov w9, v2.s[3] ++; CHECK-NEXT: mov v4.b[10], w8 ++; CHECK-NEXT: umov w8, v3.h[0] ++; CHECK-NEXT: mov v4.b[11], w9 ++; CHECK-NEXT: umov w9, v3.h[1] ++; CHECK-NEXT: mov v4.b[12], w8 ++; CHECK-NEXT: umov w8, v3.h[2] ++; CHECK-NEXT: mov v4.b[13], w9 ++; CHECK-NEXT: umov w9, v3.h[3] ++; CHECK-NEXT: mov v4.b[14], w8 ++; CHECK-NEXT: mov v4.b[15], w9 ++; CHECK-NEXT: mov v0.16b, v4.16b ++; CHECK-NEXT: ret ++entry: ++ %a0 = extractelement <4 x i16> %a, i32 0 ++ %a1 = extractelement <4 x i16> %a, i32 1 ++ %a2 = extractelement <4 x i16> %a, i32 2 ++ %a3 = extractelement <4 x i16> %a, i32 3 ++ %b0 = extractelement <4 x i32> %b, i32 0 ++ %b1 = extractelement <4 x i32> %b, i32 1 ++ %b2 = extractelement <4 x i32> %b, i32 2 ++ %b3 = extractelement <4 x i32> %b, i32 3 ++ %c0 = extractelement <4 x i32> %c, i32 0 ++ %c1 = extractelement <4 x i32> %c, i32 1 ++ %c2 = extractelement <4 x i32> %c, i32 2 ++ %c3 = extractelement <4 x i32> %c, i32 3 ++ %d0 = extractelement <4 x i16> %d, i32 0 ++ %d1 = extractelement <4 x i16> %d, i32 1 ++ %d2 = extractelement <4 x i16> %d, i32 2 ++ %d3 = extractelement <4 x i16> %d, i32 3 ++ %t0 = trunc i16 %a0 to i8 ++ %t1 = trunc i16 %a1 to i8 ++ %t2 = trunc i16 %a2 to i8 ++ %t3 = trunc i16 %a3 to i8 ++ %t4 = trunc i32 %b0 to i8 ++ %t5 = trunc i32 %b1 to i8 ++ %t6 = trunc i32 %b2 to i8 ++ %t7 = trunc i32 %b3 to i8 ++ %t8 = trunc i32 %c0 to i8 ++ %t9 = trunc i32 %c1 to i8 ++ %t10 = trunc i32 %c2 to i8 ++ %t11 = trunc i32 %c3 to i8 ++ %t12 = trunc i16 %d0 to i8 ++ %t13 = trunc i16 %d1 to i8 ++ %t14 = trunc i16 %d2 to i8 ++ %t15 = trunc i16 %d3 to i8 ++ %i0 = insertelement <16 x i8> undef, i8 %t0, i32 0 ++ %i1 = insertelement <16 x i8> %i0, i8 %t1, i32 1 ++ %i2 = insertelement <16 x i8> %i1, i8 %t2, i32 2 ++ %i3 = insertelement <16 x i8> %i2, i8 %t3, i32 3 ++ %i4 = insertelement <16 x i8> %i3, i8 %t4, i32 4 ++ %i5 = insertelement <16 x i8> %i4, i8 %t5, i32 5 ++ %i6 = insertelement <16 x i8> %i5, i8 %t6, i32 6 ++ %i7 = insertelement <16 x i8> %i6, i8 %t7, i32 7 ++ %i8 = insertelement <16 x i8> %i7, i8 %t8, i32 8 ++ %i9 = insertelement <16 x i8> %i8, i8 %t9, i32 9 ++ %i10 = insertelement <16 x i8> %i9, i8 %t10, i32 10 ++ %i11 = insertelement <16 x i8> %i10, i8 %t11, i32 11 ++ %i12 = insertelement <16 x i8> %i11, i8 %t12, i32 12 ++ %i13 = insertelement <16 x i8> %i12, i8 %t13, i32 13 ++ %i14 = insertelement <16 x i8> %i13, i8 %t14, i32 14 ++ %i15 = insertelement <16 x i8> %i14, i8 %t15, i32 15 ++ ret <16 x i8> %i15 ++} ++ ++define <16 x i8> @extract_4_v4i32_badindex(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) { ++; CHECK-LABEL: extract_4_v4i32_badindex: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: mov w8, v0.s[1] ++; CHECK-NEXT: mov w9, v0.s[2] ++; CHECK-NEXT: mov w10, v0.s[3] ++; CHECK-NEXT: mov v0.b[1], w8 ++; CHECK-NEXT: fmov w8, s1 ++; CHECK-NEXT: mov v0.b[2], w9 ++; CHECK-NEXT: mov w9, v1.s[2] ++; CHECK-NEXT: mov v0.b[3], w10 ++; CHECK-NEXT: mov v0.b[4], w8 ++; CHECK-NEXT: mov w8, v1.s[1] ++; CHECK-NEXT: mov v0.b[5], w9 ++; CHECK-NEXT: mov w9, v1.s[3] ++; CHECK-NEXT: mov v0.b[6], w8 ++; CHECK-NEXT: fmov w8, s2 ++; CHECK-NEXT: mov v0.b[7], w9 ++; CHECK-NEXT: mov w9, v2.s[1] ++; CHECK-NEXT: mov v0.b[8], w8 ++; CHECK-NEXT: mov w8, v2.s[2] ++; CHECK-NEXT: mov v0.b[9], w9 ++; CHECK-NEXT: mov w9, v2.s[3] ++; CHECK-NEXT: mov v0.b[10], w8 ++; CHECK-NEXT: fmov w8, s3 ++; CHECK-NEXT: mov v0.b[11], w9 ++; CHECK-NEXT: mov w9, v3.s[1] ++; CHECK-NEXT: mov v0.b[12], w8 ++; CHECK-NEXT: mov w8, v3.s[2] ++; CHECK-NEXT: mov v0.b[13], w9 ++; CHECK-NEXT: mov w9, v3.s[3] ++; CHECK-NEXT: mov v0.b[14], w8 ++; CHECK-NEXT: mov v0.b[15], w9 ++; CHECK-NEXT: ret ++entry: ++ %a0 = extractelement <4 x i32> %a, i32 0 ++ %a1 = extractelement <4 x i32> %a, i32 1 ++ %a2 = extractelement <4 x i32> %a, i32 2 ++ %a3 = extractelement <4 x i32> %a, i32 3 ++ %b0 = extractelement <4 x i32> %b, i32 0 ++ %b1 = extractelement <4 x i32> %b, i32 2 ++ %b2 = extractelement <4 x i32> %b, i32 1 ++ %b3 = extractelement <4 x i32> %b, i32 3 ++ %c0 = extractelement <4 x i32> %c, i32 0 ++ %c1 = extractelement <4 x i32> %c, i32 1 ++ %c2 = extractelement <4 x i32> %c, i32 2 ++ %c3 = extractelement <4 x i32> %c, i32 3 ++ %d0 = extractelement <4 x i32> %d, i32 0 ++ %d1 = extractelement <4 x i32> %d, i32 1 ++ %d2 = extractelement <4 x i32> %d, i32 2 ++ %d3 = extractelement <4 x i32> %d, i32 3 ++ %t0 = trunc i32 %a0 to i8 ++ %t1 = trunc i32 %a1 to i8 ++ %t2 = trunc i32 %a2 to i8 ++ %t3 = trunc i32 %a3 to i8 ++ %t4 = trunc i32 %b0 to i8 ++ %t5 = trunc i32 %b1 to i8 ++ %t6 = trunc i32 %b2 to i8 ++ %t7 = trunc i32 %b3 to i8 ++ %t8 = trunc i32 %c0 to i8 ++ %t9 = trunc i32 %c1 to i8 ++ %t10 = trunc i32 %c2 to i8 ++ %t11 = trunc i32 %c3 to i8 ++ %t12 = trunc i32 %d0 to i8 ++ %t13 = trunc i32 %d1 to i8 ++ %t14 = trunc i32 %d2 to i8 ++ %t15 = trunc i32 %d3 to i8 ++ %i0 = insertelement <16 x i8> undef, i8 %t0, i32 0 ++ %i1 = insertelement <16 x i8> %i0, i8 %t1, i32 1 ++ %i2 = insertelement <16 x i8> %i1, i8 %t2, i32 2 ++ %i3 = insertelement <16 x i8> %i2, i8 %t3, i32 3 ++ %i4 = insertelement <16 x i8> %i3, i8 %t4, i32 4 ++ %i5 = insertelement <16 x i8> %i4, i8 %t5, i32 5 ++ %i6 = insertelement <16 x i8> %i5, i8 %t6, i32 6 ++ %i7 = insertelement <16 x i8> %i6, i8 %t7, i32 7 ++ %i8 = insertelement <16 x i8> %i7, i8 %t8, i32 8 ++ %i9 = insertelement <16 x i8> %i8, i8 %t9, i32 9 ++ %i10 = insertelement <16 x i8> %i9, i8 %t10, i32 10 ++ %i11 = insertelement <16 x i8> %i10, i8 %t11, i32 11 ++ %i12 = insertelement <16 x i8> %i11, i8 %t12, i32 12 ++ %i13 = insertelement <16 x i8> %i12, i8 %t13, i32 13 ++ %i14 = insertelement <16 x i8> %i13, i8 %t14, i32 14 ++ %i15 = insertelement <16 x i8> %i14, i8 %t15, i32 15 ++ ret <16 x i8> %i15 ++} ++ ++define <16 x i8> @extract_4_v4i32_one(<4 x i32> %a) { ++; CHECK-LABEL: extract_4_v4i32_one: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: mov w8, v0.s[1] ++; CHECK-NEXT: fmov w9, s0 ++; CHECK-NEXT: mov w10, v0.s[2] ++; CHECK-NEXT: mov w11, v0.s[3] ++; CHECK-NEXT: mov v0.b[1], w8 ++; CHECK-NEXT: mov v0.b[2], w10 ++; CHECK-NEXT: mov v0.b[3], w11 ++; CHECK-NEXT: mov v0.b[4], w9 ++; CHECK-NEXT: mov v0.b[5], w8 ++; CHECK-NEXT: mov v0.b[6], w10 ++; CHECK-NEXT: mov v0.b[7], w11 ++; CHECK-NEXT: mov v0.b[8], w9 ++; CHECK-NEXT: mov v0.b[9], w8 ++; CHECK-NEXT: mov v0.b[10], w10 ++; CHECK-NEXT: mov v0.b[11], w11 ++; CHECK-NEXT: mov v0.b[12], w9 ++; CHECK-NEXT: mov v0.b[13], w8 ++; CHECK-NEXT: mov v0.b[14], w10 ++; CHECK-NEXT: mov v0.b[15], w11 ++; CHECK-NEXT: ret ++entry: ++ %a0 = extractelement <4 x i32> %a, i32 0 ++ %a1 = extractelement <4 x i32> %a, i32 1 ++ %a2 = extractelement <4 x i32> %a, i32 2 ++ %a3 = extractelement <4 x i32> %a, i32 3 ++ %t0 = trunc i32 %a0 to i8 ++ %t1 = trunc i32 %a1 to i8 ++ %t2 = trunc i32 %a2 to i8 ++ %t3 = trunc i32 %a3 to i8 ++ %i0 = insertelement <16 x i8> undef, i8 %t0, i32 0 ++ %i1 = insertelement <16 x i8> %i0, i8 %t1, i32 1 ++ %i2 = insertelement <16 x i8> %i1, i8 %t2, i32 2 ++ %i3 = insertelement <16 x i8> %i2, i8 %t3, i32 3 ++ %i4 = insertelement <16 x i8> %i3, i8 %t0, i32 4 ++ %i5 = insertelement <16 x i8> %i4, i8 %t1, i32 5 ++ %i6 = insertelement <16 x i8> %i5, i8 %t2, i32 6 ++ %i7 = insertelement <16 x i8> %i6, i8 %t3, i32 7 ++ %i8 = insertelement <16 x i8> %i7, i8 %t0, i32 8 ++ %i9 = insertelement <16 x i8> %i8, i8 %t1, i32 9 ++ %i10 = insertelement <16 x i8> %i9, i8 %t2, i32 10 ++ %i11 = insertelement <16 x i8> %i10, i8 %t3, i32 11 ++ %i12 = insertelement <16 x i8> %i11, i8 %t0, i32 12 ++ %i13 = insertelement <16 x i8> %i12, i8 %t1, i32 13 ++ %i14 = insertelement <16 x i8> %i13, i8 %t2, i32 14 ++ %i15 = insertelement <16 x i8> %i14, i8 %t3, i32 15 ++ ret <16 x i8> %i15 ++} ++ +-- +2.34.1 + diff --git a/patches/cherry/86617256864ebcbda03b6ce843deeb6a41a85800.patch b/patches/cherry/86617256864ebcbda03b6ce843deeb6a41a85800.patch new file mode 100644 index 0000000..8e8dfb8 --- /dev/null +++ b/patches/cherry/86617256864ebcbda03b6ce843deeb6a41a85800.patch @@ -0,0 +1,206 @@ +From 86617256864ebcbda03b6ce843deeb6a41a85800 Mon Sep 17 00:00:00 2001 +From: Florian Hahn <flo@fhahn.com> +Date: Mon, 23 May 2022 20:27:42 +0100 +Subject: [PATCH] [AArch64] Add tests with free shuffles for indexed fma + variants. + +The new tests contain examples where shuffles are free, because indexed +fma instructions can be used. +--- + .../AArch64/sink-free-instructions.ll | 183 ++++++++++++++++++ + 1 file changed, 183 insertions(+) + +diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll +index 94164c08a4b3..244d2c35bbac 100644 +--- a/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll ++++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll +@@ -494,3 +494,186 @@ if.else: + %vmull1 = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %s3, <8 x i8> %s4) + ret <8 x i16> %vmull1 + } ++ ++declare <8 x half> @llvm.fma.v8f16(<8 x half>, <8 x half>, <8 x half>) ++ ++define <8 x half> @sink_shufflevector_fma_v8f16(i1 %c, <8 x half> %a, <8 x half> %b) { ++; CHECK-LABEL: @sink_shufflevector_fma_v8f16( ++; CHECK-NEXT: entry: ++; CHECK-NEXT: [[S0:%.*]] = shufflevector <8 x half> [[A:%.*]], <8 x half> poison, <8 x i32> zeroinitializer ++; CHECK-NEXT: [[S1:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> ++; CHECK-NEXT: [[S2:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> ++; CHECK-NEXT: [[S3:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> ++; CHECK-NEXT: [[S4:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4> ++; CHECK-NEXT: [[S5:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5> ++; CHECK-NEXT: [[S6:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6> ++; CHECK-NEXT: [[S7:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> ++; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] ++; CHECK: if.then: ++; CHECK-NEXT: [[R_0:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[B:%.*]], <8 x half> [[S0]], <8 x half> [[B]]) ++; CHECK-NEXT: [[R_1:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_0]], <8 x half> [[S1]], <8 x half> [[B]]) ++; CHECK-NEXT: [[R_2:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_1]], <8 x half> [[S2]], <8 x half> [[B]]) ++; CHECK-NEXT: [[R_3:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_2]], <8 x half> [[S3]], <8 x half> [[B]]) ++; CHECK-NEXT: ret <8 x half> [[R_3]] ++; CHECK: if.else: ++; CHECK-NEXT: [[R_4:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[B]], <8 x half> [[S4]], <8 x half> [[B]]) ++; CHECK-NEXT: [[R_5:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_4]], <8 x half> [[S5]], <8 x half> [[B]]) ++; CHECK-NEXT: [[R_6:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_5]], <8 x half> [[S6]], <8 x half> [[B]]) ++; CHECK-NEXT: [[R_7:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_6]], <8 x half> [[S7]], <8 x half> [[B]]) ++; CHECK-NEXT: ret <8 x half> [[R_7]] ++; ++entry: ++ %s0 = shufflevector <8 x half> %a, <8 x half> poison, <8 x i32> zeroinitializer ++ %s1 = shufflevector <8 x half> %a, <8 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> ++ %s2 = shufflevector <8 x half> %a, <8 x half> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> ++ %s3 = shufflevector <8 x half> %a, <8 x half> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> ++ %s4 = shufflevector <8 x half> %a, <8 x half> poison, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4> ++ %s5 = shufflevector <8 x half> %a, <8 x half> poison, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5> ++ %s6 = shufflevector <8 x half> %a, <8 x half> poison, <8 x i32> <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6> ++ %s7 = shufflevector <8 x half> %a, <8 x half> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> ++ br i1 %c, label %if.then, label %if.else ++ ++if.then: ++ %r.0 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %b, <8 x half> %s0, <8 x half> %b) ++ %r.1 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %r.0, <8 x half> %s1, <8 x half> %b) ++ %r.2 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %r.1, <8 x half> %s2, <8 x half> %b) ++ %r.3 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %r.2, <8 x half> %s3, <8 x half> %b) ++ ret <8 x half> %r.3 ++ ++if.else: ++ %r.4 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %b, <8 x half> %s4, <8 x half> %b) ++ %r.5 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %r.4, <8 x half> %s5, <8 x half> %b) ++ %r.6 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %r.5, <8 x half> %s6, <8 x half> %b) ++ %r.7 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %r.6, <8 x half> %s7, <8 x half> %b) ++ ret <8 x half> %r.7 ++} ++ ++declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) ++ ++define <4 x float> @sink_shufflevector_fma_v4f32(i1 %c, <8 x float> %a, <4 x float> %b) { ++; CHECK-LABEL: @sink_shufflevector_fma_v4f32( ++; CHECK-NEXT: entry: ++; CHECK-NEXT: [[S0:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> zeroinitializer ++; CHECK-NEXT: [[S1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ++; CHECK-NEXT: [[S2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2> ++; CHECK-NEXT: [[S3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3> ++; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] ++; CHECK: if.then: ++; CHECK-NEXT: [[R_0:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[B:%.*]], <4 x float> [[S0]], <4 x float> [[B]]) ++; CHECK-NEXT: [[R_1:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[R_0]], <4 x float> [[S1]], <4 x float> [[B]]) ++; CHECK-NEXT: ret <4 x float> [[R_1]] ++; CHECK: if.else: ++; CHECK-NEXT: [[R_2:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[B]], <4 x float> [[S2]], <4 x float> [[B]]) ++; CHECK-NEXT: [[R_3:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[R_2]], <4 x float> [[S3]], <4 x float> [[B]]) ++; CHECK-NEXT: ret <4 x float> [[R_3]] ++; ++entry: ++ %s0 = shufflevector <8 x float> %a, <8 x float> poison, <4 x i32> zeroinitializer ++ %s1 = shufflevector <8 x float> %a, <8 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ++ %s2 = shufflevector <8 x float> %a, <8 x float> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2> ++ %s3 = shufflevector <8 x float> %a, <8 x float> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3> ++ br i1 %c, label %if.then, label %if.else ++ ++if.then: ++ %r.0 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %b, <4 x float> %s0, <4 x float> %b) ++ %r.1 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %r.0, <4 x float> %s1, <4 x float> %b) ++ ret <4 x float> %r.1 ++ ++if.else: ++ %r.2 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %b, <4 x float> %s2, <4 x float> %b) ++ %r.3 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %r.2, <4 x float> %s3, <4 x float> %b) ++ ret <4 x float> %r.3 ++} ++ ++declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) ++ ++define <2 x double> @sink_shufflevector_fma_v2f64(i1 %c, <2 x double> %a, <2 x double> %b) { ++; CHECK-LABEL: @sink_shufflevector_fma_v2f64( ++; CHECK-NEXT: entry: ++; CHECK-NEXT: [[S0:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> poison, <2 x i32> zeroinitializer ++; CHECK-NEXT: [[S1:%.*]] = shufflevector <2 x double> [[A]], <2 x double> poison, <2 x i32> <i32 1, i32 1> ++; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] ++; CHECK: if.then: ++; CHECK-NEXT: [[R_0:%.*]] = tail call fast <2 x double> @llvm.fma.v2f64(<2 x double> [[B:%.*]], <2 x double> [[S0]], <2 x double> [[B]]) ++; CHECK-NEXT: ret <2 x double> [[R_0]] ++; CHECK: if.else: ++; CHECK-NEXT: [[R_1:%.*]] = tail call fast <2 x double> @llvm.fma.v2f64(<2 x double> [[B]], <2 x double> [[S1]], <2 x double> [[B]]) ++; CHECK-NEXT: ret <2 x double> [[R_1]] ++; ++entry: ++ %s0 = shufflevector <2 x double> %a, <2 x double> poison, <2 x i32> zeroinitializer ++ %s1 = shufflevector <2 x double> %a, <2 x double> poison, <2 x i32> <i32 1, i32 1> ++ br i1 %c, label %if.then, label %if.else ++ ++if.then: ++ %r.0 = tail call fast <2 x double> @llvm.fma.v2f64(<2 x double> %b, <2 x double> %s0, <2 x double> %b) ++ ret <2 x double> %r.0 ++ ++if.else: ++ %r.1 = tail call fast <2 x double> @llvm.fma.v2f64(<2 x double> %b, <2 x double> %s1, <2 x double> %b) ++ ret <2 x double> %r.1 ++} ++ ++define <4 x float> @do_not_sink_out_of_range_shufflevector_fma_v4f32(i1 %c, <8 x float> %a, <4 x float> %b) { ++; CHECK-LABEL: @do_not_sink_out_of_range_shufflevector_fma_v4f32( ++; CHECK-NEXT: entry: ++; CHECK-NEXT: [[S4:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> <i32 4, i32 4, i32 4, i32 4> ++; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] ++; CHECK: if.then: ++; CHECK-NEXT: [[R:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[B:%.*]], <4 x float> [[S4]], <4 x float> [[B]]) ++; CHECK-NEXT: ret <4 x float> [[R]] ++; CHECK: if.else: ++; CHECK-NEXT: ret <4 x float> zeroinitializer ++; ++entry: ++ %s4 = shufflevector <8 x float> %a, <8 x float> poison, <4 x i32> <i32 4, i32 4, i32 4, i32 4> ++ br i1 %c, label %if.then, label %if.else ++ ++if.then: ++ %r = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %b, <4 x float> %s4, <4 x float> %b) ++ ret <4 x float> %r ++ ++if.else: ++ ret <4 x float> zeroinitializer ++} ++ ++declare <5 x float> @llvm.fma.v5f32(<5 x float>, <5 x float>, <5 x float>) ++ ++define <5 x float> @do_not_sink_shufflevector_fma_v5f32(i1 %c, <8 x float> %a, <5 x float> %b) { ++; CHECK-LABEL: @do_not_sink_shufflevector_fma_v5f32( ++; CHECK-NEXT: entry: ++; CHECK-NEXT: [[S0:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <5 x i32> zeroinitializer ++; CHECK-NEXT: [[S1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <5 x i32> <i32 1, i32 1, i32 1, i32 1, i32 4> ++; CHECK-NEXT: [[S2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <5 x i32> <i32 2, i32 2, i32 2, i32 2, i32 4> ++; CHECK-NEXT: [[S3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <5 x i32> <i32 3, i32 3, i32 3, i32 3, i32 4> ++; CHECK-NEXT: [[S4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <5 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4> ++; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] ++; CHECK: if.then: ++; CHECK-NEXT: [[R_0:%.*]] = tail call fast <5 x float> @llvm.fma.v5f32(<5 x float> [[B:%.*]], <5 x float> [[S0]], <5 x float> [[B]]) ++; CHECK-NEXT: [[R_1:%.*]] = tail call fast <5 x float> @llvm.fma.v5f32(<5 x float> [[R_0]], <5 x float> [[S1]], <5 x float> [[B]]) ++; CHECK-NEXT: ret <5 x float> [[R_1]] ++; CHECK: if.else: ++; CHECK-NEXT: [[R_2:%.*]] = tail call fast <5 x float> @llvm.fma.v5f32(<5 x float> [[B]], <5 x float> [[S2]], <5 x float> [[B]]) ++; CHECK-NEXT: [[R_3:%.*]] = tail call fast <5 x float> @llvm.fma.v5f32(<5 x float> [[R_2]], <5 x float> [[S3]], <5 x float> [[B]]) ++; CHECK-NEXT: [[R_4:%.*]] = tail call fast <5 x float> @llvm.fma.v5f32(<5 x float> [[R_3]], <5 x float> [[S4]], <5 x float> [[B]]) ++; CHECK-NEXT: ret <5 x float> [[R_4]] ++; ++entry: ++ %s0 = shufflevector <8 x float> %a, <8 x float> poison, <5 x i32> zeroinitializer ++ %s1 = shufflevector <8 x float> %a, <8 x float> poison, <5 x i32> <i32 1, i32 1, i32 1, i32 1, i32 4> ++ %s2 = shufflevector <8 x float> %a, <8 x float> poison, <5 x i32> <i32 2, i32 2, i32 2, i32 2, i32 4> ++ %s3 = shufflevector <8 x float> %a, <8 x float> poison, <5 x i32> <i32 3, i32 3, i32 3, i32 3, i32 4> ++ %s4 = shufflevector <8 x float> %a, <8 x float> poison, <5 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4> ++ br i1 %c, label %if.then, label %if.else ++ ++if.then: ++ %r.0 = tail call fast <5 x float> @llvm.fma.v5f32(<5 x float> %b, <5 x float> %s0, <5 x float> %b) ++ %r.1 = tail call fast <5 x float> @llvm.fma.v5f32(<5 x float> %r.0, <5 x float> %s1, <5 x float> %b) ++ ret <5 x float> %r.1 ++ ++if.else: ++ %r.2 = tail call fast <5 x float> @llvm.fma.v5f32(<5 x float> %b, <5 x float> %s2, <5 x float> %b) ++ %r.3 = tail call fast <5 x float> @llvm.fma.v5f32(<5 x float> %r.2, <5 x float> %s3, <5 x float> %b) ++ %r.4 = tail call fast <5 x float> @llvm.fma.v5f32(<5 x float> %r.3, <5 x float> %s4, <5 x float> %b) ++ ret <5 x float> %r.4 ++} +-- +2.34.1 + diff --git a/patches/cherry/AArch64-Use-Tbl.patch b/patches/cherry/AArch64-Use-Tbl.patch index f9521be..f9521be 100755..100644 --- a/patches/cherry/AArch64-Use-Tbl.patch +++ b/patches/cherry/AArch64-Use-Tbl.patch diff --git a/patches/cherry/Loop-Vectorizer-shouldMaximizeVectorBandwidth.patch b/patches/cherry/Loop-Vectorizer-shouldMaximizeVectorBandwidth.patch new file mode 100644 index 0000000..b4afeb5 --- /dev/null +++ b/patches/cherry/Loop-Vectorizer-shouldMaximizeVectorBandwidth.patch @@ -0,0 +1,527 @@ +commit 7e4b5c2e864ebd1c1a3a0203171143e311dd2a96 (HEAD) +Author: Peter Waller <peter.waller@arm.com> +Date: Mon May 16 20:59:17 2022 +0000 + + [LV] Improve register pressure estimate at high VFs + +commit 4f81e1af2d1de9d902709cbaff727ba198cd5410 +Author: Jingu Kang <jingu.kang@arm.com> +Date: Tue Apr 5 13:16:10 2022 +0100 + + [AArch64] Set maximum VF with shouldMaximizeVectorBandwidth +--- +diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h +index 7412e050322e..1179971ad13b 100644 +--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h ++++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h +@@ -727,7 +727,7 @@ public: + bool isTypeLegal(Type *Ty) const; + + /// Returns the estimated number of registers required to represent \p Ty. +- InstructionCost getRegUsageForType(Type *Ty) const; ++ unsigned getRegUsageForType(Type *Ty) const; + + /// Return true if switches should be turned into lookup tables for the + /// target. +@@ -934,7 +934,8 @@ public: + /// creating vectors that span multiple vector registers. + /// If false, the vectorization factor will be chosen based on the + /// size of the widest element type. +- bool shouldMaximizeVectorBandwidth() const; ++ /// \p K Register Kind for vectorization. ++ bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const; + + /// \return The minimum vectorization factor for types of given element + /// bit width, or 0 if there is no minimum VF. The returned value only +@@ -1571,7 +1572,7 @@ public: + virtual bool isProfitableToHoist(Instruction *I) = 0; + virtual bool useAA() = 0; + virtual bool isTypeLegal(Type *Ty) = 0; +- virtual InstructionCost getRegUsageForType(Type *Ty) = 0; ++ virtual unsigned getRegUsageForType(Type *Ty) = 0; + virtual bool shouldBuildLookupTables() = 0; + virtual bool shouldBuildLookupTablesForConstant(Constant *C) = 0; + virtual bool shouldBuildRelLookupTables() = 0; +@@ -1618,7 +1619,8 @@ public: + virtual unsigned getMinVectorRegisterBitWidth() const = 0; + virtual Optional<unsigned> getMaxVScale() const = 0; + virtual Optional<unsigned> getVScaleForTuning() const = 0; +- virtual bool shouldMaximizeVectorBandwidth() const = 0; ++ virtual bool ++ shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const = 0; + virtual ElementCount getMinimumVF(unsigned ElemWidth, + bool IsScalable) const = 0; + virtual unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const = 0; +@@ -2001,7 +2003,7 @@ public: + } + bool useAA() override { return Impl.useAA(); } + bool isTypeLegal(Type *Ty) override { return Impl.isTypeLegal(Ty); } +- InstructionCost getRegUsageForType(Type *Ty) override { ++ unsigned getRegUsageForType(Type *Ty) override { + return Impl.getRegUsageForType(Ty); + } + bool shouldBuildLookupTables() override { +@@ -2108,8 +2110,9 @@ public: + Optional<unsigned> getVScaleForTuning() const override { + return Impl.getVScaleForTuning(); + } +- bool shouldMaximizeVectorBandwidth() const override { +- return Impl.shouldMaximizeVectorBandwidth(); ++ bool shouldMaximizeVectorBandwidth( ++ TargetTransformInfo::RegisterKind K) const override { ++ return Impl.shouldMaximizeVectorBandwidth(K); + } + ElementCount getMinimumVF(unsigned ElemWidth, + bool IsScalable) const override { +diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +index a32744f8d58b..28ce1690202d 100644 +--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h ++++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +@@ -310,7 +310,7 @@ public: + + bool isTypeLegal(Type *Ty) const { return false; } + +- InstructionCost getRegUsageForType(Type *Ty) const { return 1; } ++ unsigned getRegUsageForType(Type *Ty) const { return 1; } + + bool shouldBuildLookupTables() const { return true; } + +@@ -415,7 +415,10 @@ public: + Optional<unsigned> getMaxVScale() const { return None; } + Optional<unsigned> getVScaleForTuning() const { return None; } + +- bool shouldMaximizeVectorBandwidth() const { return false; } ++ bool ++ shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const { ++ return false; ++ } + + ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const { + return ElementCount::get(0, IsScalable); +diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h +index 0b2737628923..39c8eaf6206b 100644 +--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h ++++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h +@@ -362,10 +362,9 @@ public: + return getTLI()->isTypeLegal(VT); + } + +- InstructionCost getRegUsageForType(Type *Ty) { +- InstructionCost Val = getTLI()->getTypeLegalizationCost(DL, Ty).first; +- assert(Val >= 0 && "Negative cost!"); +- return Val; ++ unsigned getRegUsageForType(Type *Ty) { ++ EVT ETy = getTLI()->getValueType(DL, Ty); ++ return getTLI()->getNumRegisters(Ty->getContext(), ETy); + } + + InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, +diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp +index 25e9dee98e13..7ec752990620 100644 +--- a/llvm/lib/Analysis/TargetTransformInfo.cpp ++++ b/llvm/lib/Analysis/TargetTransformInfo.cpp +@@ -470,7 +470,7 @@ bool TargetTransformInfo::isTypeLegal(Type *Ty) const { + return TTIImpl->isTypeLegal(Ty); + } + +-InstructionCost TargetTransformInfo::getRegUsageForType(Type *Ty) const { ++unsigned TargetTransformInfo::getRegUsageForType(Type *Ty) const { + return TTIImpl->getRegUsageForType(Ty); + } + +@@ -623,8 +623,9 @@ Optional<unsigned> TargetTransformInfo::getVScaleForTuning() const { + return TTIImpl->getVScaleForTuning(); + } + +-bool TargetTransformInfo::shouldMaximizeVectorBandwidth() const { +- return TTIImpl->shouldMaximizeVectorBandwidth(); ++bool TargetTransformInfo::shouldMaximizeVectorBandwidth( ++ TargetTransformInfo::RegisterKind K) const { ++ return TTIImpl->shouldMaximizeVectorBandwidth(K); + } + + ElementCount TargetTransformInfo::getMinimumVF(unsigned ElemWidth, +diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +index b2ffdf949d8b..c245b29b6d8a 100644 +--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp ++++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +@@ -50,6 +50,12 @@ bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, + return (CallerBits & CalleeBits) == CalleeBits; + } + ++bool AArch64TTIImpl::shouldMaximizeVectorBandwidth( ++ TargetTransformInfo::RegisterKind K) const { ++ assert(K != TargetTransformInfo::RGK_Scalar); ++ return K == TargetTransformInfo::RGK_FixedWidthVector; ++} ++ + /// Calculate the cost of materializing a 64-bit value. This helper + /// method might only calculate a fraction of a larger immediate. Therefore it + /// is valid to return a cost of ZERO. +diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +index a6029b9f2445..b7b11d196f1c 100644 +--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h ++++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +@@ -135,6 +135,8 @@ public: + return ST->getVScaleForTuning(); + } + ++ bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const; ++ + /// Try to return an estimate cost factor that can be used as a multiplier + /// when scalarizing an operation for a vector with ElementCount \p VF. + /// For scalable vectors this currently takes the most pessimistic view based +diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h +index 9e637dfc3e16..7bc7bbf10614 100644 +--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h ++++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h +@@ -86,12 +86,11 @@ public: + unsigned getMinVectorRegisterBitWidth() const; + ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const; + +- bool shouldMaximizeVectorBandwidth() const { ++ bool ++ shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const { + return true; + } +- bool supportsEfficientVectorElementLoadStore() { +- return false; +- } ++ bool supportsEfficientVectorElementLoadStore() { return false; } + bool hasBranchDivergence() { + return false; + } +diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +index 99e6774a02e4..26ac8d872800 100644 +--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp ++++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +@@ -276,7 +276,7 @@ void RISCVTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, + BaseT::getPeelingPreferences(L, SE, PP); + } + +-InstructionCost RISCVTTIImpl::getRegUsageForType(Type *Ty) { ++unsigned RISCVTTIImpl::getRegUsageForType(Type *Ty) { + TypeSize Size = Ty->getPrimitiveSizeInBits(); + if (Ty->isVectorTy()) { + if (Size.isScalable() && ST->hasVInstructions()) +diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +index e79c4f75712b..959a1433e689 100644 +--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h ++++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +@@ -60,7 +60,7 @@ public: + + TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const; + +- InstructionCost getRegUsageForType(Type *Ty); ++ unsigned getRegUsageForType(Type *Ty); + + void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, + TTI::UnrollingPreferences &UP, +diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +index 46ff0994e04e..c41726b11aca 100644 +--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp ++++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +@@ -5560,9 +5560,12 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( + return ElementCount::getFixed(ClampedConstTripCount); + } + ++ TargetTransformInfo::RegisterKind RegKind = ++ ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector ++ : TargetTransformInfo::RGK_FixedWidthVector; + ElementCount MaxVF = MaxVectorElementCount; +- if (TTI.shouldMaximizeVectorBandwidth() || +- (MaximizeBandwidth && isScalarEpilogueAllowed())) { ++ if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 && ++ TTI.shouldMaximizeVectorBandwidth(RegKind))) { + auto MaxVectorElementCountMaxBW = ElementCount::get( + PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), + ComputeScalableMaxVF); +@@ -6319,16 +6322,10 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { + + LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); + +- // A lambda that gets the register usage for the given type and VF. +- const auto &TTICapture = TTI; +- auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { ++ auto GetRegUsage = [&TTI = TTI](Type *Ty, ElementCount VF) -> unsigned { + if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) + return 0; +- InstructionCost::CostType RegUsage = +- *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue(); +- assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() && +- "Nonsensical values for register usage."); +- return RegUsage; ++ return TTI.getRegUsageForType(VectorType::get(Ty, VF)); + }; + + for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { +diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll +index 371d209bafff..a1ca0fea7972 100644 +--- a/llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll ++++ b/llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll +@@ -4,11 +4,12 @@ + ; are not profitable. + + ; Test with a loop that contains memory accesses of i8 and i32 types. The +-; default maximum VF for NEON is 4. And while we don't have an instruction to +-; load 4 x i8, vectorization might still be profitable. ++; maximum VF for NEON is calculated by 128/size of smallest type in loop. ++; And while we don't have an instruction to load 4 x i8, vectorization ++; might still be profitable. + define void @test_load_i8_store_i32(i8* noalias %src, i32* noalias %dst, i32 %off, i64 %N) { + ; CHECK-LABEL: @test_load_i8_store_i32( +-; CHECK: <4 x i8> ++; CHECK: <16 x i8> + ; + entry: + br label %loop +@@ -32,7 +33,7 @@ exit: + ; Same as test_load_i8_store_i32, but with types flipped for load and store. + define void @test_load_i32_store_i8(i32* noalias %src, i8* noalias %dst, i32 %off, i64 %N) { + ; CHECK-LABEL: @test_load_i32_store_i8( +-; CHECK: <4 x i8> ++; CHECK: <16 x i8> + ; + entry: + br label %loop +@@ -84,7 +85,7 @@ exit: + ; vectorization factor. + define void @test_load_i8_store_i64_large(i8* noalias %src, i64* noalias %dst, i64* noalias %dst.2, i64* noalias %dst.3, i64* noalias %dst.4, i64* noalias %dst.5, i64%off, i64 %off.2, i64 %N) { + ; CHECK-LABEL: @test_load_i8_store_i64_large +-; CHECK: <2 x i64> ++; CHECK: <8 x i64> + ; + entry: + br label %loop +diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/i1-reg-usage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/i1-reg-usage.ll +new file mode 100644 +index 000000000000..f0dc8e502769 +--- /dev/null ++++ b/llvm/test/Transforms/LoopVectorize/AArch64/i1-reg-usage.ll +@@ -0,0 +1,57 @@ ++; RUN: opt -loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 < %s | FileCheck %s ++; REQUIRES: asserts ++ ++target triple = "aarch64" ++ ++; Test that shows how many registers the loop vectorizer thinks an illegal <VF x i1> will consume. ++ ++; CHECK-LABEL: LV: Checking a loop in 'or_reduction_neon' from <stdin> ++; CHECK: LV(REG): VF = 32 ++; CHECK-NEXT: LV(REG): Found max usage: 2 item ++; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 72 registers ++; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers ++ ++define i1 @or_reduction_neon(i32 %arg, ptr %ptr) { ++entry: ++ br label %loop ++exit: ++ ret i1 %reduction_next ++loop: ++ %induction = phi i32 [ 0, %entry ], [ %induction_next, %loop ] ++ %reduction = phi i1 [ 0, %entry ], [ %reduction_next, %loop ] ++ %gep = getelementptr inbounds i32, ptr %ptr, i32 %induction ++ %loaded = load i32, ptr %gep ++ %i1 = icmp eq i32 %loaded, %induction ++ %reduction_next = or i1 %i1, %reduction ++ %induction_next = add nuw i32 %induction, 1 ++ %cond = icmp eq i32 %induction_next, %arg ++ br i1 %cond, label %exit, label %loop, !llvm.loop !32 ++} ++ ++; CHECK-LABEL: LV: Checking a loop in 'or_reduction_sve' ++; CHECK: LV(REG): VF = 64 ++; CHECK-NEXT: LV(REG): Found max usage: 2 item ++; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 136 registers ++; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers ++ ++define i1 @or_reduction_sve(i32 %arg, ptr %ptr) vscale_range(2,2) "target-features"="+sve" { ++entry: ++ br label %loop ++exit: ++ ret i1 %reduction_next ++loop: ++ %induction = phi i32 [ 0, %entry ], [ %induction_next, %loop ] ++ %reduction = phi i1 [ true, %entry ], [ %reduction_next, %loop ] ++ %gep = getelementptr inbounds i32, ptr %ptr, i32 %induction ++ %loaded = load i32, ptr %gep ++ %i1 = icmp eq i32 %loaded, %induction ++ %reduction_next = or i1 %i1, %reduction ++ %induction_next = add nuw i32 %induction, 1 ++ %cond = icmp eq i32 %induction_next, %arg ++ br i1 %cond, label %exit, label %loop, !llvm.loop !64 ++} ++ ++!32 = distinct !{!32, !33} ++!33 = !{!"llvm.loop.vectorize.width", i32 32} ++!64 = distinct !{!64, !65} ++!65 = !{!"llvm.loop.vectorize.width", i32 64} +diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll +index e6e43375204d..28eabe382dfb 100644 +--- a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll ++++ b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll +@@ -116,9 +116,9 @@ for.body: ; preds = %entry, %for.body + } + + ; CHECK-LABEL: @add_d( +-; CHECK: load <4 x i16> +-; CHECK: add nsw <4 x i32> +-; CHECK: store <4 x i32> ++; CHECK: load <8 x i16> ++; CHECK: add nsw <8 x i32> ++; CHECK: store <8 x i32> + define void @add_d(i16* noalias nocapture readonly %p, i32* noalias nocapture %q, i32 %len) #0 { + entry: + %cmp7 = icmp sgt i32 %len, 0 +diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll +index a95c0aa6f375..071255c4f4f0 100644 +--- a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll ++++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll +@@ -123,16 +123,16 @@ for.body: + ; } + ; + ; CHECK: vector.body: +-; CHECK: phi <8 x i16> +-; CHECK: [[Ld1:%[a-zA-Z0-9.]+]] = load <8 x i8> +-; CHECK: zext <8 x i8> [[Ld1]] to <8 x i16> +-; CHECK: [[Ld2:%[a-zA-Z0-9.]+]] = load <8 x i8> +-; CHECK: zext <8 x i8> [[Ld2]] to <8 x i16> +-; CHECK: add <8 x i16> +-; CHECK: add <8 x i16> ++; CHECK: phi <16 x i16> ++; CHECK: [[Ld1:%[a-zA-Z0-9.]+]] = load <16 x i8> ++; CHECK: zext <16 x i8> [[Ld1]] to <16 x i16> ++; CHECK: [[Ld2:%[a-zA-Z0-9.]+]] = load <16 x i8> ++; CHECK: zext <16 x i8> [[Ld2]] to <16 x i16> ++; CHECK: add <16 x i16> ++; CHECK: add <16 x i16> + ; + ; CHECK: middle.block: +-; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> ++; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> + ; CHECK: zext i16 [[Rdx]] to i32 + ; + define i16 @reduction_i16_2(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %n) { +diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll +index 27868480c23b..262236075f7c 100644 +--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll ++++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll +@@ -29,7 +29,7 @@ + ; NEOVERSE-N2: LV: Vector loop of width vscale x 4 costs: 3 (assuming a minimum vscale of 1). + + ; VF-4: <4 x i32> +-; VF-VSCALE4: <vscale x 4 x i32> ++; VF-VSCALE4: <16 x i32> + define void @test0(i32* %a, i8* %b, i32* %c) #0 { + entry: + br label %loop +diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll +index 9bd9c31d32d3..1d2c70db11cf 100644 +--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll ++++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll +@@ -9,9 +9,9 @@ + define void @test0(i32* %a, i8* %b, i32* %c) #0 { + ; CHECK: LV: Checking a loop in "test0" + ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4 +-; CHECK_SCALABLE_ON: LV: Selecting VF: vscale x 4 ++; CHECK_SCALABLE_ON: LV: Selecting VF: 16 + ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF +-; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4 ++; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 16 + ; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 16 + ; CHECK_SCALABLE_ON_MAXBW: LV: Selecting VF: vscale x 16 + entry: +@@ -40,9 +40,9 @@ exit: + define void @test1(i32* %a, i8* %b) #0 { + ; CHECK: LV: Checking a loop in "test1" + ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4 +-; CHECK_SCALABLE_ON: LV: Selecting VF: vscale x 4 ++; CHECK_SCALABLE_ON: LV: Selecting VF: 16 + ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF +-; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4 ++; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 16 + ; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 4 + ; CHECK_SCALABLE_ON_MAXBW: LV: Selecting VF: 16 + entry: +@@ -72,9 +72,9 @@ exit: + define void @test2(i32* %a, i8* %b) #0 { + ; CHECK: LV: Checking a loop in "test2" + ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 2 +-; CHECK_SCALABLE_ON: LV: Selecting VF: vscale x 2 ++; CHECK_SCALABLE_ON: LV: Selecting VF: 16 + ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF +-; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4 ++; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 16 + ; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 2 + ; CHECK_SCALABLE_ON_MAXBW: LV: Selecting VF: 16 + entry: +@@ -104,9 +104,9 @@ exit: + define void @test3(i32* %a, i8* %b) #0 { + ; CHECK: LV: Checking a loop in "test3" + ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 1 +-; CHECK_SCALABLE_ON: LV: Selecting VF: 4 ++; CHECK_SCALABLE_ON: LV: Selecting VF: 16 + ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF +-; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4 ++; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 16 + ; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 1 + ; CHECK_SCALABLE_ON_MAXBW: LV: Selecting VF: 16 + entry: +diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-illegal-type.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-illegal-type.ll +index 4d0886f4d953..43ef43c11507 100644 +--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-illegal-type.ll ++++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-illegal-type.ll +@@ -83,11 +83,11 @@ for.end: + define void @uniform_store_i1(i1* noalias %dst, i64* noalias %start, i64 %N) { + ; CHECK-LABEL: @uniform_store_i1 + ; CHECK: vector.body +-; CHECK: %[[GEP:.*]] = getelementptr inbounds i64, <2 x i64*> {{.*}}, i64 1 +-; CHECK: %[[ICMP:.*]] = icmp eq <2 x i64*> %[[GEP]], %[[SPLAT:.*]] +-; CHECK: %[[EXTRACT1:.*]] = extractelement <2 x i1> %[[ICMP]], i32 0 ++; CHECK: %[[GEP:.*]] = getelementptr inbounds i64, <64 x i64*> {{.*}}, i64 1 ++; CHECK: %[[ICMP:.*]] = icmp eq <64 x i64*> %[[GEP]], %[[SPLAT:.*]] ++; CHECK: %[[EXTRACT1:.*]] = extractelement <64 x i1> %[[ICMP]], i32 0 + ; CHECK: store i1 %[[EXTRACT1]], i1* %dst +-; CHECK: %[[EXTRACT2:.*]] = extractelement <2 x i1> %[[ICMP]], i32 1 ++; CHECK: %[[EXTRACT2:.*]] = extractelement <64 x i1> %[[ICMP]], i32 1 + ; CHECK: store i1 %[[EXTRACT2]], i1* %dst + ; CHECK-NOT: vscale + entry: +diff --git a/llvm/test/Transforms/LoopVectorize/X86/i1-reg-usage.ll b/llvm/test/Transforms/LoopVectorize/X86/i1-reg-usage.ll +new file mode 100644 +index 000000000000..4cab716c7544 +--- /dev/null ++++ b/llvm/test/Transforms/LoopVectorize/X86/i1-reg-usage.ll +@@ -0,0 +1,32 @@ ++; RUN: opt -loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 < %s | FileCheck %s ++; REQUIRES: asserts ++ ++target triple = "x86_64" ++ ++; Test that shows how many registers the loop vectorizer thinks an illegal <VF x i1> will consume. ++ ++; CHECK-LABEL: LV: Checking a loop in 'or_reduction_avx' from <stdin> ++; CHECK: LV(REG): VF = 64 ++; CHECK-NEXT: LV(REG): Found max usage: 2 item ++; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 136 registers ++; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers ++ ++define i1 @or_reduction_avx(i32 %arg, ptr %ptr) "target-features"="+avx" { ++entry: ++ br label %loop ++exit: ++ ret i1 %reduction_next ++loop: ++ %induction = phi i32 [ 0, %entry ], [ %induction_next, %loop ] ++ %reduction = phi i1 [ 0, %entry ], [ %reduction_next, %loop ] ++ %gep = getelementptr inbounds i32, ptr %ptr, i32 %induction ++ %loaded = load i32, ptr %gep ++ %i1 = icmp eq i32 %loaded, %induction ++ %reduction_next = or i1 %i1, %reduction ++ %induction_next = add nuw i32 %induction, 1 ++ %cond = icmp eq i32 %induction_next, %arg ++ br i1 %cond, label %exit, label %loop, !llvm.loop !64 ++} ++ ++!64 = distinct !{!64, !65} ++!65 = !{!"llvm.loop.vectorize.width", i32 64} diff --git a/patches/cherry/a8de8cab7006bc885804e8a2c0a6902702521cfe.patch b/patches/cherry/a8de8cab7006bc885804e8a2c0a6902702521cfe.patch new file mode 100644 index 0000000..99b9594 --- /dev/null +++ b/patches/cherry/a8de8cab7006bc885804e8a2c0a6902702521cfe.patch @@ -0,0 +1,1910 @@ +From a8de8cab7006bc885804e8a2c0a6902702521cfe Mon Sep 17 00:00:00 2001 +From: Cullen Rhodes <cullen.rhodes@arm.com> +Date: Fri, 22 Jul 2022 07:26:54 +0000 +Subject: [PATCH] [AArch64] Add fcmp fast math tests + +Reviewed By: paulwalker-arm + +Differential Revision: https://reviews.llvm.org/D130094 +--- + .../AArch64/neon-compare-instructions.ll | 1887 +++++++++++++++++ + 1 file changed, 1887 insertions(+) + +diff --git a/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll b/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll +index bd665955eb99..dcb0ca631c5b 100644 +--- a/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll ++++ b/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll +@@ -4494,3 +4494,1890 @@ define <2 x i64> @fcmunoz2xdouble(<2 x double> %A) { + ret <2 x i64> %tmp4 + + } ++ ++define <2 x i32> @fcmoeq2xfloat_fast(<2 x float> %A, <2 x float> %B) { ++; CHECK-LABEL: fcmoeq2xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmeq v0.2s, v0.2s, v1.2s ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmoeq2xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmeq v0.2s, v0.2s, v1.2s ++; GISEL-NEXT: shl v0.2s, v0.2s, #31 ++; GISEL-NEXT: sshr v0.2s, v0.2s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast oeq <2 x float> %A, %B ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> ++ ret <2 x i32> %tmp4 ++} ++ ++define <4 x i32> @fcmoeq4xfloat_fast(<4 x float> %A, <4 x float> %B) { ++; CHECK-LABEL: fcmoeq4xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmeq v0.4s, v0.4s, v1.4s ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmoeq4xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmeq v0.4s, v0.4s, v1.4s ++; GISEL-NEXT: shl v0.4s, v0.4s, #31 ++; GISEL-NEXT: sshr v0.4s, v0.4s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast oeq <4 x float> %A, %B ++ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> ++ ret <4 x i32> %tmp4 ++} ++define <2 x i64> @fcmoeq2xdouble_fast(<2 x double> %A, <2 x double> %B) { ++; CHECK-LABEL: fcmoeq2xdouble_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmeq v0.2d, v0.2d, v1.2d ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmoeq2xdouble_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmeq v0.2d, v0.2d, v1.2d ++; GISEL-NEXT: shl v0.2d, v0.2d, #63 ++; GISEL-NEXT: sshr v0.2d, v0.2d, #63 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast oeq <2 x double> %A, %B ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> ++ ret <2 x i64> %tmp4 ++} ++ ++define <2 x i32> @fcmoge2xfloat_fast(<2 x float> %A, <2 x float> %B) { ++; CHECK-LABEL: fcmoge2xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmge v0.2s, v0.2s, v1.2s ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmoge2xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmge v0.2s, v0.2s, v1.2s ++; GISEL-NEXT: shl v0.2s, v0.2s, #31 ++; GISEL-NEXT: sshr v0.2s, v0.2s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast oge <2 x float> %A, %B ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> ++ ret <2 x i32> %tmp4 ++} ++ ++define <4 x i32> @fcmoge4xfloat_fast(<4 x float> %A, <4 x float> %B) { ++; CHECK-LABEL: fcmoge4xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmge v0.4s, v0.4s, v1.4s ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmoge4xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmge v0.4s, v0.4s, v1.4s ++; GISEL-NEXT: shl v0.4s, v0.4s, #31 ++; GISEL-NEXT: sshr v0.4s, v0.4s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast oge <4 x float> %A, %B ++ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> ++ ret <4 x i32> %tmp4 ++} ++define <2 x i64> @fcmoge2xdouble_fast(<2 x double> %A, <2 x double> %B) { ++; CHECK-LABEL: fcmoge2xdouble_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmge v0.2d, v0.2d, v1.2d ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmoge2xdouble_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmge v0.2d, v0.2d, v1.2d ++; GISEL-NEXT: shl v0.2d, v0.2d, #63 ++; GISEL-NEXT: sshr v0.2d, v0.2d, #63 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast oge <2 x double> %A, %B ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> ++ ret <2 x i64> %tmp4 ++} ++ ++define <2 x i32> @fcmogt2xfloat_fast(<2 x float> %A, <2 x float> %B) { ++; CHECK-LABEL: fcmogt2xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmgt v0.2s, v0.2s, v1.2s ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmogt2xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmgt v0.2s, v0.2s, v1.2s ++; GISEL-NEXT: shl v0.2s, v0.2s, #31 ++; GISEL-NEXT: sshr v0.2s, v0.2s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast ogt <2 x float> %A, %B ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> ++ ret <2 x i32> %tmp4 ++} ++ ++define <4 x i32> @fcmogt4xfloat_fast(<4 x float> %A, <4 x float> %B) { ++; CHECK-LABEL: fcmogt4xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmgt v0.4s, v0.4s, v1.4s ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmogt4xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmgt v0.4s, v0.4s, v1.4s ++; GISEL-NEXT: shl v0.4s, v0.4s, #31 ++; GISEL-NEXT: sshr v0.4s, v0.4s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast ogt <4 x float> %A, %B ++ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> ++ ret <4 x i32> %tmp4 ++} ++define <2 x i64> @fcmogt2xdouble_fast(<2 x double> %A, <2 x double> %B) { ++; CHECK-LABEL: fcmogt2xdouble_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmgt v0.2d, v0.2d, v1.2d ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmogt2xdouble_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmgt v0.2d, v0.2d, v1.2d ++; GISEL-NEXT: shl v0.2d, v0.2d, #63 ++; GISEL-NEXT: sshr v0.2d, v0.2d, #63 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast ogt <2 x double> %A, %B ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> ++ ret <2 x i64> %tmp4 ++} ++ ++define <2 x i32> @fcmole2xfloat_fast(<2 x float> %A, <2 x float> %B) { ++; CHECK-LABEL: fcmole2xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 ++; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ++; CHECK-NEXT: mov s2, v1.s[1] ++; CHECK-NEXT: mov s3, v0.s[1] ++; CHECK-NEXT: fcmp s3, s2 ++; CHECK-NEXT: csetm w8, le ++; CHECK-NEXT: fcmp s0, s1 ++; CHECK-NEXT: csetm w9, le ++; CHECK-NEXT: fmov s0, w9 ++; CHECK-NEXT: mov v0.s[1], w8 ++; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmole2xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmge v0.2s, v1.2s, v0.2s ++; GISEL-NEXT: shl v0.2s, v0.2s, #31 ++; GISEL-NEXT: sshr v0.2s, v0.2s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast ole <2 x float> %A, %B ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> ++ ret <2 x i32> %tmp4 ++} ++ ++define <4 x i32> @fcmole4xfloat_fast(<4 x float> %A, <4 x float> %B) { ++; CHECK-LABEL: fcmole4xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: mov s2, v1.s[1] ++; CHECK-NEXT: mov s3, v0.s[1] ++; CHECK-NEXT: mov s4, v0.s[2] ++; CHECK-NEXT: fcmp s3, s2 ++; CHECK-NEXT: mov s3, v1.s[2] ++; CHECK-NEXT: csetm w8, le ++; CHECK-NEXT: fcmp s0, s1 ++; CHECK-NEXT: mov s1, v1.s[3] ++; CHECK-NEXT: mov s0, v0.s[3] ++; CHECK-NEXT: csetm w9, le ++; CHECK-NEXT: fcmp s4, s3 ++; CHECK-NEXT: fmov s2, w9 ++; CHECK-NEXT: mov v2.s[1], w8 ++; CHECK-NEXT: csetm w8, le ++; CHECK-NEXT: fcmp s0, s1 ++; CHECK-NEXT: mov v2.s[2], w8 ++; CHECK-NEXT: csetm w8, le ++; CHECK-NEXT: mov v2.s[3], w8 ++; CHECK-NEXT: mov v0.16b, v2.16b ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmole4xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmge v0.4s, v1.4s, v0.4s ++; GISEL-NEXT: shl v0.4s, v0.4s, #31 ++; GISEL-NEXT: sshr v0.4s, v0.4s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast ole <4 x float> %A, %B ++ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> ++ ret <4 x i32> %tmp4 ++} ++ ++define <2 x i64> @fcmole2xdouble_fast(<2 x double> %A, <2 x double> %B) { ++; CHECK-LABEL: fcmole2xdouble_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: mov d2, v1.d[1] ++; CHECK-NEXT: mov d3, v0.d[1] ++; CHECK-NEXT: fcmp d3, d2 ++; CHECK-NEXT: csetm x8, le ++; CHECK-NEXT: fcmp d0, d1 ++; CHECK-NEXT: csetm x9, le ++; CHECK-NEXT: fmov d0, x9 ++; CHECK-NEXT: mov v0.d[1], x8 ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmole2xdouble_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmge v0.2d, v1.2d, v0.2d ++; GISEL-NEXT: shl v0.2d, v0.2d, #63 ++; GISEL-NEXT: sshr v0.2d, v0.2d, #63 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast ole <2 x double> %A, %B ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> ++ ret <2 x i64> %tmp4 ++} ++ ++define <2 x i32> @fcmolt2xfloat_fast(<2 x float> %A, <2 x float> %B) { ++; CHECK-LABEL: fcmolt2xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 ++; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ++; CHECK-NEXT: mov s2, v1.s[1] ++; CHECK-NEXT: mov s3, v0.s[1] ++; CHECK-NEXT: fcmp s3, s2 ++; CHECK-NEXT: csetm w8, lt ++; CHECK-NEXT: fcmp s0, s1 ++; CHECK-NEXT: csetm w9, lt ++; CHECK-NEXT: fmov s0, w9 ++; CHECK-NEXT: mov v0.s[1], w8 ++; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmolt2xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmgt v0.2s, v1.2s, v0.2s ++; GISEL-NEXT: shl v0.2s, v0.2s, #31 ++; GISEL-NEXT: sshr v0.2s, v0.2s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast olt <2 x float> %A, %B ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> ++ ret <2 x i32> %tmp4 ++} ++ ++define <4 x i32> @fcmolt4xfloat_fast(<4 x float> %A, <4 x float> %B) { ++; CHECK-LABEL: fcmolt4xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: mov s2, v1.s[1] ++; CHECK-NEXT: mov s3, v0.s[1] ++; CHECK-NEXT: mov s4, v0.s[2] ++; CHECK-NEXT: fcmp s3, s2 ++; CHECK-NEXT: mov s3, v1.s[2] ++; CHECK-NEXT: csetm w8, lt ++; CHECK-NEXT: fcmp s0, s1 ++; CHECK-NEXT: mov s1, v1.s[3] ++; CHECK-NEXT: mov s0, v0.s[3] ++; CHECK-NEXT: csetm w9, lt ++; CHECK-NEXT: fcmp s4, s3 ++; CHECK-NEXT: fmov s2, w9 ++; CHECK-NEXT: mov v2.s[1], w8 ++; CHECK-NEXT: csetm w8, lt ++; CHECK-NEXT: fcmp s0, s1 ++; CHECK-NEXT: mov v2.s[2], w8 ++; CHECK-NEXT: csetm w8, lt ++; CHECK-NEXT: mov v2.s[3], w8 ++; CHECK-NEXT: mov v0.16b, v2.16b ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmolt4xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmgt v0.4s, v1.4s, v0.4s ++; GISEL-NEXT: shl v0.4s, v0.4s, #31 ++; GISEL-NEXT: sshr v0.4s, v0.4s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast olt <4 x float> %A, %B ++ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> ++ ret <4 x i32> %tmp4 ++} ++ ++define <2 x i64> @fcmolt2xdouble_fast(<2 x double> %A, <2 x double> %B) { ++; CHECK-LABEL: fcmolt2xdouble_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: mov d2, v1.d[1] ++; CHECK-NEXT: mov d3, v0.d[1] ++; CHECK-NEXT: fcmp d3, d2 ++; CHECK-NEXT: csetm x8, lt ++; CHECK-NEXT: fcmp d0, d1 ++; CHECK-NEXT: csetm x9, lt ++; CHECK-NEXT: fmov d0, x9 ++; CHECK-NEXT: mov v0.d[1], x8 ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmolt2xdouble_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmgt v0.2d, v1.2d, v0.2d ++; GISEL-NEXT: shl v0.2d, v0.2d, #63 ++; GISEL-NEXT: sshr v0.2d, v0.2d, #63 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast olt <2 x double> %A, %B ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> ++ ret <2 x i64> %tmp4 ++} ++ ++define <2 x i32> @fcmone2xfloat_fast(<2 x float> %A, <2 x float> %B) { ++; CHECK-LABEL: fcmone2xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmeq v0.2s, v0.2s, v1.2s ++; CHECK-NEXT: mvn v0.8b, v0.8b ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmone2xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmgt v2.2s, v0.2s, v1.2s ++; GISEL-NEXT: fcmgt v0.2s, v1.2s, v0.2s ++; GISEL-NEXT: orr v0.8b, v0.8b, v2.8b ++; GISEL-NEXT: shl v0.2s, v0.2s, #31 ++; GISEL-NEXT: sshr v0.2s, v0.2s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast one <2 x float> %A, %B ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> ++ ret <2 x i32> %tmp4 ++} ++ ++define <4 x i32> @fcmone4xfloat_fast(<4 x float> %A, <4 x float> %B) { ++; CHECK-LABEL: fcmone4xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmeq v0.4s, v0.4s, v1.4s ++; CHECK-NEXT: mvn v0.16b, v0.16b ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmone4xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmgt v2.4s, v0.4s, v1.4s ++; GISEL-NEXT: fcmgt v0.4s, v1.4s, v0.4s ++; GISEL-NEXT: orr v0.16b, v0.16b, v2.16b ++; GISEL-NEXT: shl v0.4s, v0.4s, #31 ++; GISEL-NEXT: sshr v0.4s, v0.4s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast one <4 x float> %A, %B ++ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> ++ ret <4 x i32> %tmp4 ++} ++ ++define <2 x i64> @fcmone2xdouble_fast(<2 x double> %A, <2 x double> %B) { ++; CHECK-LABEL: fcmone2xdouble_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmeq v0.2d, v0.2d, v1.2d ++; CHECK-NEXT: mvn v0.16b, v0.16b ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmone2xdouble_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmgt v2.2d, v0.2d, v1.2d ++; GISEL-NEXT: fcmgt v0.2d, v1.2d, v0.2d ++; GISEL-NEXT: orr v0.16b, v0.16b, v2.16b ++; GISEL-NEXT: shl v0.2d, v0.2d, #63 ++; GISEL-NEXT: sshr v0.2d, v0.2d, #63 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast one <2 x double> %A, %B ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> ++ ret <2 x i64> %tmp4 ++} ++ ++define <2 x i32> @fcmord2xfloat_fast(<2 x float> %A, <2 x float> %B) { ++; CHECK-LABEL: fcmord2xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmge v2.2s, v0.2s, v1.2s ++; CHECK-NEXT: fcmgt v0.2s, v1.2s, v0.2s ++; CHECK-NEXT: orr v0.8b, v0.8b, v2.8b ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmord2xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmge v2.2s, v0.2s, v1.2s ++; GISEL-NEXT: fcmgt v0.2s, v1.2s, v0.2s ++; GISEL-NEXT: orr v0.8b, v0.8b, v2.8b ++; GISEL-NEXT: shl v0.2s, v0.2s, #31 ++; GISEL-NEXT: sshr v0.2s, v0.2s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast ord <2 x float> %A, %B ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> ++ ret <2 x i32> %tmp4 ++} ++ ++define <4 x i32> @fcmord4xfloat_fast(<4 x float> %A, <4 x float> %B) { ++; CHECK-LABEL: fcmord4xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmge v2.4s, v0.4s, v1.4s ++; CHECK-NEXT: fcmgt v0.4s, v1.4s, v0.4s ++; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmord4xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmge v2.4s, v0.4s, v1.4s ++; GISEL-NEXT: fcmgt v0.4s, v1.4s, v0.4s ++; GISEL-NEXT: orr v0.16b, v0.16b, v2.16b ++; GISEL-NEXT: shl v0.4s, v0.4s, #31 ++; GISEL-NEXT: sshr v0.4s, v0.4s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast ord <4 x float> %A, %B ++ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> ++ ret <4 x i32> %tmp4 ++} ++ ++define <2 x i64> @fcmord2xdouble_fast(<2 x double> %A, <2 x double> %B) { ++; CHECK-LABEL: fcmord2xdouble_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmge v2.2d, v0.2d, v1.2d ++; CHECK-NEXT: fcmgt v0.2d, v1.2d, v0.2d ++; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmord2xdouble_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmge v2.2d, v0.2d, v1.2d ++; GISEL-NEXT: fcmgt v0.2d, v1.2d, v0.2d ++; GISEL-NEXT: orr v0.16b, v0.16b, v2.16b ++; GISEL-NEXT: shl v0.2d, v0.2d, #63 ++; GISEL-NEXT: sshr v0.2d, v0.2d, #63 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast ord <2 x double> %A, %B ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> ++ ret <2 x i64> %tmp4 ++} ++ ++ ++define <2 x i32> @fcmuno2xfloat_fast(<2 x float> %A, <2 x float> %B) { ++; CHECK-LABEL: fcmuno2xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmge v2.2s, v0.2s, v1.2s ++; CHECK-NEXT: fcmgt v0.2s, v1.2s, v0.2s ++; CHECK-NEXT: orr v0.8b, v0.8b, v2.8b ++; CHECK-NEXT: mvn v0.8b, v0.8b ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmuno2xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmge v2.2s, v0.2s, v1.2s ++; GISEL-NEXT: fcmgt v0.2s, v1.2s, v0.2s ++; GISEL-NEXT: orr v0.8b, v0.8b, v2.8b ++; GISEL-NEXT: mvn v0.8b, v0.8b ++; GISEL-NEXT: shl v0.2s, v0.2s, #31 ++; GISEL-NEXT: sshr v0.2s, v0.2s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast uno <2 x float> %A, %B ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> ++ ret <2 x i32> %tmp4 ++} ++ ++define <4 x i32> @fcmuno4xfloat_fast(<4 x float> %A, <4 x float> %B) { ++; CHECK-LABEL: fcmuno4xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmge v2.4s, v0.4s, v1.4s ++; CHECK-NEXT: fcmgt v0.4s, v1.4s, v0.4s ++; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b ++; CHECK-NEXT: mvn v0.16b, v0.16b ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmuno4xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmge v2.4s, v0.4s, v1.4s ++; GISEL-NEXT: fcmgt v0.4s, v1.4s, v0.4s ++; GISEL-NEXT: orr v0.16b, v0.16b, v2.16b ++; GISEL-NEXT: mvn v0.16b, v0.16b ++; GISEL-NEXT: shl v0.4s, v0.4s, #31 ++; GISEL-NEXT: sshr v0.4s, v0.4s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast uno <4 x float> %A, %B ++ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> ++ ret <4 x i32> %tmp4 ++} ++ ++define <2 x i64> @fcmuno2xdouble_fast(<2 x double> %A, <2 x double> %B) { ++; CHECK-LABEL: fcmuno2xdouble_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmge v2.2d, v0.2d, v1.2d ++; CHECK-NEXT: fcmgt v0.2d, v1.2d, v0.2d ++; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b ++; CHECK-NEXT: mvn v0.16b, v0.16b ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmuno2xdouble_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmge v2.2d, v0.2d, v1.2d ++; GISEL-NEXT: fcmgt v0.2d, v1.2d, v0.2d ++; GISEL-NEXT: orr v0.16b, v0.16b, v2.16b ++; GISEL-NEXT: mvn v0.16b, v0.16b ++; GISEL-NEXT: shl v0.2d, v0.2d, #63 ++; GISEL-NEXT: sshr v0.2d, v0.2d, #63 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast uno <2 x double> %A, %B ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> ++ ret <2 x i64> %tmp4 ++} ++ ++define <2 x i32> @fcmueq2xfloat_fast(<2 x float> %A, <2 x float> %B) { ++; CHECK-LABEL: fcmueq2xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmeq v0.2s, v0.2s, v1.2s ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmueq2xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmgt v2.2s, v0.2s, v1.2s ++; GISEL-NEXT: fcmgt v0.2s, v1.2s, v0.2s ++; GISEL-NEXT: orr v0.8b, v0.8b, v2.8b ++; GISEL-NEXT: mvn v0.8b, v0.8b ++; GISEL-NEXT: shl v0.2s, v0.2s, #31 ++; GISEL-NEXT: sshr v0.2s, v0.2s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast ueq <2 x float> %A, %B ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> ++ ret <2 x i32> %tmp4 ++} ++ ++define <4 x i32> @fcmueq4xfloat_fast(<4 x float> %A, <4 x float> %B) { ++; CHECK-LABEL: fcmueq4xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmeq v0.4s, v0.4s, v1.4s ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmueq4xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmgt v2.4s, v0.4s, v1.4s ++; GISEL-NEXT: fcmgt v0.4s, v1.4s, v0.4s ++; GISEL-NEXT: orr v0.16b, v0.16b, v2.16b ++; GISEL-NEXT: mvn v0.16b, v0.16b ++; GISEL-NEXT: shl v0.4s, v0.4s, #31 ++; GISEL-NEXT: sshr v0.4s, v0.4s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast ueq <4 x float> %A, %B ++ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> ++ ret <4 x i32> %tmp4 ++} ++ ++define <2 x i64> @fcmueq2xdouble_fast(<2 x double> %A, <2 x double> %B) { ++; CHECK-LABEL: fcmueq2xdouble_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmeq v0.2d, v0.2d, v1.2d ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmueq2xdouble_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmgt v2.2d, v0.2d, v1.2d ++; GISEL-NEXT: fcmgt v0.2d, v1.2d, v0.2d ++; GISEL-NEXT: orr v0.16b, v0.16b, v2.16b ++; GISEL-NEXT: mvn v0.16b, v0.16b ++; GISEL-NEXT: shl v0.2d, v0.2d, #63 ++; GISEL-NEXT: sshr v0.2d, v0.2d, #63 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast ueq <2 x double> %A, %B ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> ++ ret <2 x i64> %tmp4 ++} ++ ++define <2 x i32> @fcmuge2xfloat_fast(<2 x float> %A, <2 x float> %B) { ++; CHECK-LABEL: fcmuge2xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmge v0.2s, v0.2s, v1.2s ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmuge2xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmgt v0.2s, v1.2s, v0.2s ++; GISEL-NEXT: mvn v0.8b, v0.8b ++; GISEL-NEXT: shl v0.2s, v0.2s, #31 ++; GISEL-NEXT: sshr v0.2s, v0.2s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast uge <2 x float> %A, %B ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> ++ ret <2 x i32> %tmp4 ++} ++ ++define <4 x i32> @fcmuge4xfloat_fast(<4 x float> %A, <4 x float> %B) { ++; CHECK-LABEL: fcmuge4xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmge v0.4s, v0.4s, v1.4s ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmuge4xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmgt v0.4s, v1.4s, v0.4s ++; GISEL-NEXT: mvn v0.16b, v0.16b ++; GISEL-NEXT: shl v0.4s, v0.4s, #31 ++; GISEL-NEXT: sshr v0.4s, v0.4s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast uge <4 x float> %A, %B ++ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> ++ ret <4 x i32> %tmp4 ++} ++ ++define <2 x i64> @fcmuge2xdouble_fast(<2 x double> %A, <2 x double> %B) { ++; CHECK-LABEL: fcmuge2xdouble_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmge v0.2d, v0.2d, v1.2d ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmuge2xdouble_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmgt v0.2d, v1.2d, v0.2d ++; GISEL-NEXT: mvn v0.16b, v0.16b ++; GISEL-NEXT: shl v0.2d, v0.2d, #63 ++; GISEL-NEXT: sshr v0.2d, v0.2d, #63 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast uge <2 x double> %A, %B ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> ++ ret <2 x i64> %tmp4 ++} ++ ++define <2 x i32> @fcmugt2xfloat_fast(<2 x float> %A, <2 x float> %B) { ++; CHECK-LABEL: fcmugt2xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmgt v0.2s, v0.2s, v1.2s ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmugt2xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmge v0.2s, v1.2s, v0.2s ++; GISEL-NEXT: mvn v0.8b, v0.8b ++; GISEL-NEXT: shl v0.2s, v0.2s, #31 ++; GISEL-NEXT: sshr v0.2s, v0.2s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast ugt <2 x float> %A, %B ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> ++ ret <2 x i32> %tmp4 ++} ++ ++define <4 x i32> @fcmugt4xfloat_fast(<4 x float> %A, <4 x float> %B) { ++; CHECK-LABEL: fcmugt4xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmgt v0.4s, v0.4s, v1.4s ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmugt4xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmge v0.4s, v1.4s, v0.4s ++; GISEL-NEXT: mvn v0.16b, v0.16b ++; GISEL-NEXT: shl v0.4s, v0.4s, #31 ++; GISEL-NEXT: sshr v0.4s, v0.4s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast ugt <4 x float> %A, %B ++ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> ++ ret <4 x i32> %tmp4 ++} ++ ++define <2 x i64> @fcmugt2xdouble_fast(<2 x double> %A, <2 x double> %B) { ++; CHECK-LABEL: fcmugt2xdouble_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmgt v0.2d, v0.2d, v1.2d ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmugt2xdouble_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmge v0.2d, v1.2d, v0.2d ++; GISEL-NEXT: mvn v0.16b, v0.16b ++; GISEL-NEXT: shl v0.2d, v0.2d, #63 ++; GISEL-NEXT: sshr v0.2d, v0.2d, #63 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast ugt <2 x double> %A, %B ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> ++ ret <2 x i64> %tmp4 ++} ++ ++define <2 x i32> @fcmule2xfloat_fast(<2 x float> %A, <2 x float> %B) { ++; CHECK-LABEL: fcmule2xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 ++; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ++; CHECK-NEXT: mov s2, v1.s[1] ++; CHECK-NEXT: mov s3, v0.s[1] ++; CHECK-NEXT: fcmp s3, s2 ++; CHECK-NEXT: csetm w8, le ++; CHECK-NEXT: fcmp s0, s1 ++; CHECK-NEXT: csetm w9, le ++; CHECK-NEXT: fmov s0, w9 ++; CHECK-NEXT: mov v0.s[1], w8 ++; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmule2xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmgt v0.2s, v0.2s, v1.2s ++; GISEL-NEXT: mvn v0.8b, v0.8b ++; GISEL-NEXT: shl v0.2s, v0.2s, #31 ++; GISEL-NEXT: sshr v0.2s, v0.2s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast ule <2 x float> %A, %B ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> ++ ret <2 x i32> %tmp4 ++} ++ ++define <4 x i32> @fcmule4xfloat_fast(<4 x float> %A, <4 x float> %B) { ++; CHECK-LABEL: fcmule4xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: mov s2, v1.s[1] ++; CHECK-NEXT: mov s3, v0.s[1] ++; CHECK-NEXT: mov s4, v0.s[2] ++; CHECK-NEXT: fcmp s3, s2 ++; CHECK-NEXT: mov s3, v1.s[2] ++; CHECK-NEXT: csetm w8, le ++; CHECK-NEXT: fcmp s0, s1 ++; CHECK-NEXT: mov s1, v1.s[3] ++; CHECK-NEXT: mov s0, v0.s[3] ++; CHECK-NEXT: csetm w9, le ++; CHECK-NEXT: fcmp s4, s3 ++; CHECK-NEXT: fmov s2, w9 ++; CHECK-NEXT: mov v2.s[1], w8 ++; CHECK-NEXT: csetm w8, le ++; CHECK-NEXT: fcmp s0, s1 ++; CHECK-NEXT: mov v2.s[2], w8 ++; CHECK-NEXT: csetm w8, le ++; CHECK-NEXT: mov v2.s[3], w8 ++; CHECK-NEXT: mov v0.16b, v2.16b ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmule4xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmgt v0.4s, v0.4s, v1.4s ++; GISEL-NEXT: mvn v0.16b, v0.16b ++; GISEL-NEXT: shl v0.4s, v0.4s, #31 ++; GISEL-NEXT: sshr v0.4s, v0.4s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast ule <4 x float> %A, %B ++ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> ++ ret <4 x i32> %tmp4 ++} ++ ++define <2 x i64> @fcmule2xdouble_fast(<2 x double> %A, <2 x double> %B) { ++; CHECK-LABEL: fcmule2xdouble_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: mov d2, v1.d[1] ++; CHECK-NEXT: mov d3, v0.d[1] ++; CHECK-NEXT: fcmp d3, d2 ++; CHECK-NEXT: csetm x8, le ++; CHECK-NEXT: fcmp d0, d1 ++; CHECK-NEXT: csetm x9, le ++; CHECK-NEXT: fmov d0, x9 ++; CHECK-NEXT: mov v0.d[1], x8 ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmule2xdouble_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmgt v0.2d, v0.2d, v1.2d ++; GISEL-NEXT: mvn v0.16b, v0.16b ++; GISEL-NEXT: shl v0.2d, v0.2d, #63 ++; GISEL-NEXT: sshr v0.2d, v0.2d, #63 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast ule <2 x double> %A, %B ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> ++ ret <2 x i64> %tmp4 ++} ++ ++define <2 x i32> @fcmult2xfloat_fast(<2 x float> %A, <2 x float> %B) { ++; CHECK-LABEL: fcmult2xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 ++; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ++; CHECK-NEXT: mov s2, v1.s[1] ++; CHECK-NEXT: mov s3, v0.s[1] ++; CHECK-NEXT: fcmp s3, s2 ++; CHECK-NEXT: csetm w8, lt ++; CHECK-NEXT: fcmp s0, s1 ++; CHECK-NEXT: csetm w9, lt ++; CHECK-NEXT: fmov s0, w9 ++; CHECK-NEXT: mov v0.s[1], w8 ++; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmult2xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmge v0.2s, v0.2s, v1.2s ++; GISEL-NEXT: mvn v0.8b, v0.8b ++; GISEL-NEXT: shl v0.2s, v0.2s, #31 ++; GISEL-NEXT: sshr v0.2s, v0.2s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast ult <2 x float> %A, %B ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> ++ ret <2 x i32> %tmp4 ++} ++ ++define <4 x i32> @fcmult4xfloat_fast(<4 x float> %A, <4 x float> %B) { ++; CHECK-LABEL: fcmult4xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: mov s2, v1.s[1] ++; CHECK-NEXT: mov s3, v0.s[1] ++; CHECK-NEXT: mov s4, v0.s[2] ++; CHECK-NEXT: fcmp s3, s2 ++; CHECK-NEXT: mov s3, v1.s[2] ++; CHECK-NEXT: csetm w8, lt ++; CHECK-NEXT: fcmp s0, s1 ++; CHECK-NEXT: mov s1, v1.s[3] ++; CHECK-NEXT: mov s0, v0.s[3] ++; CHECK-NEXT: csetm w9, lt ++; CHECK-NEXT: fcmp s4, s3 ++; CHECK-NEXT: fmov s2, w9 ++; CHECK-NEXT: mov v2.s[1], w8 ++; CHECK-NEXT: csetm w8, lt ++; CHECK-NEXT: fcmp s0, s1 ++; CHECK-NEXT: mov v2.s[2], w8 ++; CHECK-NEXT: csetm w8, lt ++; CHECK-NEXT: mov v2.s[3], w8 ++; CHECK-NEXT: mov v0.16b, v2.16b ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmult4xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmge v0.4s, v0.4s, v1.4s ++; GISEL-NEXT: mvn v0.16b, v0.16b ++; GISEL-NEXT: shl v0.4s, v0.4s, #31 ++; GISEL-NEXT: sshr v0.4s, v0.4s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast ult <4 x float> %A, %B ++ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> ++ ret <4 x i32> %tmp4 ++} ++ ++define <2 x i64> @fcmult2xdouble_fast(<2 x double> %A, <2 x double> %B) { ++; CHECK-LABEL: fcmult2xdouble_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: mov d2, v1.d[1] ++; CHECK-NEXT: mov d3, v0.d[1] ++; CHECK-NEXT: fcmp d3, d2 ++; CHECK-NEXT: csetm x8, lt ++; CHECK-NEXT: fcmp d0, d1 ++; CHECK-NEXT: csetm x9, lt ++; CHECK-NEXT: fmov d0, x9 ++; CHECK-NEXT: mov v0.d[1], x8 ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmult2xdouble_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmge v0.2d, v0.2d, v1.2d ++; GISEL-NEXT: mvn v0.16b, v0.16b ++; GISEL-NEXT: shl v0.2d, v0.2d, #63 ++; GISEL-NEXT: sshr v0.2d, v0.2d, #63 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast ult <2 x double> %A, %B ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> ++ ret <2 x i64> %tmp4 ++} ++ ++define <2 x i32> @fcmune2xfloat_fast(<2 x float> %A, <2 x float> %B) { ++; CHECK-LABEL: fcmune2xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmeq v0.2s, v0.2s, v1.2s ++; CHECK-NEXT: mvn v0.8b, v0.8b ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmune2xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmeq v0.2s, v0.2s, v1.2s ++; GISEL-NEXT: mvn v0.8b, v0.8b ++; GISEL-NEXT: shl v0.2s, v0.2s, #31 ++; GISEL-NEXT: sshr v0.2s, v0.2s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast une <2 x float> %A, %B ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> ++ ret <2 x i32> %tmp4 ++} ++ ++define <4 x i32> @fcmune4xfloat_fast(<4 x float> %A, <4 x float> %B) { ++; CHECK-LABEL: fcmune4xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmeq v0.4s, v0.4s, v1.4s ++; CHECK-NEXT: mvn v0.16b, v0.16b ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmune4xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmeq v0.4s, v0.4s, v1.4s ++; GISEL-NEXT: mvn v0.16b, v0.16b ++; GISEL-NEXT: shl v0.4s, v0.4s, #31 ++; GISEL-NEXT: sshr v0.4s, v0.4s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast une <4 x float> %A, %B ++ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> ++ ret <4 x i32> %tmp4 ++} ++ ++define <2 x i64> @fcmune2xdouble_fast(<2 x double> %A, <2 x double> %B) { ++; CHECK-LABEL: fcmune2xdouble_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmeq v0.2d, v0.2d, v1.2d ++; CHECK-NEXT: mvn v0.16b, v0.16b ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmune2xdouble_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmeq v0.2d, v0.2d, v1.2d ++; GISEL-NEXT: mvn v0.16b, v0.16b ++; GISEL-NEXT: shl v0.2d, v0.2d, #63 ++; GISEL-NEXT: sshr v0.2d, v0.2d, #63 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast une <2 x double> %A, %B ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> ++ ret <2 x i64> %tmp4 ++} ++ ++define <2 x i32> @fcmoeqz2xfloat_fast(<2 x float> %A) { ++; CHECK-LABEL: fcmoeqz2xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmeq v0.2s, v0.2s, #0.0 ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmoeqz2xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmeq v0.2s, v0.2s, #0.0 ++; GISEL-NEXT: shl v0.2s, v0.2s, #31 ++; GISEL-NEXT: sshr v0.2s, v0.2s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast oeq <2 x float> %A, zeroinitializer ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> ++ ret <2 x i32> %tmp4 ++} ++ ++define <4 x i32> @fcmoeqz4xfloat_fast(<4 x float> %A) { ++; CHECK-LABEL: fcmoeqz4xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmeq v0.4s, v0.4s, #0.0 ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmoeqz4xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmeq v0.4s, v0.4s, #0.0 ++; GISEL-NEXT: shl v0.4s, v0.4s, #31 ++; GISEL-NEXT: sshr v0.4s, v0.4s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast oeq <4 x float> %A, zeroinitializer ++ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> ++ ret <4 x i32> %tmp4 ++} ++define <2 x i64> @fcmoeqz2xdouble_fast(<2 x double> %A) { ++; CHECK-LABEL: fcmoeqz2xdouble_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmeq v0.2d, v0.2d, #0.0 ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmoeqz2xdouble_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmeq v0.2d, v0.2d, #0.0 ++; GISEL-NEXT: shl v0.2d, v0.2d, #63 ++; GISEL-NEXT: sshr v0.2d, v0.2d, #63 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast oeq <2 x double> %A, zeroinitializer ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> ++ ret <2 x i64> %tmp4 ++} ++ ++ ++define <2 x i32> @fcmogez2xfloat_fast(<2 x float> %A) { ++; CHECK-LABEL: fcmogez2xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmge v0.2s, v0.2s, #0.0 ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmogez2xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmge v0.2s, v0.2s, #0.0 ++; GISEL-NEXT: shl v0.2s, v0.2s, #31 ++; GISEL-NEXT: sshr v0.2s, v0.2s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast oge <2 x float> %A, zeroinitializer ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> ++ ret <2 x i32> %tmp4 ++} ++ ++define <4 x i32> @fcmogez4xfloat_fast(<4 x float> %A) { ++; CHECK-LABEL: fcmogez4xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmge v0.4s, v0.4s, #0.0 ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmogez4xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmge v0.4s, v0.4s, #0.0 ++; GISEL-NEXT: shl v0.4s, v0.4s, #31 ++; GISEL-NEXT: sshr v0.4s, v0.4s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast oge <4 x float> %A, zeroinitializer ++ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> ++ ret <4 x i32> %tmp4 ++} ++define <2 x i64> @fcmogez2xdouble_fast(<2 x double> %A) { ++; CHECK-LABEL: fcmogez2xdouble_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmge v0.2d, v0.2d, #0.0 ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmogez2xdouble_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmge v0.2d, v0.2d, #0.0 ++; GISEL-NEXT: shl v0.2d, v0.2d, #63 ++; GISEL-NEXT: sshr v0.2d, v0.2d, #63 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast oge <2 x double> %A, zeroinitializer ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> ++ ret <2 x i64> %tmp4 ++} ++ ++define <2 x i32> @fcmogtz2xfloat_fast(<2 x float> %A) { ++; CHECK-LABEL: fcmogtz2xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmgt v0.2s, v0.2s, #0.0 ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmogtz2xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmgt v0.2s, v0.2s, #0.0 ++; GISEL-NEXT: shl v0.2s, v0.2s, #31 ++; GISEL-NEXT: sshr v0.2s, v0.2s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast ogt <2 x float> %A, zeroinitializer ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> ++ ret <2 x i32> %tmp4 ++} ++ ++define <4 x i32> @fcmogtz4xfloat_fast(<4 x float> %A) { ++; CHECK-LABEL: fcmogtz4xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmgt v0.4s, v0.4s, #0.0 ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmogtz4xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmgt v0.4s, v0.4s, #0.0 ++; GISEL-NEXT: shl v0.4s, v0.4s, #31 ++; GISEL-NEXT: sshr v0.4s, v0.4s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast ogt <4 x float> %A, zeroinitializer ++ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> ++ ret <4 x i32> %tmp4 ++} ++define <2 x i64> @fcmogtz2xdouble_fast(<2 x double> %A) { ++; CHECK-LABEL: fcmogtz2xdouble_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmgt v0.2d, v0.2d, #0.0 ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmogtz2xdouble_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmgt v0.2d, v0.2d, #0.0 ++; GISEL-NEXT: shl v0.2d, v0.2d, #63 ++; GISEL-NEXT: sshr v0.2d, v0.2d, #63 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast ogt <2 x double> %A, zeroinitializer ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> ++ ret <2 x i64> %tmp4 ++} ++ ++define <2 x i32> @fcmoltz2xfloat_fast(<2 x float> %A) { ++; CHECK-LABEL: fcmoltz2xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ++; CHECK-NEXT: mov s1, v0.s[1] ++; CHECK-NEXT: fcmp s1, #0.0 ++; CHECK-NEXT: csetm w8, lt ++; CHECK-NEXT: fcmp s0, #0.0 ++; CHECK-NEXT: csetm w9, lt ++; CHECK-NEXT: fmov s0, w9 ++; CHECK-NEXT: mov v0.s[1], w8 ++; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmoltz2xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmlt v0.2s, v0.2s, #0.0 ++; GISEL-NEXT: shl v0.2s, v0.2s, #31 ++; GISEL-NEXT: sshr v0.2s, v0.2s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast olt <2 x float> %A, zeroinitializer ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> ++ ret <2 x i32> %tmp4 ++} ++ ++define <4 x i32> @fcmoltz4xfloat_fast(<4 x float> %A) { ++; CHECK-LABEL: fcmoltz4xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: mov s1, v0.s[1] ++; CHECK-NEXT: mov s2, v0.s[2] ++; CHECK-NEXT: fcmp s1, #0.0 ++; CHECK-NEXT: csetm w8, lt ++; CHECK-NEXT: fcmp s0, #0.0 ++; CHECK-NEXT: mov s0, v0.s[3] ++; CHECK-NEXT: csetm w9, lt ++; CHECK-NEXT: fcmp s2, #0.0 ++; CHECK-NEXT: fmov s1, w9 ++; CHECK-NEXT: mov v1.s[1], w8 ++; CHECK-NEXT: csetm w8, lt ++; CHECK-NEXT: fcmp s0, #0.0 ++; CHECK-NEXT: mov v1.s[2], w8 ++; CHECK-NEXT: csetm w8, lt ++; CHECK-NEXT: mov v1.s[3], w8 ++; CHECK-NEXT: mov v0.16b, v1.16b ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmoltz4xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmlt v0.4s, v0.4s, #0.0 ++; GISEL-NEXT: shl v0.4s, v0.4s, #31 ++; GISEL-NEXT: sshr v0.4s, v0.4s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast olt <4 x float> %A, zeroinitializer ++ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> ++ ret <4 x i32> %tmp4 ++} ++ ++define <2 x i64> @fcmoltz2xdouble_fast(<2 x double> %A) { ++; CHECK-LABEL: fcmoltz2xdouble_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: mov d1, v0.d[1] ++; CHECK-NEXT: fcmp d1, #0.0 ++; CHECK-NEXT: csetm x8, lt ++; CHECK-NEXT: fcmp d0, #0.0 ++; CHECK-NEXT: csetm x9, lt ++; CHECK-NEXT: fmov d0, x9 ++; CHECK-NEXT: mov v0.d[1], x8 ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmoltz2xdouble_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmlt v0.2d, v0.2d, #0.0 ++; GISEL-NEXT: shl v0.2d, v0.2d, #63 ++; GISEL-NEXT: sshr v0.2d, v0.2d, #63 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast olt <2 x double> %A, zeroinitializer ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> ++ ret <2 x i64> %tmp4 ++} ++ ++define <2 x i32> @fcmolez2xfloat_fast(<2 x float> %A) { ++; CHECK-LABEL: fcmolez2xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ++; CHECK-NEXT: mov s1, v0.s[1] ++; CHECK-NEXT: fcmp s1, #0.0 ++; CHECK-NEXT: csetm w8, le ++; CHECK-NEXT: fcmp s0, #0.0 ++; CHECK-NEXT: csetm w9, le ++; CHECK-NEXT: fmov s0, w9 ++; CHECK-NEXT: mov v0.s[1], w8 ++; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmolez2xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmle v0.2s, v0.2s, #0.0 ++; GISEL-NEXT: shl v0.2s, v0.2s, #31 ++; GISEL-NEXT: sshr v0.2s, v0.2s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast ole <2 x float> %A, zeroinitializer ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> ++ ret <2 x i32> %tmp4 ++} ++ ++define <4 x i32> @fcmolez4xfloat_fast(<4 x float> %A) { ++; CHECK-LABEL: fcmolez4xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: mov s1, v0.s[1] ++; CHECK-NEXT: mov s2, v0.s[2] ++; CHECK-NEXT: fcmp s1, #0.0 ++; CHECK-NEXT: csetm w8, le ++; CHECK-NEXT: fcmp s0, #0.0 ++; CHECK-NEXT: mov s0, v0.s[3] ++; CHECK-NEXT: csetm w9, le ++; CHECK-NEXT: fcmp s2, #0.0 ++; CHECK-NEXT: fmov s1, w9 ++; CHECK-NEXT: mov v1.s[1], w8 ++; CHECK-NEXT: csetm w8, le ++; CHECK-NEXT: fcmp s0, #0.0 ++; CHECK-NEXT: mov v1.s[2], w8 ++; CHECK-NEXT: csetm w8, le ++; CHECK-NEXT: mov v1.s[3], w8 ++; CHECK-NEXT: mov v0.16b, v1.16b ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmolez4xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmle v0.4s, v0.4s, #0.0 ++; GISEL-NEXT: shl v0.4s, v0.4s, #31 ++; GISEL-NEXT: sshr v0.4s, v0.4s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast ole <4 x float> %A, zeroinitializer ++ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> ++ ret <4 x i32> %tmp4 ++} ++ ++define <2 x i64> @fcmolez2xdouble_fast(<2 x double> %A) { ++; CHECK-LABEL: fcmolez2xdouble_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: mov d1, v0.d[1] ++; CHECK-NEXT: fcmp d1, #0.0 ++; CHECK-NEXT: csetm x8, le ++; CHECK-NEXT: fcmp d0, #0.0 ++; CHECK-NEXT: csetm x9, le ++; CHECK-NEXT: fmov d0, x9 ++; CHECK-NEXT: mov v0.d[1], x8 ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmolez2xdouble_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmle v0.2d, v0.2d, #0.0 ++; GISEL-NEXT: shl v0.2d, v0.2d, #63 ++; GISEL-NEXT: sshr v0.2d, v0.2d, #63 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast ole <2 x double> %A, zeroinitializer ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> ++ ret <2 x i64> %tmp4 ++} ++ ++define <2 x i32> @fcmonez2xfloat_fast(<2 x float> %A) { ++; CHECK-LABEL: fcmonez2xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmeq v0.2s, v0.2s, #0.0 ++; CHECK-NEXT: mvn v0.8b, v0.8b ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmonez2xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmgt v1.2s, v0.2s, #0.0 ++; GISEL-NEXT: fcmlt v0.2s, v0.2s, #0.0 ++; GISEL-NEXT: orr v0.8b, v0.8b, v1.8b ++; GISEL-NEXT: shl v0.2s, v0.2s, #31 ++; GISEL-NEXT: sshr v0.2s, v0.2s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast one <2 x float> %A, zeroinitializer ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> ++ ret <2 x i32> %tmp4 ++} ++ ++define <4 x i32> @fcmonez4xfloat_fast(<4 x float> %A) { ++; CHECK-LABEL: fcmonez4xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmeq v0.4s, v0.4s, #0.0 ++; CHECK-NEXT: mvn v0.16b, v0.16b ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmonez4xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmgt v1.4s, v0.4s, #0.0 ++; GISEL-NEXT: fcmlt v0.4s, v0.4s, #0.0 ++; GISEL-NEXT: orr v0.16b, v0.16b, v1.16b ++; GISEL-NEXT: shl v0.4s, v0.4s, #31 ++; GISEL-NEXT: sshr v0.4s, v0.4s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast one <4 x float> %A, zeroinitializer ++ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> ++ ret <4 x i32> %tmp4 ++} ++ ++define <2 x i64> @fcmonez2xdouble_fast(<2 x double> %A) { ++; CHECK-LABEL: fcmonez2xdouble_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmeq v0.2d, v0.2d, #0.0 ++; CHECK-NEXT: mvn v0.16b, v0.16b ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmonez2xdouble_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmgt v1.2d, v0.2d, #0.0 ++; GISEL-NEXT: fcmlt v0.2d, v0.2d, #0.0 ++; GISEL-NEXT: orr v0.16b, v0.16b, v1.16b ++; GISEL-NEXT: shl v0.2d, v0.2d, #63 ++; GISEL-NEXT: sshr v0.2d, v0.2d, #63 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast one <2 x double> %A, zeroinitializer ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> ++ ret <2 x i64> %tmp4 ++} ++ ++define <2 x i32> @fcmordz2xfloat_fast(<2 x float> %A) { ++; CHECK-LABEL: fcmordz2xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmge v1.2s, v0.2s, #0.0 ++; CHECK-NEXT: fcmlt v0.2s, v0.2s, #0.0 ++; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmordz2xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmge v1.2s, v0.2s, #0.0 ++; GISEL-NEXT: fcmlt v0.2s, v0.2s, #0.0 ++; GISEL-NEXT: orr v0.8b, v0.8b, v1.8b ++; GISEL-NEXT: shl v0.2s, v0.2s, #31 ++; GISEL-NEXT: sshr v0.2s, v0.2s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast ord <2 x float> %A, zeroinitializer ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> ++ ret <2 x i32> %tmp4 ++} ++ ++define <4 x i32> @fcmordz4xfloat_fast(<4 x float> %A) { ++; CHECK-LABEL: fcmordz4xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmge v1.4s, v0.4s, #0.0 ++; CHECK-NEXT: fcmlt v0.4s, v0.4s, #0.0 ++; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmordz4xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmge v1.4s, v0.4s, #0.0 ++; GISEL-NEXT: fcmlt v0.4s, v0.4s, #0.0 ++; GISEL-NEXT: orr v0.16b, v0.16b, v1.16b ++; GISEL-NEXT: shl v0.4s, v0.4s, #31 ++; GISEL-NEXT: sshr v0.4s, v0.4s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast ord <4 x float> %A, zeroinitializer ++ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> ++ ret <4 x i32> %tmp4 ++} ++ ++define <2 x i64> @fcmordz2xdouble_fast(<2 x double> %A) { ++; CHECK-LABEL: fcmordz2xdouble_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmge v1.2d, v0.2d, #0.0 ++; CHECK-NEXT: fcmlt v0.2d, v0.2d, #0.0 ++; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmordz2xdouble_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmge v1.2d, v0.2d, #0.0 ++; GISEL-NEXT: fcmlt v0.2d, v0.2d, #0.0 ++; GISEL-NEXT: orr v0.16b, v0.16b, v1.16b ++; GISEL-NEXT: shl v0.2d, v0.2d, #63 ++; GISEL-NEXT: sshr v0.2d, v0.2d, #63 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast ord <2 x double> %A, zeroinitializer ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> ++ ret <2 x i64> %tmp4 ++} ++ ++define <2 x i32> @fcmueqz2xfloat_fast(<2 x float> %A) { ++; CHECK-LABEL: fcmueqz2xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmeq v0.2s, v0.2s, #0.0 ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmueqz2xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmgt v1.2s, v0.2s, #0.0 ++; GISEL-NEXT: fcmlt v0.2s, v0.2s, #0.0 ++; GISEL-NEXT: orr v0.8b, v0.8b, v1.8b ++; GISEL-NEXT: mvn v0.8b, v0.8b ++; GISEL-NEXT: shl v0.2s, v0.2s, #31 ++; GISEL-NEXT: sshr v0.2s, v0.2s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast ueq <2 x float> %A, zeroinitializer ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> ++ ret <2 x i32> %tmp4 ++} ++ ++define <4 x i32> @fcmueqz4xfloat_fast(<4 x float> %A) { ++; CHECK-LABEL: fcmueqz4xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmeq v0.4s, v0.4s, #0.0 ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmueqz4xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmgt v1.4s, v0.4s, #0.0 ++; GISEL-NEXT: fcmlt v0.4s, v0.4s, #0.0 ++; GISEL-NEXT: orr v0.16b, v0.16b, v1.16b ++; GISEL-NEXT: mvn v0.16b, v0.16b ++; GISEL-NEXT: shl v0.4s, v0.4s, #31 ++; GISEL-NEXT: sshr v0.4s, v0.4s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast ueq <4 x float> %A, zeroinitializer ++ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> ++ ret <4 x i32> %tmp4 ++} ++ ++define <2 x i64> @fcmueqz2xdouble_fast(<2 x double> %A) { ++; CHECK-LABEL: fcmueqz2xdouble_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmeq v0.2d, v0.2d, #0.0 ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmueqz2xdouble_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmgt v1.2d, v0.2d, #0.0 ++; GISEL-NEXT: fcmlt v0.2d, v0.2d, #0.0 ++; GISEL-NEXT: orr v0.16b, v0.16b, v1.16b ++; GISEL-NEXT: mvn v0.16b, v0.16b ++; GISEL-NEXT: shl v0.2d, v0.2d, #63 ++; GISEL-NEXT: sshr v0.2d, v0.2d, #63 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast ueq <2 x double> %A, zeroinitializer ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> ++ ret <2 x i64> %tmp4 ++} ++ ++define <2 x i32> @fcmugez2xfloat_fast(<2 x float> %A) { ++; CHECK-LABEL: fcmugez2xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmge v0.2s, v0.2s, #0.0 ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmugez2xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmlt v0.2s, v0.2s, #0.0 ++; GISEL-NEXT: mvn v0.8b, v0.8b ++; GISEL-NEXT: shl v0.2s, v0.2s, #31 ++; GISEL-NEXT: sshr v0.2s, v0.2s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast uge <2 x float> %A, zeroinitializer ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> ++ ret <2 x i32> %tmp4 ++} ++ ++define <4 x i32> @fcmugez4xfloat_fast(<4 x float> %A) { ++; CHECK-LABEL: fcmugez4xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmge v0.4s, v0.4s, #0.0 ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmugez4xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmlt v0.4s, v0.4s, #0.0 ++; GISEL-NEXT: mvn v0.16b, v0.16b ++; GISEL-NEXT: shl v0.4s, v0.4s, #31 ++; GISEL-NEXT: sshr v0.4s, v0.4s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast uge <4 x float> %A, zeroinitializer ++ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> ++ ret <4 x i32> %tmp4 ++} ++ ++define <2 x i64> @fcmugez2xdouble_fast(<2 x double> %A) { ++; CHECK-LABEL: fcmugez2xdouble_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmge v0.2d, v0.2d, #0.0 ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmugez2xdouble_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmlt v0.2d, v0.2d, #0.0 ++; GISEL-NEXT: mvn v0.16b, v0.16b ++; GISEL-NEXT: shl v0.2d, v0.2d, #63 ++; GISEL-NEXT: sshr v0.2d, v0.2d, #63 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast uge <2 x double> %A, zeroinitializer ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> ++ ret <2 x i64> %tmp4 ++} ++ ++define <2 x i32> @fcmugtz2xfloat_fast(<2 x float> %A) { ++; CHECK-LABEL: fcmugtz2xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmgt v0.2s, v0.2s, #0.0 ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmugtz2xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmle v0.2s, v0.2s, #0.0 ++; GISEL-NEXT: mvn v0.8b, v0.8b ++; GISEL-NEXT: shl v0.2s, v0.2s, #31 ++; GISEL-NEXT: sshr v0.2s, v0.2s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast ugt <2 x float> %A, zeroinitializer ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> ++ ret <2 x i32> %tmp4 ++} ++ ++define <4 x i32> @fcmugtz4xfloat_fast(<4 x float> %A) { ++; CHECK-LABEL: fcmugtz4xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmgt v0.4s, v0.4s, #0.0 ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmugtz4xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmle v0.4s, v0.4s, #0.0 ++; GISEL-NEXT: mvn v0.16b, v0.16b ++; GISEL-NEXT: shl v0.4s, v0.4s, #31 ++; GISEL-NEXT: sshr v0.4s, v0.4s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast ugt <4 x float> %A, zeroinitializer ++ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> ++ ret <4 x i32> %tmp4 ++} ++ ++define <2 x i64> @fcmugtz2xdouble_fast(<2 x double> %A) { ++; CHECK-LABEL: fcmugtz2xdouble_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmgt v0.2d, v0.2d, #0.0 ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmugtz2xdouble_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmle v0.2d, v0.2d, #0.0 ++; GISEL-NEXT: mvn v0.16b, v0.16b ++; GISEL-NEXT: shl v0.2d, v0.2d, #63 ++; GISEL-NEXT: sshr v0.2d, v0.2d, #63 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast ugt <2 x double> %A, zeroinitializer ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> ++ ret <2 x i64> %tmp4 ++} ++ ++define <2 x i32> @fcmultz2xfloat_fast(<2 x float> %A) { ++; CHECK-LABEL: fcmultz2xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ++; CHECK-NEXT: mov s1, v0.s[1] ++; CHECK-NEXT: fcmp s1, #0.0 ++; CHECK-NEXT: csetm w8, lt ++; CHECK-NEXT: fcmp s0, #0.0 ++; CHECK-NEXT: csetm w9, lt ++; CHECK-NEXT: fmov s0, w9 ++; CHECK-NEXT: mov v0.s[1], w8 ++; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmultz2xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmge v0.2s, v0.2s, #0.0 ++; GISEL-NEXT: mvn v0.8b, v0.8b ++; GISEL-NEXT: shl v0.2s, v0.2s, #31 ++; GISEL-NEXT: sshr v0.2s, v0.2s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast ult <2 x float> %A, zeroinitializer ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> ++ ret <2 x i32> %tmp4 ++} ++ ++define <4 x i32> @fcmultz4xfloat_fast(<4 x float> %A) { ++; CHECK-LABEL: fcmultz4xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: mov s1, v0.s[1] ++; CHECK-NEXT: mov s2, v0.s[2] ++; CHECK-NEXT: fcmp s1, #0.0 ++; CHECK-NEXT: csetm w8, lt ++; CHECK-NEXT: fcmp s0, #0.0 ++; CHECK-NEXT: mov s0, v0.s[3] ++; CHECK-NEXT: csetm w9, lt ++; CHECK-NEXT: fcmp s2, #0.0 ++; CHECK-NEXT: fmov s1, w9 ++; CHECK-NEXT: mov v1.s[1], w8 ++; CHECK-NEXT: csetm w8, lt ++; CHECK-NEXT: fcmp s0, #0.0 ++; CHECK-NEXT: mov v1.s[2], w8 ++; CHECK-NEXT: csetm w8, lt ++; CHECK-NEXT: mov v1.s[3], w8 ++; CHECK-NEXT: mov v0.16b, v1.16b ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmultz4xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmge v0.4s, v0.4s, #0.0 ++; GISEL-NEXT: mvn v0.16b, v0.16b ++; GISEL-NEXT: shl v0.4s, v0.4s, #31 ++; GISEL-NEXT: sshr v0.4s, v0.4s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast ult <4 x float> %A, zeroinitializer ++ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> ++ ret <4 x i32> %tmp4 ++} ++ ++define <2 x i64> @fcmultz2xdouble_fast(<2 x double> %A) { ++; CHECK-LABEL: fcmultz2xdouble_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: mov d1, v0.d[1] ++; CHECK-NEXT: fcmp d1, #0.0 ++; CHECK-NEXT: csetm x8, lt ++; CHECK-NEXT: fcmp d0, #0.0 ++; CHECK-NEXT: csetm x9, lt ++; CHECK-NEXT: fmov d0, x9 ++; CHECK-NEXT: mov v0.d[1], x8 ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmultz2xdouble_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmge v0.2d, v0.2d, #0.0 ++; GISEL-NEXT: mvn v0.16b, v0.16b ++; GISEL-NEXT: shl v0.2d, v0.2d, #63 ++; GISEL-NEXT: sshr v0.2d, v0.2d, #63 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast ult <2 x double> %A, zeroinitializer ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> ++ ret <2 x i64> %tmp4 ++} ++ ++; ULE with zero = !OGT ++define <2 x i32> @fcmulez2xfloat_fast(<2 x float> %A) { ++; CHECK-LABEL: fcmulez2xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ++; CHECK-NEXT: mov s1, v0.s[1] ++; CHECK-NEXT: fcmp s1, #0.0 ++; CHECK-NEXT: csetm w8, le ++; CHECK-NEXT: fcmp s0, #0.0 ++; CHECK-NEXT: csetm w9, le ++; CHECK-NEXT: fmov s0, w9 ++; CHECK-NEXT: mov v0.s[1], w8 ++; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmulez2xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmgt v0.2s, v0.2s, #0.0 ++; GISEL-NEXT: mvn v0.8b, v0.8b ++; GISEL-NEXT: shl v0.2s, v0.2s, #31 ++; GISEL-NEXT: sshr v0.2s, v0.2s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast ule <2 x float> %A, zeroinitializer ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> ++ ret <2 x i32> %tmp4 ++} ++ ++define <4 x i32> @fcmulez4xfloat_fast(<4 x float> %A) { ++; CHECK-LABEL: fcmulez4xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: mov s1, v0.s[1] ++; CHECK-NEXT: mov s2, v0.s[2] ++; CHECK-NEXT: fcmp s1, #0.0 ++; CHECK-NEXT: csetm w8, le ++; CHECK-NEXT: fcmp s0, #0.0 ++; CHECK-NEXT: mov s0, v0.s[3] ++; CHECK-NEXT: csetm w9, le ++; CHECK-NEXT: fcmp s2, #0.0 ++; CHECK-NEXT: fmov s1, w9 ++; CHECK-NEXT: mov v1.s[1], w8 ++; CHECK-NEXT: csetm w8, le ++; CHECK-NEXT: fcmp s0, #0.0 ++; CHECK-NEXT: mov v1.s[2], w8 ++; CHECK-NEXT: csetm w8, le ++; CHECK-NEXT: mov v1.s[3], w8 ++; CHECK-NEXT: mov v0.16b, v1.16b ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmulez4xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmgt v0.4s, v0.4s, #0.0 ++; GISEL-NEXT: mvn v0.16b, v0.16b ++; GISEL-NEXT: shl v0.4s, v0.4s, #31 ++; GISEL-NEXT: sshr v0.4s, v0.4s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast ule <4 x float> %A, zeroinitializer ++ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> ++ ret <4 x i32> %tmp4 ++} ++ ++define <2 x i64> @fcmulez2xdouble_fast(<2 x double> %A) { ++; CHECK-LABEL: fcmulez2xdouble_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: mov d1, v0.d[1] ++; CHECK-NEXT: fcmp d1, #0.0 ++; CHECK-NEXT: csetm x8, le ++; CHECK-NEXT: fcmp d0, #0.0 ++; CHECK-NEXT: csetm x9, le ++; CHECK-NEXT: fmov d0, x9 ++; CHECK-NEXT: mov v0.d[1], x8 ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmulez2xdouble_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmgt v0.2d, v0.2d, #0.0 ++; GISEL-NEXT: mvn v0.16b, v0.16b ++; GISEL-NEXT: shl v0.2d, v0.2d, #63 ++; GISEL-NEXT: sshr v0.2d, v0.2d, #63 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast ule <2 x double> %A, zeroinitializer ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> ++ ret <2 x i64> %tmp4 ++} ++ ++define <2 x i32> @fcmunez2xfloat_fast(<2 x float> %A) { ++; CHECK-LABEL: fcmunez2xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmeq v0.2s, v0.2s, #0.0 ++; CHECK-NEXT: mvn v0.8b, v0.8b ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmunez2xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmeq v0.2s, v0.2s, #0.0 ++; GISEL-NEXT: mvn v0.8b, v0.8b ++; GISEL-NEXT: shl v0.2s, v0.2s, #31 ++; GISEL-NEXT: sshr v0.2s, v0.2s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast une <2 x float> %A, zeroinitializer ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> ++ ret <2 x i32> %tmp4 ++} ++ ++define <4 x i32> @fcmunez4xfloat_fast(<4 x float> %A) { ++; CHECK-LABEL: fcmunez4xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmeq v0.4s, v0.4s, #0.0 ++; CHECK-NEXT: mvn v0.16b, v0.16b ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmunez4xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmeq v0.4s, v0.4s, #0.0 ++; GISEL-NEXT: mvn v0.16b, v0.16b ++; GISEL-NEXT: shl v0.4s, v0.4s, #31 ++; GISEL-NEXT: sshr v0.4s, v0.4s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast une <4 x float> %A, zeroinitializer ++ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> ++ ret <4 x i32> %tmp4 ++} ++ ++define <2 x i64> @fcmunez2xdouble_fast(<2 x double> %A) { ++; CHECK-LABEL: fcmunez2xdouble_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmeq v0.2d, v0.2d, #0.0 ++; CHECK-NEXT: mvn v0.16b, v0.16b ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmunez2xdouble_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmeq v0.2d, v0.2d, #0.0 ++; GISEL-NEXT: mvn v0.16b, v0.16b ++; GISEL-NEXT: shl v0.2d, v0.2d, #63 ++; GISEL-NEXT: sshr v0.2d, v0.2d, #63 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast une <2 x double> %A, zeroinitializer ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> ++ ret <2 x i64> %tmp4 ++} ++ ++define <2 x i32> @fcmunoz2xfloat_fast(<2 x float> %A) { ++; CHECK-LABEL: fcmunoz2xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmge v1.2s, v0.2s, #0.0 ++; CHECK-NEXT: fcmlt v0.2s, v0.2s, #0.0 ++; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b ++; CHECK-NEXT: mvn v0.8b, v0.8b ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmunoz2xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmge v1.2s, v0.2s, #0.0 ++; GISEL-NEXT: fcmlt v0.2s, v0.2s, #0.0 ++; GISEL-NEXT: orr v0.8b, v0.8b, v1.8b ++; GISEL-NEXT: mvn v0.8b, v0.8b ++; GISEL-NEXT: shl v0.2s, v0.2s, #31 ++; GISEL-NEXT: sshr v0.2s, v0.2s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast uno <2 x float> %A, zeroinitializer ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> ++ ret <2 x i32> %tmp4 ++} ++ ++define <4 x i32> @fcmunoz4xfloat_fast(<4 x float> %A) { ++; CHECK-LABEL: fcmunoz4xfloat_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmge v1.4s, v0.4s, #0.0 ++; CHECK-NEXT: fcmlt v0.4s, v0.4s, #0.0 ++; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ++; CHECK-NEXT: mvn v0.16b, v0.16b ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmunoz4xfloat_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmge v1.4s, v0.4s, #0.0 ++; GISEL-NEXT: fcmlt v0.4s, v0.4s, #0.0 ++; GISEL-NEXT: orr v0.16b, v0.16b, v1.16b ++; GISEL-NEXT: mvn v0.16b, v0.16b ++; GISEL-NEXT: shl v0.4s, v0.4s, #31 ++; GISEL-NEXT: sshr v0.4s, v0.4s, #31 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast uno <4 x float> %A, zeroinitializer ++ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> ++ ret <4 x i32> %tmp4 ++} ++ ++define <2 x i64> @fcmunoz2xdouble_fast(<2 x double> %A) { ++; CHECK-LABEL: fcmunoz2xdouble_fast: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcmge v1.2d, v0.2d, #0.0 ++; CHECK-NEXT: fcmlt v0.2d, v0.2d, #0.0 ++; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ++; CHECK-NEXT: mvn v0.16b, v0.16b ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmunoz2xdouble_fast: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmge v1.2d, v0.2d, #0.0 ++; GISEL-NEXT: fcmlt v0.2d, v0.2d, #0.0 ++; GISEL-NEXT: orr v0.16b, v0.16b, v1.16b ++; GISEL-NEXT: mvn v0.16b, v0.16b ++; GISEL-NEXT: shl v0.2d, v0.2d, #63 ++; GISEL-NEXT: sshr v0.2d, v0.2d, #63 ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast uno <2 x double> %A, zeroinitializer ++ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> ++ ret <2 x i64> %tmp4 ++ ++} ++ ++; Test SETCC fast-math flags are propagated when combining zext(setcc). ++define <4 x i32> @fcmule4xfloat_fast_zext(<4 x float> %A, <4 x float> %B) { ++; CHECK-LABEL: fcmule4xfloat_fast_zext: ++; CHECK: // %bb.0: ++; CHECK-NEXT: mov s3, v1.s[1] ++; CHECK-NEXT: mov s4, v0.s[1] ++; CHECK-NEXT: movi v2.4s, #1 ++; CHECK-NEXT: fcmp s4, s3 ++; CHECK-NEXT: mov s3, v1.s[2] ++; CHECK-NEXT: mov s4, v0.s[2] ++; CHECK-NEXT: csetm w8, le ++; CHECK-NEXT: fcmp s0, s1 ++; CHECK-NEXT: mov s1, v1.s[3] ++; CHECK-NEXT: mov s0, v0.s[3] ++; CHECK-NEXT: csetm w9, le ++; CHECK-NEXT: fcmp s4, s3 ++; CHECK-NEXT: fmov s3, w9 ++; CHECK-NEXT: mov v3.s[1], w8 ++; CHECK-NEXT: csetm w8, le ++; CHECK-NEXT: fcmp s0, s1 ++; CHECK-NEXT: mov v3.s[2], w8 ++; CHECK-NEXT: csetm w8, le ++; CHECK-NEXT: mov v3.s[3], w8 ++; CHECK-NEXT: and v0.16b, v3.16b, v2.16b ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmule4xfloat_fast_zext: ++; GISEL: // %bb.0: ++; GISEL-NEXT: adrp x8, .LCPI322_0 ++; GISEL-NEXT: fcmgt v0.4s, v0.4s, v1.4s ++; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI322_0] ++; GISEL-NEXT: bic v0.16b, v1.16b, v0.16b ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast ule <4 x float> %A, %B ++ %tmp4 = zext <4 x i1> %tmp3 to <4 x i32> ++ ret <4 x i32> %tmp4 ++} ++ ++; Test SETCC fast-math flags are propagated when combining aext(setcc). ++define <4 x i1> @fcmule4xfloat_fast_aext(<4 x float> %A, <4 x float> %B) { ++; CHECK-LABEL: fcmule4xfloat_fast_aext: ++; CHECK: // %bb.0: ++; CHECK-NEXT: mov s2, v1.s[1] ++; CHECK-NEXT: mov s3, v0.s[1] ++; CHECK-NEXT: fcmp s3, s2 ++; CHECK-NEXT: mov s2, v1.s[2] ++; CHECK-NEXT: mov s3, v0.s[2] ++; CHECK-NEXT: csetm w8, le ++; CHECK-NEXT: fcmp s0, s1 ++; CHECK-NEXT: mov s1, v1.s[3] ++; CHECK-NEXT: mov s0, v0.s[3] ++; CHECK-NEXT: csetm w9, le ++; CHECK-NEXT: fcmp s3, s2 ++; CHECK-NEXT: fmov s4, w9 ++; CHECK-NEXT: mov v4.s[1], w8 ++; CHECK-NEXT: csetm w8, le ++; CHECK-NEXT: fcmp s0, s1 ++; CHECK-NEXT: mov v4.s[2], w8 ++; CHECK-NEXT: csetm w8, le ++; CHECK-NEXT: mov v4.s[3], w8 ++; CHECK-NEXT: xtn v0.4h, v4.4s ++; CHECK-NEXT: ret ++; ++; GISEL-LABEL: fcmule4xfloat_fast_aext: ++; GISEL: // %bb.0: ++; GISEL-NEXT: fcmgt v0.4s, v0.4s, v1.4s ++; GISEL-NEXT: mvn v0.16b, v0.16b ++; GISEL-NEXT: xtn v0.4h, v0.4s ++; GISEL-NEXT: ret ++ %tmp3 = fcmp fast ule <4 x float> %A, %B ++ ret <4 x i1> %tmp3 ++} +-- +2.34.1 + diff --git a/patches/cherry/a9a012086a917dff367bb63de2d63782b23111fc.patch b/patches/cherry/a9a012086a917dff367bb63de2d63782b23111fc.patch new file mode 100644 index 0000000..c6abbb2 --- /dev/null +++ b/patches/cherry/a9a012086a917dff367bb63de2d63782b23111fc.patch @@ -0,0 +1,72 @@ +From a9a012086a917dff367bb63de2d63782b23111fc Mon Sep 17 00:00:00 2001 +From: Florian Hahn <flo@fhahn.com> +Date: Thu, 26 May 2022 10:35:38 +0100 +Subject: [PATCH] [AArch64] Add additional tests for sinking free shuffles for + FMAs. + +--- + .../AArch64/sink-free-instructions.ll | 41 ++++++++++++++++++- + 1 file changed, 39 insertions(+), 2 deletions(-) + +diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll +index 244d2c35bbac..5d7a26f65784 100644 +--- a/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll ++++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll +@@ -585,6 +585,43 @@ if.else: + ret <4 x float> %r.3 + } + ++define <4 x float> @sink_shufflevector_first_arg_fma_v4f3(i1 %c, <8 x float> %a, <4 x float> %b) { ++; CHECK-LABEL: @sink_shufflevector_first_arg_fma_v4f3( ++; CHECK-NEXT: entry: ++; CHECK-NEXT: [[S0:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> zeroinitializer ++; CHECK-NEXT: [[S1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ++; CHECK-NEXT: [[S2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2> ++; CHECK-NEXT: [[S3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3> ++; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] ++; CHECK: if.then: ++; CHECK-NEXT: [[R_0:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[S0]], <4 x float> [[B:%.*]], <4 x float> [[B]]) ++; CHECK-NEXT: [[R_1:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[S1]], <4 x float> [[R_0]], <4 x float> [[B]]) ++; CHECK-NEXT: ret <4 x float> [[R_1]] ++; CHECK: if.else: ++; CHECK-NEXT: [[R_2:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[S2]], <4 x float> [[B]], <4 x float> [[B]]) ++; CHECK-NEXT: [[R_3:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[S3]], <4 x float> [[R_2]], <4 x float> [[B]]) ++; CHECK-NEXT: ret <4 x float> [[R_3]] ++; ++entry: ++ %s0 = shufflevector <8 x float> %a, <8 x float> poison, <4 x i32> zeroinitializer ++ %s1 = shufflevector <8 x float> %a, <8 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ++ %s2 = shufflevector <8 x float> %a, <8 x float> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2> ++ %s3 = shufflevector <8 x float> %a, <8 x float> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3> ++ br i1 %c, label %if.then, label %if.else ++ ++if.then: ++ %r.0 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %s0, <4 x float> %b, <4 x float> %b) ++ %r.1 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %s1, <4 x float> %r.0, <4 x float> %b) ++ ret <4 x float> %r.1 ++ ++if.else: ++ %r.2 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %s2, <4 x float> %b, <4 x float> %b) ++ %r.3 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %s3, <4 x float> %r.2, <4 x float> %b) ++ ret <4 x float> %r.3 ++} ++ ++ ++ + declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) + + define <2 x double> @sink_shufflevector_fma_v2f64(i1 %c, <2 x double> %a, <2 x double> %b) { +@@ -639,8 +676,8 @@ if.else: + + declare <5 x float> @llvm.fma.v5f32(<5 x float>, <5 x float>, <5 x float>) + +-define <5 x float> @do_not_sink_shufflevector_fma_v5f32(i1 %c, <8 x float> %a, <5 x float> %b) { +-; CHECK-LABEL: @do_not_sink_shufflevector_fma_v5f32( ++define <5 x float> @sink_shufflevector_fma_v5f32(i1 %c, <8 x float> %a, <5 x float> %b) { ++; CHECK-LABEL: @sink_shufflevector_fma_v5f32( + ; CHECK-NEXT: entry: + ; CHECK-NEXT: [[S0:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <5 x i32> zeroinitializer + ; CHECK-NEXT: [[S1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <5 x i32> <i32 1, i32 1, i32 1, i32 1, i32 4> +-- +2.34.1 + diff --git a/patches/cherry/bb362d890f0d51c250818711d4a9b0b51cea7bc6.patch b/patches/cherry/bb362d890f0d51c250818711d4a9b0b51cea7bc6.patch new file mode 100644 index 0000000..e7d8543 --- /dev/null +++ b/patches/cherry/bb362d890f0d51c250818711d4a9b0b51cea7bc6.patch @@ -0,0 +1,1507 @@ +From bb362d890f0d51c250818711d4a9b0b51cea7bc6 Mon Sep 17 00:00:00 2001 +From: David Green <david.green@arm.com> +Date: Thu, 10 Feb 2022 21:04:41 +0000 +Subject: [PATCH] [AArch64] Add extra fptoint_sat tests for larger than legal + types. NFC + +--- + .../test/CodeGen/AArch64/fptosi-sat-vector.ll | 821 ++++++++++++++++++ + .../test/CodeGen/AArch64/fptoui-sat-vector.ll | 656 ++++++++++++++ + 2 files changed, 1477 insertions(+) + +diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll +index 55e018783f04..3625bd6011fb 100644 +--- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll ++++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll +@@ -2976,3 +2976,824 @@ define <8 x i128> @test_signed_v8f16_v8i128(<8 x half> %f) { + } + + ++declare <8 x i8> @llvm.fptosi.sat.v8f32.v8i8(<8 x float> %f) ++declare <8 x i16> @llvm.fptosi.sat.v8f32.v8i16(<8 x float> %f) ++declare <16 x i8> @llvm.fptosi.sat.v16f32.v16i8(<16 x float> %f) ++declare <16 x i16> @llvm.fptosi.sat.v16f32.v16i16(<16 x float> %f) ++ ++declare <16 x i8> @llvm.fptosi.sat.v16f16.v16i8(<16 x half> %f) ++declare <16 x i16> @llvm.fptosi.sat.v16f16.v16i16(<16 x half> %f) ++ ++declare <8 x i8> @llvm.fptosi.sat.v8f64.v8i8(<8 x double> %f) ++declare <8 x i16> @llvm.fptosi.sat.v8f64.v8i16(<8 x double> %f) ++declare <16 x i8> @llvm.fptosi.sat.v16f64.v16i8(<16 x double> %f) ++declare <16 x i16> @llvm.fptosi.sat.v16f64.v16i16(<16 x double> %f) ++ ++define <8 x i8> @test_signed_v8f32_v8i8(<8 x float> %f) { ++; CHECK-LABEL: test_signed_v8f32_v8i8: ++; CHECK: // %bb.0: ++; CHECK-NEXT: movi v2.4s, #127 ++; CHECK-NEXT: fcvtzs v1.4s, v1.4s ++; CHECK-NEXT: fcvtzs v0.4s, v0.4s ++; CHECK-NEXT: mvni v3.4s, #127 ++; CHECK-NEXT: smin v1.4s, v1.4s, v2.4s ++; CHECK-NEXT: smin v0.4s, v0.4s, v2.4s ++; CHECK-NEXT: smax v1.4s, v1.4s, v3.4s ++; CHECK-NEXT: smax v0.4s, v0.4s, v3.4s ++; CHECK-NEXT: xtn v1.4h, v1.4s ++; CHECK-NEXT: xtn v0.4h, v0.4s ++; CHECK-NEXT: uzp1 v0.8b, v0.8b, v1.8b ++; CHECK-NEXT: ret ++ %x = call <8 x i8> @llvm.fptosi.sat.v8f32.v8i8(<8 x float> %f) ++ ret <8 x i8> %x ++} ++ ++define <16 x i8> @test_signed_v16f32_v16i8(<16 x float> %f) { ++; CHECK-LABEL: test_signed_v16f32_v16i8: ++; CHECK: // %bb.0: ++; CHECK-NEXT: movi v4.4s, #127 ++; CHECK-NEXT: fcvtzs v0.4s, v0.4s ++; CHECK-NEXT: mvni v5.4s, #127 ++; CHECK-NEXT: fcvtzs v1.4s, v1.4s ++; CHECK-NEXT: fcvtzs v2.4s, v2.4s ++; CHECK-NEXT: smin v0.4s, v0.4s, v4.4s ++; CHECK-NEXT: smin v1.4s, v1.4s, v4.4s ++; CHECK-NEXT: smin v2.4s, v2.4s, v4.4s ++; CHECK-NEXT: smax v0.4s, v0.4s, v5.4s ++; CHECK-NEXT: smax v1.4s, v1.4s, v5.4s ++; CHECK-NEXT: smax v2.4s, v2.4s, v5.4s ++; CHECK-NEXT: xtn v6.4h, v0.4s ++; CHECK-NEXT: umov w8, v6.h[0] ++; CHECK-NEXT: umov w9, v6.h[1] ++; CHECK-NEXT: xtn v1.4h, v1.4s ++; CHECK-NEXT: fmov s0, w8 ++; CHECK-NEXT: umov w8, v6.h[2] ++; CHECK-NEXT: mov v0.b[1], w9 ++; CHECK-NEXT: mov v0.b[2], w8 ++; CHECK-NEXT: umov w8, v6.h[3] ++; CHECK-NEXT: mov v0.b[3], w8 ++; CHECK-NEXT: umov w8, v1.h[0] ++; CHECK-NEXT: mov v0.b[4], w8 ++; CHECK-NEXT: umov w8, v1.h[1] ++; CHECK-NEXT: mov v0.b[5], w8 ++; CHECK-NEXT: umov w8, v1.h[2] ++; CHECK-NEXT: mov v0.b[6], w8 ++; CHECK-NEXT: umov w8, v1.h[3] ++; CHECK-NEXT: xtn v1.4h, v2.4s ++; CHECK-NEXT: fcvtzs v2.4s, v3.4s ++; CHECK-NEXT: mov v0.b[7], w8 ++; CHECK-NEXT: umov w8, v1.h[0] ++; CHECK-NEXT: smin v2.4s, v2.4s, v4.4s ++; CHECK-NEXT: mov v0.b[8], w8 ++; CHECK-NEXT: umov w8, v1.h[1] ++; CHECK-NEXT: smax v2.4s, v2.4s, v5.4s ++; CHECK-NEXT: mov v0.b[9], w8 ++; CHECK-NEXT: umov w8, v1.h[2] ++; CHECK-NEXT: mov v0.b[10], w8 ++; CHECK-NEXT: umov w8, v1.h[3] ++; CHECK-NEXT: xtn v1.4h, v2.4s ++; CHECK-NEXT: mov v0.b[11], w8 ++; CHECK-NEXT: umov w8, v1.h[0] ++; CHECK-NEXT: mov v0.b[12], w8 ++; CHECK-NEXT: umov w8, v1.h[1] ++; CHECK-NEXT: mov v0.b[13], w8 ++; CHECK-NEXT: umov w8, v1.h[2] ++; CHECK-NEXT: mov v0.b[14], w8 ++; CHECK-NEXT: umov w8, v1.h[3] ++; CHECK-NEXT: mov v0.b[15], w8 ++; CHECK-NEXT: ret ++ %x = call <16 x i8> @llvm.fptosi.sat.v16f32.v16i8(<16 x float> %f) ++ ret <16 x i8> %x ++} ++ ++define <8 x i16> @test_signed_v8f32_v8i16(<8 x float> %f) { ++; CHECK-LABEL: test_signed_v8f32_v8i16: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcvtzs v0.4s, v0.4s ++; CHECK-NEXT: fcvtzs v1.4s, v1.4s ++; CHECK-NEXT: sqxtn v0.4h, v0.4s ++; CHECK-NEXT: sqxtn2 v0.8h, v1.4s ++; CHECK-NEXT: ret ++ %x = call <8 x i16> @llvm.fptosi.sat.v8f32.v8i16(<8 x float> %f) ++ ret <8 x i16> %x ++} ++ ++define <16 x i16> @test_signed_v16f32_v16i16(<16 x float> %f) { ++; CHECK-LABEL: test_signed_v16f32_v16i16: ++; CHECK: // %bb.0: ++; CHECK-NEXT: fcvtzs v0.4s, v0.4s ++; CHECK-NEXT: fcvtzs v2.4s, v2.4s ++; CHECK-NEXT: fcvtzs v4.4s, v1.4s ++; CHECK-NEXT: fcvtzs v3.4s, v3.4s ++; CHECK-NEXT: sqxtn v0.4h, v0.4s ++; CHECK-NEXT: sqxtn v1.4h, v2.4s ++; CHECK-NEXT: sqxtn2 v0.8h, v4.4s ++; CHECK-NEXT: sqxtn2 v1.8h, v3.4s ++; CHECK-NEXT: ret ++ %x = call <16 x i16> @llvm.fptosi.sat.v16f32.v16i16(<16 x float> %f) ++ ret <16 x i16> %x ++} ++ ++ ++ ++define <16 x i8> @test_signed_v16f16_v16i8(<16 x half> %f) { ++; CHECK-CVT-LABEL: test_signed_v16f16_v16i8: ++; CHECK-CVT: // %bb.0: ++; CHECK-CVT-NEXT: fcvtl2 v2.4s, v1.8h ++; CHECK-CVT-NEXT: mov w8, #127 ++; CHECK-CVT-NEXT: fcvtl v1.4s, v1.4h ++; CHECK-CVT-NEXT: mov w9, #-128 ++; CHECK-CVT-NEXT: mov s3, v2.s[1] ++; CHECK-CVT-NEXT: fcvtzs w11, s2 ++; CHECK-CVT-NEXT: fcvtzs w10, s3 ++; CHECK-CVT-NEXT: mov s3, v2.s[2] ++; CHECK-CVT-NEXT: mov s2, v2.s[3] ++; CHECK-CVT-NEXT: cmp w10, #127 ++; CHECK-CVT-NEXT: csel w10, w10, w8, lt ++; CHECK-CVT-NEXT: fcvtzs w12, s3 ++; CHECK-CVT-NEXT: cmn w10, #128 ++; CHECK-CVT-NEXT: mov s3, v1.s[1] ++; CHECK-CVT-NEXT: csel w10, w10, w9, gt ++; CHECK-CVT-NEXT: cmp w11, #127 ++; CHECK-CVT-NEXT: csel w11, w11, w8, lt ++; CHECK-CVT-NEXT: fcvtzs w14, s2 ++; CHECK-CVT-NEXT: cmn w11, #128 ++; CHECK-CVT-NEXT: fcvtl2 v2.4s, v0.8h ++; CHECK-CVT-NEXT: csel w11, w11, w9, gt ++; CHECK-CVT-NEXT: cmp w12, #127 ++; CHECK-CVT-NEXT: csel w12, w12, w8, lt ++; CHECK-CVT-NEXT: fcvtzs w15, s3 ++; CHECK-CVT-NEXT: cmn w12, #128 ++; CHECK-CVT-NEXT: mov s3, v1.s[2] ++; CHECK-CVT-NEXT: csel w13, w12, w9, gt ++; CHECK-CVT-NEXT: cmp w14, #127 ++; CHECK-CVT-NEXT: csel w12, w14, w8, lt ++; CHECK-CVT-NEXT: fcvtzs w14, s1 ++; CHECK-CVT-NEXT: cmn w12, #128 ++; CHECK-CVT-NEXT: mov s1, v1.s[3] ++; CHECK-CVT-NEXT: csel w12, w12, w9, gt ++; CHECK-CVT-NEXT: cmp w15, #127 ++; CHECK-CVT-NEXT: csel w15, w15, w8, lt ++; CHECK-CVT-NEXT: fcvtzs w16, s3 ++; CHECK-CVT-NEXT: cmn w15, #128 ++; CHECK-CVT-NEXT: mov s3, v2.s[1] ++; CHECK-CVT-NEXT: csel w15, w15, w9, gt ++; CHECK-CVT-NEXT: cmp w14, #127 ++; CHECK-CVT-NEXT: csel w14, w14, w8, lt ++; CHECK-CVT-NEXT: fcvtzs w17, s1 ++; CHECK-CVT-NEXT: cmn w14, #128 ++; CHECK-CVT-NEXT: mov s1, v2.s[2] ++; CHECK-CVT-NEXT: csel w14, w14, w9, gt ++; CHECK-CVT-NEXT: cmp w16, #127 ++; CHECK-CVT-NEXT: csel w16, w16, w8, lt ++; CHECK-CVT-NEXT: fcvtzs w18, s3 ++; CHECK-CVT-NEXT: cmn w16, #128 ++; CHECK-CVT-NEXT: fcvtzs w0, s2 ++; CHECK-CVT-NEXT: csel w16, w16, w9, gt ++; CHECK-CVT-NEXT: cmp w17, #127 ++; CHECK-CVT-NEXT: csel w17, w17, w8, lt ++; CHECK-CVT-NEXT: fcvtl v0.4s, v0.4h ++; CHECK-CVT-NEXT: cmn w17, #128 ++; CHECK-CVT-NEXT: mov s2, v2.s[3] ++; CHECK-CVT-NEXT: csel w17, w17, w9, gt ++; CHECK-CVT-NEXT: cmp w18, #127 ++; CHECK-CVT-NEXT: csel w18, w18, w8, lt ++; CHECK-CVT-NEXT: fcvtzs w1, s1 ++; CHECK-CVT-NEXT: cmn w18, #128 ++; CHECK-CVT-NEXT: mov s1, v0.s[1] ++; CHECK-CVT-NEXT: csel w18, w18, w9, gt ++; CHECK-CVT-NEXT: cmp w0, #127 ++; CHECK-CVT-NEXT: csel w0, w0, w8, lt ++; CHECK-CVT-NEXT: fcvtzs w2, s2 ++; CHECK-CVT-NEXT: cmn w0, #128 ++; CHECK-CVT-NEXT: fcvtzs w4, s0 ++; CHECK-CVT-NEXT: csel w0, w0, w9, gt ++; CHECK-CVT-NEXT: cmp w1, #127 ++; CHECK-CVT-NEXT: csel w1, w1, w8, lt ++; CHECK-CVT-NEXT: fcvtzs w3, s1 ++; CHECK-CVT-NEXT: cmn w1, #128 ++; CHECK-CVT-NEXT: mov s1, v0.s[2] ++; CHECK-CVT-NEXT: csel w1, w1, w9, gt ++; CHECK-CVT-NEXT: cmp w2, #127 ++; CHECK-CVT-NEXT: csel w2, w2, w8, lt ++; CHECK-CVT-NEXT: fmov s2, w11 ++; CHECK-CVT-NEXT: cmn w2, #128 ++; CHECK-CVT-NEXT: fmov s3, w14 ++; CHECK-CVT-NEXT: csel w2, w2, w9, gt ++; CHECK-CVT-NEXT: cmp w3, #127 ++; CHECK-CVT-NEXT: csel w3, w3, w8, lt ++; CHECK-CVT-NEXT: fcvtzs w14, s1 ++; CHECK-CVT-NEXT: cmn w3, #128 ++; CHECK-CVT-NEXT: mov s0, v0.s[3] ++; CHECK-CVT-NEXT: csel w3, w3, w9, gt ++; CHECK-CVT-NEXT: cmp w4, #127 ++; CHECK-CVT-NEXT: csel w11, w4, w8, lt ++; CHECK-CVT-NEXT: fmov s4, w0 ++; CHECK-CVT-NEXT: cmn w11, #128 ++; CHECK-CVT-NEXT: csel w11, w11, w9, gt ++; CHECK-CVT-NEXT: cmp w14, #127 ++; CHECK-CVT-NEXT: mov v2.s[1], w10 ++; CHECK-CVT-NEXT: csel w10, w14, w8, lt ++; CHECK-CVT-NEXT: mov v3.s[1], w15 ++; CHECK-CVT-NEXT: cmn w10, #128 ++; CHECK-CVT-NEXT: fmov s1, w11 ++; CHECK-CVT-NEXT: csel w10, w10, w9, gt ++; CHECK-CVT-NEXT: fcvtzs w11, s0 ++; CHECK-CVT-NEXT: mov v4.s[1], w18 ++; CHECK-CVT-NEXT: mov v1.s[1], w3 ++; CHECK-CVT-NEXT: cmp w11, #127 ++; CHECK-CVT-NEXT: csel w8, w11, w8, lt ++; CHECK-CVT-NEXT: mov v2.s[2], w13 ++; CHECK-CVT-NEXT: cmn w8, #128 ++; CHECK-CVT-NEXT: mov v3.s[2], w16 ++; CHECK-CVT-NEXT: csel w8, w8, w9, gt ++; CHECK-CVT-NEXT: mov v4.s[2], w1 ++; CHECK-CVT-NEXT: mov v1.s[2], w10 ++; CHECK-CVT-NEXT: mov v2.s[3], w12 ++; CHECK-CVT-NEXT: mov v3.s[3], w17 ++; CHECK-CVT-NEXT: mov v4.s[3], w2 ++; CHECK-CVT-NEXT: mov v1.s[3], w8 ++; CHECK-CVT-NEXT: uzp1 v0.8h, v3.8h, v2.8h ++; CHECK-CVT-NEXT: uzp1 v1.8h, v1.8h, v4.8h ++; CHECK-CVT-NEXT: uzp1 v0.16b, v1.16b, v0.16b ++; CHECK-CVT-NEXT: ret ++; ++; CHECK-FP16-LABEL: test_signed_v16f16_v16i8: ++; CHECK-FP16: // %bb.0: ++; CHECK-FP16-NEXT: fcvtzs v0.8h, v0.8h ++; CHECK-FP16-NEXT: fcvtzs v1.8h, v1.8h ++; CHECK-FP16-NEXT: sqxtn v0.8b, v0.8h ++; CHECK-FP16-NEXT: sqxtn2 v0.16b, v1.8h ++; CHECK-FP16-NEXT: ret ++ %x = call <16 x i8> @llvm.fptosi.sat.v16f16.v16i8(<16 x half> %f) ++ ret <16 x i8> %x ++} ++ ++define <16 x i16> @test_signed_v16f16_v16i16(<16 x half> %f) { ++; CHECK-CVT-LABEL: test_signed_v16f16_v16i16: ++; CHECK-CVT: // %bb.0: ++; CHECK-CVT-NEXT: fcvtl2 v2.4s, v0.8h ++; CHECK-CVT-NEXT: mov w8, #32767 ++; CHECK-CVT-NEXT: fcvtl v0.4s, v0.4h ++; CHECK-CVT-NEXT: mov w9, #-32768 ++; CHECK-CVT-NEXT: mov s3, v2.s[1] ++; CHECK-CVT-NEXT: fcvtzs w11, s2 ++; CHECK-CVT-NEXT: fcvtzs w10, s3 ++; CHECK-CVT-NEXT: mov s3, v2.s[2] ++; CHECK-CVT-NEXT: mov s2, v2.s[3] ++; CHECK-CVT-NEXT: cmp w10, w8 ++; CHECK-CVT-NEXT: csel w10, w10, w8, lt ++; CHECK-CVT-NEXT: fcvtzs w12, s3 ++; CHECK-CVT-NEXT: cmn w10, #8, lsl #12 // =32768 ++; CHECK-CVT-NEXT: mov s3, v0.s[1] ++; CHECK-CVT-NEXT: csel w10, w10, w9, gt ++; CHECK-CVT-NEXT: cmp w11, w8 ++; CHECK-CVT-NEXT: csel w11, w11, w8, lt ++; CHECK-CVT-NEXT: fcvtzs w14, s2 ++; CHECK-CVT-NEXT: cmn w11, #8, lsl #12 // =32768 ++; CHECK-CVT-NEXT: fcvtl2 v2.4s, v1.8h ++; CHECK-CVT-NEXT: csel w11, w11, w9, gt ++; CHECK-CVT-NEXT: cmp w12, w8 ++; CHECK-CVT-NEXT: csel w12, w12, w8, lt ++; CHECK-CVT-NEXT: fcvtzs w15, s3 ++; CHECK-CVT-NEXT: cmn w12, #8, lsl #12 // =32768 ++; CHECK-CVT-NEXT: mov s3, v0.s[2] ++; CHECK-CVT-NEXT: csel w13, w12, w9, gt ++; CHECK-CVT-NEXT: cmp w14, w8 ++; CHECK-CVT-NEXT: csel w12, w14, w8, lt ++; CHECK-CVT-NEXT: fcvtzs w14, s0 ++; CHECK-CVT-NEXT: cmn w12, #8, lsl #12 // =32768 ++; CHECK-CVT-NEXT: mov s0, v0.s[3] ++; CHECK-CVT-NEXT: csel w12, w12, w9, gt ++; CHECK-CVT-NEXT: cmp w15, w8 ++; CHECK-CVT-NEXT: csel w15, w15, w8, lt ++; CHECK-CVT-NEXT: fcvtzs w16, s3 ++; CHECK-CVT-NEXT: cmn w15, #8, lsl #12 // =32768 ++; CHECK-CVT-NEXT: mov s3, v2.s[1] ++; CHECK-CVT-NEXT: csel w15, w15, w9, gt ++; CHECK-CVT-NEXT: cmp w14, w8 ++; CHECK-CVT-NEXT: csel w14, w14, w8, lt ++; CHECK-CVT-NEXT: fcvtzs w17, s0 ++; CHECK-CVT-NEXT: cmn w14, #8, lsl #12 // =32768 ++; CHECK-CVT-NEXT: fcvtl v0.4s, v1.4h ++; CHECK-CVT-NEXT: csel w14, w14, w9, gt ++; CHECK-CVT-NEXT: cmp w16, w8 ++; CHECK-CVT-NEXT: csel w16, w16, w8, lt ++; CHECK-CVT-NEXT: fcvtzs w18, s3 ++; CHECK-CVT-NEXT: cmn w16, #8, lsl #12 // =32768 ++; CHECK-CVT-NEXT: mov s1, v2.s[2] ++; CHECK-CVT-NEXT: csel w16, w16, w9, gt ++; CHECK-CVT-NEXT: cmp w17, w8 ++; CHECK-CVT-NEXT: csel w17, w17, w8, lt ++; CHECK-CVT-NEXT: fcvtzs w0, s2 ++; CHECK-CVT-NEXT: cmn w17, #8, lsl #12 // =32768 ++; CHECK-CVT-NEXT: mov s2, v2.s[3] ++; CHECK-CVT-NEXT: csel w17, w17, w9, gt ++; CHECK-CVT-NEXT: cmp w18, w8 ++; CHECK-CVT-NEXT: csel w18, w18, w8, lt ++; CHECK-CVT-NEXT: fcvtzs w1, s1 ++; CHECK-CVT-NEXT: cmn w18, #8, lsl #12 // =32768 ++; CHECK-CVT-NEXT: mov s1, v0.s[1] ++; CHECK-CVT-NEXT: csel w18, w18, w9, gt ++; CHECK-CVT-NEXT: cmp w0, w8 ++; CHECK-CVT-NEXT: csel w0, w0, w8, lt ++; CHECK-CVT-NEXT: fcvtzs w2, s2 ++; CHECK-CVT-NEXT: cmn w0, #8, lsl #12 // =32768 ++; CHECK-CVT-NEXT: fcvtzs w4, s0 ++; CHECK-CVT-NEXT: csel w0, w0, w9, gt ++; CHECK-CVT-NEXT: cmp w1, w8 ++; CHECK-CVT-NEXT: csel w1, w1, w8, lt ++; CHECK-CVT-NEXT: fcvtzs w3, s1 ++; CHECK-CVT-NEXT: cmn w1, #8, lsl #12 // =32768 ++; CHECK-CVT-NEXT: mov s1, v0.s[2] ++; CHECK-CVT-NEXT: csel w1, w1, w9, gt ++; CHECK-CVT-NEXT: cmp w2, w8 ++; CHECK-CVT-NEXT: csel w2, w2, w8, lt ++; CHECK-CVT-NEXT: fmov s2, w11 ++; CHECK-CVT-NEXT: cmn w2, #8, lsl #12 // =32768 ++; CHECK-CVT-NEXT: fmov s3, w14 ++; CHECK-CVT-NEXT: csel w2, w2, w9, gt ++; CHECK-CVT-NEXT: cmp w3, w8 ++; CHECK-CVT-NEXT: csel w3, w3, w8, lt ++; CHECK-CVT-NEXT: fcvtzs w14, s1 ++; CHECK-CVT-NEXT: cmn w3, #8, lsl #12 // =32768 ++; CHECK-CVT-NEXT: mov s0, v0.s[3] ++; CHECK-CVT-NEXT: csel w3, w3, w9, gt ++; CHECK-CVT-NEXT: cmp w4, w8 ++; CHECK-CVT-NEXT: csel w11, w4, w8, lt ++; CHECK-CVT-NEXT: fmov s4, w0 ++; CHECK-CVT-NEXT: cmn w11, #8, lsl #12 // =32768 ++; CHECK-CVT-NEXT: csel w11, w11, w9, gt ++; CHECK-CVT-NEXT: cmp w14, w8 ++; CHECK-CVT-NEXT: mov v2.s[1], w10 ++; CHECK-CVT-NEXT: csel w10, w14, w8, lt ++; CHECK-CVT-NEXT: mov v3.s[1], w15 ++; CHECK-CVT-NEXT: cmn w10, #8, lsl #12 // =32768 ++; CHECK-CVT-NEXT: fmov s1, w11 ++; CHECK-CVT-NEXT: csel w10, w10, w9, gt ++; CHECK-CVT-NEXT: fcvtzs w11, s0 ++; CHECK-CVT-NEXT: mov v4.s[1], w18 ++; CHECK-CVT-NEXT: mov v1.s[1], w3 ++; CHECK-CVT-NEXT: cmp w11, w8 ++; CHECK-CVT-NEXT: csel w8, w11, w8, lt ++; CHECK-CVT-NEXT: mov v2.s[2], w13 ++; CHECK-CVT-NEXT: cmn w8, #8, lsl #12 // =32768 ++; CHECK-CVT-NEXT: mov v3.s[2], w16 ++; CHECK-CVT-NEXT: csel w8, w8, w9, gt ++; CHECK-CVT-NEXT: mov v4.s[2], w1 ++; CHECK-CVT-NEXT: mov v1.s[2], w10 ++; CHECK-CVT-NEXT: mov v2.s[3], w12 ++; CHECK-CVT-NEXT: mov v3.s[3], w17 ++; CHECK-CVT-NEXT: mov v4.s[3], w2 ++; CHECK-CVT-NEXT: mov v1.s[3], w8 ++; CHECK-CVT-NEXT: uzp1 v0.8h, v3.8h, v2.8h ++; CHECK-CVT-NEXT: uzp1 v1.8h, v1.8h, v4.8h ++; CHECK-CVT-NEXT: ret ++; ++; CHECK-FP16-LABEL: test_signed_v16f16_v16i16: ++; CHECK-FP16: // %bb.0: ++; CHECK-FP16-NEXT: fcvtzs v0.8h, v0.8h ++; CHECK-FP16-NEXT: fcvtzs v1.8h, v1.8h ++; CHECK-FP16-NEXT: ret ++ %x = call <16 x i16> @llvm.fptosi.sat.v16f16.v16i16(<16 x half> %f) ++ ret <16 x i16> %x ++} ++ ++define <8 x i8> @test_signed_v8f64_v8i8(<8 x double> %f) { ++; CHECK-LABEL: test_signed_v8f64_v8i8: ++; CHECK: // %bb.0: ++; CHECK-NEXT: mov d4, v0.d[1] ++; CHECK-NEXT: mov w8, #127 ++; CHECK-NEXT: fcvtzs w11, d0 ++; CHECK-NEXT: mov w9, #-128 ++; CHECK-NEXT: mov d0, v2.d[1] ++; CHECK-NEXT: fcvtzs w13, d1 ++; CHECK-NEXT: fcvtzs w15, d3 ++; CHECK-NEXT: fcvtzs w10, d4 ++; CHECK-NEXT: mov d4, v1.d[1] ++; CHECK-NEXT: mov d1, v3.d[1] ++; CHECK-NEXT: fcvtzs w14, d0 ++; CHECK-NEXT: cmp w10, #127 ++; CHECK-NEXT: csel w10, w10, w8, lt ++; CHECK-NEXT: fcvtzs w12, d4 ++; CHECK-NEXT: cmn w10, #128 ++; CHECK-NEXT: csel w10, w10, w9, gt ++; CHECK-NEXT: cmp w11, #127 ++; CHECK-NEXT: csel w11, w11, w8, lt ++; CHECK-NEXT: cmn w11, #128 ++; CHECK-NEXT: csel w11, w11, w9, gt ++; CHECK-NEXT: cmp w12, #127 ++; CHECK-NEXT: csel w12, w12, w8, lt ++; CHECK-NEXT: cmn w12, #128 ++; CHECK-NEXT: csel w12, w12, w9, gt ++; CHECK-NEXT: cmp w13, #127 ++; CHECK-NEXT: fmov s0, w11 ++; CHECK-NEXT: csel w11, w13, w8, lt ++; CHECK-NEXT: cmn w11, #128 ++; CHECK-NEXT: fcvtzs w13, d2 ++; CHECK-NEXT: csel w11, w11, w9, gt ++; CHECK-NEXT: cmp w14, #127 ++; CHECK-NEXT: mov v0.s[1], w10 ++; CHECK-NEXT: csel w10, w14, w8, lt ++; CHECK-NEXT: cmn w10, #128 ++; CHECK-NEXT: fmov s2, w11 ++; CHECK-NEXT: csel w10, w10, w9, gt ++; CHECK-NEXT: cmp w13, #127 ++; CHECK-NEXT: mov w11, v0.s[1] ++; CHECK-NEXT: csel w13, w13, w8, lt ++; CHECK-NEXT: mov v2.s[1], w12 ++; CHECK-NEXT: cmn w13, #128 ++; CHECK-NEXT: fcvtzs w12, d1 ++; CHECK-NEXT: csel w13, w13, w9, gt ++; CHECK-NEXT: mov v0.b[1], w11 ++; CHECK-NEXT: fmov w14, s2 ++; CHECK-NEXT: cmp w12, #127 ++; CHECK-NEXT: fmov s1, w13 ++; CHECK-NEXT: csel w12, w12, w8, lt ++; CHECK-NEXT: cmn w12, #128 ++; CHECK-NEXT: mov w11, v2.s[1] ++; CHECK-NEXT: mov v0.b[2], w14 ++; CHECK-NEXT: csel w12, w12, w9, gt ++; CHECK-NEXT: cmp w15, #127 ++; CHECK-NEXT: mov v1.s[1], w10 ++; CHECK-NEXT: csel w8, w15, w8, lt ++; CHECK-NEXT: cmn w8, #128 ++; CHECK-NEXT: csel w8, w8, w9, gt ++; CHECK-NEXT: mov v0.b[3], w11 ++; CHECK-NEXT: fmov w9, s1 ++; CHECK-NEXT: fmov s2, w8 ++; CHECK-NEXT: mov w8, v1.s[1] ++; CHECK-NEXT: mov v0.b[4], w9 ++; CHECK-NEXT: mov v2.s[1], w12 ++; CHECK-NEXT: mov v0.b[5], w8 ++; CHECK-NEXT: fmov w8, s2 ++; CHECK-NEXT: mov w9, v2.s[1] ++; CHECK-NEXT: mov v0.b[6], w8 ++; CHECK-NEXT: mov v0.b[7], w9 ++; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ++; CHECK-NEXT: ret ++ %x = call <8 x i8> @llvm.fptosi.sat.v8f64.v8i8(<8 x double> %f) ++ ret <8 x i8> %x ++} ++ ++define <16 x i8> @test_signed_v16f64_v16i8(<16 x double> %f) { ++; CHECK-LABEL: test_signed_v16f64_v16i8: ++; CHECK: // %bb.0: ++; CHECK-NEXT: mov d16, v0.d[1] ++; CHECK-NEXT: mov w8, #127 ++; CHECK-NEXT: fcvtzs w11, d0 ++; CHECK-NEXT: mov w9, #-128 ++; CHECK-NEXT: fcvtzs w13, d1 ++; CHECK-NEXT: mov d0, v2.d[1] ++; CHECK-NEXT: fcvtzs w14, d2 ++; CHECK-NEXT: fcvtzs w10, d16 ++; CHECK-NEXT: mov d16, v1.d[1] ++; CHECK-NEXT: mov d1, v3.d[1] ++; CHECK-NEXT: fcvtzs w15, d0 ++; CHECK-NEXT: cmp w10, #127 ++; CHECK-NEXT: csel w10, w10, w8, lt ++; CHECK-NEXT: fcvtzs w12, d16 ++; CHECK-NEXT: cmn w10, #128 ++; CHECK-NEXT: csel w10, w10, w9, gt ++; CHECK-NEXT: cmp w11, #127 ++; CHECK-NEXT: csel w11, w11, w8, lt ++; CHECK-NEXT: cmn w11, #128 ++; CHECK-NEXT: csel w11, w11, w9, gt ++; CHECK-NEXT: cmp w12, #127 ++; CHECK-NEXT: csel w12, w12, w8, lt ++; CHECK-NEXT: cmn w12, #128 ++; CHECK-NEXT: csel w12, w12, w9, gt ++; CHECK-NEXT: cmp w13, #127 ++; CHECK-NEXT: csel w13, w13, w8, lt ++; CHECK-NEXT: fmov s0, w11 ++; CHECK-NEXT: cmn w13, #128 ++; CHECK-NEXT: csel w11, w13, w9, gt ++; CHECK-NEXT: cmp w15, #127 ++; CHECK-NEXT: mov v0.s[1], w10 ++; CHECK-NEXT: csel w10, w15, w8, lt ++; CHECK-NEXT: cmn w10, #128 ++; CHECK-NEXT: fcvtzs w13, d3 ++; CHECK-NEXT: fmov s2, w11 ++; CHECK-NEXT: csel w10, w10, w9, gt ++; CHECK-NEXT: cmp w14, #127 ++; CHECK-NEXT: fcvtzs w11, d1 ++; CHECK-NEXT: mov w15, v0.s[1] ++; CHECK-NEXT: csel w14, w14, w8, lt ++; CHECK-NEXT: mov v2.s[1], w12 ++; CHECK-NEXT: cmn w14, #128 ++; CHECK-NEXT: csel w12, w14, w9, gt ++; CHECK-NEXT: cmp w11, #127 ++; CHECK-NEXT: csel w11, w11, w8, lt ++; CHECK-NEXT: mov d1, v4.d[1] ++; CHECK-NEXT: mov v0.b[1], w15 ++; CHECK-NEXT: cmn w11, #128 ++; CHECK-NEXT: fmov w14, s2 ++; CHECK-NEXT: csel w11, w11, w9, gt ++; CHECK-NEXT: fmov s3, w12 ++; CHECK-NEXT: cmp w13, #127 ++; CHECK-NEXT: mov w12, v2.s[1] ++; CHECK-NEXT: csel w13, w13, w8, lt ++; CHECK-NEXT: mov v0.b[2], w14 ++; CHECK-NEXT: cmn w13, #128 ++; CHECK-NEXT: mov v3.s[1], w10 ++; CHECK-NEXT: csel w13, w13, w9, gt ++; CHECK-NEXT: fcvtzs w15, d1 ++; CHECK-NEXT: fcvtzs w14, d4 ++; CHECK-NEXT: mov d1, v5.d[1] ++; CHECK-NEXT: mov v0.b[3], w12 ++; CHECK-NEXT: fmov s4, w13 ++; CHECK-NEXT: cmp w15, #127 ++; CHECK-NEXT: fmov w13, s3 ++; CHECK-NEXT: csel w10, w15, w8, lt ++; CHECK-NEXT: mov w12, v3.s[1] ++; CHECK-NEXT: cmn w10, #128 ++; CHECK-NEXT: fcvtzs w15, d1 ++; CHECK-NEXT: csel w10, w10, w9, gt ++; CHECK-NEXT: cmp w14, #127 ++; CHECK-NEXT: mov v0.b[4], w13 ++; CHECK-NEXT: csel w14, w14, w8, lt ++; CHECK-NEXT: mov v4.s[1], w11 ++; CHECK-NEXT: cmn w14, #128 ++; CHECK-NEXT: csel w14, w14, w9, gt ++; CHECK-NEXT: fcvtzs w13, d5 ++; CHECK-NEXT: cmp w15, #127 ++; CHECK-NEXT: mov d2, v6.d[1] ++; CHECK-NEXT: mov v0.b[5], w12 ++; CHECK-NEXT: csel w11, w15, w8, lt ++; CHECK-NEXT: fmov w12, s4 ++; CHECK-NEXT: cmn w11, #128 ++; CHECK-NEXT: fmov s1, w14 ++; CHECK-NEXT: csel w11, w11, w9, gt ++; CHECK-NEXT: cmp w13, #127 ++; CHECK-NEXT: mov w14, v4.s[1] ++; CHECK-NEXT: mov v0.b[6], w12 ++; CHECK-NEXT: csel w13, w13, w8, lt ++; CHECK-NEXT: mov v1.s[1], w10 ++; CHECK-NEXT: cmn w13, #128 ++; CHECK-NEXT: fcvtzs w15, d2 ++; CHECK-NEXT: csel w13, w13, w9, gt ++; CHECK-NEXT: fcvtzs w10, d6 ++; CHECK-NEXT: mov v0.b[7], w14 ++; CHECK-NEXT: cmp w15, #127 ++; CHECK-NEXT: fmov w14, s1 ++; CHECK-NEXT: csel w12, w15, w8, lt ++; CHECK-NEXT: fmov s2, w13 ++; CHECK-NEXT: mov w13, v1.s[1] ++; CHECK-NEXT: mov d1, v7.d[1] ++; CHECK-NEXT: cmn w12, #128 ++; CHECK-NEXT: fcvtzs w15, d7 ++; CHECK-NEXT: csel w12, w12, w9, gt ++; CHECK-NEXT: cmp w10, #127 ++; CHECK-NEXT: mov v0.b[8], w14 ++; CHECK-NEXT: csel w10, w10, w8, lt ++; CHECK-NEXT: mov v2.s[1], w11 ++; CHECK-NEXT: cmn w10, #128 ++; CHECK-NEXT: fcvtzs w11, d1 ++; CHECK-NEXT: csel w10, w10, w9, gt ++; CHECK-NEXT: mov v0.b[9], w13 ++; CHECK-NEXT: fmov w14, s2 ++; CHECK-NEXT: cmp w11, #127 ++; CHECK-NEXT: fmov s1, w10 ++; CHECK-NEXT: csel w10, w11, w8, lt ++; CHECK-NEXT: cmn w10, #128 ++; CHECK-NEXT: mov w13, v2.s[1] ++; CHECK-NEXT: mov v0.b[10], w14 ++; CHECK-NEXT: csel w10, w10, w9, gt ++; CHECK-NEXT: cmp w15, #127 ++; CHECK-NEXT: mov v1.s[1], w12 ++; CHECK-NEXT: csel w8, w15, w8, lt ++; CHECK-NEXT: cmn w8, #128 ++; CHECK-NEXT: csel w8, w8, w9, gt ++; CHECK-NEXT: mov v0.b[11], w13 ++; CHECK-NEXT: fmov w9, s1 ++; CHECK-NEXT: fmov s2, w8 ++; CHECK-NEXT: mov w8, v1.s[1] ++; CHECK-NEXT: mov v0.b[12], w9 ++; CHECK-NEXT: mov v2.s[1], w10 ++; CHECK-NEXT: mov v0.b[13], w8 ++; CHECK-NEXT: fmov w8, s2 ++; CHECK-NEXT: mov w9, v2.s[1] ++; CHECK-NEXT: mov v0.b[14], w8 ++; CHECK-NEXT: mov v0.b[15], w9 ++; CHECK-NEXT: ret ++ %x = call <16 x i8> @llvm.fptosi.sat.v16f64.v16i8(<16 x double> %f) ++ ret <16 x i8> %x ++} ++ ++define <8 x i16> @test_signed_v8f64_v8i16(<8 x double> %f) { ++; CHECK-LABEL: test_signed_v8f64_v8i16: ++; CHECK: // %bb.0: ++; CHECK-NEXT: mov d4, v0.d[1] ++; CHECK-NEXT: mov w8, #32767 ++; CHECK-NEXT: fcvtzs w10, d0 ++; CHECK-NEXT: mov w11, #-32768 ++; CHECK-NEXT: mov d0, v2.d[1] ++; CHECK-NEXT: fcvtzs w13, d1 ++; CHECK-NEXT: fcvtzs w15, d3 ++; CHECK-NEXT: fcvtzs w9, d4 ++; CHECK-NEXT: mov d4, v1.d[1] ++; CHECK-NEXT: mov d1, v3.d[1] ++; CHECK-NEXT: fcvtzs w14, d0 ++; CHECK-NEXT: cmp w9, w8 ++; CHECK-NEXT: csel w9, w9, w8, lt ++; CHECK-NEXT: fcvtzs w12, d4 ++; CHECK-NEXT: cmn w9, #8, lsl #12 // =32768 ++; CHECK-NEXT: csel w9, w9, w11, gt ++; CHECK-NEXT: cmp w10, w8 ++; CHECK-NEXT: csel w10, w10, w8, lt ++; CHECK-NEXT: cmn w10, #8, lsl #12 // =32768 ++; CHECK-NEXT: csel w10, w10, w11, gt ++; CHECK-NEXT: cmp w12, w8 ++; CHECK-NEXT: csel w12, w12, w8, lt ++; CHECK-NEXT: cmn w12, #8, lsl #12 // =32768 ++; CHECK-NEXT: csel w12, w12, w11, gt ++; CHECK-NEXT: cmp w13, w8 ++; CHECK-NEXT: fmov s0, w10 ++; CHECK-NEXT: csel w10, w13, w8, lt ++; CHECK-NEXT: cmn w10, #8, lsl #12 // =32768 ++; CHECK-NEXT: fcvtzs w13, d2 ++; CHECK-NEXT: csel w10, w10, w11, gt ++; CHECK-NEXT: cmp w14, w8 ++; CHECK-NEXT: mov v0.s[1], w9 ++; CHECK-NEXT: csel w9, w14, w8, lt ++; CHECK-NEXT: cmn w9, #8, lsl #12 // =32768 ++; CHECK-NEXT: fmov s2, w10 ++; CHECK-NEXT: csel w9, w9, w11, gt ++; CHECK-NEXT: cmp w13, w8 ++; CHECK-NEXT: mov w10, v0.s[1] ++; CHECK-NEXT: csel w13, w13, w8, lt ++; CHECK-NEXT: mov v2.s[1], w12 ++; CHECK-NEXT: cmn w13, #8, lsl #12 // =32768 ++; CHECK-NEXT: fcvtzs w12, d1 ++; CHECK-NEXT: csel w13, w13, w11, gt ++; CHECK-NEXT: mov v0.h[1], w10 ++; CHECK-NEXT: fmov w14, s2 ++; CHECK-NEXT: cmp w12, w8 ++; CHECK-NEXT: fmov s1, w13 ++; CHECK-NEXT: csel w12, w12, w8, lt ++; CHECK-NEXT: cmn w12, #8, lsl #12 // =32768 ++; CHECK-NEXT: mov w10, v2.s[1] ++; CHECK-NEXT: mov v0.h[2], w14 ++; CHECK-NEXT: csel w12, w12, w11, gt ++; CHECK-NEXT: cmp w15, w8 ++; CHECK-NEXT: mov v1.s[1], w9 ++; CHECK-NEXT: csel w8, w15, w8, lt ++; CHECK-NEXT: cmn w8, #8, lsl #12 // =32768 ++; CHECK-NEXT: csel w8, w8, w11, gt ++; CHECK-NEXT: mov v0.h[3], w10 ++; CHECK-NEXT: fmov w9, s1 ++; CHECK-NEXT: fmov s2, w8 ++; CHECK-NEXT: mov w8, v1.s[1] ++; CHECK-NEXT: mov v0.h[4], w9 ++; CHECK-NEXT: mov v2.s[1], w12 ++; CHECK-NEXT: mov v0.h[5], w8 ++; CHECK-NEXT: fmov w8, s2 ++; CHECK-NEXT: mov w9, v2.s[1] ++; CHECK-NEXT: mov v0.h[6], w8 ++; CHECK-NEXT: mov v0.h[7], w9 ++; CHECK-NEXT: ret ++ %x = call <8 x i16> @llvm.fptosi.sat.v8f64.v8i16(<8 x double> %f) ++ ret <8 x i16> %x ++} ++ ++define <16 x i16> @test_signed_v16f64_v16i16(<16 x double> %f) { ++; CHECK-LABEL: test_signed_v16f64_v16i16: ++; CHECK: // %bb.0: ++; CHECK-NEXT: mov d16, v0.d[1] ++; CHECK-NEXT: mov w9, #32767 ++; CHECK-NEXT: fcvtzs w11, d0 ++; CHECK-NEXT: mov w8, #-32768 ++; CHECK-NEXT: mov d0, v2.d[1] ++; CHECK-NEXT: fcvtzs w12, d1 ++; CHECK-NEXT: fcvtzs w14, d2 ++; CHECK-NEXT: mov d2, v4.d[1] ++; CHECK-NEXT: fcvtzs w10, d16 ++; CHECK-NEXT: mov d16, v1.d[1] ++; CHECK-NEXT: mov d1, v3.d[1] ++; CHECK-NEXT: fcvtzs w16, d3 ++; CHECK-NEXT: fcvtzs w15, d0 ++; CHECK-NEXT: mov d3, v6.d[1] ++; CHECK-NEXT: cmp w10, w9 ++; CHECK-NEXT: csel w10, w10, w9, lt ++; CHECK-NEXT: fcvtzs w13, d16 ++; CHECK-NEXT: cmn w10, #8, lsl #12 // =32768 ++; CHECK-NEXT: fcvtzs w17, d1 ++; CHECK-NEXT: csel w10, w10, w8, gt ++; CHECK-NEXT: cmp w11, w9 ++; CHECK-NEXT: csel w11, w11, w9, lt ++; CHECK-NEXT: mov d1, v5.d[1] ++; CHECK-NEXT: cmn w11, #8, lsl #12 // =32768 ++; CHECK-NEXT: csel w11, w11, w8, gt ++; CHECK-NEXT: cmp w13, w9 ++; CHECK-NEXT: csel w13, w13, w9, lt ++; CHECK-NEXT: cmn w13, #8, lsl #12 // =32768 ++; CHECK-NEXT: csel w13, w13, w8, gt ++; CHECK-NEXT: cmp w12, w9 ++; CHECK-NEXT: csel w12, w12, w9, lt ++; CHECK-NEXT: fmov s0, w11 ++; CHECK-NEXT: cmn w12, #8, lsl #12 // =32768 ++; CHECK-NEXT: csel w12, w12, w8, gt ++; CHECK-NEXT: cmp w15, w9 ++; CHECK-NEXT: csel w15, w15, w9, lt ++; CHECK-NEXT: cmn w15, #8, lsl #12 // =32768 ++; CHECK-NEXT: csel w11, w15, w8, gt ++; CHECK-NEXT: cmp w14, w9 ++; CHECK-NEXT: csel w14, w14, w9, lt ++; CHECK-NEXT: fcvtzs w15, d4 ++; CHECK-NEXT: cmn w14, #8, lsl #12 // =32768 ++; CHECK-NEXT: csel w14, w14, w8, gt ++; CHECK-NEXT: cmp w17, w9 ++; CHECK-NEXT: mov v0.s[1], w10 ++; CHECK-NEXT: csel w10, w17, w9, lt ++; CHECK-NEXT: cmn w10, #8, lsl #12 // =32768 ++; CHECK-NEXT: fcvtzs w17, d2 ++; CHECK-NEXT: csel w10, w10, w8, gt ++; CHECK-NEXT: cmp w16, w9 ++; CHECK-NEXT: fmov s2, w12 ++; CHECK-NEXT: csel w12, w16, w9, lt ++; CHECK-NEXT: cmn w12, #8, lsl #12 // =32768 ++; CHECK-NEXT: mov w16, v0.s[1] ++; CHECK-NEXT: csel w12, w12, w8, gt ++; CHECK-NEXT: cmp w17, w9 ++; CHECK-NEXT: mov v2.s[1], w13 ++; CHECK-NEXT: csel w13, w17, w9, lt ++; CHECK-NEXT: cmn w13, #8, lsl #12 // =32768 ++; CHECK-NEXT: fcvtzs w17, d1 ++; CHECK-NEXT: csel w13, w13, w8, gt ++; CHECK-NEXT: cmp w15, w9 ++; CHECK-NEXT: csel w15, w15, w9, lt ++; CHECK-NEXT: fmov s4, w14 ++; CHECK-NEXT: cmn w15, #8, lsl #12 // =32768 ++; CHECK-NEXT: mov v0.h[1], w16 ++; CHECK-NEXT: fcvtzs w16, d5 ++; CHECK-NEXT: csel w15, w15, w8, gt ++; CHECK-NEXT: cmp w17, w9 ++; CHECK-NEXT: csel w17, w17, w9, lt ++; CHECK-NEXT: cmn w17, #8, lsl #12 // =32768 ++; CHECK-NEXT: csel w14, w17, w8, gt ++; CHECK-NEXT: cmp w16, w9 ++; CHECK-NEXT: fmov s1, w15 ++; CHECK-NEXT: csel w15, w16, w9, lt ++; CHECK-NEXT: fcvtzs w16, d3 ++; CHECK-NEXT: cmn w15, #8, lsl #12 // =32768 ++; CHECK-NEXT: mov v4.s[1], w11 ++; CHECK-NEXT: csel w11, w15, w8, gt ++; CHECK-NEXT: fcvtzs w15, d6 ++; CHECK-NEXT: mov v1.s[1], w13 ++; CHECK-NEXT: cmp w16, w9 ++; CHECK-NEXT: fmov s3, w11 ++; CHECK-NEXT: csel w16, w16, w9, lt ++; CHECK-NEXT: fmov w11, s2 ++; CHECK-NEXT: mov w13, v2.s[1] ++; CHECK-NEXT: mov d2, v7.d[1] ++; CHECK-NEXT: cmn w16, #8, lsl #12 // =32768 ++; CHECK-NEXT: csel w16, w16, w8, gt ++; CHECK-NEXT: cmp w15, w9 ++; CHECK-NEXT: mov v0.h[2], w11 ++; CHECK-NEXT: csel w11, w15, w9, lt ++; CHECK-NEXT: mov w15, v1.s[1] ++; CHECK-NEXT: cmn w11, #8, lsl #12 // =32768 ++; CHECK-NEXT: mov v3.s[1], w14 ++; CHECK-NEXT: fcvtzs w14, d2 ++; CHECK-NEXT: csel w11, w11, w8, gt ++; CHECK-NEXT: mov v0.h[3], w13 ++; CHECK-NEXT: mov v1.h[1], w15 ++; CHECK-NEXT: cmp w14, w9 ++; CHECK-NEXT: fmov w13, s3 ++; CHECK-NEXT: csel w14, w14, w9, lt ++; CHECK-NEXT: fcvtzs w15, d7 ++; CHECK-NEXT: fmov s2, w11 ++; CHECK-NEXT: cmn w14, #8, lsl #12 // =32768 ++; CHECK-NEXT: mov w11, v3.s[1] ++; CHECK-NEXT: mov v1.h[2], w13 ++; CHECK-NEXT: csel w13, w14, w8, gt ++; CHECK-NEXT: cmp w15, w9 ++; CHECK-NEXT: fmov s3, w12 ++; CHECK-NEXT: mov v2.s[1], w16 ++; CHECK-NEXT: csel w9, w15, w9, lt ++; CHECK-NEXT: cmn w9, #8, lsl #12 // =32768 ++; CHECK-NEXT: fmov w12, s4 ++; CHECK-NEXT: csel w8, w9, w8, gt ++; CHECK-NEXT: mov w14, v4.s[1] ++; CHECK-NEXT: mov v1.h[3], w11 ++; CHECK-NEXT: fmov w11, s2 ++; CHECK-NEXT: mov w9, v2.s[1] ++; CHECK-NEXT: fmov s2, w8 ++; CHECK-NEXT: mov v0.h[4], w12 ++; CHECK-NEXT: mov v1.h[4], w11 ++; CHECK-NEXT: mov v3.s[1], w10 ++; CHECK-NEXT: mov v2.s[1], w13 ++; CHECK-NEXT: mov v0.h[5], w14 ++; CHECK-NEXT: mov v1.h[5], w9 ++; CHECK-NEXT: fmov w8, s3 ++; CHECK-NEXT: fmov w9, s2 ++; CHECK-NEXT: mov w10, v3.s[1] ++; CHECK-NEXT: mov w11, v2.s[1] ++; CHECK-NEXT: mov v0.h[6], w8 ++; CHECK-NEXT: mov v1.h[6], w9 ++; CHECK-NEXT: mov v0.h[7], w10 ++; CHECK-NEXT: mov v1.h[7], w11 ++; CHECK-NEXT: ret ++ %x = call <16 x i16> @llvm.fptosi.sat.v16f64.v16i16(<16 x double> %f) ++ ret <16 x i16> %x ++} +diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll +index 017845d3624a..ace519684215 100644 +--- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll ++++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll +@@ -2480,3 +2480,659 @@ define <8 x i128> @test_unsigned_v8f16_v8i128(<8 x half> %f) { + %x = call <8 x i128> @llvm.fptoui.sat.v8f16.v8i128(<8 x half> %f) + ret <8 x i128> %x + } ++ ++ ++declare <8 x i8> @llvm.fptoui.sat.v8f32.v8i8(<8 x float> %f) ++declare <8 x i16> @llvm.fptoui.sat.v8f32.v8i16(<8 x float> %f) ++declare <16 x i8> @llvm.fptoui.sat.v16f32.v16i8(<16 x float> %f) ++declare <16 x i16> @llvm.fptoui.sat.v16f32.v16i16(<16 x float> %f) ++ ++declare <16 x i8> @llvm.fptoui.sat.v16f16.v16i8(<16 x half> %f) ++declare <16 x i16> @llvm.fptoui.sat.v16f16.v16i16(<16 x half> %f) ++ ++declare <8 x i8> @llvm.fptoui.sat.v8f64.v8i8(<8 x double> %f) ++declare <8 x i16> @llvm.fptoui.sat.v8f64.v8i16(<8 x double> %f) ++declare <16 x i8> @llvm.fptoui.sat.v16f64.v16i8(<16 x double> %f) ++declare <16 x i16> @llvm.fptoui.sat.v16f64.v16i16(<16 x double> %f) ++ ++define <8 x i8> @test_unsigned_v8f32_v8i8(<8 x float> %f) { ++; CHECK-LABEL: test_unsigned_v8f32_v8i8: ++; CHECK: // %bb.0: ++; CHECK-NEXT: movi v2.2d, #0x0000ff000000ff ++; CHECK-NEXT: fcvtzu v1.4s, v1.4s ++; CHECK-NEXT: fcvtzu v0.4s, v0.4s ++; CHECK-NEXT: umin v1.4s, v1.4s, v2.4s ++; CHECK-NEXT: umin v0.4s, v0.4s, v2.4s ++; CHECK-NEXT: xtn v1.4h, v1.4s ++; CHECK-NEXT: xtn v0.4h, v0.4s ++; CHECK-NEXT: uzp1 v0.8b, v0.8b, v1.8b ++; CHECK-NEXT: ret ++ %x = call <8 x i8> @llvm.fptoui.sat.v8f32.v8i8(<8 x float> %f) ++ ret <8 x i8> %x ++} ++ ++define <16 x i8> @test_unsigned_v16f32_v16i8(<16 x float> %f) { ++; CHECK-LABEL: test_unsigned_v16f32_v16i8: ++; CHECK: // %bb.0: ++; CHECK-NEXT: movi v4.2d, #0x0000ff000000ff ++; CHECK-NEXT: fcvtzu v0.4s, v0.4s ++; CHECK-NEXT: fcvtzu v1.4s, v1.4s ++; CHECK-NEXT: fcvtzu v2.4s, v2.4s ++; CHECK-NEXT: umin v0.4s, v0.4s, v4.4s ++; CHECK-NEXT: umin v1.4s, v1.4s, v4.4s ++; CHECK-NEXT: umin v2.4s, v2.4s, v4.4s ++; CHECK-NEXT: xtn v5.4h, v0.4s ++; CHECK-NEXT: xtn v1.4h, v1.4s ++; CHECK-NEXT: umov w8, v5.h[0] ++; CHECK-NEXT: umov w9, v5.h[1] ++; CHECK-NEXT: fmov s0, w8 ++; CHECK-NEXT: umov w8, v5.h[2] ++; CHECK-NEXT: mov v0.b[1], w9 ++; CHECK-NEXT: mov v0.b[2], w8 ++; CHECK-NEXT: umov w8, v5.h[3] ++; CHECK-NEXT: mov v0.b[3], w8 ++; CHECK-NEXT: umov w8, v1.h[0] ++; CHECK-NEXT: mov v0.b[4], w8 ++; CHECK-NEXT: umov w8, v1.h[1] ++; CHECK-NEXT: mov v0.b[5], w8 ++; CHECK-NEXT: umov w8, v1.h[2] ++; CHECK-NEXT: mov v0.b[6], w8 ++; CHECK-NEXT: umov w8, v1.h[3] ++; CHECK-NEXT: xtn v1.4h, v2.4s ++; CHECK-NEXT: fcvtzu v2.4s, v3.4s ++; CHECK-NEXT: mov v0.b[7], w8 ++; CHECK-NEXT: umov w8, v1.h[0] ++; CHECK-NEXT: umin v2.4s, v2.4s, v4.4s ++; CHECK-NEXT: mov v0.b[8], w8 ++; CHECK-NEXT: umov w8, v1.h[1] ++; CHECK-NEXT: mov v0.b[9], w8 ++; CHECK-NEXT: umov w8, v1.h[2] ++; CHECK-NEXT: mov v0.b[10], w8 ++; CHECK-NEXT: umov w8, v1.h[3] ++; CHECK-NEXT: xtn v1.4h, v2.4s ++; CHECK-NEXT: mov v0.b[11], w8 ++; CHECK-NEXT: umov w8, v1.h[0] ++; CHECK-NEXT: mov v0.b[12], w8 ++; CHECK-NEXT: umov w8, v1.h[1] ++; CHECK-NEXT: mov v0.b[13], w8 ++; CHECK-NEXT: umov w8, v1.h[2] ++; CHECK-NEXT: mov v0.b[14], w8 ++; CHECK-NEXT: umov w8, v1.h[3] ++; CHECK-NEXT: mov v0.b[15], w8 ++; CHECK-NEXT: ret ++ %x = call <16 x i8> @llvm.fptoui.sat.v16f32.v16i8(<16 x float> %f) ++ ret <16 x i8> %x ++} ++ ++define <8 x i16> @test_unsigned_v8f32_v8i16(<8 x float> %f) { ++; CHECK-LABEL: test_unsigned_v8f32_v8i16: ++; CHECK: // %bb.0: ++; CHECK-NEXT: movi v2.2d, #0x00ffff0000ffff ++; CHECK-NEXT: fcvtzu v1.4s, v1.4s ++; CHECK-NEXT: fcvtzu v0.4s, v0.4s ++; CHECK-NEXT: umin v1.4s, v1.4s, v2.4s ++; CHECK-NEXT: umin v0.4s, v0.4s, v2.4s ++; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h ++; CHECK-NEXT: ret ++ %x = call <8 x i16> @llvm.fptoui.sat.v8f32.v8i16(<8 x float> %f) ++ ret <8 x i16> %x ++} ++ ++define <16 x i16> @test_unsigned_v16f32_v16i16(<16 x float> %f) { ++; CHECK-LABEL: test_unsigned_v16f32_v16i16: ++; CHECK: // %bb.0: ++; CHECK-NEXT: movi v4.2d, #0x00ffff0000ffff ++; CHECK-NEXT: fcvtzu v1.4s, v1.4s ++; CHECK-NEXT: fcvtzu v0.4s, v0.4s ++; CHECK-NEXT: fcvtzu v3.4s, v3.4s ++; CHECK-NEXT: fcvtzu v2.4s, v2.4s ++; CHECK-NEXT: umin v1.4s, v1.4s, v4.4s ++; CHECK-NEXT: umin v0.4s, v0.4s, v4.4s ++; CHECK-NEXT: umin v3.4s, v3.4s, v4.4s ++; CHECK-NEXT: umin v2.4s, v2.4s, v4.4s ++; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h ++; CHECK-NEXT: uzp1 v1.8h, v2.8h, v3.8h ++; CHECK-NEXT: ret ++ %x = call <16 x i16> @llvm.fptoui.sat.v16f32.v16i16(<16 x float> %f) ++ ret <16 x i16> %x ++} ++ ++ ++ ++define <16 x i8> @test_unsigned_v16f16_v16i8(<16 x half> %f) { ++; CHECK-CVT-LABEL: test_unsigned_v16f16_v16i8: ++; CHECK-CVT: // %bb.0: ++; CHECK-CVT-NEXT: fcvtl2 v2.4s, v1.8h ++; CHECK-CVT-NEXT: fcvtl v1.4s, v1.4h ++; CHECK-CVT-NEXT: fcvtl2 v5.4s, v0.8h ++; CHECK-CVT-NEXT: mov w8, #255 ++; CHECK-CVT-NEXT: fcvtl v0.4s, v0.4h ++; CHECK-CVT-NEXT: mov s3, v2.s[1] ++; CHECK-CVT-NEXT: mov s4, v2.s[2] ++; CHECK-CVT-NEXT: fcvtzu w9, s2 ++; CHECK-CVT-NEXT: mov s2, v2.s[3] ++; CHECK-CVT-NEXT: fcvtzu w12, s1 ++; CHECK-CVT-NEXT: fcvtzu w16, s5 ++; CHECK-CVT-NEXT: fcvtzu w2, s0 ++; CHECK-CVT-NEXT: fcvtzu w10, s3 ++; CHECK-CVT-NEXT: mov s3, v1.s[1] ++; CHECK-CVT-NEXT: fcvtzu w11, s4 ++; CHECK-CVT-NEXT: mov s4, v1.s[2] ++; CHECK-CVT-NEXT: mov s1, v1.s[3] ++; CHECK-CVT-NEXT: fcvtzu w13, s2 ++; CHECK-CVT-NEXT: cmp w10, #255 ++; CHECK-CVT-NEXT: mov s2, v5.s[1] ++; CHECK-CVT-NEXT: fcvtzu w14, s3 ++; CHECK-CVT-NEXT: csel w10, w10, w8, lo ++; CHECK-CVT-NEXT: cmp w9, #255 ++; CHECK-CVT-NEXT: fcvtzu w15, s4 ++; CHECK-CVT-NEXT: csel w9, w9, w8, lo ++; CHECK-CVT-NEXT: cmp w11, #255 ++; CHECK-CVT-NEXT: csel w11, w11, w8, lo ++; CHECK-CVT-NEXT: cmp w13, #255 ++; CHECK-CVT-NEXT: mov s3, v5.s[2] ++; CHECK-CVT-NEXT: fcvtzu w17, s1 ++; CHECK-CVT-NEXT: csel w13, w13, w8, lo ++; CHECK-CVT-NEXT: cmp w14, #255 ++; CHECK-CVT-NEXT: mov s4, v5.s[3] ++; CHECK-CVT-NEXT: fcvtzu w18, s2 ++; CHECK-CVT-NEXT: csel w14, w14, w8, lo ++; CHECK-CVT-NEXT: cmp w12, #255 ++; CHECK-CVT-NEXT: mov s1, v0.s[1] ++; CHECK-CVT-NEXT: csel w12, w12, w8, lo ++; CHECK-CVT-NEXT: cmp w15, #255 ++; CHECK-CVT-NEXT: fcvtzu w0, s3 ++; CHECK-CVT-NEXT: csel w15, w15, w8, lo ++; CHECK-CVT-NEXT: cmp w17, #255 ++; CHECK-CVT-NEXT: csel w17, w17, w8, lo ++; CHECK-CVT-NEXT: cmp w18, #255 ++; CHECK-CVT-NEXT: fmov s2, w9 ++; CHECK-CVT-NEXT: csel w9, w18, w8, lo ++; CHECK-CVT-NEXT: fcvtzu w18, s4 ++; CHECK-CVT-NEXT: cmp w16, #255 ++; CHECK-CVT-NEXT: fcvtzu w1, s1 ++; CHECK-CVT-NEXT: csel w16, w16, w8, lo ++; CHECK-CVT-NEXT: cmp w0, #255 ++; CHECK-CVT-NEXT: mov s1, v0.s[2] ++; CHECK-CVT-NEXT: csel w0, w0, w8, lo ++; CHECK-CVT-NEXT: cmp w18, #255 ++; CHECK-CVT-NEXT: mov v2.s[1], w10 ++; CHECK-CVT-NEXT: csel w10, w18, w8, lo ++; CHECK-CVT-NEXT: cmp w1, #255 ++; CHECK-CVT-NEXT: fmov s3, w12 ++; CHECK-CVT-NEXT: csel w18, w1, w8, lo ++; CHECK-CVT-NEXT: cmp w2, #255 ++; CHECK-CVT-NEXT: csel w1, w2, w8, lo ++; CHECK-CVT-NEXT: fmov s4, w16 ++; CHECK-CVT-NEXT: mov v2.s[2], w11 ++; CHECK-CVT-NEXT: fcvtzu w11, s1 ++; CHECK-CVT-NEXT: mov s0, v0.s[3] ++; CHECK-CVT-NEXT: fmov s1, w1 ++; CHECK-CVT-NEXT: mov v3.s[1], w14 ++; CHECK-CVT-NEXT: cmp w11, #255 ++; CHECK-CVT-NEXT: mov v4.s[1], w9 ++; CHECK-CVT-NEXT: csel w9, w11, w8, lo ++; CHECK-CVT-NEXT: mov v1.s[1], w18 ++; CHECK-CVT-NEXT: fcvtzu w11, s0 ++; CHECK-CVT-NEXT: mov v3.s[2], w15 ++; CHECK-CVT-NEXT: mov v4.s[2], w0 ++; CHECK-CVT-NEXT: mov v1.s[2], w9 ++; CHECK-CVT-NEXT: cmp w11, #255 ++; CHECK-CVT-NEXT: csel w8, w11, w8, lo ++; CHECK-CVT-NEXT: mov v2.s[3], w13 ++; CHECK-CVT-NEXT: mov v3.s[3], w17 ++; CHECK-CVT-NEXT: mov v4.s[3], w10 ++; CHECK-CVT-NEXT: mov v1.s[3], w8 ++; CHECK-CVT-NEXT: uzp1 v0.8h, v3.8h, v2.8h ++; CHECK-CVT-NEXT: uzp1 v1.8h, v1.8h, v4.8h ++; CHECK-CVT-NEXT: uzp1 v0.16b, v1.16b, v0.16b ++; CHECK-CVT-NEXT: ret ++; ++; CHECK-FP16-LABEL: test_unsigned_v16f16_v16i8: ++; CHECK-FP16: // %bb.0: ++; CHECK-FP16-NEXT: movi v2.2d, #0xff00ff00ff00ff ++; CHECK-FP16-NEXT: fcvtzu v1.8h, v1.8h ++; CHECK-FP16-NEXT: fcvtzu v0.8h, v0.8h ++; CHECK-FP16-NEXT: umin v1.8h, v1.8h, v2.8h ++; CHECK-FP16-NEXT: umin v0.8h, v0.8h, v2.8h ++; CHECK-FP16-NEXT: uzp1 v0.16b, v0.16b, v1.16b ++; CHECK-FP16-NEXT: ret ++ %x = call <16 x i8> @llvm.fptoui.sat.v16f16.v16i8(<16 x half> %f) ++ ret <16 x i8> %x ++} ++ ++define <16 x i16> @test_unsigned_v16f16_v16i16(<16 x half> %f) { ++; CHECK-CVT-LABEL: test_unsigned_v16f16_v16i16: ++; CHECK-CVT: // %bb.0: ++; CHECK-CVT-NEXT: fcvtl2 v2.4s, v0.8h ++; CHECK-CVT-NEXT: fcvtl v0.4s, v0.4h ++; CHECK-CVT-NEXT: fcvtl2 v5.4s, v1.8h ++; CHECK-CVT-NEXT: mov w8, #65535 ++; CHECK-CVT-NEXT: fcvtl v1.4s, v1.4h ++; CHECK-CVT-NEXT: mov s3, v2.s[1] ++; CHECK-CVT-NEXT: mov s4, v2.s[2] ++; CHECK-CVT-NEXT: fcvtzu w9, s2 ++; CHECK-CVT-NEXT: mov s2, v2.s[3] ++; CHECK-CVT-NEXT: fcvtzu w12, s0 ++; CHECK-CVT-NEXT: fcvtzu w16, s5 ++; CHECK-CVT-NEXT: fcvtzu w2, s1 ++; CHECK-CVT-NEXT: fcvtzu w10, s3 ++; CHECK-CVT-NEXT: mov s3, v0.s[1] ++; CHECK-CVT-NEXT: fcvtzu w11, s4 ++; CHECK-CVT-NEXT: mov s4, v0.s[2] ++; CHECK-CVT-NEXT: mov s0, v0.s[3] ++; CHECK-CVT-NEXT: fcvtzu w13, s2 ++; CHECK-CVT-NEXT: cmp w10, w8 ++; CHECK-CVT-NEXT: mov s2, v5.s[1] ++; CHECK-CVT-NEXT: fcvtzu w14, s3 ++; CHECK-CVT-NEXT: csel w10, w10, w8, lo ++; CHECK-CVT-NEXT: cmp w9, w8 ++; CHECK-CVT-NEXT: fcvtzu w15, s4 ++; CHECK-CVT-NEXT: csel w9, w9, w8, lo ++; CHECK-CVT-NEXT: cmp w11, w8 ++; CHECK-CVT-NEXT: csel w11, w11, w8, lo ++; CHECK-CVT-NEXT: cmp w13, w8 ++; CHECK-CVT-NEXT: mov s3, v5.s[2] ++; CHECK-CVT-NEXT: fcvtzu w17, s0 ++; CHECK-CVT-NEXT: csel w13, w13, w8, lo ++; CHECK-CVT-NEXT: cmp w14, w8 ++; CHECK-CVT-NEXT: mov s4, v5.s[3] ++; CHECK-CVT-NEXT: fcvtzu w18, s2 ++; CHECK-CVT-NEXT: csel w14, w14, w8, lo ++; CHECK-CVT-NEXT: cmp w12, w8 ++; CHECK-CVT-NEXT: mov s0, v1.s[1] ++; CHECK-CVT-NEXT: csel w12, w12, w8, lo ++; CHECK-CVT-NEXT: cmp w15, w8 ++; CHECK-CVT-NEXT: fcvtzu w0, s3 ++; CHECK-CVT-NEXT: csel w15, w15, w8, lo ++; CHECK-CVT-NEXT: cmp w17, w8 ++; CHECK-CVT-NEXT: csel w17, w17, w8, lo ++; CHECK-CVT-NEXT: cmp w18, w8 ++; CHECK-CVT-NEXT: fmov s2, w9 ++; CHECK-CVT-NEXT: csel w9, w18, w8, lo ++; CHECK-CVT-NEXT: fcvtzu w18, s4 ++; CHECK-CVT-NEXT: cmp w16, w8 ++; CHECK-CVT-NEXT: fcvtzu w1, s0 ++; CHECK-CVT-NEXT: csel w16, w16, w8, lo ++; CHECK-CVT-NEXT: cmp w0, w8 ++; CHECK-CVT-NEXT: mov s0, v1.s[2] ++; CHECK-CVT-NEXT: csel w0, w0, w8, lo ++; CHECK-CVT-NEXT: cmp w18, w8 ++; CHECK-CVT-NEXT: mov v2.s[1], w10 ++; CHECK-CVT-NEXT: csel w10, w18, w8, lo ++; CHECK-CVT-NEXT: cmp w1, w8 ++; CHECK-CVT-NEXT: fmov s3, w12 ++; CHECK-CVT-NEXT: csel w18, w1, w8, lo ++; CHECK-CVT-NEXT: cmp w2, w8 ++; CHECK-CVT-NEXT: csel w1, w2, w8, lo ++; CHECK-CVT-NEXT: fmov s4, w16 ++; CHECK-CVT-NEXT: mov v2.s[2], w11 ++; CHECK-CVT-NEXT: fcvtzu w11, s0 ++; CHECK-CVT-NEXT: mov s0, v1.s[3] ++; CHECK-CVT-NEXT: fmov s5, w1 ++; CHECK-CVT-NEXT: mov v3.s[1], w14 ++; CHECK-CVT-NEXT: cmp w11, w8 ++; CHECK-CVT-NEXT: mov v4.s[1], w9 ++; CHECK-CVT-NEXT: csel w9, w11, w8, lo ++; CHECK-CVT-NEXT: mov v5.s[1], w18 ++; CHECK-CVT-NEXT: fcvtzu w11, s0 ++; CHECK-CVT-NEXT: mov v3.s[2], w15 ++; CHECK-CVT-NEXT: mov v4.s[2], w0 ++; CHECK-CVT-NEXT: mov v5.s[2], w9 ++; CHECK-CVT-NEXT: cmp w11, w8 ++; CHECK-CVT-NEXT: csel w8, w11, w8, lo ++; CHECK-CVT-NEXT: mov v2.s[3], w13 ++; CHECK-CVT-NEXT: mov v3.s[3], w17 ++; CHECK-CVT-NEXT: mov v4.s[3], w10 ++; CHECK-CVT-NEXT: mov v5.s[3], w8 ++; CHECK-CVT-NEXT: uzp1 v0.8h, v3.8h, v2.8h ++; CHECK-CVT-NEXT: uzp1 v1.8h, v5.8h, v4.8h ++; CHECK-CVT-NEXT: ret ++; ++; CHECK-FP16-LABEL: test_unsigned_v16f16_v16i16: ++; CHECK-FP16: // %bb.0: ++; CHECK-FP16-NEXT: fcvtzu v0.8h, v0.8h ++; CHECK-FP16-NEXT: fcvtzu v1.8h, v1.8h ++; CHECK-FP16-NEXT: ret ++ %x = call <16 x i16> @llvm.fptoui.sat.v16f16.v16i16(<16 x half> %f) ++ ret <16 x i16> %x ++} ++ ++define <8 x i8> @test_unsigned_v8f64_v8i8(<8 x double> %f) { ++; CHECK-LABEL: test_unsigned_v8f64_v8i8: ++; CHECK: // %bb.0: ++; CHECK-NEXT: mov d5, v0.d[1] ++; CHECK-NEXT: fcvtzu w10, d0 ++; CHECK-NEXT: mov d0, v1.d[1] ++; CHECK-NEXT: mov w8, #255 ++; CHECK-NEXT: fcvtzu w12, d1 ++; CHECK-NEXT: mov d4, v2.d[1] ++; CHECK-NEXT: fcvtzu w13, d3 ++; CHECK-NEXT: fcvtzu w9, d5 ++; CHECK-NEXT: fcvtzu w11, d0 ++; CHECK-NEXT: cmp w9, #255 ++; CHECK-NEXT: csel w9, w9, w8, lo ++; CHECK-NEXT: cmp w10, #255 ++; CHECK-NEXT: csel w10, w10, w8, lo ++; CHECK-NEXT: cmp w11, #255 ++; CHECK-NEXT: fmov s0, w10 ++; CHECK-NEXT: csel w10, w11, w8, lo ++; CHECK-NEXT: cmp w12, #255 ++; CHECK-NEXT: csel w11, w12, w8, lo ++; CHECK-NEXT: mov v0.s[1], w9 ++; CHECK-NEXT: fcvtzu w9, d4 ++; CHECK-NEXT: fmov s1, w11 ++; CHECK-NEXT: fcvtzu w11, d2 ++; CHECK-NEXT: cmp w9, #255 ++; CHECK-NEXT: mov d2, v3.d[1] ++; CHECK-NEXT: mov w12, v0.s[1] ++; CHECK-NEXT: csel w9, w9, w8, lo ++; CHECK-NEXT: mov v1.s[1], w10 ++; CHECK-NEXT: cmp w11, #255 ++; CHECK-NEXT: csel w10, w11, w8, lo ++; CHECK-NEXT: mov v0.b[1], w12 ++; CHECK-NEXT: fmov w11, s1 ++; CHECK-NEXT: fmov s4, w10 ++; CHECK-NEXT: fcvtzu w10, d2 ++; CHECK-NEXT: mov w12, v1.s[1] ++; CHECK-NEXT: mov v0.b[2], w11 ++; CHECK-NEXT: mov v4.s[1], w9 ++; CHECK-NEXT: cmp w10, #255 ++; CHECK-NEXT: csel w9, w10, w8, lo ++; CHECK-NEXT: cmp w13, #255 ++; CHECK-NEXT: csel w8, w13, w8, lo ++; CHECK-NEXT: mov v0.b[3], w12 ++; CHECK-NEXT: fmov w10, s4 ++; CHECK-NEXT: fmov s1, w8 ++; CHECK-NEXT: mov w8, v4.s[1] ++; CHECK-NEXT: mov v0.b[4], w10 ++; CHECK-NEXT: mov v1.s[1], w9 ++; CHECK-NEXT: mov v0.b[5], w8 ++; CHECK-NEXT: fmov w8, s1 ++; CHECK-NEXT: mov w9, v1.s[1] ++; CHECK-NEXT: mov v0.b[6], w8 ++; CHECK-NEXT: mov v0.b[7], w9 ++; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ++; CHECK-NEXT: ret ++ %x = call <8 x i8> @llvm.fptoui.sat.v8f64.v8i8(<8 x double> %f) ++ ret <8 x i8> %x ++} ++ ++define <16 x i8> @test_unsigned_v16f64_v16i8(<16 x double> %f) { ++; CHECK-LABEL: test_unsigned_v16f64_v16i8: ++; CHECK: // %bb.0: ++; CHECK-NEXT: mov d16, v0.d[1] ++; CHECK-NEXT: fcvtzu w10, d0 ++; CHECK-NEXT: mov d0, v1.d[1] ++; CHECK-NEXT: mov w8, #255 ++; CHECK-NEXT: fcvtzu w12, d1 ++; CHECK-NEXT: mov d1, v2.d[1] ++; CHECK-NEXT: fcvtzu w9, d16 ++; CHECK-NEXT: fcvtzu w11, d0 ++; CHECK-NEXT: cmp w9, #255 ++; CHECK-NEXT: csel w9, w9, w8, lo ++; CHECK-NEXT: cmp w10, #255 ++; CHECK-NEXT: csel w10, w10, w8, lo ++; CHECK-NEXT: cmp w11, #255 ++; CHECK-NEXT: fmov s0, w10 ++; CHECK-NEXT: csel w10, w11, w8, lo ++; CHECK-NEXT: cmp w12, #255 ++; CHECK-NEXT: csel w11, w12, w8, lo ++; CHECK-NEXT: mov v0.s[1], w9 ++; CHECK-NEXT: fcvtzu w9, d1 ++; CHECK-NEXT: fmov s1, w11 ++; CHECK-NEXT: fcvtzu w11, d2 ++; CHECK-NEXT: cmp w9, #255 ++; CHECK-NEXT: mov d2, v3.d[1] ++; CHECK-NEXT: mov w12, v0.s[1] ++; CHECK-NEXT: csel w9, w9, w8, lo ++; CHECK-NEXT: mov v1.s[1], w10 ++; CHECK-NEXT: cmp w11, #255 ++; CHECK-NEXT: csel w11, w11, w8, lo ++; CHECK-NEXT: fcvtzu w10, d2 ++; CHECK-NEXT: mov d2, v4.d[1] ++; CHECK-NEXT: mov v0.b[1], w12 ++; CHECK-NEXT: fmov w13, s1 ++; CHECK-NEXT: mov w12, v1.s[1] ++; CHECK-NEXT: fmov s1, w11 ++; CHECK-NEXT: fcvtzu w11, d3 ++; CHECK-NEXT: cmp w10, #255 ++; CHECK-NEXT: mov v0.b[2], w13 ++; CHECK-NEXT: mov v1.s[1], w9 ++; CHECK-NEXT: csel w9, w10, w8, lo ++; CHECK-NEXT: cmp w11, #255 ++; CHECK-NEXT: fcvtzu w10, d2 ++; CHECK-NEXT: csel w11, w11, w8, lo ++; CHECK-NEXT: mov d2, v5.d[1] ++; CHECK-NEXT: mov v0.b[3], w12 ++; CHECK-NEXT: fmov w12, s1 ++; CHECK-NEXT: cmp w10, #255 ++; CHECK-NEXT: mov w13, v1.s[1] ++; CHECK-NEXT: fmov s1, w11 ++; CHECK-NEXT: fcvtzu w11, d4 ++; CHECK-NEXT: mov v0.b[4], w12 ++; CHECK-NEXT: mov v1.s[1], w9 ++; CHECK-NEXT: csel w9, w10, w8, lo ++; CHECK-NEXT: cmp w11, #255 ++; CHECK-NEXT: csel w10, w11, w8, lo ++; CHECK-NEXT: mov v0.b[5], w13 ++; CHECK-NEXT: fcvtzu w13, d2 ++; CHECK-NEXT: fmov w11, s1 ++; CHECK-NEXT: mov w12, v1.s[1] ++; CHECK-NEXT: fmov s1, w10 ++; CHECK-NEXT: fcvtzu w10, d5 ++; CHECK-NEXT: cmp w13, #255 ++; CHECK-NEXT: mov v0.b[6], w11 ++; CHECK-NEXT: mov d2, v6.d[1] ++; CHECK-NEXT: mov v1.s[1], w9 ++; CHECK-NEXT: csel w9, w13, w8, lo ++; CHECK-NEXT: cmp w10, #255 ++; CHECK-NEXT: fcvtzu w13, d6 ++; CHECK-NEXT: csel w10, w10, w8, lo ++; CHECK-NEXT: mov v0.b[7], w12 ++; CHECK-NEXT: fcvtzu w12, d2 ++; CHECK-NEXT: fmov w11, s1 ++; CHECK-NEXT: fmov s2, w10 ++; CHECK-NEXT: mov w10, v1.s[1] ++; CHECK-NEXT: cmp w12, #255 ++; CHECK-NEXT: mov d1, v7.d[1] ++; CHECK-NEXT: mov v0.b[8], w11 ++; CHECK-NEXT: mov v2.s[1], w9 ++; CHECK-NEXT: csel w9, w12, w8, lo ++; CHECK-NEXT: cmp w13, #255 ++; CHECK-NEXT: csel w11, w13, w8, lo ++; CHECK-NEXT: fcvtzu w13, d7 ++; CHECK-NEXT: mov v0.b[9], w10 ++; CHECK-NEXT: fmov w10, s2 ++; CHECK-NEXT: fmov s3, w11 ++; CHECK-NEXT: fcvtzu w11, d1 ++; CHECK-NEXT: mov w12, v2.s[1] ++; CHECK-NEXT: mov v0.b[10], w10 ++; CHECK-NEXT: mov v3.s[1], w9 ++; CHECK-NEXT: cmp w11, #255 ++; CHECK-NEXT: csel w9, w11, w8, lo ++; CHECK-NEXT: cmp w13, #255 ++; CHECK-NEXT: csel w8, w13, w8, lo ++; CHECK-NEXT: mov v0.b[11], w12 ++; CHECK-NEXT: fmov w10, s3 ++; CHECK-NEXT: fmov s1, w8 ++; CHECK-NEXT: mov w8, v3.s[1] ++; CHECK-NEXT: mov v0.b[12], w10 ++; CHECK-NEXT: mov v1.s[1], w9 ++; CHECK-NEXT: mov v0.b[13], w8 ++; CHECK-NEXT: fmov w8, s1 ++; CHECK-NEXT: mov w9, v1.s[1] ++; CHECK-NEXT: mov v0.b[14], w8 ++; CHECK-NEXT: mov v0.b[15], w9 ++; CHECK-NEXT: ret ++ %x = call <16 x i8> @llvm.fptoui.sat.v16f64.v16i8(<16 x double> %f) ++ ret <16 x i8> %x ++} ++ ++define <8 x i16> @test_unsigned_v8f64_v8i16(<8 x double> %f) { ++; CHECK-LABEL: test_unsigned_v8f64_v8i16: ++; CHECK: // %bb.0: ++; CHECK-NEXT: mov d5, v0.d[1] ++; CHECK-NEXT: fcvtzu w10, d0 ++; CHECK-NEXT: mov d0, v1.d[1] ++; CHECK-NEXT: mov w8, #65535 ++; CHECK-NEXT: fcvtzu w12, d1 ++; CHECK-NEXT: mov d4, v2.d[1] ++; CHECK-NEXT: fcvtzu w13, d3 ++; CHECK-NEXT: fcvtzu w9, d5 ++; CHECK-NEXT: fcvtzu w11, d0 ++; CHECK-NEXT: cmp w9, w8 ++; CHECK-NEXT: csel w9, w9, w8, lo ++; CHECK-NEXT: cmp w10, w8 ++; CHECK-NEXT: csel w10, w10, w8, lo ++; CHECK-NEXT: cmp w11, w8 ++; CHECK-NEXT: fmov s0, w10 ++; CHECK-NEXT: csel w10, w11, w8, lo ++; CHECK-NEXT: cmp w12, w8 ++; CHECK-NEXT: csel w11, w12, w8, lo ++; CHECK-NEXT: mov v0.s[1], w9 ++; CHECK-NEXT: fcvtzu w9, d4 ++; CHECK-NEXT: fmov s1, w11 ++; CHECK-NEXT: fcvtzu w11, d2 ++; CHECK-NEXT: cmp w9, w8 ++; CHECK-NEXT: mov d2, v3.d[1] ++; CHECK-NEXT: mov w12, v0.s[1] ++; CHECK-NEXT: csel w9, w9, w8, lo ++; CHECK-NEXT: mov v1.s[1], w10 ++; CHECK-NEXT: cmp w11, w8 ++; CHECK-NEXT: csel w10, w11, w8, lo ++; CHECK-NEXT: mov v0.h[1], w12 ++; CHECK-NEXT: fmov w11, s1 ++; CHECK-NEXT: fmov s4, w10 ++; CHECK-NEXT: fcvtzu w10, d2 ++; CHECK-NEXT: mov w12, v1.s[1] ++; CHECK-NEXT: mov v0.h[2], w11 ++; CHECK-NEXT: mov v4.s[1], w9 ++; CHECK-NEXT: cmp w10, w8 ++; CHECK-NEXT: csel w9, w10, w8, lo ++; CHECK-NEXT: cmp w13, w8 ++; CHECK-NEXT: csel w8, w13, w8, lo ++; CHECK-NEXT: mov v0.h[3], w12 ++; CHECK-NEXT: fmov w10, s4 ++; CHECK-NEXT: fmov s1, w8 ++; CHECK-NEXT: mov w8, v4.s[1] ++; CHECK-NEXT: mov v0.h[4], w10 ++; CHECK-NEXT: mov v1.s[1], w9 ++; CHECK-NEXT: mov v0.h[5], w8 ++; CHECK-NEXT: fmov w8, s1 ++; CHECK-NEXT: mov w9, v1.s[1] ++; CHECK-NEXT: mov v0.h[6], w8 ++; CHECK-NEXT: mov v0.h[7], w9 ++; CHECK-NEXT: ret ++ %x = call <8 x i16> @llvm.fptoui.sat.v8f64.v8i16(<8 x double> %f) ++ ret <8 x i16> %x ++} ++ ++define <16 x i16> @test_unsigned_v16f64_v16i16(<16 x double> %f) { ++; CHECK-LABEL: test_unsigned_v16f64_v16i16: ++; CHECK: // %bb.0: ++; CHECK-NEXT: mov d16, v0.d[1] ++; CHECK-NEXT: fcvtzu w9, d0 ++; CHECK-NEXT: mov d0, v1.d[1] ++; CHECK-NEXT: mov d17, v2.d[1] ++; CHECK-NEXT: fcvtzu w10, d1 ++; CHECK-NEXT: mov d1, v3.d[1] ++; CHECK-NEXT: mov w8, #65535 ++; CHECK-NEXT: fcvtzu w12, d2 ++; CHECK-NEXT: fcvtzu w11, d16 ++; CHECK-NEXT: mov d2, v4.d[1] ++; CHECK-NEXT: fcvtzu w13, d0 ++; CHECK-NEXT: fcvtzu w14, d17 ++; CHECK-NEXT: fcvtzu w15, d1 ++; CHECK-NEXT: fcvtzu w16, d3 ++; CHECK-NEXT: cmp w11, w8 ++; CHECK-NEXT: mov d1, v5.d[1] ++; CHECK-NEXT: csel w11, w11, w8, lo ++; CHECK-NEXT: cmp w9, w8 ++; CHECK-NEXT: csel w9, w9, w8, lo ++; CHECK-NEXT: cmp w13, w8 ++; CHECK-NEXT: csel w13, w13, w8, lo ++; CHECK-NEXT: cmp w10, w8 ++; CHECK-NEXT: csel w10, w10, w8, lo ++; CHECK-NEXT: cmp w14, w8 ++; CHECK-NEXT: csel w14, w14, w8, lo ++; CHECK-NEXT: cmp w12, w8 ++; CHECK-NEXT: csel w12, w12, w8, lo ++; CHECK-NEXT: cmp w15, w8 ++; CHECK-NEXT: fcvtzu w17, d2 ++; CHECK-NEXT: fmov s0, w9 ++; CHECK-NEXT: csel w9, w15, w8, lo ++; CHECK-NEXT: fcvtzu w15, d4 ++; CHECK-NEXT: cmp w16, w8 ++; CHECK-NEXT: fcvtzu w18, d1 ++; CHECK-NEXT: csel w16, w16, w8, lo ++; CHECK-NEXT: cmp w17, w8 ++; CHECK-NEXT: csel w17, w17, w8, lo ++; CHECK-NEXT: cmp w15, w8 ++; CHECK-NEXT: mov v0.s[1], w11 ++; CHECK-NEXT: fcvtzu w0, d5 ++; CHECK-NEXT: csel w11, w15, w8, lo ++; CHECK-NEXT: fmov s2, w10 ++; CHECK-NEXT: cmp w18, w8 ++; CHECK-NEXT: mov d4, v6.d[1] ++; CHECK-NEXT: csel w10, w18, w8, lo ++; CHECK-NEXT: cmp w0, w8 ++; CHECK-NEXT: fmov s1, w11 ++; CHECK-NEXT: csel w11, w0, w8, lo ++; CHECK-NEXT: mov v2.s[1], w13 ++; CHECK-NEXT: mov w13, v0.s[1] ++; CHECK-NEXT: fcvtzu w15, d4 ++; CHECK-NEXT: mov v1.s[1], w17 ++; CHECK-NEXT: fmov s3, w11 ++; CHECK-NEXT: mov d4, v7.d[1] ++; CHECK-NEXT: mov v0.h[1], w13 ++; CHECK-NEXT: fmov w11, s2 ++; CHECK-NEXT: mov v3.s[1], w10 ++; CHECK-NEXT: cmp w15, w8 ++; CHECK-NEXT: mov w10, v1.s[1] ++; CHECK-NEXT: mov w13, v2.s[1] ++; CHECK-NEXT: fmov s2, w12 ++; CHECK-NEXT: mov v0.h[2], w11 ++; CHECK-NEXT: fcvtzu w11, d6 ++; CHECK-NEXT: csel w12, w15, w8, lo ++; CHECK-NEXT: mov v1.h[1], w10 ++; CHECK-NEXT: fmov w10, s3 ++; CHECK-NEXT: cmp w11, w8 ++; CHECK-NEXT: csel w11, w11, w8, lo ++; CHECK-NEXT: mov v0.h[3], w13 ++; CHECK-NEXT: fcvtzu w13, d7 ++; CHECK-NEXT: mov v1.h[2], w10 ++; CHECK-NEXT: fmov s5, w11 ++; CHECK-NEXT: fcvtzu w10, d4 ++; CHECK-NEXT: mov w11, v3.s[1] ++; CHECK-NEXT: mov v2.s[1], w14 ++; CHECK-NEXT: fmov s3, w16 ++; CHECK-NEXT: mov v5.s[1], w12 ++; CHECK-NEXT: cmp w10, w8 ++; CHECK-NEXT: csel w10, w10, w8, lo ++; CHECK-NEXT: cmp w13, w8 ++; CHECK-NEXT: csel w8, w13, w8, lo ++; CHECK-NEXT: fmov w12, s2 ++; CHECK-NEXT: mov v1.h[3], w11 ++; CHECK-NEXT: fmov w13, s5 ++; CHECK-NEXT: mov w14, v2.s[1] ++; CHECK-NEXT: fmov s2, w8 ++; CHECK-NEXT: mov w11, v5.s[1] ++; CHECK-NEXT: mov v0.h[4], w12 ++; CHECK-NEXT: mov v1.h[4], w13 ++; CHECK-NEXT: mov v3.s[1], w9 ++; CHECK-NEXT: mov v2.s[1], w10 ++; CHECK-NEXT: mov v0.h[5], w14 ++; CHECK-NEXT: mov v1.h[5], w11 ++; CHECK-NEXT: fmov w8, s3 ++; CHECK-NEXT: fmov w9, s2 ++; CHECK-NEXT: mov w10, v3.s[1] ++; CHECK-NEXT: mov w11, v2.s[1] ++; CHECK-NEXT: mov v0.h[6], w8 ++; CHECK-NEXT: mov v1.h[6], w9 ++; CHECK-NEXT: mov v0.h[7], w10 ++; CHECK-NEXT: mov v1.h[7], w11 ++; CHECK-NEXT: ret ++ %x = call <16 x i16> @llvm.fptoui.sat.v16f64.v16i16(<16 x double> %f) ++ ret <16 x i16> %x ++} +-- +2.34.1 + diff --git a/patches/cherry/bf268a05cd9294854ffccc3158c0e673069bed4a.patch b/patches/cherry/bf268a05cd9294854ffccc3158c0e673069bed4a.patch new file mode 100644 index 0000000..9e87587 --- /dev/null +++ b/patches/cherry/bf268a05cd9294854ffccc3158c0e673069bed4a.patch @@ -0,0 +1,609 @@ +From bf268a05cd9294854ffccc3158c0e673069bed4a Mon Sep 17 00:00:00 2001 +From: Cullen Rhodes <cullen.rhodes@arm.com> +Date: Fri, 22 Jul 2022 07:27:12 +0000 +Subject: [PATCH] [AArch64] Emit vector FP cmp when LE is used with fast-math + +Reviewed By: paulwalker-arm + +Differential Revision: https://reviews.llvm.org/D130093 +--- + llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 9 + + .../Target/AArch64/AArch64ISelLowering.cpp | 7 +- + .../AArch64/neon-compare-instructions.ll | 346 ++---------------- + 3 files changed, 43 insertions(+), 319 deletions(-) + +diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +index 06c633e45ccd..803278e34db8 100644 +--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp ++++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +@@ -11843,6 +11843,9 @@ SDValue DAGCombiner::foldSextSetcc(SDNode *N) { + EVT N00VT = N00.getValueType(); + SDLoc DL(N); + ++ // Propagate fast-math-flags. ++ SelectionDAG::FlagInserter FlagsInserter(DAG, N0->getFlags()); ++ + // On some architectures (such as SSE/NEON/etc) the SETCC result type is + // the same size as the compared operands. Try to optimize sext(setcc()) + // if this is the case. +@@ -12384,6 +12387,9 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { + return V; + + if (N0.getOpcode() == ISD::SETCC) { ++ // Propagate fast-math-flags. ++ SelectionDAG::FlagInserter FlagsInserter(DAG, N0->getFlags()); ++ + // Only do this before legalize for now. + if (!LegalOperations && VT.isVector() && + N0.getValueType().getVectorElementType() == MVT::i1) { +@@ -12575,6 +12581,9 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { + } + + if (N0.getOpcode() == ISD::SETCC) { ++ // Propagate fast-math-flags. ++ SelectionDAG::FlagInserter FlagsInserter(DAG, N0->getFlags()); ++ + // For vectors: + // aext(setcc) -> vsetcc + // aext(setcc) -> truncate(vsetcc) +diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +index 52f026456f02..1f6ce2d381ae 100644 +--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp ++++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +@@ -11975,6 +11975,11 @@ static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, + if (IsZero) + return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS); + return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS); ++ case AArch64CC::LE: ++ if (!NoNans) ++ return SDValue(); ++ // If we ignore NaNs then we can use to the LS implementation. ++ LLVM_FALLTHROUGH; + case AArch64CC::LS: + if (IsZero) + return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS); +@@ -12079,7 +12084,7 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op, + bool ShouldInvert; + changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert); + +- bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath; ++ bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs(); + SDValue Cmp = + EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG); + if (!Cmp.getNode()) +diff --git a/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll b/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll +index dcb0ca631c5b..ec210b4efc7b 100644 +--- a/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll ++++ b/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll +@@ -4648,17 +4648,7 @@ define <2 x i64> @fcmogt2xdouble_fast(<2 x double> %A, <2 x double> %B) { + define <2 x i32> @fcmole2xfloat_fast(<2 x float> %A, <2 x float> %B) { + ; CHECK-LABEL: fcmole2xfloat_fast: + ; CHECK: // %bb.0: +-; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +-; CHECK-NEXT: mov s2, v1.s[1] +-; CHECK-NEXT: mov s3, v0.s[1] +-; CHECK-NEXT: fcmp s3, s2 +-; CHECK-NEXT: csetm w8, le +-; CHECK-NEXT: fcmp s0, s1 +-; CHECK-NEXT: csetm w9, le +-; CHECK-NEXT: fmov s0, w9 +-; CHECK-NEXT: mov v0.s[1], w8 +-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ++; CHECK-NEXT: fcmge v0.2s, v1.2s, v0.2s + ; CHECK-NEXT: ret + ; + ; GISEL-LABEL: fcmole2xfloat_fast: +@@ -4675,25 +4665,7 @@ define <2 x i32> @fcmole2xfloat_fast(<2 x float> %A, <2 x float> %B) { + define <4 x i32> @fcmole4xfloat_fast(<4 x float> %A, <4 x float> %B) { + ; CHECK-LABEL: fcmole4xfloat_fast: + ; CHECK: // %bb.0: +-; CHECK-NEXT: mov s2, v1.s[1] +-; CHECK-NEXT: mov s3, v0.s[1] +-; CHECK-NEXT: mov s4, v0.s[2] +-; CHECK-NEXT: fcmp s3, s2 +-; CHECK-NEXT: mov s3, v1.s[2] +-; CHECK-NEXT: csetm w8, le +-; CHECK-NEXT: fcmp s0, s1 +-; CHECK-NEXT: mov s1, v1.s[3] +-; CHECK-NEXT: mov s0, v0.s[3] +-; CHECK-NEXT: csetm w9, le +-; CHECK-NEXT: fcmp s4, s3 +-; CHECK-NEXT: fmov s2, w9 +-; CHECK-NEXT: mov v2.s[1], w8 +-; CHECK-NEXT: csetm w8, le +-; CHECK-NEXT: fcmp s0, s1 +-; CHECK-NEXT: mov v2.s[2], w8 +-; CHECK-NEXT: csetm w8, le +-; CHECK-NEXT: mov v2.s[3], w8 +-; CHECK-NEXT: mov v0.16b, v2.16b ++; CHECK-NEXT: fcmge v0.4s, v1.4s, v0.4s + ; CHECK-NEXT: ret + ; + ; GISEL-LABEL: fcmole4xfloat_fast: +@@ -4710,14 +4682,7 @@ define <4 x i32> @fcmole4xfloat_fast(<4 x float> %A, <4 x float> %B) { + define <2 x i64> @fcmole2xdouble_fast(<2 x double> %A, <2 x double> %B) { + ; CHECK-LABEL: fcmole2xdouble_fast: + ; CHECK: // %bb.0: +-; CHECK-NEXT: mov d2, v1.d[1] +-; CHECK-NEXT: mov d3, v0.d[1] +-; CHECK-NEXT: fcmp d3, d2 +-; CHECK-NEXT: csetm x8, le +-; CHECK-NEXT: fcmp d0, d1 +-; CHECK-NEXT: csetm x9, le +-; CHECK-NEXT: fmov d0, x9 +-; CHECK-NEXT: mov v0.d[1], x8 ++; CHECK-NEXT: fcmge v0.2d, v1.2d, v0.2d + ; CHECK-NEXT: ret + ; + ; GISEL-LABEL: fcmole2xdouble_fast: +@@ -4734,17 +4699,7 @@ define <2 x i64> @fcmole2xdouble_fast(<2 x double> %A, <2 x double> %B) { + define <2 x i32> @fcmolt2xfloat_fast(<2 x float> %A, <2 x float> %B) { + ; CHECK-LABEL: fcmolt2xfloat_fast: + ; CHECK: // %bb.0: +-; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +-; CHECK-NEXT: mov s2, v1.s[1] +-; CHECK-NEXT: mov s3, v0.s[1] +-; CHECK-NEXT: fcmp s3, s2 +-; CHECK-NEXT: csetm w8, lt +-; CHECK-NEXT: fcmp s0, s1 +-; CHECK-NEXT: csetm w9, lt +-; CHECK-NEXT: fmov s0, w9 +-; CHECK-NEXT: mov v0.s[1], w8 +-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ++; CHECK-NEXT: fcmgt v0.2s, v1.2s, v0.2s + ; CHECK-NEXT: ret + ; + ; GISEL-LABEL: fcmolt2xfloat_fast: +@@ -4761,25 +4716,7 @@ define <2 x i32> @fcmolt2xfloat_fast(<2 x float> %A, <2 x float> %B) { + define <4 x i32> @fcmolt4xfloat_fast(<4 x float> %A, <4 x float> %B) { + ; CHECK-LABEL: fcmolt4xfloat_fast: + ; CHECK: // %bb.0: +-; CHECK-NEXT: mov s2, v1.s[1] +-; CHECK-NEXT: mov s3, v0.s[1] +-; CHECK-NEXT: mov s4, v0.s[2] +-; CHECK-NEXT: fcmp s3, s2 +-; CHECK-NEXT: mov s3, v1.s[2] +-; CHECK-NEXT: csetm w8, lt +-; CHECK-NEXT: fcmp s0, s1 +-; CHECK-NEXT: mov s1, v1.s[3] +-; CHECK-NEXT: mov s0, v0.s[3] +-; CHECK-NEXT: csetm w9, lt +-; CHECK-NEXT: fcmp s4, s3 +-; CHECK-NEXT: fmov s2, w9 +-; CHECK-NEXT: mov v2.s[1], w8 +-; CHECK-NEXT: csetm w8, lt +-; CHECK-NEXT: fcmp s0, s1 +-; CHECK-NEXT: mov v2.s[2], w8 +-; CHECK-NEXT: csetm w8, lt +-; CHECK-NEXT: mov v2.s[3], w8 +-; CHECK-NEXT: mov v0.16b, v2.16b ++; CHECK-NEXT: fcmgt v0.4s, v1.4s, v0.4s + ; CHECK-NEXT: ret + ; + ; GISEL-LABEL: fcmolt4xfloat_fast: +@@ -4796,14 +4733,7 @@ define <4 x i32> @fcmolt4xfloat_fast(<4 x float> %A, <4 x float> %B) { + define <2 x i64> @fcmolt2xdouble_fast(<2 x double> %A, <2 x double> %B) { + ; CHECK-LABEL: fcmolt2xdouble_fast: + ; CHECK: // %bb.0: +-; CHECK-NEXT: mov d2, v1.d[1] +-; CHECK-NEXT: mov d3, v0.d[1] +-; CHECK-NEXT: fcmp d3, d2 +-; CHECK-NEXT: csetm x8, lt +-; CHECK-NEXT: fcmp d0, d1 +-; CHECK-NEXT: csetm x9, lt +-; CHECK-NEXT: fmov d0, x9 +-; CHECK-NEXT: mov v0.d[1], x8 ++; CHECK-NEXT: fcmgt v0.2d, v1.2d, v0.2d + ; CHECK-NEXT: ret + ; + ; GISEL-LABEL: fcmolt2xdouble_fast: +@@ -5181,17 +5111,7 @@ define <2 x i64> @fcmugt2xdouble_fast(<2 x double> %A, <2 x double> %B) { + define <2 x i32> @fcmule2xfloat_fast(<2 x float> %A, <2 x float> %B) { + ; CHECK-LABEL: fcmule2xfloat_fast: + ; CHECK: // %bb.0: +-; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +-; CHECK-NEXT: mov s2, v1.s[1] +-; CHECK-NEXT: mov s3, v0.s[1] +-; CHECK-NEXT: fcmp s3, s2 +-; CHECK-NEXT: csetm w8, le +-; CHECK-NEXT: fcmp s0, s1 +-; CHECK-NEXT: csetm w9, le +-; CHECK-NEXT: fmov s0, w9 +-; CHECK-NEXT: mov v0.s[1], w8 +-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ++; CHECK-NEXT: fcmge v0.2s, v1.2s, v0.2s + ; CHECK-NEXT: ret + ; + ; GISEL-LABEL: fcmule2xfloat_fast: +@@ -5209,25 +5129,7 @@ define <2 x i32> @fcmule2xfloat_fast(<2 x float> %A, <2 x float> %B) { + define <4 x i32> @fcmule4xfloat_fast(<4 x float> %A, <4 x float> %B) { + ; CHECK-LABEL: fcmule4xfloat_fast: + ; CHECK: // %bb.0: +-; CHECK-NEXT: mov s2, v1.s[1] +-; CHECK-NEXT: mov s3, v0.s[1] +-; CHECK-NEXT: mov s4, v0.s[2] +-; CHECK-NEXT: fcmp s3, s2 +-; CHECK-NEXT: mov s3, v1.s[2] +-; CHECK-NEXT: csetm w8, le +-; CHECK-NEXT: fcmp s0, s1 +-; CHECK-NEXT: mov s1, v1.s[3] +-; CHECK-NEXT: mov s0, v0.s[3] +-; CHECK-NEXT: csetm w9, le +-; CHECK-NEXT: fcmp s4, s3 +-; CHECK-NEXT: fmov s2, w9 +-; CHECK-NEXT: mov v2.s[1], w8 +-; CHECK-NEXT: csetm w8, le +-; CHECK-NEXT: fcmp s0, s1 +-; CHECK-NEXT: mov v2.s[2], w8 +-; CHECK-NEXT: csetm w8, le +-; CHECK-NEXT: mov v2.s[3], w8 +-; CHECK-NEXT: mov v0.16b, v2.16b ++; CHECK-NEXT: fcmge v0.4s, v1.4s, v0.4s + ; CHECK-NEXT: ret + ; + ; GISEL-LABEL: fcmule4xfloat_fast: +@@ -5245,14 +5147,7 @@ define <4 x i32> @fcmule4xfloat_fast(<4 x float> %A, <4 x float> %B) { + define <2 x i64> @fcmule2xdouble_fast(<2 x double> %A, <2 x double> %B) { + ; CHECK-LABEL: fcmule2xdouble_fast: + ; CHECK: // %bb.0: +-; CHECK-NEXT: mov d2, v1.d[1] +-; CHECK-NEXT: mov d3, v0.d[1] +-; CHECK-NEXT: fcmp d3, d2 +-; CHECK-NEXT: csetm x8, le +-; CHECK-NEXT: fcmp d0, d1 +-; CHECK-NEXT: csetm x9, le +-; CHECK-NEXT: fmov d0, x9 +-; CHECK-NEXT: mov v0.d[1], x8 ++; CHECK-NEXT: fcmge v0.2d, v1.2d, v0.2d + ; CHECK-NEXT: ret + ; + ; GISEL-LABEL: fcmule2xdouble_fast: +@@ -5270,17 +5165,7 @@ define <2 x i64> @fcmule2xdouble_fast(<2 x double> %A, <2 x double> %B) { + define <2 x i32> @fcmult2xfloat_fast(<2 x float> %A, <2 x float> %B) { + ; CHECK-LABEL: fcmult2xfloat_fast: + ; CHECK: // %bb.0: +-; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +-; CHECK-NEXT: mov s2, v1.s[1] +-; CHECK-NEXT: mov s3, v0.s[1] +-; CHECK-NEXT: fcmp s3, s2 +-; CHECK-NEXT: csetm w8, lt +-; CHECK-NEXT: fcmp s0, s1 +-; CHECK-NEXT: csetm w9, lt +-; CHECK-NEXT: fmov s0, w9 +-; CHECK-NEXT: mov v0.s[1], w8 +-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ++; CHECK-NEXT: fcmgt v0.2s, v1.2s, v0.2s + ; CHECK-NEXT: ret + ; + ; GISEL-LABEL: fcmult2xfloat_fast: +@@ -5298,25 +5183,7 @@ define <2 x i32> @fcmult2xfloat_fast(<2 x float> %A, <2 x float> %B) { + define <4 x i32> @fcmult4xfloat_fast(<4 x float> %A, <4 x float> %B) { + ; CHECK-LABEL: fcmult4xfloat_fast: + ; CHECK: // %bb.0: +-; CHECK-NEXT: mov s2, v1.s[1] +-; CHECK-NEXT: mov s3, v0.s[1] +-; CHECK-NEXT: mov s4, v0.s[2] +-; CHECK-NEXT: fcmp s3, s2 +-; CHECK-NEXT: mov s3, v1.s[2] +-; CHECK-NEXT: csetm w8, lt +-; CHECK-NEXT: fcmp s0, s1 +-; CHECK-NEXT: mov s1, v1.s[3] +-; CHECK-NEXT: mov s0, v0.s[3] +-; CHECK-NEXT: csetm w9, lt +-; CHECK-NEXT: fcmp s4, s3 +-; CHECK-NEXT: fmov s2, w9 +-; CHECK-NEXT: mov v2.s[1], w8 +-; CHECK-NEXT: csetm w8, lt +-; CHECK-NEXT: fcmp s0, s1 +-; CHECK-NEXT: mov v2.s[2], w8 +-; CHECK-NEXT: csetm w8, lt +-; CHECK-NEXT: mov v2.s[3], w8 +-; CHECK-NEXT: mov v0.16b, v2.16b ++; CHECK-NEXT: fcmgt v0.4s, v1.4s, v0.4s + ; CHECK-NEXT: ret + ; + ; GISEL-LABEL: fcmult4xfloat_fast: +@@ -5334,14 +5201,7 @@ define <4 x i32> @fcmult4xfloat_fast(<4 x float> %A, <4 x float> %B) { + define <2 x i64> @fcmult2xdouble_fast(<2 x double> %A, <2 x double> %B) { + ; CHECK-LABEL: fcmult2xdouble_fast: + ; CHECK: // %bb.0: +-; CHECK-NEXT: mov d2, v1.d[1] +-; CHECK-NEXT: mov d3, v0.d[1] +-; CHECK-NEXT: fcmp d3, d2 +-; CHECK-NEXT: csetm x8, lt +-; CHECK-NEXT: fcmp d0, d1 +-; CHECK-NEXT: csetm x9, lt +-; CHECK-NEXT: fmov d0, x9 +-; CHECK-NEXT: mov v0.d[1], x8 ++; CHECK-NEXT: fcmgt v0.2d, v1.2d, v0.2d + ; CHECK-NEXT: ret + ; + ; GISEL-LABEL: fcmult2xdouble_fast: +@@ -5567,15 +5427,7 @@ define <2 x i64> @fcmogtz2xdouble_fast(<2 x double> %A) { + define <2 x i32> @fcmoltz2xfloat_fast(<2 x float> %A) { + ; CHECK-LABEL: fcmoltz2xfloat_fast: + ; CHECK: // %bb.0: +-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +-; CHECK-NEXT: mov s1, v0.s[1] +-; CHECK-NEXT: fcmp s1, #0.0 +-; CHECK-NEXT: csetm w8, lt +-; CHECK-NEXT: fcmp s0, #0.0 +-; CHECK-NEXT: csetm w9, lt +-; CHECK-NEXT: fmov s0, w9 +-; CHECK-NEXT: mov v0.s[1], w8 +-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ++; CHECK-NEXT: fcmlt v0.2s, v0.2s, #0.0 + ; CHECK-NEXT: ret + ; + ; GISEL-LABEL: fcmoltz2xfloat_fast: +@@ -5592,22 +5444,7 @@ define <2 x i32> @fcmoltz2xfloat_fast(<2 x float> %A) { + define <4 x i32> @fcmoltz4xfloat_fast(<4 x float> %A) { + ; CHECK-LABEL: fcmoltz4xfloat_fast: + ; CHECK: // %bb.0: +-; CHECK-NEXT: mov s1, v0.s[1] +-; CHECK-NEXT: mov s2, v0.s[2] +-; CHECK-NEXT: fcmp s1, #0.0 +-; CHECK-NEXT: csetm w8, lt +-; CHECK-NEXT: fcmp s0, #0.0 +-; CHECK-NEXT: mov s0, v0.s[3] +-; CHECK-NEXT: csetm w9, lt +-; CHECK-NEXT: fcmp s2, #0.0 +-; CHECK-NEXT: fmov s1, w9 +-; CHECK-NEXT: mov v1.s[1], w8 +-; CHECK-NEXT: csetm w8, lt +-; CHECK-NEXT: fcmp s0, #0.0 +-; CHECK-NEXT: mov v1.s[2], w8 +-; CHECK-NEXT: csetm w8, lt +-; CHECK-NEXT: mov v1.s[3], w8 +-; CHECK-NEXT: mov v0.16b, v1.16b ++; CHECK-NEXT: fcmlt v0.4s, v0.4s, #0.0 + ; CHECK-NEXT: ret + ; + ; GISEL-LABEL: fcmoltz4xfloat_fast: +@@ -5624,13 +5461,7 @@ define <4 x i32> @fcmoltz4xfloat_fast(<4 x float> %A) { + define <2 x i64> @fcmoltz2xdouble_fast(<2 x double> %A) { + ; CHECK-LABEL: fcmoltz2xdouble_fast: + ; CHECK: // %bb.0: +-; CHECK-NEXT: mov d1, v0.d[1] +-; CHECK-NEXT: fcmp d1, #0.0 +-; CHECK-NEXT: csetm x8, lt +-; CHECK-NEXT: fcmp d0, #0.0 +-; CHECK-NEXT: csetm x9, lt +-; CHECK-NEXT: fmov d0, x9 +-; CHECK-NEXT: mov v0.d[1], x8 ++; CHECK-NEXT: fcmlt v0.2d, v0.2d, #0.0 + ; CHECK-NEXT: ret + ; + ; GISEL-LABEL: fcmoltz2xdouble_fast: +@@ -5647,15 +5478,7 @@ define <2 x i64> @fcmoltz2xdouble_fast(<2 x double> %A) { + define <2 x i32> @fcmolez2xfloat_fast(<2 x float> %A) { + ; CHECK-LABEL: fcmolez2xfloat_fast: + ; CHECK: // %bb.0: +-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +-; CHECK-NEXT: mov s1, v0.s[1] +-; CHECK-NEXT: fcmp s1, #0.0 +-; CHECK-NEXT: csetm w8, le +-; CHECK-NEXT: fcmp s0, #0.0 +-; CHECK-NEXT: csetm w9, le +-; CHECK-NEXT: fmov s0, w9 +-; CHECK-NEXT: mov v0.s[1], w8 +-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ++; CHECK-NEXT: fcmle v0.2s, v0.2s, #0.0 + ; CHECK-NEXT: ret + ; + ; GISEL-LABEL: fcmolez2xfloat_fast: +@@ -5672,22 +5495,7 @@ define <2 x i32> @fcmolez2xfloat_fast(<2 x float> %A) { + define <4 x i32> @fcmolez4xfloat_fast(<4 x float> %A) { + ; CHECK-LABEL: fcmolez4xfloat_fast: + ; CHECK: // %bb.0: +-; CHECK-NEXT: mov s1, v0.s[1] +-; CHECK-NEXT: mov s2, v0.s[2] +-; CHECK-NEXT: fcmp s1, #0.0 +-; CHECK-NEXT: csetm w8, le +-; CHECK-NEXT: fcmp s0, #0.0 +-; CHECK-NEXT: mov s0, v0.s[3] +-; CHECK-NEXT: csetm w9, le +-; CHECK-NEXT: fcmp s2, #0.0 +-; CHECK-NEXT: fmov s1, w9 +-; CHECK-NEXT: mov v1.s[1], w8 +-; CHECK-NEXT: csetm w8, le +-; CHECK-NEXT: fcmp s0, #0.0 +-; CHECK-NEXT: mov v1.s[2], w8 +-; CHECK-NEXT: csetm w8, le +-; CHECK-NEXT: mov v1.s[3], w8 +-; CHECK-NEXT: mov v0.16b, v1.16b ++; CHECK-NEXT: fcmle v0.4s, v0.4s, #0.0 + ; CHECK-NEXT: ret + ; + ; GISEL-LABEL: fcmolez4xfloat_fast: +@@ -5704,13 +5512,7 @@ define <4 x i32> @fcmolez4xfloat_fast(<4 x float> %A) { + define <2 x i64> @fcmolez2xdouble_fast(<2 x double> %A) { + ; CHECK-LABEL: fcmolez2xdouble_fast: + ; CHECK: // %bb.0: +-; CHECK-NEXT: mov d1, v0.d[1] +-; CHECK-NEXT: fcmp d1, #0.0 +-; CHECK-NEXT: csetm x8, le +-; CHECK-NEXT: fcmp d0, #0.0 +-; CHECK-NEXT: csetm x9, le +-; CHECK-NEXT: fmov d0, x9 +-; CHECK-NEXT: mov v0.d[1], x8 ++; CHECK-NEXT: fcmle v0.2d, v0.2d, #0.0 + ; CHECK-NEXT: ret + ; + ; GISEL-LABEL: fcmolez2xdouble_fast: +@@ -6018,15 +5820,7 @@ define <2 x i64> @fcmugtz2xdouble_fast(<2 x double> %A) { + define <2 x i32> @fcmultz2xfloat_fast(<2 x float> %A) { + ; CHECK-LABEL: fcmultz2xfloat_fast: + ; CHECK: // %bb.0: +-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +-; CHECK-NEXT: mov s1, v0.s[1] +-; CHECK-NEXT: fcmp s1, #0.0 +-; CHECK-NEXT: csetm w8, lt +-; CHECK-NEXT: fcmp s0, #0.0 +-; CHECK-NEXT: csetm w9, lt +-; CHECK-NEXT: fmov s0, w9 +-; CHECK-NEXT: mov v0.s[1], w8 +-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ++; CHECK-NEXT: fcmlt v0.2s, v0.2s, #0.0 + ; CHECK-NEXT: ret + ; + ; GISEL-LABEL: fcmultz2xfloat_fast: +@@ -6044,22 +5838,7 @@ define <2 x i32> @fcmultz2xfloat_fast(<2 x float> %A) { + define <4 x i32> @fcmultz4xfloat_fast(<4 x float> %A) { + ; CHECK-LABEL: fcmultz4xfloat_fast: + ; CHECK: // %bb.0: +-; CHECK-NEXT: mov s1, v0.s[1] +-; CHECK-NEXT: mov s2, v0.s[2] +-; CHECK-NEXT: fcmp s1, #0.0 +-; CHECK-NEXT: csetm w8, lt +-; CHECK-NEXT: fcmp s0, #0.0 +-; CHECK-NEXT: mov s0, v0.s[3] +-; CHECK-NEXT: csetm w9, lt +-; CHECK-NEXT: fcmp s2, #0.0 +-; CHECK-NEXT: fmov s1, w9 +-; CHECK-NEXT: mov v1.s[1], w8 +-; CHECK-NEXT: csetm w8, lt +-; CHECK-NEXT: fcmp s0, #0.0 +-; CHECK-NEXT: mov v1.s[2], w8 +-; CHECK-NEXT: csetm w8, lt +-; CHECK-NEXT: mov v1.s[3], w8 +-; CHECK-NEXT: mov v0.16b, v1.16b ++; CHECK-NEXT: fcmlt v0.4s, v0.4s, #0.0 + ; CHECK-NEXT: ret + ; + ; GISEL-LABEL: fcmultz4xfloat_fast: +@@ -6077,13 +5856,7 @@ define <4 x i32> @fcmultz4xfloat_fast(<4 x float> %A) { + define <2 x i64> @fcmultz2xdouble_fast(<2 x double> %A) { + ; CHECK-LABEL: fcmultz2xdouble_fast: + ; CHECK: // %bb.0: +-; CHECK-NEXT: mov d1, v0.d[1] +-; CHECK-NEXT: fcmp d1, #0.0 +-; CHECK-NEXT: csetm x8, lt +-; CHECK-NEXT: fcmp d0, #0.0 +-; CHECK-NEXT: csetm x9, lt +-; CHECK-NEXT: fmov d0, x9 +-; CHECK-NEXT: mov v0.d[1], x8 ++; CHECK-NEXT: fcmlt v0.2d, v0.2d, #0.0 + ; CHECK-NEXT: ret + ; + ; GISEL-LABEL: fcmultz2xdouble_fast: +@@ -6102,15 +5875,7 @@ define <2 x i64> @fcmultz2xdouble_fast(<2 x double> %A) { + define <2 x i32> @fcmulez2xfloat_fast(<2 x float> %A) { + ; CHECK-LABEL: fcmulez2xfloat_fast: + ; CHECK: // %bb.0: +-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +-; CHECK-NEXT: mov s1, v0.s[1] +-; CHECK-NEXT: fcmp s1, #0.0 +-; CHECK-NEXT: csetm w8, le +-; CHECK-NEXT: fcmp s0, #0.0 +-; CHECK-NEXT: csetm w9, le +-; CHECK-NEXT: fmov s0, w9 +-; CHECK-NEXT: mov v0.s[1], w8 +-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ++; CHECK-NEXT: fcmle v0.2s, v0.2s, #0.0 + ; CHECK-NEXT: ret + ; + ; GISEL-LABEL: fcmulez2xfloat_fast: +@@ -6128,22 +5893,7 @@ define <2 x i32> @fcmulez2xfloat_fast(<2 x float> %A) { + define <4 x i32> @fcmulez4xfloat_fast(<4 x float> %A) { + ; CHECK-LABEL: fcmulez4xfloat_fast: + ; CHECK: // %bb.0: +-; CHECK-NEXT: mov s1, v0.s[1] +-; CHECK-NEXT: mov s2, v0.s[2] +-; CHECK-NEXT: fcmp s1, #0.0 +-; CHECK-NEXT: csetm w8, le +-; CHECK-NEXT: fcmp s0, #0.0 +-; CHECK-NEXT: mov s0, v0.s[3] +-; CHECK-NEXT: csetm w9, le +-; CHECK-NEXT: fcmp s2, #0.0 +-; CHECK-NEXT: fmov s1, w9 +-; CHECK-NEXT: mov v1.s[1], w8 +-; CHECK-NEXT: csetm w8, le +-; CHECK-NEXT: fcmp s0, #0.0 +-; CHECK-NEXT: mov v1.s[2], w8 +-; CHECK-NEXT: csetm w8, le +-; CHECK-NEXT: mov v1.s[3], w8 +-; CHECK-NEXT: mov v0.16b, v1.16b ++; CHECK-NEXT: fcmle v0.4s, v0.4s, #0.0 + ; CHECK-NEXT: ret + ; + ; GISEL-LABEL: fcmulez4xfloat_fast: +@@ -6161,13 +5911,7 @@ define <4 x i32> @fcmulez4xfloat_fast(<4 x float> %A) { + define <2 x i64> @fcmulez2xdouble_fast(<2 x double> %A) { + ; CHECK-LABEL: fcmulez2xdouble_fast: + ; CHECK: // %bb.0: +-; CHECK-NEXT: mov d1, v0.d[1] +-; CHECK-NEXT: fcmp d1, #0.0 +-; CHECK-NEXT: csetm x8, le +-; CHECK-NEXT: fcmp d0, #0.0 +-; CHECK-NEXT: csetm x9, le +-; CHECK-NEXT: fmov d0, x9 +-; CHECK-NEXT: mov v0.d[1], x8 ++; CHECK-NEXT: fcmle v0.2d, v0.2d, #0.0 + ; CHECK-NEXT: ret + ; + ; GISEL-LABEL: fcmulez2xdouble_fast: +@@ -6313,26 +6057,9 @@ define <2 x i64> @fcmunoz2xdouble_fast(<2 x double> %A) { + define <4 x i32> @fcmule4xfloat_fast_zext(<4 x float> %A, <4 x float> %B) { + ; CHECK-LABEL: fcmule4xfloat_fast_zext: + ; CHECK: // %bb.0: +-; CHECK-NEXT: mov s3, v1.s[1] +-; CHECK-NEXT: mov s4, v0.s[1] + ; CHECK-NEXT: movi v2.4s, #1 +-; CHECK-NEXT: fcmp s4, s3 +-; CHECK-NEXT: mov s3, v1.s[2] +-; CHECK-NEXT: mov s4, v0.s[2] +-; CHECK-NEXT: csetm w8, le +-; CHECK-NEXT: fcmp s0, s1 +-; CHECK-NEXT: mov s1, v1.s[3] +-; CHECK-NEXT: mov s0, v0.s[3] +-; CHECK-NEXT: csetm w9, le +-; CHECK-NEXT: fcmp s4, s3 +-; CHECK-NEXT: fmov s3, w9 +-; CHECK-NEXT: mov v3.s[1], w8 +-; CHECK-NEXT: csetm w8, le +-; CHECK-NEXT: fcmp s0, s1 +-; CHECK-NEXT: mov v3.s[2], w8 +-; CHECK-NEXT: csetm w8, le +-; CHECK-NEXT: mov v3.s[3], w8 +-; CHECK-NEXT: and v0.16b, v3.16b, v2.16b ++; CHECK-NEXT: fcmge v0.4s, v1.4s, v0.4s ++; CHECK-NEXT: and v0.16b, v0.16b, v2.16b + ; CHECK-NEXT: ret + ; + ; GISEL-LABEL: fcmule4xfloat_fast_zext: +@@ -6351,25 +6078,8 @@ define <4 x i32> @fcmule4xfloat_fast_zext(<4 x float> %A, <4 x float> %B) { + define <4 x i1> @fcmule4xfloat_fast_aext(<4 x float> %A, <4 x float> %B) { + ; CHECK-LABEL: fcmule4xfloat_fast_aext: + ; CHECK: // %bb.0: +-; CHECK-NEXT: mov s2, v1.s[1] +-; CHECK-NEXT: mov s3, v0.s[1] +-; CHECK-NEXT: fcmp s3, s2 +-; CHECK-NEXT: mov s2, v1.s[2] +-; CHECK-NEXT: mov s3, v0.s[2] +-; CHECK-NEXT: csetm w8, le +-; CHECK-NEXT: fcmp s0, s1 +-; CHECK-NEXT: mov s1, v1.s[3] +-; CHECK-NEXT: mov s0, v0.s[3] +-; CHECK-NEXT: csetm w9, le +-; CHECK-NEXT: fcmp s3, s2 +-; CHECK-NEXT: fmov s4, w9 +-; CHECK-NEXT: mov v4.s[1], w8 +-; CHECK-NEXT: csetm w8, le +-; CHECK-NEXT: fcmp s0, s1 +-; CHECK-NEXT: mov v4.s[2], w8 +-; CHECK-NEXT: csetm w8, le +-; CHECK-NEXT: mov v4.s[3], w8 +-; CHECK-NEXT: xtn v0.4h, v4.4s ++; CHECK-NEXT: fcmge v0.4s, v1.4s, v0.4s ++; CHECK-NEXT: xtn v0.4h, v0.4s + ; CHECK-NEXT: ret + ; + ; GISEL-LABEL: fcmule4xfloat_fast_aext: +-- +2.34.1 + diff --git a/patches/cherry/d9633d149022054bdac90bd3d03a240dbdb46f7e.patch b/patches/cherry/d9633d149022054bdac90bd3d03a240dbdb46f7e.patch new file mode 100644 index 0000000..51e504b --- /dev/null +++ b/patches/cherry/d9633d149022054bdac90bd3d03a240dbdb46f7e.patch @@ -0,0 +1,408 @@ +From d9633d149022054bdac90bd3d03a240dbdb46f7e Mon Sep 17 00:00:00 2001 +From: David Green <david.green@arm.com> +Date: Mon, 7 Mar 2022 09:42:54 +0000 +Subject: [PATCH] [AArch64] Turn truncating buildvectors into truncates + +When lowering large v16f32->v16i8 fp_to_si_sat, the fp_to_si_sat node is +split several times, creating an illegal v4i8 concat that gets expanded +into a BUILD_VECTOR. After some combining and other legalisation, it +ends up the a buildvector that extracts from 4 vectors, looking like +BUILDVECTOR(a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3). That is +really an v16i32->v16i8 truncate in disguise. + +This adds a ReconstructTruncateFromBuildVector method to detect the +pattern, converting it back into the legal "concat(trunc(concat(trunc(a), +trunc(b))), trunc(concat(trunc(c), trunc(d))))" tree. The extracted +nodes could also be v4i16, in which case the truncates are not needed. +All those truncates and concats then become uzip1's, which is much +better than expanding by moving vector lanes around. + +Differential Revision: https://reviews.llvm.org/D119469 +--- + .../Target/AArch64/AArch64ISelLowering.cpp | 56 ++++++++ + .../test/CodeGen/AArch64/fptosi-sat-vector.ll | 57 ++------ + .../test/CodeGen/AArch64/fptoui-sat-vector.ll | 51 ++----- + .../CodeGen/AArch64/neon-extracttruncate.ll | 133 ++---------------- + 4 files changed, 92 insertions(+), 205 deletions(-) + +diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +index 51f17b37a8d6..dd421970e99f 100644 +--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp ++++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +@@ -9252,6 +9252,56 @@ static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) { + return true; + } + ++// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from ++// v4i32s. This is really a truncate, which we can construct out of (legal) ++// concats and truncate nodes. ++static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG) { ++ if (V.getValueType() != MVT::v16i8) ++ return SDValue(); ++ assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR"); ++ ++ for (unsigned X = 0; X < 4; X++) { ++ // Check the first item in each group is an extract from lane 0 of a v4i32 ++ // or v4i16. ++ SDValue BaseExt = V.getOperand(X * 4); ++ if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || ++ (BaseExt.getOperand(0).getValueType() != MVT::v4i16 && ++ BaseExt.getOperand(0).getValueType() != MVT::v4i32) || ++ !isa<ConstantSDNode>(BaseExt.getOperand(1)) || ++ BaseExt.getConstantOperandVal(1) != 0) ++ return SDValue(); ++ SDValue Base = BaseExt.getOperand(0); ++ // And check the other items are extracts from the same vector. ++ for (unsigned Y = 1; Y < 4; Y++) { ++ SDValue Ext = V.getOperand(X * 4 + Y); ++ if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT || ++ Ext.getOperand(0) != Base || ++ !isa<ConstantSDNode>(Ext.getOperand(1)) || ++ Ext.getConstantOperandVal(1) != Y) ++ return SDValue(); ++ } ++ } ++ ++ // Turn the buildvector into a series of truncates and concates, which will ++ // become uzip1's. Any v4i32s we found get truncated to v4i16, which are ++ // concat together to produce 2 v8i16. These are both truncated and concat ++ // together. ++ SDLoc DL(V); ++ SDValue Trunc[4] = { ++ V.getOperand(0).getOperand(0), V.getOperand(4).getOperand(0), ++ V.getOperand(8).getOperand(0), V.getOperand(12).getOperand(0)}; ++ for (int I = 0; I < 4; I++) ++ if (Trunc[I].getValueType() == MVT::v4i32) ++ Trunc[I] = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, Trunc[I]); ++ SDValue Concat0 = ++ DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]); ++ SDValue Concat1 = ++ DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]); ++ SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0); ++ SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1); ++ return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1); ++} ++ + /// Check if a vector shuffle corresponds to a DUP instructions with a larger + /// element width than the vector lane type. If that is the case the function + /// returns true and writes the value of the DUP instruction lane operand into +@@ -10871,6 +10921,12 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, + return SDValue(); + } + ++ // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from ++ // v4i32s. This is really a truncate, which we can construct out of (legal) ++ // concats and truncate nodes. ++ if (SDValue M = ReconstructTruncateFromBuildVector(Op, DAG)) ++ return M; ++ + // Empirical tests suggest this is rarely worth it for vectors of length <= 2. + if (NumElts >= 4) { + if (SDValue shuffle = ReconstructShuffle(Op, DAG)) +diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll +index ebfe8e4a20d0..244c65312e0e 100644 +--- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll ++++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll +@@ -3004,55 +3004,22 @@ define <16 x i8> @test_signed_v16f32_v16i8(<16 x float> %f) { + ; CHECK-LABEL: test_signed_v16f32_v16i8: + ; CHECK: // %bb.0: + ; CHECK-NEXT: movi v4.4s, #127 ++; CHECK-NEXT: fcvtzs v3.4s, v3.4s ++; CHECK-NEXT: fcvtzs v2.4s, v2.4s ++; CHECK-NEXT: fcvtzs v1.4s, v1.4s + ; CHECK-NEXT: fcvtzs v0.4s, v0.4s + ; CHECK-NEXT: mvni v5.4s, #127 +-; CHECK-NEXT: fcvtzs v1.4s, v1.4s +-; CHECK-NEXT: fcvtzs v2.4s, v2.4s +-; CHECK-NEXT: smin v0.4s, v0.4s, v4.4s +-; CHECK-NEXT: smin v1.4s, v1.4s, v4.4s ++; CHECK-NEXT: smin v3.4s, v3.4s, v4.4s + ; CHECK-NEXT: smin v2.4s, v2.4s, v4.4s +-; CHECK-NEXT: smax v0.4s, v0.4s, v5.4s +-; CHECK-NEXT: smax v1.4s, v1.4s, v5.4s +-; CHECK-NEXT: smax v2.4s, v2.4s, v5.4s +-; CHECK-NEXT: xtn v6.4h, v0.4s +-; CHECK-NEXT: umov w8, v6.h[0] +-; CHECK-NEXT: umov w9, v6.h[1] +-; CHECK-NEXT: xtn v1.4h, v1.4s +-; CHECK-NEXT: fmov s0, w8 +-; CHECK-NEXT: umov w8, v6.h[2] +-; CHECK-NEXT: mov v0.b[1], w9 +-; CHECK-NEXT: mov v0.b[2], w8 +-; CHECK-NEXT: umov w8, v6.h[3] +-; CHECK-NEXT: mov v0.b[3], w8 +-; CHECK-NEXT: umov w8, v1.h[0] +-; CHECK-NEXT: mov v0.b[4], w8 +-; CHECK-NEXT: umov w8, v1.h[1] +-; CHECK-NEXT: mov v0.b[5], w8 +-; CHECK-NEXT: umov w8, v1.h[2] +-; CHECK-NEXT: mov v0.b[6], w8 +-; CHECK-NEXT: umov w8, v1.h[3] +-; CHECK-NEXT: xtn v1.4h, v2.4s +-; CHECK-NEXT: fcvtzs v2.4s, v3.4s +-; CHECK-NEXT: mov v0.b[7], w8 +-; CHECK-NEXT: umov w8, v1.h[0] +-; CHECK-NEXT: smin v2.4s, v2.4s, v4.4s +-; CHECK-NEXT: mov v0.b[8], w8 +-; CHECK-NEXT: umov w8, v1.h[1] ++; CHECK-NEXT: smin v1.4s, v1.4s, v4.4s ++; CHECK-NEXT: smin v0.4s, v0.4s, v4.4s ++; CHECK-NEXT: smax v3.4s, v3.4s, v5.4s + ; CHECK-NEXT: smax v2.4s, v2.4s, v5.4s +-; CHECK-NEXT: mov v0.b[9], w8 +-; CHECK-NEXT: umov w8, v1.h[2] +-; CHECK-NEXT: mov v0.b[10], w8 +-; CHECK-NEXT: umov w8, v1.h[3] +-; CHECK-NEXT: xtn v1.4h, v2.4s +-; CHECK-NEXT: mov v0.b[11], w8 +-; CHECK-NEXT: umov w8, v1.h[0] +-; CHECK-NEXT: mov v0.b[12], w8 +-; CHECK-NEXT: umov w8, v1.h[1] +-; CHECK-NEXT: mov v0.b[13], w8 +-; CHECK-NEXT: umov w8, v1.h[2] +-; CHECK-NEXT: mov v0.b[14], w8 +-; CHECK-NEXT: umov w8, v1.h[3] +-; CHECK-NEXT: mov v0.b[15], w8 ++; CHECK-NEXT: smax v1.4s, v1.4s, v5.4s ++; CHECK-NEXT: smax v0.4s, v0.4s, v5.4s ++; CHECK-NEXT: uzp1 v2.8h, v2.8h, v3.8h ++; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h ++; CHECK-NEXT: uzp1 v0.16b, v0.16b, v2.16b + ; CHECK-NEXT: ret + %x = call <16 x i8> @llvm.fptosi.sat.v16f32.v16i8(<16 x float> %f) + ret <16 x i8> %x +diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll +index cbb8b8a51126..d8d4b6f8b98c 100644 +--- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll ++++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll +@@ -2515,50 +2515,17 @@ define <16 x i8> @test_unsigned_v16f32_v16i8(<16 x float> %f) { + ; CHECK-LABEL: test_unsigned_v16f32_v16i8: + ; CHECK: // %bb.0: + ; CHECK-NEXT: movi v4.2d, #0x0000ff000000ff +-; CHECK-NEXT: fcvtzu v0.4s, v0.4s +-; CHECK-NEXT: fcvtzu v1.4s, v1.4s ++; CHECK-NEXT: fcvtzu v3.4s, v3.4s + ; CHECK-NEXT: fcvtzu v2.4s, v2.4s +-; CHECK-NEXT: umin v0.4s, v0.4s, v4.4s +-; CHECK-NEXT: umin v1.4s, v1.4s, v4.4s +-; CHECK-NEXT: umin v2.4s, v2.4s, v4.4s +-; CHECK-NEXT: xtn v5.4h, v0.4s +-; CHECK-NEXT: xtn v1.4h, v1.4s +-; CHECK-NEXT: umov w8, v5.h[0] +-; CHECK-NEXT: umov w9, v5.h[1] +-; CHECK-NEXT: fmov s0, w8 +-; CHECK-NEXT: umov w8, v5.h[2] +-; CHECK-NEXT: mov v0.b[1], w9 +-; CHECK-NEXT: mov v0.b[2], w8 +-; CHECK-NEXT: umov w8, v5.h[3] +-; CHECK-NEXT: mov v0.b[3], w8 +-; CHECK-NEXT: umov w8, v1.h[0] +-; CHECK-NEXT: mov v0.b[4], w8 +-; CHECK-NEXT: umov w8, v1.h[1] +-; CHECK-NEXT: mov v0.b[5], w8 +-; CHECK-NEXT: umov w8, v1.h[2] +-; CHECK-NEXT: mov v0.b[6], w8 +-; CHECK-NEXT: umov w8, v1.h[3] +-; CHECK-NEXT: xtn v1.4h, v2.4s +-; CHECK-NEXT: fcvtzu v2.4s, v3.4s +-; CHECK-NEXT: mov v0.b[7], w8 +-; CHECK-NEXT: umov w8, v1.h[0] ++; CHECK-NEXT: fcvtzu v1.4s, v1.4s ++; CHECK-NEXT: fcvtzu v0.4s, v0.4s ++; CHECK-NEXT: umin v3.4s, v3.4s, v4.4s + ; CHECK-NEXT: umin v2.4s, v2.4s, v4.4s +-; CHECK-NEXT: mov v0.b[8], w8 +-; CHECK-NEXT: umov w8, v1.h[1] +-; CHECK-NEXT: mov v0.b[9], w8 +-; CHECK-NEXT: umov w8, v1.h[2] +-; CHECK-NEXT: mov v0.b[10], w8 +-; CHECK-NEXT: umov w8, v1.h[3] +-; CHECK-NEXT: xtn v1.4h, v2.4s +-; CHECK-NEXT: mov v0.b[11], w8 +-; CHECK-NEXT: umov w8, v1.h[0] +-; CHECK-NEXT: mov v0.b[12], w8 +-; CHECK-NEXT: umov w8, v1.h[1] +-; CHECK-NEXT: mov v0.b[13], w8 +-; CHECK-NEXT: umov w8, v1.h[2] +-; CHECK-NEXT: mov v0.b[14], w8 +-; CHECK-NEXT: umov w8, v1.h[3] +-; CHECK-NEXT: mov v0.b[15], w8 ++; CHECK-NEXT: umin v1.4s, v1.4s, v4.4s ++; CHECK-NEXT: umin v0.4s, v0.4s, v4.4s ++; CHECK-NEXT: uzp1 v2.8h, v2.8h, v3.8h ++; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h ++; CHECK-NEXT: uzp1 v0.16b, v0.16b, v2.16b + ; CHECK-NEXT: ret + %x = call <16 x i8> @llvm.fptoui.sat.v16f32.v16i8(<16 x float> %f) + ret <16 x i8> %x +diff --git a/llvm/test/CodeGen/AArch64/neon-extracttruncate.ll b/llvm/test/CodeGen/AArch64/neon-extracttruncate.ll +index 14cc333120c7..dd7dd44bedf7 100644 +--- a/llvm/test/CodeGen/AArch64/neon-extracttruncate.ll ++++ b/llvm/test/CodeGen/AArch64/neon-extracttruncate.ll +@@ -84,43 +84,13 @@ entry: + define <16 x i8> @extract_4_v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) { + ; CHECK-LABEL: extract_4_v4i16: + ; CHECK: // %bb.0: // %entry +-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +-; CHECK-NEXT: umov w9, v0.h[0] +-; CHECK-NEXT: umov w10, v0.h[1] +-; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 + ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +-; CHECK-NEXT: umov w8, v2.h[0] ++; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 + ; CHECK-NEXT: // kill: def $d3 killed $d3 def $q3 +-; CHECK-NEXT: fmov s4, w9 +-; CHECK-NEXT: umov w9, v0.h[2] +-; CHECK-NEXT: mov v4.b[1], w10 +-; CHECK-NEXT: umov w10, v0.h[3] +-; CHECK-NEXT: mov v4.b[2], w9 +-; CHECK-NEXT: umov w9, v1.h[0] +-; CHECK-NEXT: mov v4.b[3], w10 +-; CHECK-NEXT: umov w10, v1.h[1] +-; CHECK-NEXT: mov v4.b[4], w9 +-; CHECK-NEXT: umov w9, v1.h[2] +-; CHECK-NEXT: mov v4.b[5], w10 +-; CHECK-NEXT: umov w10, v1.h[3] +-; CHECK-NEXT: mov v4.b[6], w9 +-; CHECK-NEXT: umov w9, v2.h[1] +-; CHECK-NEXT: mov v4.b[7], w10 +-; CHECK-NEXT: mov v4.b[8], w8 +-; CHECK-NEXT: umov w8, v2.h[2] +-; CHECK-NEXT: mov v4.b[9], w9 +-; CHECK-NEXT: umov w9, v2.h[3] +-; CHECK-NEXT: mov v4.b[10], w8 +-; CHECK-NEXT: umov w8, v3.h[0] +-; CHECK-NEXT: mov v4.b[11], w9 +-; CHECK-NEXT: umov w9, v3.h[1] +-; CHECK-NEXT: mov v4.b[12], w8 +-; CHECK-NEXT: umov w8, v3.h[2] +-; CHECK-NEXT: mov v4.b[13], w9 +-; CHECK-NEXT: umov w9, v3.h[3] +-; CHECK-NEXT: mov v4.b[14], w8 +-; CHECK-NEXT: mov v4.b[15], w9 +-; CHECK-NEXT: mov v0.16b, v4.16b ++; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 ++; CHECK-NEXT: mov v2.d[1], v3.d[0] ++; CHECK-NEXT: mov v0.d[1], v1.d[0] ++; CHECK-NEXT: uzp1 v0.16b, v0.16b, v2.16b + ; CHECK-NEXT: ret + entry: + %a0 = extractelement <4 x i16> %a, i32 0 +@@ -177,36 +147,9 @@ entry: + define <16 x i8> @extract_4_v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) { + ; CHECK-LABEL: extract_4_v4i32: + ; CHECK: // %bb.0: // %entry +-; CHECK-NEXT: mov w8, v0.s[1] +-; CHECK-NEXT: mov w9, v0.s[2] +-; CHECK-NEXT: mov w10, v0.s[3] +-; CHECK-NEXT: mov v0.b[1], w8 +-; CHECK-NEXT: fmov w8, s1 +-; CHECK-NEXT: mov v0.b[2], w9 +-; CHECK-NEXT: mov w9, v1.s[1] +-; CHECK-NEXT: mov v0.b[3], w10 +-; CHECK-NEXT: mov v0.b[4], w8 +-; CHECK-NEXT: mov w8, v1.s[2] +-; CHECK-NEXT: mov v0.b[5], w9 +-; CHECK-NEXT: mov w9, v1.s[3] +-; CHECK-NEXT: mov v0.b[6], w8 +-; CHECK-NEXT: fmov w8, s2 +-; CHECK-NEXT: mov v0.b[7], w9 +-; CHECK-NEXT: mov w9, v2.s[1] +-; CHECK-NEXT: mov v0.b[8], w8 +-; CHECK-NEXT: mov w8, v2.s[2] +-; CHECK-NEXT: mov v0.b[9], w9 +-; CHECK-NEXT: mov w9, v2.s[3] +-; CHECK-NEXT: mov v0.b[10], w8 +-; CHECK-NEXT: fmov w8, s3 +-; CHECK-NEXT: mov v0.b[11], w9 +-; CHECK-NEXT: mov w9, v3.s[1] +-; CHECK-NEXT: mov v0.b[12], w8 +-; CHECK-NEXT: mov w8, v3.s[2] +-; CHECK-NEXT: mov v0.b[13], w9 +-; CHECK-NEXT: mov w9, v3.s[3] +-; CHECK-NEXT: mov v0.b[14], w8 +-; CHECK-NEXT: mov v0.b[15], w9 ++; CHECK-NEXT: uzp1 v2.8h, v2.8h, v3.8h ++; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h ++; CHECK-NEXT: uzp1 v0.16b, v0.16b, v2.16b + ; CHECK-NEXT: ret + entry: + %a0 = extractelement <4 x i32> %a, i32 0 +@@ -263,41 +206,12 @@ entry: + define <16 x i8> @extract_4_mixed(<4 x i16> %a, <4 x i32> %b, <4 x i32> %c, <4 x i16> %d) { + ; CHECK-LABEL: extract_4_mixed: + ; CHECK: // %bb.0: // %entry +-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +-; CHECK-NEXT: umov w8, v0.h[0] +-; CHECK-NEXT: umov w9, v0.h[1] ++; CHECK-NEXT: xtn v2.4h, v2.4s + ; CHECK-NEXT: // kill: def $d3 killed $d3 def $q3 +-; CHECK-NEXT: fmov s4, w8 +-; CHECK-NEXT: umov w8, v0.h[2] +-; CHECK-NEXT: mov v4.b[1], w9 +-; CHECK-NEXT: umov w9, v0.h[3] +-; CHECK-NEXT: mov v4.b[2], w8 +-; CHECK-NEXT: fmov w8, s1 +-; CHECK-NEXT: mov v4.b[3], w9 +-; CHECK-NEXT: mov w9, v1.s[1] +-; CHECK-NEXT: mov v4.b[4], w8 +-; CHECK-NEXT: mov w8, v1.s[2] +-; CHECK-NEXT: mov v4.b[5], w9 +-; CHECK-NEXT: mov w9, v1.s[3] +-; CHECK-NEXT: mov v4.b[6], w8 +-; CHECK-NEXT: fmov w8, s2 +-; CHECK-NEXT: mov v4.b[7], w9 +-; CHECK-NEXT: mov w9, v2.s[1] +-; CHECK-NEXT: mov v4.b[8], w8 +-; CHECK-NEXT: mov w8, v2.s[2] +-; CHECK-NEXT: mov v4.b[9], w9 +-; CHECK-NEXT: mov w9, v2.s[3] +-; CHECK-NEXT: mov v4.b[10], w8 +-; CHECK-NEXT: umov w8, v3.h[0] +-; CHECK-NEXT: mov v4.b[11], w9 +-; CHECK-NEXT: umov w9, v3.h[1] +-; CHECK-NEXT: mov v4.b[12], w8 +-; CHECK-NEXT: umov w8, v3.h[2] +-; CHECK-NEXT: mov v4.b[13], w9 +-; CHECK-NEXT: umov w9, v3.h[3] +-; CHECK-NEXT: mov v4.b[14], w8 +-; CHECK-NEXT: mov v4.b[15], w9 +-; CHECK-NEXT: mov v0.16b, v4.16b ++; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ++; CHECK-NEXT: xtn2 v0.8h, v1.4s ++; CHECK-NEXT: mov v2.d[1], v3.d[0] ++; CHECK-NEXT: uzp1 v0.16b, v0.16b, v2.16b + ; CHECK-NEXT: ret + entry: + %a0 = extractelement <4 x i16> %a, i32 0 +@@ -440,25 +354,8 @@ entry: + define <16 x i8> @extract_4_v4i32_one(<4 x i32> %a) { + ; CHECK-LABEL: extract_4_v4i32_one: + ; CHECK: // %bb.0: // %entry +-; CHECK-NEXT: mov w8, v0.s[1] +-; CHECK-NEXT: fmov w9, s0 +-; CHECK-NEXT: mov w10, v0.s[2] +-; CHECK-NEXT: mov w11, v0.s[3] +-; CHECK-NEXT: mov v0.b[1], w8 +-; CHECK-NEXT: mov v0.b[2], w10 +-; CHECK-NEXT: mov v0.b[3], w11 +-; CHECK-NEXT: mov v0.b[4], w9 +-; CHECK-NEXT: mov v0.b[5], w8 +-; CHECK-NEXT: mov v0.b[6], w10 +-; CHECK-NEXT: mov v0.b[7], w11 +-; CHECK-NEXT: mov v0.b[8], w9 +-; CHECK-NEXT: mov v0.b[9], w8 +-; CHECK-NEXT: mov v0.b[10], w10 +-; CHECK-NEXT: mov v0.b[11], w11 +-; CHECK-NEXT: mov v0.b[12], w9 +-; CHECK-NEXT: mov v0.b[13], w8 +-; CHECK-NEXT: mov v0.b[14], w10 +-; CHECK-NEXT: mov v0.b[15], w11 ++; CHECK-NEXT: uzp1 v0.8h, v0.8h, v0.8h ++; CHECK-NEXT: uzp1 v0.16b, v0.16b, v0.16b + ; CHECK-NEXT: ret + entry: + %a0 = extractelement <4 x i32> %a, i32 0 +-- +2.34.1 + |