diff options
author | David LeGare <legare@google.com> | 2022-03-03 03:14:37 +0000 |
---|---|---|
committer | Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com> | 2022-03-03 03:14:37 +0000 |
commit | fcbcefbbc5486d3fa1bf9ac6ce6bfd66fbbf750e (patch) | |
tree | b6854ef9548514cf0cbdd9cde93c3870fc5d7077 | |
parent | da247c27f63828ecaf579e5ca275a0428a7f453e (diff) | |
parent | 9a5a3f8d7a82e1094e9abf277468e6f8d140b9f1 (diff) | |
download | ppv-lite86-android13-mainline-go-adservices-release.tar.gz |
Update ppv-lite86 to 0.2.16 am: b990c176a5 am: 46785d4ae8 am: 9a5a3f8d7at_frc_odp_330442040t_frc_odp_330442000t_frc_ase_330444010android-13.0.0_r30android-13.0.0_r29android-13.0.0_r28android-13.0.0_r27android-13.0.0_r24android-13.0.0_r23android-13.0.0_r22android-13.0.0_r21android-13.0.0_r20android-13.0.0_r19android-13.0.0_r18android-13.0.0_r17android-13.0.0_r16aml_go_odp_330912000aml_go_ads_330915100aml_go_ads_330915000aml_go_ads_330913000android13-qpr1-s8-releaseandroid13-qpr1-s7-releaseandroid13-qpr1-s6-releaseandroid13-qpr1-s5-releaseandroid13-qpr1-s4-releaseandroid13-qpr1-s3-releaseandroid13-qpr1-s2-releaseandroid13-qpr1-s1-releaseandroid13-qpr1-releaseandroid13-mainline-go-adservices-releaseandroid13-frc-odp-releaseandroid13-dev
Original change: https://android-review.googlesource.com/c/platform/external/rust/crates/ppv-lite86/+/2005933
Change-Id: I892b5c10849aa4f168c513c451526de92151ced6
-rw-r--r-- | .cargo_vcs_info.json | 7 | ||||
-rw-r--r-- | Android.bp | 4 | ||||
-rw-r--r-- | CHANGELOG.md | 10 | ||||
-rw-r--r-- | Cargo.toml | 11 | ||||
-rw-r--r-- | Cargo.toml.orig | 2 | ||||
-rw-r--r-- | METADATA | 10 | ||||
-rw-r--r-- | src/generic.rs | 122 | ||||
-rw-r--r-- | src/soft.rs | 81 | ||||
-rw-r--r-- | src/types.rs | 22 | ||||
-rw-r--r-- | src/x86_64/mod.rs | 30 | ||||
-rw-r--r-- | src/x86_64/sse2.rs | 338 |
11 files changed, 413 insertions, 224 deletions
diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json index adb1fc4..e6ee0e5 100644 --- a/.cargo_vcs_info.json +++ b/.cargo_vcs_info.json @@ -1,5 +1,6 @@ { "git": { - "sha1": "3012849c2d9c50228a780031e7c200b193a6b4fa" - } -} + "sha1": "4b1e1d655d05c9da29aa833ce705feedb3da760b" + }, + "path_in_vcs": "utils-simd/ppv-lite86" +}
\ No newline at end of file @@ -42,7 +42,7 @@ rust_library { host_supported: true, crate_name: "ppv_lite86", cargo_env_compat: true, - cargo_pkg_version: "0.2.10", + cargo_pkg_version: "0.2.16", srcs: ["src/lib.rs"], edition: "2018", features: [ @@ -60,7 +60,7 @@ rust_test { host_supported: true, crate_name: "ppv_lite86", cargo_env_compat: true, - cargo_pkg_version: "0.2.10", + cargo_pkg_version: "0.2.16", srcs: ["src/lib.rs"], test_suites: ["general-tests"], auto_gen_config: true, diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..6e34be3 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,10 @@ +# Changelog +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [0.2.16] +### Added +- add [u64; 4] conversion for generic vec256, to support BLAKE on non-x86. +- impl `From` (rather than just `Into`) for conversions between `*_storage` types and arrays. @@ -3,17 +3,16 @@ # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies -# to registry (e.g., crates.io) dependencies +# to registry (e.g., crates.io) dependencies. # -# If you believe there's an error in this file please file an -# issue against the rust-lang/cargo repository. If you're -# editing this file be aware that the upstream Cargo.toml -# will likely look very different (and much more reasonable) +# If you are reading this file be aware that the original Cargo.toml +# will likely look very different (and much more reasonable). +# See Cargo.toml.orig for the original contents. [package] edition = "2018" name = "ppv-lite86" -version = "0.2.10" +version = "0.2.16" authors = ["The CryptoCorrosion Contributors"] description = "Implementation of the crypto-simd API for x86" keywords = ["crypto", "simd", "x86"] diff --git a/Cargo.toml.orig b/Cargo.toml.orig index 8f3fb52..b457f54 100644 --- a/Cargo.toml.orig +++ b/Cargo.toml.orig @@ -1,6 +1,6 @@ [package] name = "ppv-lite86" -version = "0.2.10" +version = "0.2.16" authors = ["The CryptoCorrosion Contributors"] edition = "2018" license = "MIT/Apache-2.0" @@ -7,13 +7,13 @@ third_party { } url { type: ARCHIVE - value: "https://static.crates.io/crates/ppv-lite86/ppv-lite86-0.2.10.crate" + value: "https://static.crates.io/crates/ppv-lite86/ppv-lite86-0.2.16.crate" } - version: "0.2.10" + version: "0.2.16" license_type: NOTICE last_upgrade_date { - year: 2020 - month: 11 - day: 2 + year: 2022 + month: 3 + day: 1 } } diff --git a/src/generic.rs b/src/generic.rs index f0e83d9..add6c48 100644 --- a/src/generic.rs +++ b/src/generic.rs @@ -11,38 +11,38 @@ pub union vec128_storage { q: [u64; 2], } impl From<[u32; 4]> for vec128_storage { - #[inline] + #[inline(always)] fn from(d: [u32; 4]) -> Self { Self { d } } } impl From<vec128_storage> for [u32; 4] { - #[inline] + #[inline(always)] fn from(d: vec128_storage) -> Self { unsafe { d.d } } } impl From<[u64; 2]> for vec128_storage { - #[inline] + #[inline(always)] fn from(q: [u64; 2]) -> Self { Self { q } } } impl From<vec128_storage> for [u64; 2] { - #[inline] + #[inline(always)] fn from(q: vec128_storage) -> Self { unsafe { q.q } } } impl Default for vec128_storage { - #[inline] + #[inline(always)] fn default() -> Self { Self { q: [0, 0] } } } impl Eq for vec128_storage {} impl PartialEq<vec128_storage> for vec128_storage { - #[inline] + #[inline(always)] fn eq(&self, rhs: &Self) -> bool { unsafe { self.q == rhs.q } } @@ -62,13 +62,21 @@ impl vec256_storage { } } impl From<vec256_storage> for [u64; 4] { - #[inline] + #[inline(always)] fn from(q: vec256_storage) -> Self { let [a, b]: [u64; 2] = q.v128[0].into(); let [c, d]: [u64; 2] = q.v128[1].into(); [a, b, c, d] } } +impl From<[u64; 4]> for vec256_storage { + #[inline(always)] + fn from([a, b, c, d]: [u64; 4]) -> Self { + Self { + v128: [[a, b].into(), [c, d].into()], + } + } +} #[derive(Clone, Copy, PartialEq, Eq, Default)] pub struct vec512_storage { v128: [vec128_storage; 4], @@ -84,6 +92,7 @@ impl vec512_storage { } } +#[inline(always)] fn dmap<T, F>(t: T, f: F) -> T where T: Store<vec128_storage> + Into<vec128_storage>, @@ -117,6 +126,7 @@ where unsafe { T::unpack(d) } } +#[inline(always)] fn qmap<T, F>(t: T, f: F) -> T where T: Store<vec128_storage> + Into<vec128_storage>, @@ -130,6 +140,7 @@ where unsafe { T::unpack(q) } } +#[inline(always)] fn qmap2<T, F>(a: T, b: T, f: F) -> T where T: Store<vec128_storage> + Into<vec128_storage>, @@ -145,14 +156,17 @@ where unsafe { T::unpack(q) } } +#[inline(always)] fn o_of_q(q: [u64; 2]) -> u128 { u128::from(q[0]) | (u128::from(q[1]) << 64) } +#[inline(always)] fn q_of_o(o: u128) -> [u64; 2] { [o as u64, (o >> 64) as u64] } +#[inline(always)] fn omap<T, F>(a: T, f: F) -> T where T: Store<vec128_storage> + Into<vec128_storage>, @@ -164,6 +178,7 @@ where unsafe { T::unpack(o) } } +#[inline(always)] fn omap2<T, F>(a: T, b: T, f: F) -> T where T: Store<vec128_storage> + Into<vec128_storage>, @@ -247,39 +262,39 @@ macro_rules! impl_bitops { } impl Swap64 for $vec { - #[inline] + #[inline(always)] fn swap1(self) -> Self { qmap(self, |x| { ((x & 0x5555555555555555) << 1) | ((x & 0xaaaaaaaaaaaaaaaa) >> 1) }) } - #[inline] + #[inline(always)] fn swap2(self) -> Self { qmap(self, |x| { ((x & 0x3333333333333333) << 2) | ((x & 0xcccccccccccccccc) >> 2) }) } - #[inline] + #[inline(always)] fn swap4(self) -> Self { qmap(self, |x| { ((x & 0x0f0f0f0f0f0f0f0f) << 4) | ((x & 0xf0f0f0f0f0f0f0f0) >> 4) }) } - #[inline] + #[inline(always)] fn swap8(self) -> Self { qmap(self, |x| { ((x & 0x00ff00ff00ff00ff) << 8) | ((x & 0xff00ff00ff00ff00) >> 8) }) } - #[inline] + #[inline(always)] fn swap16(self) -> Self { dmap(self, |x| x.rotate_left(16)) } - #[inline] + #[inline(always)] fn swap32(self) -> Self { qmap(self, |x| x.rotate_left(32)) } - #[inline] + #[inline(always)] fn swap64(self) -> Self { omap(self, |x| (x << 64) | (x >> 64)) } @@ -291,82 +306,83 @@ impl_bitops!(u64x2_generic); impl_bitops!(u128x1_generic); impl RotateEachWord32 for u32x4_generic { - #[inline] + #[inline(always)] fn rotate_each_word_right7(self) -> Self { dmap(self, |x| x.rotate_right(7)) } - #[inline] + #[inline(always)] fn rotate_each_word_right8(self) -> Self { dmap(self, |x| x.rotate_right(8)) } - #[inline] + #[inline(always)] fn rotate_each_word_right11(self) -> Self { dmap(self, |x| x.rotate_right(11)) } - #[inline] + #[inline(always)] fn rotate_each_word_right12(self) -> Self { dmap(self, |x| x.rotate_right(12)) } - #[inline] + #[inline(always)] fn rotate_each_word_right16(self) -> Self { dmap(self, |x| x.rotate_right(16)) } - #[inline] + #[inline(always)] fn rotate_each_word_right20(self) -> Self { dmap(self, |x| x.rotate_right(20)) } - #[inline] + #[inline(always)] fn rotate_each_word_right24(self) -> Self { dmap(self, |x| x.rotate_right(24)) } - #[inline] + #[inline(always)] fn rotate_each_word_right25(self) -> Self { dmap(self, |x| x.rotate_right(25)) } } impl RotateEachWord32 for u64x2_generic { - #[inline] + #[inline(always)] fn rotate_each_word_right7(self) -> Self { qmap(self, |x| x.rotate_right(7)) } - #[inline] + #[inline(always)] fn rotate_each_word_right8(self) -> Self { qmap(self, |x| x.rotate_right(8)) } - #[inline] + #[inline(always)] fn rotate_each_word_right11(self) -> Self { qmap(self, |x| x.rotate_right(11)) } - #[inline] + #[inline(always)] fn rotate_each_word_right12(self) -> Self { qmap(self, |x| x.rotate_right(12)) } - #[inline] + #[inline(always)] fn rotate_each_word_right16(self) -> Self { qmap(self, |x| x.rotate_right(16)) } - #[inline] + #[inline(always)] fn rotate_each_word_right20(self) -> Self { qmap(self, |x| x.rotate_right(20)) } - #[inline] + #[inline(always)] fn rotate_each_word_right24(self) -> Self { qmap(self, |x| x.rotate_right(24)) } - #[inline] + #[inline(always)] fn rotate_each_word_right25(self) -> Self { qmap(self, |x| x.rotate_right(25)) } } impl RotateEachWord64 for u64x2_generic { - #[inline] + #[inline(always)] fn rotate_each_word_right32(self) -> Self { qmap(self, |x| x.rotate_right(32)) } } // workaround for koute/cargo-web#52 (u128::rotate_* broken with cargo web) +#[inline(always)] fn rotate_u128_right(x: u128, i: u32) -> u128 { (x >> i) | (x << (128 - i)) } @@ -377,41 +393,41 @@ fn test_rotate_u128() { } impl RotateEachWord32 for u128x1_generic { - #[inline] + #[inline(always)] fn rotate_each_word_right7(self) -> Self { Self([rotate_u128_right(self.0[0], 7)]) } - #[inline] + #[inline(always)] fn rotate_each_word_right8(self) -> Self { Self([rotate_u128_right(self.0[0], 8)]) } - #[inline] + #[inline(always)] fn rotate_each_word_right11(self) -> Self { Self([rotate_u128_right(self.0[0], 11)]) } - #[inline] + #[inline(always)] fn rotate_each_word_right12(self) -> Self { Self([rotate_u128_right(self.0[0], 12)]) } - #[inline] + #[inline(always)] fn rotate_each_word_right16(self) -> Self { Self([rotate_u128_right(self.0[0], 16)]) } - #[inline] + #[inline(always)] fn rotate_each_word_right20(self) -> Self { Self([rotate_u128_right(self.0[0], 20)]) } - #[inline] + #[inline(always)] fn rotate_each_word_right24(self) -> Self { Self([rotate_u128_right(self.0[0], 24)]) } - #[inline] + #[inline(always)] fn rotate_each_word_right25(self) -> Self { Self([rotate_u128_right(self.0[0], 25)]) } } impl RotateEachWord64 for u128x1_generic { - #[inline] + #[inline(always)] fn rotate_each_word_right32(self) -> Self { Self([rotate_u128_right(self.0[0], 32)]) } @@ -430,7 +446,7 @@ impl Machine for GenericMachine { type u32x4x4 = u32x4x4_generic; type u64x2x4 = u64x2x4_generic; type u128x4 = u128x4_generic; - #[inline] + #[inline(always)] unsafe fn instance() -> Self { Self } @@ -607,6 +623,22 @@ pub type u32x4x4_generic = x4<u32x4_generic>; pub type u64x2x4_generic = x4<u64x2_generic>; pub type u128x4_generic = x4<u128x1_generic>; +impl Vector<[u32; 16]> for u32x4x4_generic { + fn to_scalars(self) -> [u32; 16] { + let [a, b, c, d] = self.0; + let a = a.0; + let b = b.0; + let c = c.0; + let d = d.0; + [ + a[0], a[1], a[2], a[3], // + b[0], b[1], b[2], b[3], // + c[0], c[1], c[2], c[3], // + d[0], d[1], d[2], d[3], // + ] + } +} + impl MultiLane<[u32; 4]> for u32x4_generic { #[inline(always)] fn to_lanes(self) -> [u32; 4] { @@ -747,7 +779,7 @@ impl u128x4<GenericMachine> for u128x4_generic {} #[macro_export] macro_rules! dispatch { ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { - #[inline] + #[inline(always)] $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { let $mach = unsafe { $crate::generic::GenericMachine::instance() }; #[inline(always)] @@ -764,7 +796,7 @@ macro_rules! dispatch { #[macro_export] macro_rules! dispatch_light128 { ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { - #[inline] + #[inline(always)] $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { let $mach = unsafe { $crate::generic::GenericMachine::instance() }; #[inline(always)] @@ -781,7 +813,7 @@ macro_rules! dispatch_light128 { #[macro_export] macro_rules! dispatch_light256 { ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { - #[inline] + #[inline(always)] $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { let $mach = unsafe { $crate::generic::GenericMachine::instance() }; #[inline(always)] @@ -798,7 +830,7 @@ macro_rules! dispatch_light256 { #[macro_export] macro_rules! dispatch_light512 { ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { - #[inline] + #[inline(always)] $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { let $mach = unsafe { $crate::generic::GenericMachine::instance() }; #[inline(always)] diff --git a/src/soft.rs b/src/soft.rs index 8976c48..0ae390c 100644 --- a/src/soft.rs +++ b/src/soft.rs @@ -175,26 +175,50 @@ impl<W: BSwap + Copy, G> BSwap for x2<W, G> { impl<W: StoreBytes + BSwap + Copy, G> StoreBytes for x2<W, G> { #[inline(always)] unsafe fn unsafe_read_le(input: &[u8]) -> Self { - let input = input.split_at(16); + let input = input.split_at(input.len() / 2); x2::new([W::unsafe_read_le(input.0), W::unsafe_read_le(input.1)]) } #[inline(always)] unsafe fn unsafe_read_be(input: &[u8]) -> Self { - x2::unsafe_read_le(input).bswap() + let input = input.split_at(input.len() / 2); + x2::new([W::unsafe_read_be(input.0), W::unsafe_read_be(input.1)]) } #[inline(always)] fn write_le(self, out: &mut [u8]) { - let out = out.split_at_mut(16); + let out = out.split_at_mut(out.len() / 2); self.0[0].write_le(out.0); self.0[1].write_le(out.1); } #[inline(always)] fn write_be(self, out: &mut [u8]) { - let out = out.split_at_mut(16); + let out = out.split_at_mut(out.len() / 2); self.0[0].write_be(out.0); self.0[1].write_be(out.1); } } +impl<W: Copy + LaneWords4, G: Copy> LaneWords4 for x2<W, G> { + #[inline(always)] + fn shuffle_lane_words2301(self) -> Self { + Self::new([ + self.0[0].shuffle_lane_words2301(), + self.0[1].shuffle_lane_words2301(), + ]) + } + #[inline(always)] + fn shuffle_lane_words1230(self) -> Self { + Self::new([ + self.0[0].shuffle_lane_words1230(), + self.0[1].shuffle_lane_words1230(), + ]) + } + #[inline(always)] + fn shuffle_lane_words3012(self) -> Self { + Self::new([ + self.0[0].shuffle_lane_words3012(), + self.0[1].shuffle_lane_words3012(), + ]) + } +} #[derive(Copy, Clone, Default)] #[allow(non_camel_case_types)] @@ -310,6 +334,20 @@ impl<W: Copy> Vec4<W> for x4<W> { self } } +impl<W: Copy> Vec4Ext<W> for x4<W> { + #[inline(always)] + fn transpose4(a: Self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) + where + Self: Sized, + { + ( + x4([a.0[0], b.0[0], c.0[0], d.0[0]]), + x4([a.0[1], b.0[1], c.0[1], d.0[1]]), + x4([a.0[2], b.0[2], c.0[2], d.0[2]]), + x4([a.0[3], b.0[3], c.0[3], d.0[3]]), + ) + } +} impl<W: Copy + Store<vec128_storage>> Store<vec512_storage> for x4<W> { #[inline(always)] unsafe fn unpack(p: vec512_storage) -> Self { @@ -368,30 +406,39 @@ impl<W: BSwap + Copy> BSwap for x4<W> { impl<W: StoreBytes + BSwap + Copy> StoreBytes for x4<W> { #[inline(always)] unsafe fn unsafe_read_le(input: &[u8]) -> Self { + let n = input.len() / 4; x4([ - W::unsafe_read_le(&input[0..16]), - W::unsafe_read_le(&input[16..32]), - W::unsafe_read_le(&input[32..48]), - W::unsafe_read_le(&input[48..64]), + W::unsafe_read_le(&input[..n]), + W::unsafe_read_le(&input[n..n * 2]), + W::unsafe_read_le(&input[n * 2..n * 3]), + W::unsafe_read_le(&input[n * 3..]), ]) } #[inline(always)] unsafe fn unsafe_read_be(input: &[u8]) -> Self { - x4::unsafe_read_le(input).bswap() + let n = input.len() / 4; + x4([ + W::unsafe_read_be(&input[..n]), + W::unsafe_read_be(&input[n..n * 2]), + W::unsafe_read_be(&input[n * 2..n * 3]), + W::unsafe_read_be(&input[n * 3..]), + ]) } #[inline(always)] fn write_le(self, out: &mut [u8]) { - self.0[0].write_le(&mut out[0..16]); - self.0[1].write_le(&mut out[16..32]); - self.0[2].write_le(&mut out[32..48]); - self.0[3].write_le(&mut out[48..64]); + let n = out.len() / 4; + self.0[0].write_le(&mut out[..n]); + self.0[1].write_le(&mut out[n..n * 2]); + self.0[2].write_le(&mut out[n * 2..n * 3]); + self.0[3].write_le(&mut out[n * 3..]); } #[inline(always)] fn write_be(self, out: &mut [u8]) { - self.0[0].write_be(&mut out[0..16]); - self.0[1].write_be(&mut out[16..32]); - self.0[2].write_be(&mut out[32..48]); - self.0[3].write_be(&mut out[48..64]); + let n = out.len() / 4; + self.0[0].write_be(&mut out[..n]); + self.0[1].write_be(&mut out[n..n * 2]); + self.0[2].write_be(&mut out[n * 2..n * 3]); + self.0[3].write_be(&mut out[n * 3..]); } } impl<W: Copy + LaneWords4> LaneWords4 for x4<W> { diff --git a/src/types.rs b/src/types.rs index a282670..f9f3bf1 100644 --- a/src/types.rs +++ b/src/types.rs @@ -71,6 +71,17 @@ pub trait Vec4<W> { fn extract(self, i: u32) -> W; fn insert(self, w: W, i: u32) -> Self; } +/// Vec4 functions which may not be implemented yet for all Vec4 types. +/// NOTE: functions in this trait may be moved to Vec4 in any patch release. To avoid breakage, +/// import Vec4Ext only together with Vec4, and don't qualify its methods. +pub trait Vec4Ext<W> { + fn transpose4(a: Self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) + where + Self: Sized; +} +pub trait Vector<T> { + fn to_scalars(self) -> T; +} // TODO: multiples of 4 should inherit this /// A vector composed of four words; depending on their size, operations may cross lanes. @@ -112,12 +123,7 @@ pub trait u32x4<M: Machine>: { } pub trait u64x2<M: Machine>: - BitOps64 - + Store<vec128_storage> - + ArithOps - + Vec2<u64> - + MultiLane<[u64; 2]> - + Into<vec128_storage> + BitOps64 + Store<vec128_storage> + ArithOps + Vec2<u64> + MultiLane<[u64; 2]> + Into<vec128_storage> { } pub trait u128x1<M: Machine>: @@ -132,6 +138,7 @@ pub trait u32x4x2<M: Machine>: + MultiLane<[M::u32x4; 2]> + ArithOps + Into<vec256_storage> + + StoreBytes { } pub trait u64x2x2<M: Machine>: @@ -169,10 +176,13 @@ pub trait u32x4x4<M: Machine>: BitOps32 + Store<vec512_storage> + Vec4<M::u32x4> + + Vec4Ext<M::u32x4> + + Vector<[u32; 16]> + MultiLane<[M::u32x4; 4]> + ArithOps + LaneWords4 + Into<vec512_storage> + + StoreBytes { } pub trait u64x2x4<M: Machine>: diff --git a/src/x86_64/mod.rs b/src/x86_64/mod.rs index d7455d0..937732d 100644 --- a/src/x86_64/mod.rs +++ b/src/x86_64/mod.rs @@ -79,7 +79,7 @@ where type u64x2 = sse2::u64x2_sse2<YesS3, YesS4, NI>; type u128x1 = sse2::u128x1_sse2<YesS3, YesS4, NI>; - type u32x4x2 = sse2::u32x4x2_sse2<YesS3, YesS4, NI>; + type u32x4x2 = sse2::avx2::u32x4x2_avx2<NI>; type u64x2x2 = sse2::u64x2x2_sse2<YesS3, YesS4, NI>; type u64x4 = sse2::u64x4_sse2<YesS3, YesS4, NI>; type u128x2 = sse2::u128x2_sse2<YesS3, YesS4, NI>; @@ -119,16 +119,16 @@ impl Store<vec128_storage> for vec128_storage { p } } -impl<'a> Into<&'a [u32; 4]> for &'a vec128_storage { +impl<'a> From<&'a vec128_storage> for &'a [u32; 4] { #[inline(always)] - fn into(self) -> &'a [u32; 4] { - unsafe { &self.u32x4 } + fn from(x: &'a vec128_storage) -> Self { + unsafe { &x.u32x4 } } } -impl Into<vec128_storage> for [u32; 4] { +impl From<[u32; 4]> for vec128_storage { #[inline(always)] - fn into(self) -> vec128_storage { - vec128_storage { u32x4: self } + fn from(u32x4: [u32; 4]) -> Self { + vec128_storage { u32x4 } } } impl Default for vec128_storage { @@ -154,10 +154,10 @@ pub union vec256_storage { sse2: [vec128_storage; 2], avx: __m256i, } -impl Into<vec256_storage> for [u64; 4] { +impl From<[u64; 4]> for vec256_storage { #[inline(always)] - fn into(self) -> vec256_storage { - vec256_storage { u64x4: self } + fn from(u64x4: [u64; 4]) -> Self { + vec256_storage { u64x4 } } } impl Default for vec256_storage { @@ -167,9 +167,11 @@ impl Default for vec256_storage { } } impl vec256_storage { + #[inline(always)] pub fn new128(xs: [vec128_storage; 2]) -> Self { Self { sse2: xs } } + #[inline(always)] pub fn split128(self) -> [vec128_storage; 2] { unsafe { self.sse2 } } @@ -200,9 +202,11 @@ impl Default for vec512_storage { } } impl vec512_storage { + #[inline(always)] pub fn new128(xs: [vec128_storage; 4]) -> Self { Self { sse2: xs } } + #[inline(always)] pub fn split128(self) -> [vec128_storage; 4] { unsafe { self.sse2 } } @@ -217,10 +221,10 @@ impl PartialEq for vec512_storage { macro_rules! impl_into { ($storage:ident, $array:ty, $name:ident) => { - impl Into<$array> for $storage { + impl From<$storage> for $array { #[inline(always)] - fn into(self) -> $array { - unsafe { self.$name } + fn from(vec: $storage) -> Self { + unsafe { vec.$name } } } }; diff --git a/src/x86_64/sse2.rs b/src/x86_64/sse2.rs index bf0063f..97197a4 100644 --- a/src/x86_64/sse2.rs +++ b/src/x86_64/sse2.rs @@ -189,21 +189,21 @@ impl<S4: Copy, NI: Copy> RotateEachWord32 for u32x4_sse2<YesS3, S4, NI> { rotr_32!(rotate_each_word_right7, 7); rotr_32_s3!( rotate_each_word_right8, - 0x0c0f0e0d_080b0a09, - 0x04070605_00030201 + 0x0c0f_0e0d_080b_0a09, + 0x0407_0605_0003_0201 ); rotr_32!(rotate_each_word_right11, 11); rotr_32!(rotate_each_word_right12, 12); rotr_32_s3!( rotate_each_word_right16, - 0x0d0c0f0e_09080b0a, - 0x05040706_01000302 + 0x0d0c_0f0e_0908_0b0a, + 0x0504_0706_0100_0302 ); rotr_32!(rotate_each_word_right20, 20); rotr_32_s3!( rotate_each_word_right24, - 0x0e0d0c0f_0a09080b, - 0x06050407_02010003 + 0x0e0d_0c0f_0a09_080b, + 0x0605_0407_0201_0003 ); rotr_32!(rotate_each_word_right25, 25); } @@ -880,6 +880,13 @@ pub type u64x2x4_sse2<S3, S4, NI> = x4<u64x2_sse2<S3, S4, NI>>; #[allow(non_camel_case_types)] pub type u128x4_sse2<S3, S4, NI> = x4<u128x1_sse2<S3, S4, NI>>; +impl<S3, S4, NI> Vector<[u32; 16]> for u32x4x4_sse2<S3, S4, NI> { + #[inline(always)] + fn to_scalars(self) -> [u32; 16] { + unsafe { core::mem::transmute(self) } + } +} + impl<S3: Copy, S4: Copy, NI: Copy> u32x4x2<Machine86<S3, S4, NI>> for u32x4x2_sse2<S3, S4, NI> where u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap, @@ -983,6 +990,8 @@ where Machine86<S3, S4, NI>: Machine, u32x4x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u32x4; 4]>, u32x4x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u32x4>, + u32x4x4_sse2<S3, S4, NI>: Vec4Ext<<Machine86<S3, S4, NI> as Machine>::u32x4>, + u32x4x4_sse2<S3, S4, NI>: Vector<[u32; 16]>, { } impl<S3: Copy, S4: Copy, NI: Copy> u64x2x4<Machine86<S3, S4, NI>> for u64x2x4_sse2<S3, S4, NI> @@ -1004,14 +1013,6 @@ where { } -impl<NI: Copy> u32x4x4<Avx2Machine<NI>> for u32x4x4_sse2<YesS3, YesS4, NI> -where - u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap, - Avx2Machine<NI>: Machine, - u32x4x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u32x4; 4]>, - u32x4x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u32x4>, -{ -} impl<NI: Copy> u64x2x4<Avx2Machine<NI>> for u64x2x4_sse2<YesS3, YesS4, NI> where u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap, @@ -1374,65 +1375,78 @@ mod test { pub mod avx2 { #![allow(non_camel_case_types)] - use crate::soft::x4; + use crate::soft::{x2, x4}; use crate::types::*; - use crate::x86_64::sse2::{u128x1_sse2, u32x4_sse2}; + use crate::x86_64::sse2::{u128x1_sse2, u32x4_sse2, G0}; use crate::x86_64::{vec256_storage, vec512_storage, Avx2Machine, YesS3, YesS4}; use core::arch::x86_64::*; use core::marker::PhantomData; use core::ops::*; #[derive(Copy, Clone)] - pub struct u32x4x4_avx2<NI> { - x: [__m256i; 2], + pub struct u32x4x2_avx2<NI> { + x: __m256i, ni: PhantomData<NI>, } - impl<NI> u32x4x4_avx2<NI> { + impl<NI> u32x4x2_avx2<NI> { #[inline(always)] - fn new(x: [__m256i; 2]) -> Self { + fn new(x: __m256i) -> Self { Self { x, ni: PhantomData } } } - impl<NI> u32x4x4<Avx2Machine<NI>> for u32x4x4_avx2<NI> where NI: Copy {} - impl<NI> Store<vec512_storage> for u32x4x4_avx2<NI> { + impl<NI> u32x4x2<Avx2Machine<NI>> for u32x4x2_avx2<NI> where NI: Copy {} + impl<NI> Store<vec256_storage> for u32x4x2_avx2<NI> { #[inline(always)] - unsafe fn unpack(p: vec512_storage) -> Self { - Self::new([p.avx[0].avx, p.avx[1].avx]) + unsafe fn unpack(p: vec256_storage) -> Self { + Self::new(p.avx) } } - impl<NI> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; 4]> for u32x4x4_avx2<NI> { + impl<NI> StoreBytes for u32x4x2_avx2<NI> { #[inline(always)] - fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 4] { + unsafe fn unsafe_read_le(input: &[u8]) -> Self { + assert_eq!(input.len(), 32); + Self::new(_mm256_loadu_si256(input.as_ptr() as *const _)) + } + #[inline(always)] + unsafe fn unsafe_read_be(input: &[u8]) -> Self { + Self::unsafe_read_le(input).bswap() + } + #[inline(always)] + fn write_le(self, out: &mut [u8]) { unsafe { - [ - u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 0)), - u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 1)), - u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 0)), - u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 1)), - ] + assert_eq!(out.len(), 32); + _mm256_storeu_si256(out.as_mut_ptr() as *mut _, self.x) } } #[inline(always)] - fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 4]) -> Self { - Self::new(unsafe { + fn write_be(self, out: &mut [u8]) { + self.bswap().write_le(out) + } + } + impl<NI> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; 2]> for u32x4x2_avx2<NI> { + #[inline(always)] + fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 2] { + unsafe { [ - _mm256_setr_m128i(x[0].x, x[1].x), - _mm256_setr_m128i(x[2].x, x[3].x), + u32x4_sse2::new(_mm256_extracti128_si256(self.x, 0)), + u32x4_sse2::new(_mm256_extracti128_si256(self.x, 1)), ] - }) + } + } + #[inline(always)] + fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 2]) -> Self { + Self::new(unsafe { _mm256_setr_m128i(x[0].x, x[1].x) }) } } - impl<NI> Vec4<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> { + impl<NI> Vec2<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x2_avx2<NI> { #[inline(always)] fn extract(self, i: u32) -> u32x4_sse2<YesS3, YesS4, NI> { unsafe { match i { - 0 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 0)), - 1 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 1)), - 2 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 0)), - 3 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 1)), + 0 => u32x4_sse2::new(_mm256_extracti128_si256(self.x, 0)), + 1 => u32x4_sse2::new(_mm256_extracti128_si256(self.x, 1)), _ => panic!(), } } @@ -1441,55 +1455,21 @@ pub mod avx2 { fn insert(self, w: u32x4_sse2<YesS3, YesS4, NI>, i: u32) -> Self { Self::new(unsafe { match i { - 0 => [_mm256_inserti128_si256(self.x[0], w.x, 0), self.x[1]], - 1 => [_mm256_inserti128_si256(self.x[0], w.x, 1), self.x[1]], - 2 => [self.x[0], _mm256_inserti128_si256(self.x[1], w.x, 0)], - 3 => [self.x[0], _mm256_inserti128_si256(self.x[1], w.x, 1)], + 0 => _mm256_inserti128_si256(self.x, w.x, 0), + 1 => _mm256_inserti128_si256(self.x, w.x, 1), _ => panic!(), } }) } } - impl<NI> LaneWords4 for u32x4x4_avx2<NI> { - #[inline(always)] - fn shuffle_lane_words1230(self) -> Self { - Self::new(unsafe { - [ - _mm256_shuffle_epi32(self.x[0], 0b1001_0011), - _mm256_shuffle_epi32(self.x[1], 0b1001_0011), - ] - }) - } - #[inline(always)] - fn shuffle_lane_words2301(self) -> Self { - Self::new(unsafe { - [ - _mm256_shuffle_epi32(self.x[0], 0b0100_1110), - _mm256_shuffle_epi32(self.x[1], 0b0100_1110), - ] - }) - } - #[inline(always)] - fn shuffle_lane_words3012(self) -> Self { - Self::new(unsafe { - [ - _mm256_shuffle_epi32(self.x[0], 0b0011_1001), - _mm256_shuffle_epi32(self.x[1], 0b0011_1001), - ] - }) - } - } - impl<NI> BitOps32 for u32x4x4_avx2<NI> where NI: Copy {} - impl<NI> ArithOps for u32x4x4_avx2<NI> where NI: Copy {} + impl<NI> BitOps32 for u32x4x2_avx2<NI> where NI: Copy {} + impl<NI> ArithOps for u32x4x2_avx2<NI> where NI: Copy {} macro_rules! shuf_lane_bytes { ($name:ident, $k0:expr, $k1:expr) => { #[inline(always)] fn $name(self) -> Self { Self::new(unsafe { - [ - _mm256_shuffle_epi8(self.x[0], _mm256_set_epi64x($k0, $k1, $k0, $k1)), - _mm256_shuffle_epi8(self.x[1], _mm256_set_epi64x($k0, $k1, $k0, $k1)), - ] + _mm256_shuffle_epi8(self.x, _mm256_set_epi64x($k0, $k1, $k0, $k1)) }) } }; @@ -1499,52 +1479,41 @@ pub mod avx2 { #[inline(always)] fn $name(self) -> Self { Self::new(unsafe { - [ - _mm256_or_si256( - _mm256_srli_epi32(self.x[0], $i as i32), - _mm256_slli_epi32(self.x[0], 32 - $i as i32), - ), - _mm256_or_si256( - _mm256_srli_epi32(self.x[1], $i as i32), - _mm256_slli_epi32(self.x[1], 32 - $i as i32), - ), - ] + _mm256_or_si256( + _mm256_srli_epi32(self.x, $i as i32), + _mm256_slli_epi32(self.x, 32 - $i as i32), + ) }) } }; } - impl<NI: Copy> RotateEachWord32 for u32x4x4_avx2<NI> { + impl<NI: Copy> RotateEachWord32 for u32x4x2_avx2<NI> { rotr_32!(rotate_each_word_right7, 7); shuf_lane_bytes!( rotate_each_word_right8, - 0x0c0f0e0d_080b0a09, - 0x04070605_00030201 + 0x0c0f_0e0d_080b_0a09, + 0x0407_0605_0003_0201 ); rotr_32!(rotate_each_word_right11, 11); rotr_32!(rotate_each_word_right12, 12); shuf_lane_bytes!( rotate_each_word_right16, - 0x0d0c0f0e_09080b0a, - 0x05040706_01000302 + 0x0d0c_0f0e_0908_0b0a, + 0x0504_0706_0100_0302 ); rotr_32!(rotate_each_word_right20, 20); shuf_lane_bytes!( rotate_each_word_right24, - 0x0e0d0c0f_0a09080b, - 0x06050407_02010003 + 0x0e0d_0c0f_0a09_080b, + 0x0605_0407_0201_0003 ); rotr_32!(rotate_each_word_right25, 25); } - impl<NI> BitOps0 for u32x4x4_avx2<NI> where NI: Copy {} - impl<NI> From<u32x4x4_avx2<NI>> for vec512_storage { + impl<NI> BitOps0 for u32x4x2_avx2<NI> where NI: Copy {} + impl<NI> From<u32x4x2_avx2<NI>> for vec256_storage { #[inline(always)] - fn from(x: u32x4x4_avx2<NI>) -> Self { - Self { - avx: [ - vec256_storage { avx: x.x[0] }, - vec256_storage { avx: x.x[1] }, - ], - } + fn from(x: u32x4x2_avx2<NI>) -> Self { + Self { avx: x.x } } } @@ -1561,55 +1530,172 @@ pub mod avx2 { } }; } - impl_assign!(u32x4x4_avx2, BitXorAssign, bitxor_assign, bitxor); - impl_assign!(u32x4x4_avx2, BitOrAssign, bitor_assign, bitor); - impl_assign!(u32x4x4_avx2, BitAndAssign, bitand_assign, bitand); - impl_assign!(u32x4x4_avx2, AddAssign, add_assign, add); + impl_assign!(u32x4x2_avx2, BitXorAssign, bitxor_assign, bitxor); + impl_assign!(u32x4x2_avx2, BitOrAssign, bitor_assign, bitor); + impl_assign!(u32x4x2_avx2, BitAndAssign, bitand_assign, bitand); + impl_assign!(u32x4x2_avx2, AddAssign, add_assign, add); - macro_rules! impl_bitop_x2 { + macro_rules! impl_bitop { ($vec:ident, $Op:ident, $op_fn:ident, $impl_fn:ident) => { impl<NI> $Op for $vec<NI> { type Output = Self; #[inline(always)] fn $op_fn(self, rhs: Self) -> Self::Output { - Self::new(unsafe { - [$impl_fn(self.x[0], rhs.x[0]), $impl_fn(self.x[1], rhs.x[1])] - }) + Self::new(unsafe { $impl_fn(self.x, rhs.x) }) } } }; } - impl_bitop_x2!(u32x4x4_avx2, BitXor, bitxor, _mm256_xor_si256); - impl_bitop_x2!(u32x4x4_avx2, BitOr, bitor, _mm256_or_si256); - impl_bitop_x2!(u32x4x4_avx2, BitAnd, bitand, _mm256_and_si256); - impl_bitop_x2!(u32x4x4_avx2, AndNot, andnot, _mm256_andnot_si256); - impl_bitop_x2!(u32x4x4_avx2, Add, add, _mm256_add_epi32); + impl_bitop!(u32x4x2_avx2, BitXor, bitxor, _mm256_xor_si256); + impl_bitop!(u32x4x2_avx2, BitOr, bitor, _mm256_or_si256); + impl_bitop!(u32x4x2_avx2, BitAnd, bitand, _mm256_and_si256); + impl_bitop!(u32x4x2_avx2, AndNot, andnot, _mm256_andnot_si256); + impl_bitop!(u32x4x2_avx2, Add, add, _mm256_add_epi32); - impl<NI> Not for u32x4x4_avx2<NI> { + impl<NI> Not for u32x4x2_avx2<NI> { type Output = Self; #[inline(always)] fn not(self) -> Self::Output { unsafe { let f = _mm256_set1_epi8(-0x7f); - Self::new([f, f]) ^ self + Self::new(f) ^ self } } } - impl<NI> BSwap for u32x4x4_avx2<NI> { + impl<NI> BSwap for u32x4x2_avx2<NI> { shuf_lane_bytes!(bswap, 0x0c0d_0e0f_0809_0a0b, 0x0405_0607_0001_0203); } - impl<NI> From<x4<u128x1_sse2<YesS3, YesS4, NI>>> for u32x4x4_avx2<NI> + impl<NI> From<x2<u128x1_sse2<YesS3, YesS4, NI>, G0>> for u32x4x2_avx2<NI> where NI: Copy, { #[inline(always)] + fn from(x: x2<u128x1_sse2<YesS3, YesS4, NI>, G0>) -> Self { + Self::new(unsafe { _mm256_setr_m128i(x.0[0].x, x.0[1].x) }) + } + } + + impl<NI> LaneWords4 for u32x4x2_avx2<NI> { + #[inline(always)] + fn shuffle_lane_words1230(self) -> Self { + Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b1001_0011) }) + } + #[inline(always)] + fn shuffle_lane_words2301(self) -> Self { + Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b0100_1110) }) + } + #[inline(always)] + fn shuffle_lane_words3012(self) -> Self { + Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b0011_1001) }) + } + } + + /////////////////////////////////////////////////////////////////////////////////////////// + + pub type u32x4x4_avx2<NI> = x2<u32x4x2_avx2<NI>, G0>; + impl<NI: Copy> u32x4x4<Avx2Machine<NI>> for u32x4x4_avx2<NI> {} + + impl<NI: Copy> Store<vec512_storage> for u32x4x4_avx2<NI> { + #[inline(always)] + unsafe fn unpack(p: vec512_storage) -> Self { + Self::new([ + u32x4x2_avx2::unpack(p.avx[0]), + u32x4x2_avx2::unpack(p.avx[1]), + ]) + } + } + impl<NI: Copy> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; 4]> for u32x4x4_avx2<NI> { + #[inline(always)] + fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 4] { + let [a, b] = self.0[0].to_lanes(); + let [c, d] = self.0[1].to_lanes(); + [a, b, c, d] + } + #[inline(always)] + fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 4]) -> Self { + let ab = u32x4x2_avx2::from_lanes([x[0], x[1]]); + let cd = u32x4x2_avx2::from_lanes([x[2], x[3]]); + Self::new([ab, cd]) + } + } + impl<NI: Copy> Vec4<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> { + #[inline(always)] + fn extract(self, i: u32) -> u32x4_sse2<YesS3, YesS4, NI> { + match i { + 0 => self.0[0].extract(0), + 1 => self.0[0].extract(1), + 2 => self.0[1].extract(0), + 3 => self.0[1].extract(1), + _ => panic!(), + } + } + #[inline(always)] + fn insert(self, w: u32x4_sse2<YesS3, YesS4, NI>, i: u32) -> Self { + Self::new(match i { + 0 | 1 => [self.0[0].insert(w, i), self.0[1]], + 2 | 3 => [self.0[0], self.0[1].insert(w, i - 2)], + _ => panic!(), + }) + } + } + impl<NI: Copy> Vec4Ext<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> { + #[inline(always)] + fn transpose4(a: Self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) { + /* + * a00:a01 a10:a11 + * b00:b01 b10:b11 + * c00:c01 c10:c11 + * d00:d01 d10:d11 + * => + * a00:b00 c00:d00 + * a01:b01 c01:d01 + * a10:b10 c10:d10 + * a11:b11 c11:d11 + */ + unsafe { + let ab00 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[0].x, b.0[0].x, 0x20)); + let ab01 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[0].x, b.0[0].x, 0x31)); + let ab10 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[1].x, b.0[1].x, 0x20)); + let ab11 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[1].x, b.0[1].x, 0x31)); + let cd00 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[0].x, d.0[0].x, 0x20)); + let cd01 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[0].x, d.0[0].x, 0x31)); + let cd10 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[1].x, d.0[1].x, 0x20)); + let cd11 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[1].x, d.0[1].x, 0x31)); + ( + Self::new([ab00, cd00]), + Self::new([ab01, cd01]), + Self::new([ab10, cd10]), + Self::new([ab11, cd11]), + ) + } + } + } + impl<NI: Copy> Vector<[u32; 16]> for u32x4x4_avx2<NI> { + #[inline(always)] + fn to_scalars(self) -> [u32; 16] { + unsafe { core::mem::transmute(self) } + } + } + impl<NI: Copy> From<u32x4x4_avx2<NI>> for vec512_storage { + #[inline(always)] + fn from(x: u32x4x4_avx2<NI>) -> Self { + Self { + avx: [ + vec256_storage { avx: x.0[0].x }, + vec256_storage { avx: x.0[1].x }, + ], + } + } + } + impl<NI: Copy> From<x4<u128x1_sse2<YesS3, YesS4, NI>>> for u32x4x4_avx2<NI> { + #[inline(always)] fn from(x: x4<u128x1_sse2<YesS3, YesS4, NI>>) -> Self { Self::new(unsafe { [ - _mm256_setr_m128i(x.0[0].x, x.0[1].x), - _mm256_setr_m128i(x.0[2].x, x.0[3].x), + u32x4x2_avx2::new(_mm256_setr_m128i(x.0[0].x, x.0[1].x)), + u32x4x2_avx2::new(_mm256_setr_m128i(x.0[2].x, x.0[3].x)), ] }) } |