diff options
Diffstat (limited to 'src/fastcpy.rs')
-rw-r--r-- | src/fastcpy.rs | 145 |
1 files changed, 145 insertions, 0 deletions
diff --git a/src/fastcpy.rs b/src/fastcpy.rs new file mode 100644 index 0000000..8bbd480 --- /dev/null +++ b/src/fastcpy.rs @@ -0,0 +1,145 @@ +//! # FastCpy +//! +//! The Rust Compiler calls `memcpy` for slices of unknown length. +//! This crate provides a faster implementation of `memcpy` for slices up to 32bytes (64bytes with `avx`). +//! If you know most of you copy operations are not too big you can use `fastcpy` to speed up your program. +//! +//! `fastcpy` is designed to contain not too much assembly, so the overhead is low. +//! +//! As fall back the standard `memcpy` is called +//! +//! ## Double Copy Trick +//! `fastcpy` employs a double copy trick to copy slices of length 4-32bytes (64bytes with `avx`). +//! E.g. Slice of length 6 can be copied with two uncoditional copy operations. +//! +//! /// [1, 2, 3, 4, 5, 6] +//! /// [1, 2, 3, 4] +//! /// [3, 4, 5, 6] +//! + +#[inline] +pub fn slice_copy(src: &[u8], dst: &mut [u8]) { + #[inline(never)] + #[cold] + #[track_caller] + fn len_mismatch_fail(dst_len: usize, src_len: usize) -> ! { + panic!( + "source slice length ({}) does not match destination slice length ({})", + src_len, dst_len, + ); + } + + if src.len() != dst.len() { + len_mismatch_fail(src.len(), dst.len()); + } + let len = src.len(); + + if src.is_empty() { + return; + } + + if len < 4 { + short_copy(src, dst); + return; + } + + if len < 8 { + double_copy_trick::<4>(src, dst); + return; + } + + if len <= 16 { + double_copy_trick::<8>(src, dst); + return; + } + + if len <= 32 { + double_copy_trick::<16>(src, dst); + return; + } + + /// The code will use the vmovdqu instruction to copy 32 bytes at a time. + #[cfg(target_feature = "avx")] + { + if len <= 64 { + double_copy_trick::<32>(src, dst); + return; + } + } + + // For larger sizes we use the default, which calls memcpy + // memcpy does some virtual memory tricks to copy large chunks of memory. + // + // The theory should be that the checks above don't cost much relative to the copy call for + // larger copies. + // The bounds checks in `copy_from_slice` are elided. + dst.copy_from_slice(src); +} + +#[inline(always)] +fn short_copy(src: &[u8], dst: &mut [u8]) { + let len = src.len(); + + // length 1-3 + dst[0] = src[0]; + if len >= 2 { + double_copy_trick::<2>(src, dst); + } +} + +#[inline(always)] +/// [1, 2, 3, 4, 5, 6] +/// [1, 2, 3, 4] +/// [3, 4, 5, 6] +fn double_copy_trick<const SIZE: usize>(src: &[u8], dst: &mut [u8]) { + dst[0..SIZE].copy_from_slice(&src[0..SIZE]); + dst[src.len() - SIZE..].copy_from_slice(&src[src.len() - SIZE..]); +} + +#[cfg(test)] +mod tests { + use super::slice_copy; + use alloc::vec::Vec; + use proptest::prelude::*; + proptest! { + #[test] + fn test_fast_short_slice_copy(left: Vec<u8>) { + let mut right = vec![0u8; left.len()]; + slice_copy(&left, &mut right); + prop_assert_eq!(&left, &right); + } + } + + #[test] + fn test_fast_short_slice_copy_edge_cases() { + for len in 0..(512 * 2) { + let left = (0..len).map(|i| i as u8).collect::<Vec<_>>(); + let mut right = vec![0u8; len]; + slice_copy(&left, &mut right); + assert_eq!(left, right); + } + } + + #[test] + fn test_fail2() { + let left = vec![ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 32, + ]; + let mut right = vec![0u8; left.len()]; + slice_copy(&left, &mut right); + assert_eq!(left, right); + } + + #[test] + fn test_fail() { + let left = vec![ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ]; + let mut right = vec![0u8; left.len()]; + slice_copy(&left, &mut right); + assert_eq!(left, right); + } +} |