summaryrefslogtreecommitdiff
path: root/src/fastcpy.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/fastcpy.rs')
-rw-r--r--src/fastcpy.rs145
1 files changed, 145 insertions, 0 deletions
diff --git a/src/fastcpy.rs b/src/fastcpy.rs
new file mode 100644
index 0000000..8bbd480
--- /dev/null
+++ b/src/fastcpy.rs
@@ -0,0 +1,145 @@
+//! # FastCpy
+//!
+//! The Rust Compiler calls `memcpy` for slices of unknown length.
+//! This crate provides a faster implementation of `memcpy` for slices up to 32bytes (64bytes with `avx`).
+//! If you know most of you copy operations are not too big you can use `fastcpy` to speed up your program.
+//!
+//! `fastcpy` is designed to contain not too much assembly, so the overhead is low.
+//!
+//! As fall back the standard `memcpy` is called
+//!
+//! ## Double Copy Trick
+//! `fastcpy` employs a double copy trick to copy slices of length 4-32bytes (64bytes with `avx`).
+//! E.g. Slice of length 6 can be copied with two uncoditional copy operations.
+//!
+//! /// [1, 2, 3, 4, 5, 6]
+//! /// [1, 2, 3, 4]
+//! /// [3, 4, 5, 6]
+//!
+
+#[inline]
+pub fn slice_copy(src: &[u8], dst: &mut [u8]) {
+ #[inline(never)]
+ #[cold]
+ #[track_caller]
+ fn len_mismatch_fail(dst_len: usize, src_len: usize) -> ! {
+ panic!(
+ "source slice length ({}) does not match destination slice length ({})",
+ src_len, dst_len,
+ );
+ }
+
+ if src.len() != dst.len() {
+ len_mismatch_fail(src.len(), dst.len());
+ }
+ let len = src.len();
+
+ if src.is_empty() {
+ return;
+ }
+
+ if len < 4 {
+ short_copy(src, dst);
+ return;
+ }
+
+ if len < 8 {
+ double_copy_trick::<4>(src, dst);
+ return;
+ }
+
+ if len <= 16 {
+ double_copy_trick::<8>(src, dst);
+ return;
+ }
+
+ if len <= 32 {
+ double_copy_trick::<16>(src, dst);
+ return;
+ }
+
+ /// The code will use the vmovdqu instruction to copy 32 bytes at a time.
+ #[cfg(target_feature = "avx")]
+ {
+ if len <= 64 {
+ double_copy_trick::<32>(src, dst);
+ return;
+ }
+ }
+
+ // For larger sizes we use the default, which calls memcpy
+ // memcpy does some virtual memory tricks to copy large chunks of memory.
+ //
+ // The theory should be that the checks above don't cost much relative to the copy call for
+ // larger copies.
+ // The bounds checks in `copy_from_slice` are elided.
+ dst.copy_from_slice(src);
+}
+
+#[inline(always)]
+fn short_copy(src: &[u8], dst: &mut [u8]) {
+ let len = src.len();
+
+ // length 1-3
+ dst[0] = src[0];
+ if len >= 2 {
+ double_copy_trick::<2>(src, dst);
+ }
+}
+
+#[inline(always)]
+/// [1, 2, 3, 4, 5, 6]
+/// [1, 2, 3, 4]
+/// [3, 4, 5, 6]
+fn double_copy_trick<const SIZE: usize>(src: &[u8], dst: &mut [u8]) {
+ dst[0..SIZE].copy_from_slice(&src[0..SIZE]);
+ dst[src.len() - SIZE..].copy_from_slice(&src[src.len() - SIZE..]);
+}
+
+#[cfg(test)]
+mod tests {
+ use super::slice_copy;
+ use alloc::vec::Vec;
+ use proptest::prelude::*;
+ proptest! {
+ #[test]
+ fn test_fast_short_slice_copy(left: Vec<u8>) {
+ let mut right = vec![0u8; left.len()];
+ slice_copy(&left, &mut right);
+ prop_assert_eq!(&left, &right);
+ }
+ }
+
+ #[test]
+ fn test_fast_short_slice_copy_edge_cases() {
+ for len in 0..(512 * 2) {
+ let left = (0..len).map(|i| i as u8).collect::<Vec<_>>();
+ let mut right = vec![0u8; len];
+ slice_copy(&left, &mut right);
+ assert_eq!(left, right);
+ }
+ }
+
+ #[test]
+ fn test_fail2() {
+ let left = vec![
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31, 32,
+ ];
+ let mut right = vec![0u8; left.len()];
+ slice_copy(&left, &mut right);
+ assert_eq!(left, right);
+ }
+
+ #[test]
+ fn test_fail() {
+ let left = vec![
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ ];
+ let mut right = vec![0u8; left.len()];
+ slice_copy(&left, &mut right);
+ assert_eq!(left, right);
+ }
+}