diff options
Diffstat (limited to 'src/block/hashtable.rs')
-rw-r--r-- | src/block/hashtable.rs | 161 |
1 files changed, 161 insertions, 0 deletions
diff --git a/src/block/hashtable.rs b/src/block/hashtable.rs new file mode 100644 index 0000000..7c29db7 --- /dev/null +++ b/src/block/hashtable.rs @@ -0,0 +1,161 @@ +use alloc::boxed::Box; +use core::convert::TryInto; + +/// The Hashtable trait used by the compression to store hashed bytes to their position. +/// `val` can be maximum the size of the input in bytes. +/// +/// `pos` can have a maximum value of u16::MAX or 65535 +/// If the hashtable is smaller it needs to reduce the pos to its space, e.g. by right +/// shifting. +/// +/// Duplication dictionary size. +/// +/// Every four bytes is assigned an entry. When this number is lower, fewer entries exists, and +/// thus collisions are more likely, hurting the compression ratio. + +/// hashes and right shifts to a maximum value of 16bit, 65535 +/// The right shift is done in order to not exceed, the hashtables capacity +#[inline] +fn hash(sequence: u32) -> u32 { + (sequence.wrapping_mul(2654435761_u32)) >> 16 +} + +/// hashes and right shifts to a maximum value of 16bit, 65535 +/// The right shift is done in order to not exceed, the hashtables capacity +#[cfg(target_pointer_width = "64")] +#[inline] +fn hash5(sequence: usize) -> u32 { + let primebytes = if cfg!(target_endian = "little") { + 889523592379_usize + } else { + 11400714785074694791_usize + }; + (((sequence << 24).wrapping_mul(primebytes)) >> 48) as u32 +} + +pub trait HashTable { + fn get_at(&self, pos: usize) -> usize; + fn put_at(&mut self, pos: usize, val: usize); + fn clear(&mut self); + #[inline] + #[cfg(target_pointer_width = "64")] + fn get_hash_at(input: &[u8], pos: usize) -> usize { + hash5(super::compress::get_batch_arch(input, pos)) as usize + } + #[inline] + #[cfg(target_pointer_width = "32")] + fn get_hash_at(input: &[u8], pos: usize) -> usize { + hash(super::compress::get_batch(input, pos)) as usize + } +} + +const HASHTABLE_SIZE_4K: usize = 4 * 1024; +const HASHTABLE_BIT_SHIFT_4K: usize = 4; + +#[derive(Debug)] +#[repr(align(64))] +pub struct HashTable4KU16 { + dict: Box<[u16; HASHTABLE_SIZE_4K]>, +} +impl HashTable4KU16 { + #[inline] + pub fn new() -> Self { + // This generates more efficient assembly in contrast to Box::new(slice), because of an + // optmized call alloc_zeroed, vs. alloc + memset + // try_into is optimized away + let dict = alloc::vec![0; HASHTABLE_SIZE_4K] + .into_boxed_slice() + .try_into() + .unwrap(); + Self { dict } + } +} +impl HashTable for HashTable4KU16 { + #[inline] + fn get_at(&self, hash: usize) -> usize { + self.dict[hash >> HASHTABLE_BIT_SHIFT_4K] as usize + } + #[inline] + fn put_at(&mut self, hash: usize, val: usize) { + self.dict[hash >> HASHTABLE_BIT_SHIFT_4K] = val as u16; + } + #[inline] + fn clear(&mut self) { + self.dict.fill(0); + } + #[inline] + fn get_hash_at(input: &[u8], pos: usize) -> usize { + hash(super::get_batch(input, pos)) as usize + } +} + +#[derive(Debug)] +pub struct HashTable4K { + dict: Box<[u32; HASHTABLE_SIZE_4K]>, +} +impl HashTable4K { + #[inline] + pub fn new() -> Self { + let dict = alloc::vec![0; HASHTABLE_SIZE_4K] + .into_boxed_slice() + .try_into() + .unwrap(); + Self { dict } + } + + #[cold] + #[allow(dead_code)] + pub fn reposition(&mut self, offset: u32) { + for i in self.dict.iter_mut() { + *i = i.saturating_sub(offset); + } + } +} +impl HashTable for HashTable4K { + #[inline] + fn get_at(&self, hash: usize) -> usize { + self.dict[hash >> HASHTABLE_BIT_SHIFT_4K] as usize + } + #[inline] + fn put_at(&mut self, hash: usize, val: usize) { + self.dict[hash >> HASHTABLE_BIT_SHIFT_4K] = val as u32; + } + #[inline] + fn clear(&mut self) { + self.dict.fill(0); + } +} + +const HASHTABLE_SIZE_8K: usize = 8 * 1024; +const HASH_TABLE_BIT_SHIFT_8K: usize = 3; + +#[derive(Debug)] +pub struct HashTable8K { + dict: Box<[u32; HASHTABLE_SIZE_8K]>, +} +#[allow(dead_code)] +impl HashTable8K { + #[inline] + pub fn new() -> Self { + let dict = alloc::vec![0; HASHTABLE_SIZE_8K] + .into_boxed_slice() + .try_into() + .unwrap(); + + Self { dict } + } +} +impl HashTable for HashTable8K { + #[inline] + fn get_at(&self, hash: usize) -> usize { + self.dict[hash >> HASH_TABLE_BIT_SHIFT_8K] as usize + } + #[inline] + fn put_at(&mut self, hash: usize, val: usize) { + self.dict[hash >> HASH_TABLE_BIT_SHIFT_8K] = val as u32; + } + #[inline] + fn clear(&mut self) { + self.dict.fill(0); + } +} |