aboutsummaryrefslogtreecommitdiff
path: root/pw_tokenizer/rust/pw_tokenizer_core.rs
blob: ad74ead6a1bd7b9ff2ad76513594bbc66413e111 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
// Copyright 2023 The Pigweed Authors
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy of
// the License at
//
//     https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations under
// the License.

//! # pw_tokenizer_core
//!
//! This crate provides the core functionality of calculating a token hash
//! for a string or byte sequence.  This is intended to provide a minimal core
//! for both the main `pw_tokenizer` and `pw_tokenizer_macro` crates.  Users
//! should prefer depending `pw_tokenizer` instead of this crate.
#![cfg_attr(not(feature = "std"), no_std)]

pub const HASH_CONSTANT: u32 = 65599u32;

/// Calculate the hash for a sequence of bytes.
///
/// ```
/// use pw_tokenizer_core::hash_bytes;
///
/// let hash = hash_bytes(&[0x34, 0xd8, 0x3a, 0xbb, 0xf1, 0x0e, 0x07]);
/// assert_eq!(hash, 0x9e624642);
/// ```
pub const fn hash_bytes(bytes: &[u8]) -> u32 {
    hash_bytes_fixed(bytes, bytes.len())
}

/// Calculate the hash for a sequence of bytes, examining at most `len` bytes.
///
/// ```
/// use pw_tokenizer_core::hash_bytes_fixed;
///
/// let hash = hash_bytes_fixed(&[0x34, 0xd8, 0x3a, 0xbb, 0xf1, 0x0e, 0x07], 4);
/// assert_eq!(hash, 0x92c5d2ac);
/// ```
pub const fn hash_bytes_fixed(bytes: &[u8], len: usize) -> u32 {
    // The length is hashed as if it were the first byte.
    let mut hash = bytes.len() as u32;
    let mut coefficient = HASH_CONSTANT;

    // Fixed sized hashes are seeded with the total length of the slice
    // but only hash over at most `len` bytes.
    let bytes = if bytes.len() > len {
        bytes.split_at(len).0
    } else {
        bytes
    };

    // For loops are not allowed in const functions.
    let mut i = 0;
    while i < bytes.len() {
        // We call `u32::wrapping_*` instead of using `core::num::Wrapping` to
        // avoid using traits in a const function.
        hash = hash.wrapping_add(coefficient.wrapping_mul(bytes[i] as u32));
        coefficient = coefficient.wrapping_mul(HASH_CONSTANT);
        i += 1;
    }

    hash
}

/// Calculate the hash for a string.
///
/// ```
/// use pw_tokenizer_core::hash_string;
///
/// let hash = hash_string("I 💖 Pigweed");
/// assert_eq!(hash, 0xe318d1b3);
/// ```
pub fn hash_string(s: &str) -> u32 {
    hash_bytes(s.as_bytes())
}

pub const TOKENIZER_ENTRY_MAGIC: u32 = 0xBAA98DEE;

#[cfg(test)]
mod tests {
    use super::hash_bytes_fixed;

    struct TestCase {
        string: &'static [u8],
        hash_length: usize,
        hash: u32,
    }

    // Generated file defines `test_cases()`.
    include!("pw_tokenizer_core_test_cases.rs");

    #[test]
    fn hash() {
        for test in test_cases() {
            let hash = hash_bytes_fixed(test.string, test.hash_length);
            assert_eq!(
                hash, test.hash,
                "{:08x} != {:08x} string: {:x?} hash_size: {}",
                hash, test.hash, test.string, test.hash_length
            );
        }
    }
}