pw_tokenizer/rust/pw_tokenizer/lib.rs


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267

// Copyright 2023 The Pigweed Authors
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy of
// the License at
//
//     https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations under
// the License.

//! `pw_tokenizer` - Efficient string handling and printf style encoding.
//!
//! Logging is critical, but developers are often forced to choose between
//! additional logging or saving crucial flash space. The `pw_tokenizer` crate
//! helps address this by replacing printf-style strings with binary tokens
//! during compilation. This enables extensive logging with substantially less
//! memory usage.
//!
//! For a more in depth explanation of the systems design and motivations,
//! see [Pigweed's pw_tokenizer module documentation](https://pigweed.dev/pw_tokenizer/).
//!
//! # Example
//!
//! ```
//! use pw_tokenizer::tokenize_to_buffer;
//!
//! # fn doctest() -> pw_status::Result<()> {
//! let mut buffer = [0u8; 1024];
//! let len = tokenize_to_buffer!(&mut buffer, "The answer is %d", 42)?;
//!
//! // 4 bytes used to encode the token and one to encode the value 42.  This
//! // is a **3.5x** reduction in size compared to the raw string!
//! assert_eq!(len, 5);
//! # Ok(())
//! # }
//! # doctest().unwrap();
//! ```

#![no_std]
#![deny(missing_docs)]

#[doc(hidden)]
pub mod internal;

#[doc(hidden)]
// Creating a __private namespace allows us a way to get to the modules
// we need from macros by doing:
//     use $crate::__private as __pw_tokenizer_crate;
//
// This is how proc macro generated code can reliably reference back to
// `pw_tokenizer` while still allowing a user to import it under a different
// name.
pub mod __private {
    pub use crate::*;
    pub use pw_stream::{Cursor, Seek, WriteInteger, WriteVarint};
    pub use pw_tokenizer_macro::{_token, _tokenize_to_buffer};
}

/// Return the [`u32`] token for the specified string and add it to the token
/// database.
///
/// This is where the magic happens in `pw_tokenizer`!   ... and by magic
/// we mean hiding information in a special linker section that ends up in the
/// final elf binary but does not get flashed to the device.
///
/// Two things are accomplished here:
/// 1) The string is hashed into its stable `u32` token.  This is the value that
///    is returned from the macro.
/// 2) A [token database entry](https://pigweed.dev/pw_tokenizer/design.html#binary-database-format)
///   is generated, assigned to a unique static symbol, placed in a linker
///   section named `pw_tokenizer.entries.<TOKEN_HASH>`.  A
///   [linker script](https://pigweed.googlesource.com/pigweed/pigweed/+/refs/heads/main/pw_tokenizer/pw_tokenizer_linker_sections.ld)
///   is responsible for picking these symbols up and aggregating them into a
///   single `.pw_tokenizer.entries` section in the final binary.
///
/// # Example
/// ```
/// use pw_tokenizer::token;
///
/// let token = token!("hello, \"world\"");
/// assert_eq!(token, 3537412730);
/// ```
///
/// Currently there is no support for encoding tokens to specific domains
/// or with "fixed lengths" per [`pw_tokenizer_core::hash_bytes_fixed`].
#[macro_export]
macro_rules! token {
    ($string:literal) => {{
        $crate::__private::_token!($string)
    }};
}

/// Tokenize a format string and arguments to an [`AsMut<u8>`] buffer and add
/// the format string's token to the token database.
///
/// See [`token`] for an explanation on how strings are tokenized and entries
/// are added to the token database.
///
/// Returns a [`pw_status::Result<usize>`] the number of bytes written to the buffer.
///
/// # Errors
/// - [`pw_status::Error::OutOfRange`] - Buffer is not large enough to fit
///   tokenized data.
/// - [`pw_status::Error::InvalidArgument`] - Invalid buffer was provided.
///
/// # Example
///
/// ```
/// use pw_tokenizer::tokenize_to_buffer;
///
/// # fn doctest() -> pw_status::Result<()> {
/// let mut buffer = [0u8; 1024];
/// let len = tokenize_to_buffer!(&mut buffer, "The answer is %d", 42)?;
///
/// // 4 bytes used to encode the token and one to encode the value 42.
/// assert_eq!(len, 5);
/// # Ok(())
/// # }
/// # doctest().unwrap();
/// ```
#[macro_export]
macro_rules! tokenize_to_buffer {
    ($buffer:expr, $format_string:literal) => {{
      use $crate::__private as __pw_tokenizer_crate;
      __pw_tokenizer_crate::_tokenize_to_buffer!($buffer, $format_string)
    }};

    ($buffer:expr, $format_string:literal, $($args:expr),*) => {{
      use $crate::__private as __pw_tokenizer_crate;
      __pw_tokenizer_crate::_tokenize_to_buffer!($buffer, $format_string, $($args),*)
    }};
}

#[cfg(test)]
mod tests {
    use super::*;
    extern crate self as pw_tokenizer;

    // This is not meant to be an exhaustive test of tokenization which is
    // covered by `pw_tokenizer_core`'s unit tests.  Rather, this is testing
    // that the `tokenize!` macro connects to that correctly.
    #[test]
    fn test_token() {}

    macro_rules! tokenize_to_buffer_test {
      ($expected_data:expr, $buffer_len:expr, $fmt:expr) => {
        {
          let mut orig_buffer = [0u8; $buffer_len];
          let buffer =
              tokenize_to_buffer!(&mut orig_buffer, $fmt).unwrap();
            let len = buffer.len();
            assert_eq!(
              &orig_buffer[..(($buffer_len) - len)],
              $expected_data,
          );
        }
      };

      ($expected_data:expr, $buffer_len:expr, $fmt:expr, $($args:expr),*) => {
        {
          let mut buffer = [0u8; $buffer_len];
          let len = tokenize_to_buffer!(&mut buffer, $fmt, $($args),*).unwrap();
          assert_eq!(
              &buffer[..len],
              $expected_data,
          );
        }
      };
    }

    #[test]
    fn test_decimal_format() {
        tokenize_to_buffer_test!(
            &[0x52, 0x1c, 0xb0, 0x4c, 0x2], // expected buffer
            64,                             // buffer size
            "The answer is %d!",
            1
        );

        tokenize_to_buffer_test!(
            &[0x36, 0xd0, 0xfb, 0x69, 0x1], // expected buffer
            64,                             // buffer size
            "No! The answer is %d!",
            -1
        );

        tokenize_to_buffer_test!(
            &[0xa4, 0xad, 0x50, 0x54, 0x0], // expected buffer
            64,                             // buffer size
            "I think you'll find that the answer is %d!",
            0
        );
    }

    #[test]
    fn test_misc_integer_format() {
        // %d, %i, %o, %u, %x, %X all encode integers the same.
        tokenize_to_buffer_test!(
            &[0x57, 0x88, 0xc5, 0xd8, 0x2], // expected buffer
            64,                             // buffer size
            "The answer is %i!",
            1
        );

        tokenize_to_buffer_test!(
            &[0x5d, 0x70, 0x12, 0xb4, 0x2], // expected buffer
            64,                             // buffer size
            "The answer is %o!",
            1
        );

        tokenize_to_buffer_test!(
            &[0x63, 0x58, 0x5f, 0x8f, 0x2], // expected buffer
            64,                             // buffer size
            "The answer is %u!",
            1
        );

        tokenize_to_buffer_test!(
            &[0x66, 0xcc, 0x05, 0x7d, 0x2], // expected buffer
            64,                             // buffer size
            "The answer is %x!",
            1
        );

        tokenize_to_buffer_test!(
            &[0x46, 0x4c, 0x16, 0x96, 0x2], // expected buffer
            64,                             // buffer size
            "The answer is %X!",
            1
        );
    }

    #[test]
    fn test_string_format() {
        tokenize_to_buffer_test!(
            b"\x25\xf6\x2e\x66\x07Pigweed", // expected buffer
            64,                             // buffer size
            "Hello: %s!",
            "Pigweed"
        );
    }

    #[test]
    fn test_string_format_overflow() {
        tokenize_to_buffer_test!(
            b"\x25\xf6\x2e\x66\x83Pig", // expected buffer
            8,                          // buffer size
            "Hello: %s!",
            "Pigweed"
        );
    }

    #[test]
    fn test_char_format() {
        tokenize_to_buffer_test!(
            &[0x2e, 0x52, 0xac, 0xe4, 0x50], // expected buffer
            64,                              // buffer size
            "Hello: %cigweed",
            "P".as_bytes()[0]
        );
    }
}