aboutsummaryrefslogtreecommitdiff
path: root/src/state_id.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/state_id.rs')
-rw-r--r--src/state_id.rs291
1 files changed, 291 insertions, 0 deletions
diff --git a/src/state_id.rs b/src/state_id.rs
new file mode 100644
index 0000000..c9bac19
--- /dev/null
+++ b/src/state_id.rs
@@ -0,0 +1,291 @@
+use core::fmt::Debug;
+use core::hash::Hash;
+use core::mem::size_of;
+
+use byteorder::{ByteOrder, NativeEndian};
+
+#[cfg(feature = "std")]
+pub use self::std::*;
+
+#[cfg(feature = "std")]
+mod std {
+ use byteorder::ByteOrder;
+ use core::mem::size_of;
+ use error::{Error, Result};
+
+ use super::StateID;
+
+ /// Check that the premultiplication of the given state identifier can
+ /// fit into the representation indicated by `S`. If it cannot, or if it
+ /// overflows `usize` itself, then an error is returned.
+ pub fn premultiply_overflow_error<S: StateID>(
+ last_state: S,
+ alphabet_len: usize,
+ ) -> Result<()> {
+ let requested = match last_state.to_usize().checked_mul(alphabet_len) {
+ Some(requested) => requested,
+ None => return Err(Error::premultiply_overflow(0, 0)),
+ };
+ if requested > S::max_id() {
+ return Err(Error::premultiply_overflow(S::max_id(), requested));
+ }
+ Ok(())
+ }
+
+ /// Allocate the next sequential identifier for a fresh state given
+ /// the previously constructed state identified by `current`. If the
+ /// next sequential identifier would overflow `usize` or the chosen
+ /// representation indicated by `S`, then an error is returned.
+ pub fn next_state_id<S: StateID>(current: S) -> Result<S> {
+ let next = match current.to_usize().checked_add(1) {
+ Some(next) => next,
+ None => return Err(Error::state_id_overflow(::std::usize::MAX)),
+ };
+ if next > S::max_id() {
+ return Err(Error::state_id_overflow(S::max_id()));
+ }
+ Ok(S::from_usize(next))
+ }
+
+ /// Convert the given `usize` to the chosen state identifier
+ /// representation. If the given value cannot fit in the chosen
+ /// representation, then an error is returned.
+ pub fn usize_to_state_id<S: StateID>(value: usize) -> Result<S> {
+ if value > S::max_id() {
+ Err(Error::state_id_overflow(S::max_id()))
+ } else {
+ Ok(S::from_usize(value))
+ }
+ }
+
+ /// Write the given identifier to the given slice of bytes using the
+ /// specified endianness. The given slice must have length at least
+ /// `size_of::<S>()`.
+ ///
+ /// The given state identifier representation must have size 1, 2, 4 or 8.
+ pub fn write_state_id_bytes<E: ByteOrder, S: StateID>(
+ slice: &mut [u8],
+ id: S,
+ ) {
+ assert!(
+ 1 == size_of::<S>()
+ || 2 == size_of::<S>()
+ || 4 == size_of::<S>()
+ || 8 == size_of::<S>()
+ );
+
+ match size_of::<S>() {
+ 1 => slice[0] = id.to_usize() as u8,
+ 2 => E::write_u16(slice, id.to_usize() as u16),
+ 4 => E::write_u32(slice, id.to_usize() as u32),
+ 8 => E::write_u64(slice, id.to_usize() as u64),
+ _ => unreachable!(),
+ }
+ }
+}
+
+/// Return the unique identifier for a DFA's dead state in the chosen
+/// representation indicated by `S`.
+pub fn dead_id<S: StateID>() -> S {
+ S::from_usize(0)
+}
+
+/// A trait describing the representation of a DFA's state identifier.
+///
+/// The purpose of this trait is to safely express both the possible state
+/// identifier representations that can be used in a DFA and to convert between
+/// state identifier representations and types that can be used to efficiently
+/// index memory (such as `usize`).
+///
+/// In general, one should not need to implement this trait explicitly. In
+/// particular, this crate provides implementations for `u8`, `u16`, `u32`,
+/// `u64` and `usize`. (`u32` and `u64` are only provided for targets that can
+/// represent all corresponding values in a `usize`.)
+///
+/// # Safety
+///
+/// This trait is unsafe because the correctness of its implementations may be
+/// relied upon by other unsafe code. For example, one possible way to
+/// implement this trait incorrectly would be to return a maximum identifier
+/// in `max_id` that is greater than the real maximum identifier. This will
+/// likely result in wrap-on-overflow semantics in release mode, which can in
+/// turn produce incorrect state identifiers. Those state identifiers may then
+/// in turn access out-of-bounds memory in a DFA's search routine, where bounds
+/// checks are explicitly elided for performance reasons.
+pub unsafe trait StateID:
+ Clone + Copy + Debug + Eq + Hash + PartialEq + PartialOrd + Ord
+{
+ /// Convert from a `usize` to this implementation's representation.
+ ///
+ /// Implementors may assume that `n <= Self::max_id`. That is, implementors
+ /// do not need to check whether `n` can fit inside this implementation's
+ /// representation.
+ fn from_usize(n: usize) -> Self;
+
+ /// Convert this implementation's representation to a `usize`.
+ ///
+ /// Implementors must not return a `usize` value greater than
+ /// `Self::max_id` and must not permit overflow when converting between the
+ /// implementor's representation and `usize`. In general, the preferred
+ /// way for implementors to achieve this is to simply not provide
+ /// implementations of `StateID` that cannot fit into the target platform's
+ /// `usize`.
+ fn to_usize(self) -> usize;
+
+ /// Return the maximum state identifier supported by this representation.
+ ///
+ /// Implementors must return a correct bound. Doing otherwise may result
+ /// in memory unsafety.
+ fn max_id() -> usize;
+
+ /// Read a single state identifier from the given slice of bytes in native
+ /// endian format.
+ ///
+ /// Implementors may assume that the given slice has length at least
+ /// `size_of::<Self>()`.
+ fn read_bytes(slice: &[u8]) -> Self;
+
+ /// Write this state identifier to the given slice of bytes in native
+ /// endian format.
+ ///
+ /// Implementors may assume that the given slice has length at least
+ /// `size_of::<Self>()`.
+ fn write_bytes(self, slice: &mut [u8]);
+}
+
+unsafe impl StateID for usize {
+ #[inline]
+ fn from_usize(n: usize) -> usize {
+ n
+ }
+
+ #[inline]
+ fn to_usize(self) -> usize {
+ self
+ }
+
+ #[inline]
+ fn max_id() -> usize {
+ ::core::usize::MAX
+ }
+
+ #[inline]
+ fn read_bytes(slice: &[u8]) -> Self {
+ NativeEndian::read_uint(slice, size_of::<usize>()) as usize
+ }
+
+ #[inline]
+ fn write_bytes(self, slice: &mut [u8]) {
+ NativeEndian::write_uint(slice, self as u64, size_of::<usize>())
+ }
+}
+
+unsafe impl StateID for u8 {
+ #[inline]
+ fn from_usize(n: usize) -> u8 {
+ n as u8
+ }
+
+ #[inline]
+ fn to_usize(self) -> usize {
+ self as usize
+ }
+
+ #[inline]
+ fn max_id() -> usize {
+ ::core::u8::MAX as usize
+ }
+
+ #[inline]
+ fn read_bytes(slice: &[u8]) -> Self {
+ slice[0]
+ }
+
+ #[inline]
+ fn write_bytes(self, slice: &mut [u8]) {
+ slice[0] = self;
+ }
+}
+
+unsafe impl StateID for u16 {
+ #[inline]
+ fn from_usize(n: usize) -> u16 {
+ n as u16
+ }
+
+ #[inline]
+ fn to_usize(self) -> usize {
+ self as usize
+ }
+
+ #[inline]
+ fn max_id() -> usize {
+ ::core::u16::MAX as usize
+ }
+
+ #[inline]
+ fn read_bytes(slice: &[u8]) -> Self {
+ NativeEndian::read_u16(slice)
+ }
+
+ #[inline]
+ fn write_bytes(self, slice: &mut [u8]) {
+ NativeEndian::write_u16(slice, self)
+ }
+}
+
+#[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
+unsafe impl StateID for u32 {
+ #[inline]
+ fn from_usize(n: usize) -> u32 {
+ n as u32
+ }
+
+ #[inline]
+ fn to_usize(self) -> usize {
+ self as usize
+ }
+
+ #[inline]
+ fn max_id() -> usize {
+ ::core::u32::MAX as usize
+ }
+
+ #[inline]
+ fn read_bytes(slice: &[u8]) -> Self {
+ NativeEndian::read_u32(slice)
+ }
+
+ #[inline]
+ fn write_bytes(self, slice: &mut [u8]) {
+ NativeEndian::write_u32(slice, self)
+ }
+}
+
+#[cfg(target_pointer_width = "64")]
+unsafe impl StateID for u64 {
+ #[inline]
+ fn from_usize(n: usize) -> u64 {
+ n as u64
+ }
+
+ #[inline]
+ fn to_usize(self) -> usize {
+ self as usize
+ }
+
+ #[inline]
+ fn max_id() -> usize {
+ ::core::u64::MAX as usize
+ }
+
+ #[inline]
+ fn read_bytes(slice: &[u8]) -> Self {
+ NativeEndian::read_u64(slice)
+ }
+
+ #[inline]
+ fn write_bytes(self, slice: &mut [u8]) {
+ NativeEndian::write_u64(slice, self)
+ }
+}