aboutsummaryrefslogtreecommitdiff
path: root/src/buffer.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/buffer.rs')
-rw-r--r--src/buffer.rs130
1 files changed, 130 insertions, 0 deletions
diff --git a/src/buffer.rs b/src/buffer.rs
new file mode 100644
index 0000000..1008196
--- /dev/null
+++ b/src/buffer.rs
@@ -0,0 +1,130 @@
+use std::cmp;
+use std::io;
+use std::ptr;
+
+/// The default buffer capacity that we use for the stream buffer.
+const DEFAULT_BUFFER_CAPACITY: usize = 8 * (1 << 10); // 8 KB
+
+/// A fairly simple roll buffer for supporting stream searches.
+///
+/// This buffer acts as a temporary place to store a fixed amount of data when
+/// reading from a stream. Its central purpose is to allow "rolling" some
+/// suffix of the data to the beginning of the buffer before refilling it with
+/// more data from the stream. For example, let's say we are trying to match
+/// "foobar" on a stream. When we report the match, we'd like to not only
+/// report the correct offsets at which the match occurs, but also the matching
+/// bytes themselves. So let's say our stream is a file with the following
+/// contents: `test test foobar test test`. Now assume that we happen to read
+/// the aforementioned file in two chunks: `test test foo` and `bar test test`.
+/// Naively, it would not be possible to report a single contiguous `foobar`
+/// match, but this roll buffer allows us to do that. Namely, after the second
+/// read, the contents of the buffer should be `st foobar test test`, where the
+/// search should ultimately resume immediately after `foo`. (The prefix `st `
+/// is included because the roll buffer saves N bytes at the end of the buffer,
+/// where N is the maximum possible length of a match.)
+///
+/// A lot of the logic for dealing with this is unfortunately split out between
+/// this roll buffer and the `StreamChunkIter`.
+#[derive(Debug)]
+pub struct Buffer {
+ /// The raw buffer contents. This has a fixed size and never increases.
+ buf: Vec<u8>,
+ /// The minimum size of the buffer, which is equivalent to the maximum
+ /// possible length of a match. This corresponds to the amount that we
+ /// roll
+ min: usize,
+ /// The end of the contents of this buffer.
+ end: usize,
+}
+
+impl Buffer {
+ /// Create a new buffer for stream searching. The minimum buffer length
+ /// given should be the size of the maximum possible match length.
+ pub fn new(min_buffer_len: usize) -> Buffer {
+ let min = cmp::max(1, min_buffer_len);
+ // The minimum buffer amount is also the amount that we roll our
+ // buffer in order to support incremental searching. To this end,
+ // our actual capacity needs to be at least 1 byte bigger than our
+ // minimum amount, otherwise we won't have any overlap. In actuality,
+ // we want our buffer to be a bit bigger than that for performance
+ // reasons, so we set a lower bound of `8 * min`.
+ //
+ // TODO: It would be good to find a way to test the streaming
+ // implementation with the minimal buffer size.
+ let capacity = cmp::max(min * 8, DEFAULT_BUFFER_CAPACITY);
+ Buffer { buf: vec![0; capacity], min, end: 0 }
+ }
+
+ /// Return the contents of this buffer.
+ #[inline]
+ pub fn buffer(&self) -> &[u8] {
+ &self.buf[..self.end]
+ }
+
+ /// Return the minimum size of the buffer. The only way a buffer may be
+ /// smaller than this is if the stream itself contains less than the
+ /// minimum buffer amount.
+ #[inline]
+ pub fn min_buffer_len(&self) -> usize {
+ self.min
+ }
+
+ /// Return the total length of the contents in the buffer.
+ #[inline]
+ pub fn len(&self) -> usize {
+ self.end
+ }
+
+ /// Return all free capacity in this buffer.
+ fn free_buffer(&mut self) -> &mut [u8] {
+ &mut self.buf[self.end..]
+ }
+
+ /// Refill the contents of this buffer by reading as much as possible into
+ /// this buffer's free capacity. If no more bytes could be read, then this
+ /// returns false. Otherwise, this reads until it has filled the buffer
+ /// past the minimum amount.
+ pub fn fill<R: io::Read>(&mut self, mut rdr: R) -> io::Result<bool> {
+ let mut readany = false;
+ loop {
+ let readlen = rdr.read(self.free_buffer())?;
+ if readlen == 0 {
+ return Ok(readany);
+ }
+ readany = true;
+ self.end += readlen;
+ if self.len() >= self.min {
+ return Ok(true);
+ }
+ }
+ }
+
+ /// Roll the contents of the buffer so that the suffix of this buffer is
+ /// moved to the front and all other contents are dropped. The size of the
+ /// suffix corresponds precisely to the minimum buffer length.
+ ///
+ /// This should only be called when the entire contents of this buffer have
+ /// been searched.
+ pub fn roll(&mut self) {
+ let roll_start = self
+ .end
+ .checked_sub(self.min)
+ .expect("buffer capacity should be bigger than minimum amount");
+ let roll_len = self.min;
+
+ assert!(roll_start + roll_len <= self.end);
+ unsafe {
+ // SAFETY: A buffer contains Copy data, so there's no problem
+ // moving it around. Safety also depends on our indices being in
+ // bounds, which they always should be, given the assert above.
+ //
+ // TODO: Switch to [T]::copy_within once our MSRV is high enough.
+ ptr::copy(
+ self.buf[roll_start..].as_ptr(),
+ self.buf.as_mut_ptr(),
+ roll_len,
+ );
+ }
+ self.end = roll_len;
+ }
+}