aboutsummaryrefslogtreecommitdiff
path: root/src/compile.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/compile.rs')
-rw-r--r--src/compile.rs42
1 files changed, 30 insertions, 12 deletions
diff --git a/src/compile.rs b/src/compile.rs
index 9ffd347..9a2ed5e 100644
--- a/src/compile.rs
+++ b/src/compile.rs
@@ -4,16 +4,16 @@ use std::iter;
use std::result;
use std::sync::Arc;
-use syntax::hir::{self, Hir};
-use syntax::is_word_byte;
-use syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences};
+use regex_syntax::hir::{self, Hir};
+use regex_syntax::is_word_byte;
+use regex_syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences};
-use prog::{
+use crate::prog::{
EmptyLook, Inst, InstBytes, InstChar, InstEmptyLook, InstPtr, InstRanges,
InstSave, InstSplit, Program,
};
-use Error;
+use crate::Error;
type Result = result::Result<Patch, Error>;
type ResultOrEmpty = result::Result<Option<Patch>, Error>;
@@ -38,6 +38,7 @@ pub struct Compiler {
suffix_cache: SuffixCache,
utf8_seqs: Option<Utf8Sequences>,
byte_classes: ByteClassSet,
+ extra_inst_bytes: usize,
}
impl Compiler {
@@ -54,6 +55,7 @@ impl Compiler {
suffix_cache: SuffixCache::new(1000),
utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')),
byte_classes: ByteClassSet::new(),
+ extra_inst_bytes: 0,
}
}
@@ -253,8 +255,8 @@ impl Compiler {
/// Ok(None) is returned when an expression is compiled to no
/// instruction, and so no patch.entry value makes sense.
fn c(&mut self, expr: &Hir) -> ResultOrEmpty {
- use prog;
- use syntax::hir::HirKind::*;
+ use crate::prog;
+ use regex_syntax::hir::HirKind::*;
self.check_size()?;
match *expr.kind() {
@@ -316,6 +318,13 @@ impl Compiler {
}
self.compiled.has_unicode_word_boundary = true;
self.byte_classes.set_word_boundary();
+ // We also make sure that all ASCII bytes are in a different
+ // class from non-ASCII bytes. Otherwise, it's possible for
+ // ASCII bytes to get lumped into the same class as non-ASCII
+ // bytes. This in turn may cause the lazy DFA to falsely start
+ // when it sees an ASCII byte that maps to a byte class with
+ // non-ASCII bytes. This ensures that never happens.
+ self.byte_classes.set_range(0, 0x7F);
self.c_empty_look(prog::EmptyLook::WordBoundary)
}
WordBoundary(hir::WordBoundary::UnicodeNegate) => {
@@ -328,6 +337,8 @@ impl Compiler {
}
self.compiled.has_unicode_word_boundary = true;
self.byte_classes.set_word_boundary();
+ // See comments above for why we set the ASCII range here.
+ self.byte_classes.set_range(0, 0x7F);
self.c_empty_look(prog::EmptyLook::NotWordBoundary)
}
WordBoundary(hir::WordBoundary::Ascii) => {
@@ -420,6 +431,8 @@ impl Compiler {
}
fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty {
+ use std::mem::size_of;
+
assert!(!ranges.is_empty());
if self.compiled.uses_bytes() {
Ok(Some(CompileClass { c: self, ranges: ranges }.compile()?))
@@ -429,6 +442,8 @@ impl Compiler {
let hole = if ranges.len() == 1 && ranges[0].0 == ranges[0].1 {
self.push_hole(InstHole::Char { c: ranges[0].0 })
} else {
+ self.extra_inst_bytes +=
+ ranges.len() * (size_of::<char>() * 2);
self.push_hole(InstHole::Ranges { ranges: ranges })
};
Ok(Some(Patch { hole: hole, entry: self.insts.len() - 1 }))
@@ -548,7 +563,7 @@ impl Compiler {
}
fn c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty {
- use syntax::hir::RepetitionKind::*;
+ use regex_syntax::hir::RepetitionKind::*;
match rep.kind {
ZeroOrOne => self.c_repeat_zero_or_one(&rep.hir, rep.greedy),
ZeroOrMore => self.c_repeat_zero_or_more(&rep.hir, rep.greedy),
@@ -795,7 +810,9 @@ impl Compiler {
fn check_size(&self) -> result::Result<(), Error> {
use std::mem::size_of;
- if self.insts.len() * size_of::<Inst>() > self.size_limit {
+ let size =
+ self.extra_inst_bytes + (self.insts.len() * size_of::<Inst>());
+ if size > self.size_limit {
Err(Error::CompiledTooBig(self.size_limit))
} else {
Ok(())
@@ -927,9 +944,10 @@ impl InstHole {
Inst::EmptyLook(InstEmptyLook { goto: goto, look: look })
}
InstHole::Char { c } => Inst::Char(InstChar { goto: goto, c: c }),
- InstHole::Ranges { ref ranges } => {
- Inst::Ranges(InstRanges { goto: goto, ranges: ranges.clone() })
- }
+ InstHole::Ranges { ref ranges } => Inst::Ranges(InstRanges {
+ goto: goto,
+ ranges: ranges.clone().into_boxed_slice(),
+ }),
InstHole::Bytes { start, end } => {
Inst::Bytes(InstBytes { goto: goto, start: start, end: end })
}