diff options
Diffstat (limited to 'ucs2.diff')
-rw-r--r-- | ucs2.diff | 567 |
1 files changed, 567 insertions, 0 deletions
diff --git a/ucs2.diff b/ucs2.diff new file mode 100644 index 0000000..57aec04 --- /dev/null +++ b/ucs2.diff @@ -0,0 +1,567 @@ +This is a dump from Google's source control system of the change +that removed UCS-2 support from RE2. As the explanation below +says, UCS-2 mode is fundamentally at odds with things like ^ and $, +so it never really worked very well. But if you are interested in using +it without those operators, it did work for that. It assumed that the +UCS-2 data was in the native host byte order. + +If you are interested in adding UCS-2 mode back, this patch might +be a good starting point. + + +Change 12780686 by rsc@rsc-re2 on 2009/09/16 15:30:15 + + Retire UCS-2 mode. + + I added it as an experiment for V8, but it + requires 2-byte lookahead to do completely, + and RE2 has 1-byte lookahead (enough for UTF-8) + as a fairly deep fundamental assumption, + so it did not support ^ or $. + +==== re2/bitstate.cc#2 - re2/bitstate.cc#3 ==== +re2/bitstate.cc#2:314,321 - re2/bitstate.cc#3:314,319 + cap_[0] = p; + if (TrySearch(prog_->start(), p)) // Match must be leftmost; done. + return true; +- if (prog_->flags() & Regexp::UCS2) +- p++; + } + return false; + } +==== re2/compile.cc#17 - re2/compile.cc#18 ==== +re2/compile.cc#17:95,101 - re2/compile.cc#18:95,100 + // Input encodings. + enum Encoding { + kEncodingUTF8 = 1, // UTF-8 (0-10FFFF) +- kEncodingUCS2, // UCS-2 (0-FFFF), native byte order + kEncodingLatin1, // Latin1 (0-FF) + }; + +re2/compile.cc#17:168,176 - re2/compile.cc#18:167,172 + void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase); + void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase); + void Add_80_10ffff(); +- void AddRuneRangeUCS2(Rune lo, Rune hi, bool foldcase); +- void AddUCS2Pair(uint8 lo1, uint8 hi1, bool fold1, +- uint8 lo2, uint8 hi2, bool fold2); + + // New suffix that matches the byte range lo-hi, then goes to next. + Inst* RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, Inst* next); +re2/compile.cc#17:475,481 - re2/compile.cc#18:471,477 + + // Converts rune range lo-hi into a fragment that recognizes + // the bytes that would make up those runes in the current +- // encoding (Latin 1, UTF-8, or UCS-2). ++ // encoding (Latin 1 or UTF-8). + // This lets the machine work byte-by-byte even when + // using multibyte encodings. + +re2/compile.cc#17:488,496 - re2/compile.cc#18:484,489 + case kEncodingLatin1: + AddRuneRangeLatin1(lo, hi, foldcase); + break; +- case kEncodingUCS2: +- AddRuneRangeUCS2(lo, hi, foldcase); +- break; + } + } + +re2/compile.cc#17:503,581 - re2/compile.cc#18:496,501 + AddSuffix(RuneByteSuffix(lo, hi, foldcase, NULL)); + } + +- // Test whether 16-bit values are big or little endian. +- static bool BigEndian() { +- union { +- char byte[2]; +- int16 endian; +- } u; +- +- u.byte[0] = 1; +- u.byte[1] = 2; +- return u.endian == 0x0102; +- } +- +- void Compiler::AddUCS2Pair(uint8 lo1, uint8 hi1, bool fold1, +- uint8 lo2, uint8 hi2, bool fold2) { +- Inst* ip; +- if (reversed_) { +- ip = RuneByteSuffix(lo1, hi1, fold1, NULL); +- ip = RuneByteSuffix(lo2, hi2, fold2, ip); +- } else { +- ip = RuneByteSuffix(lo2, hi2, fold2, NULL); +- ip = RuneByteSuffix(lo1, hi1, fold1, ip); +- } +- AddSuffix(ip); +- } +- +- void Compiler::AddRuneRangeUCS2(Rune lo, Rune hi, bool foldcase) { +- if (lo > hi || lo > 0xFFFF) +- return; +- if (hi > 0xFFFF) +- hi = 0xFFFF; +- +- // We'll assemble a pattern assuming big endian. +- // If the machine isn't, tell Cat to reverse its arguments. +- bool oldreversed = reversed_; +- if (!BigEndian()) { +- reversed_ = !oldreversed; +- } +- +- // Split into bytes. +- int lo1 = lo >> 8; +- int lo2 = lo & 0xFF; +- int hi1 = hi >> 8; +- int hi2 = hi & 0xFF; +- +- if (lo1 == hi1) { +- // Easy case: high bits are same in both. +- // Only do ASCII case folding on the second byte if the top byte is 00. +- AddUCS2Pair(lo1, lo1, false, lo2, hi2, lo1==0 && foldcase); +- } else { +- // Harder case: different second byte ranges depending on first byte. +- +- // Initial fragment. +- if (lo2 > 0) { +- AddUCS2Pair(lo1, lo1, false, lo2, 0xFF, lo1==0 && foldcase); +- lo1++; +- } +- +- // Trailing fragment. +- if (hi2 < 0xFF) { +- AddUCS2Pair(hi1, hi1, false, 0, hi2, false); +- hi1--; +- } +- +- // Inner ranges. +- if (lo1 <= hi1) { +- AddUCS2Pair(lo1, hi1, false, 0, 0xFF, false); +- } +- } +- +- // Restore reverse setting. +- reversed_ = oldreversed; +- } +- + // Table describing how to make a UTF-8 matching machine + // for the rune range 80-10FFFF (Runeself-Runemax). + // This range happens frequently enough (for example /./ and /[^a-z]/) +re2/compile.cc#17:707,716 - re2/compile.cc#18:627,634 + + Frag Compiler::Literal(Rune r, bool foldcase) { + switch (encoding_) { +- default: // UCS-2 or something new +- BeginRange(); +- AddRuneRange(r, r, foldcase); +- return EndRange(); ++ default: ++ return kNullFrag; + + case kEncodingLatin1: + return ByteRange(r, r, foldcase); +re2/compile.cc#17:927,934 - re2/compile.cc#18:845,850 + + if (re->parse_flags() & Regexp::Latin1) + c.encoding_ = kEncodingLatin1; +- else if (re->parse_flags() & Regexp::UCS2) +- c.encoding_ = kEncodingUCS2; + c.reversed_ = reversed; + if (max_mem <= 0) { + c.max_inst_ = 100000; // more than enough +re2/compile.cc#17:983,993 - re2/compile.cc#18:899,905 + c.prog_->set_start_unanchored(c.prog_->start()); + } else { + Frag dot; +- if (c.encoding_ == kEncodingUCS2) { +- dot = c.Cat(c.ByteRange(0x00, 0xFF, false), c.ByteRange(0x00, 0xFF, false)); +- } else { +- dot = c.ByteRange(0x00, 0xFF, false); +- } ++ dot = c.ByteRange(0x00, 0xFF, false); + Frag dotloop = c.Star(dot, true); + Frag unanchored = c.Cat(dotloop, all); + c.prog_->set_start_unanchored(unanchored.begin); +==== re2/nfa.cc#8 - re2/nfa.cc#9 ==== +re2/nfa.cc#8:426,432 - re2/nfa.cc#9:426,431 + const char* bp = context.begin(); + int c = -1; + int wasword = 0; +- bool ucs2 = prog_->flags() & Regexp::UCS2; + + if (text.begin() > context.begin()) { + c = text.begin()[-1] & 0xFF; +re2/nfa.cc#8:492,498 - re2/nfa.cc#9:491,497 + // If there's a required first byte for an unanchored search + // and we're not in the middle of any possible matches, + // use memchr to search for the byte quickly. +- if (!ucs2 && !anchored && first_byte_ >= 0 && runq->size() == 0 && ++ if (!anchored && first_byte_ >= 0 && runq->size() == 0 && + p < text.end() && (p[0] & 0xFF) != first_byte_) { + p = reinterpret_cast<const char*>(memchr(p, first_byte_, + text.end() - p)); +re2/nfa.cc#8:505,526 - re2/nfa.cc#9:504,514 + flag = Prog::EmptyFlags(context, p); + } + +- // In UCS-2 mode, if we need to start a new thread, +- // make sure to do it on an even boundary. +- if(ucs2 && runq->size() == 0 && +- (p - context.begin()) % 2 && p < text.end()) { +- p++; +- flag = Prog::EmptyFlags(context, p); +- } +- + // Steal match storage (cleared but unused as of yet) + // temporarily to hold match boundaries for new thread. +- // In UCS-2 mode, only start the thread on a 2-byte boundary. +- if(!ucs2 || (p - context.begin()) % 2 == 0) { +- match_[0] = p; +- AddToThreadq(runq, start_, flag, p, match_); +- match_[0] = NULL; +- } ++ match_[0] = p; ++ AddToThreadq(runq, start_, flag, p, match_); ++ match_[0] = NULL; + } + + // If all the threads have died, stop early. +==== re2/parse.cc#22 - re2/parse.cc#23 ==== +re2/parse.cc#22:160,167 - re2/parse.cc#23:160,165 + status_(status), stacktop_(NULL), ncap_(0) { + if (flags_ & Latin1) + rune_max_ = 0xFF; +- else if (flags & UCS2) +- rune_max_ = 0xFFFF; + else + rune_max_ = Runemax; + } +re2/parse.cc#22:365,387 - re2/parse.cc#23:363,374 + bool Regexp::ParseState::PushCarat() { + if (flags_ & OneLine) { + return PushSimpleOp(kRegexpBeginText); +- } else { +- if (flags_ & UCS2) { +- status_->set_code(kRegexpUnsupported); +- status_->set_error_arg("multiline ^ in UCS-2 mode"); +- return false; +- } +- return PushSimpleOp(kRegexpBeginLine); + } ++ return PushSimpleOp(kRegexpBeginLine); + } + + // Pushes a \b or \B onto the stack. + bool Regexp::ParseState::PushWordBoundary(bool word) { +- if (flags_ & UCS2) { +- status_->set_code(kRegexpUnsupported); +- status_->set_error_arg("\\b or \\B in UCS-2 mode"); +- return false; +- } + if (word) + return PushSimpleOp(kRegexpWordBoundary); + return PushSimpleOp(kRegexpNoWordBoundary); +re2/parse.cc#22:397,407 - re2/parse.cc#23:384,389 + bool ret = PushSimpleOp(kRegexpEndText); + flags_ = oflags; + return ret; +- } +- if (flags_ & UCS2) { +- status_->set_code(kRegexpUnsupported); +- status_->set_error_arg("multiline $ in UCS-2 mode"); +- return false; + } + return PushSimpleOp(kRegexpEndLine); + } +==== re2/re2.cc#34 - re2/re2.cc#35 ==== +re2/re2.cc#34:79,86 - re2/re2.cc#35:79,84 + return RE2::ErrorBadUTF8; + case re2::kRegexpBadNamedCapture: + return RE2::ErrorBadNamedCapture; +- case re2::kRegexpUnsupported: +- return RE2::ErrorUnsupported; + } + return RE2::ErrorInternal; + } +re2/re2.cc#34:122,130 - re2/re2.cc#35:120,125 + break; + case RE2::Options::EncodingLatin1: + flags |= Regexp::Latin1; +- break; +- case RE2::Options::EncodingUCS2: +- flags |= Regexp::UCS2; + break; + } + +==== re2/re2.h#36 - re2/re2.h#37 ==== +re2/re2.h#36:246,252 - re2/re2.h#37:246,251 + ErrorBadUTF8, // invalid UTF-8 in regexp + ErrorBadNamedCapture, // bad named capture group + ErrorPatternTooLarge, // pattern too large (compile failed) +- ErrorUnsupported, // unsupported feature (in UCS-2 mode) + }; + + // Predefined common options. +re2/re2.h#36:570,576 - re2/re2.h#37:569,574 + + enum Encoding { + EncodingUTF8 = 1, +- EncodingUCS2, // 16-bit Unicode 0-FFFF only + EncodingLatin1 + }; + +==== re2/regexp.cc#15 - re2/regexp.cc#16 ==== +re2/regexp.cc#15:324,333 - re2/regexp.cc#16:324,329 + // the regexp that remains after the prefix. The prefix might + // be ASCII case-insensitive. + bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) { +- // Don't even bother for UCS-2; it's time to throw that code away. +- if (parse_flags_ & UCS2) +- return false; +- + // No need for a walker: the regexp must be of the form + // 1. some number of ^ anchors + // 2. a literal char or string +==== re2/regexp.h#20 - re2/regexp.h#21 ==== +re2/regexp.h#20:187,193 - re2/regexp.h#21:187,192 + kRegexpBadPerlOp, // bad perl operator + kRegexpBadUTF8, // invalid UTF-8 in regexp + kRegexpBadNamedCapture, // bad named capture +- kRegexpUnsupported, // unsupported operator + }; + + // Error status for certain operations. +re2/regexp.h#20:307,316 - re2/regexp.h#21:306,314 + // \Q and \E to disable/enable metacharacters + // (?P<name>expr) for named captures + // \C to match any single byte +- UCS2 = 1<<10, // Text is in UCS-2, regexp is in UTF-8. +- UnicodeGroups = 1<<11, // Allow \p{Han} for Unicode Han group ++ UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group + // and \P{Han} for its negation. +- NeverNL = 1<<12, // Never match NL, even if the regexp mentions ++ NeverNL = 1<<11, // Never match NL, even if the regexp mentions + // it explicitly. + + // As close to Perl as we can get. +==== re2/testing/backtrack.cc#4 - re2/testing/backtrack.cc#5 ==== +re2/testing/backtrack.cc#4:134,141 - re2/testing/backtrack.cc#5:134,139 + cap_[0] = p; + if (Visit(prog_->start(), p)) // Match must be leftmost; done. + return true; +- if (prog_->flags() & Regexp::UCS2) +- p++; + } + return false; + } +==== re2/testing/tester.cc#12 - re2/testing/tester.cc#13 ==== +re2/testing/tester.cc#12:144,154 - re2/testing/tester.cc#13:144,152 + static ParseMode parse_modes[] = { + { single_line, "single-line" }, + { single_line|Regexp::Latin1, "single-line, latin1" }, +- { single_line|Regexp::UCS2, "single-line, ucs2" }, + { multi_line, "multiline" }, + { multi_line|Regexp::NonGreedy, "multiline, nongreedy" }, + { multi_line|Regexp::Latin1, "multiline, latin1" }, +- { multi_line|Regexp::UCS2, "multiline, ucs2" }, + }; + + static string FormatMode(Regexp::ParseFlags flags) { +re2/testing/tester.cc#12:179,189 - re2/testing/tester.cc#13:177,185 + RegexpStatus status; + regexp_ = Regexp::Parse(regexp_str, flags, &status); + if (regexp_ == NULL) { +- if (status.code() != kRegexpUnsupported) { +- LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_) +- << " mode: " << FormatMode(flags); +- error_ = true; +- } ++ LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_) ++ << " mode: " << FormatMode(flags); ++ error_ = true; + return; + } + prog_ = regexp_->CompileToProg(0); +re2/testing/tester.cc#12:230,237 - re2/testing/tester.cc#13:226,231 + RE2::Options options; + if (flags & Regexp::Latin1) + options.set_encoding(RE2::Options::EncodingLatin1); +- else if (flags & Regexp::UCS2) +- options.set_encoding(RE2::Options::EncodingUCS2); + if (kind_ == Prog::kLongestMatch) + options.set_longest_match(true); + re2_ = new RE2(re, options); +re2/testing/tester.cc#12:281,379 - re2/testing/tester.cc#13:275,280 + delete re2_; + } + +- // Converts UTF-8 string in text into UCS-2 string in new_text. +- static bool ConvertUTF8ToUCS2(const StringPiece& text, StringPiece* new_text) { +- const char* p = text.begin(); +- const char* ep = text.end(); +- uint16* q = new uint16[ep - p]; +- uint16* q0 = q; +- +- int n; +- Rune r; +- for (; p < ep; p += n) { +- if (!fullrune(p, ep - p)) { +- delete[] q0; +- return false; +- } +- n = chartorune(&r, p); +- if (r > 0xFFFF) { +- delete[] q0; +- return false; +- } +- *q++ = r; +- } +- *new_text = StringPiece(reinterpret_cast<char*>(q0), 2*(q - q0)); +- return true; +- } +- +- // Rewrites *sp from being a pointer into text8 (UTF-8) +- // to being a pointer into text16 (equivalent text but in UCS-2). +- static void AdjustUTF8ToUCS2(const StringPiece& text8, const StringPiece& text16, +- StringPiece *sp) { +- if (sp->begin() == NULL && text8.begin() != NULL) +- return; +- +- int nrune = 0; +- int n; +- Rune r; +- const char* p = text8.begin(); +- const char* ep = text8.end(); +- const char* spbegin = NULL; +- const char* spend = NULL; +- for (;;) { +- if (p == sp->begin()) +- spbegin = text16.begin() + sizeof(uint16)*nrune; +- if (p == sp->end()) +- spend = text16.begin() + sizeof(uint16)*nrune; +- if (p >= ep) +- break; +- n = chartorune(&r, p); +- p += n; +- nrune++; +- } +- if (spbegin == NULL || spend == NULL) { +- LOG(FATAL) << "Error in AdjustUTF8ToUCS2 " +- << CEscape(text8) << " " +- << (int)(sp->begin() - text8.begin()) << " " +- << (int)(sp->end() - text8.begin()); +- } +- *sp = StringPiece(spbegin, spend - spbegin); +- } +- +- // Rewrites *sp from begin a pointer into text16 (UCS-2) +- // to being a pointer into text8 (equivalent text but in UTF-8). +- static void AdjustUCS2ToUTF8(const StringPiece& text16, const StringPiece& text8, +- StringPiece* sp) { +- if (sp->begin() == NULL) +- return; +- +- int nrune = 0; +- int n; +- Rune r; +- const char* p = text8.begin(); +- const char* ep = text8.end(); +- const char* spbegin = NULL; +- const char* spend = NULL; +- for (;;) { +- if (nrune == (sp->begin() - text16.begin())/2) +- spbegin = p; +- if (nrune == (sp->end() - text16.begin())/2) +- spend = p; +- if (p >= ep) +- break; +- n = chartorune(&r, p); +- p += n; +- nrune++; +- } +- if (text8.begin() != NULL && (spbegin == NULL || spend == NULL)) { +- LOG(FATAL) << "Error in AdjustUCS2ToUTF8 " +- << CEscape(text16) << " " +- << (int)(sp->begin() - text16.begin()) << " " +- << (int)(sp->end() - text16.begin()); +- } +- *sp = StringPiece(spbegin, spend - spbegin); +- } +- + // Runs a single search using the named engine type. + // This interface hides all the irregularities of the various + // engine interfaces from the rest of this file. +re2/testing/tester.cc#12:393,411 - re2/testing/tester.cc#13:294,300 + + StringPiece text = orig_text; + StringPiece context = orig_context; +- bool ucs2 = false; + +- if ((flags() & Regexp::UCS2) && type != kEnginePCRE) { +- if (!ConvertUTF8ToUCS2(orig_context, &context)) { +- result->skipped = true; +- return; +- } +- +- // Rewrite context to refer to new text. +- AdjustUTF8ToUCS2(orig_context, context, &text); +- ucs2 = true; +- } +- + switch (type) { + default: + LOG(FATAL) << "Bad RunSearch type: " << (int)type; +re2/testing/tester.cc#12:557,577 - re2/testing/tester.cc#13:446,451 + } + } + +- // If we did UCS-2 matching, rewrite the matches to refer +- // to the original UTF-8 text. +- if (ucs2) { +- if (result->matched) { +- if (result->have_submatch0) { +- AdjustUCS2ToUTF8(context, orig_context, &result->submatch[0]); +- } else if (result->have_submatch) { +- for (int i = 0; i < nsubmatch; i++) { +- AdjustUCS2ToUTF8(context, orig_context, &result->submatch[i]); +- } +- } +- } +- delete[] context.begin(); +- } +- + if (!result->matched) + memset(result->submatch, 0, sizeof result->submatch); + } +re2/testing/tester.cc#12:596,617 - re2/testing/tester.cc#13:470,475 + return true; + } + +- // Check whether text uses only Unicode points <= 0xFFFF +- // (in the BMP). +- static bool IsBMP(const StringPiece& text) { +- const char* p = text.begin(); +- const char* ep = text.end(); +- while (p < ep) { +- if (!fullrune(p, ep - p)) +- return false; +- Rune r; +- p += chartorune(&r, p); +- if (r > 0xFFFF) +- return false; +- } +- return true; +- } +- + // Runs a single test. + bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context, + Prog::Anchor anchor) { +re2/testing/tester.cc#12:619,625 - re2/testing/tester.cc#13:477,483 + Result correct; + RunSearch(kEngineBacktrack, text, context, anchor, &correct); + if (correct.skipped) { +- if (regexp_ == NULL || !IsBMP(context)) // okay to skip in UCS-2 mode ++ if (regexp_ == NULL) + return true; + LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_) + << " " << FormatMode(flags_); |