From 1ad5e5bc944bfb46689d87ace2773109cb54f5e1 Mon Sep 17 00:00:00 2001 From: "philip.liard@gmail.com" Date: Fri, 1 Jul 2011 08:22:06 +0000 Subject: CPP: Move non-base source code to src/phonenumbers/ git-svn-id: http://libphonenumber.googlecode.com/svn/trunk/cpp/src/phonenumbers@282 ee073f10-1060-11df-b6a4-87a95322a99c --- regexp_adapter_icu.cc | 209 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 209 insertions(+) create mode 100644 regexp_adapter_icu.cc (limited to 'regexp_adapter_icu.cc') diff --git a/regexp_adapter_icu.cc b/regexp_adapter_icu.cc new file mode 100644 index 0000000..bada8e3 --- /dev/null +++ b/regexp_adapter_icu.cc @@ -0,0 +1,209 @@ +// Copyright (C) 2011 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Author: George Yakovlev +// Philippe Liard + +#include "phonenumbers/regexp_adapter.h" + +#include + +#include +#include + +#include "base/basictypes.h" +#include "base/logging.h" +#include "base/scoped_ptr.h" +#include "phonenumbers/default_logger.h" + +namespace i18n { +namespace phonenumbers { + +using icu::RegexMatcher; +using icu::RegexPattern; +using icu::UnicodeString; + +namespace { + +// Converts UnicodeString 'source' to a UTF8-formatted std::string. +string UnicodeStringToUtf8String(const UnicodeString& source) { + string data; + source.toUTF8String(data); + return data; +} + +} // namespace + +// Implementation of the abstract classes RegExpInput and RegExp using ICU +// regular expression capabilities. + +// ICU implementation of the RegExpInput abstract class. +class IcuRegExpInput : public RegExpInput { + public: + explicit IcuRegExpInput(const string& utf8_input) + : utf8_input_(UnicodeString::fromUTF8(utf8_input)), + position_(0) {} + + virtual ~IcuRegExpInput() {} + + virtual string ToString() const { + return UnicodeStringToUtf8String(utf8_input_.tempSubString(position_)); + } + + UnicodeString* Data() { + return &utf8_input_; + } + + // The current start position. For a newly created input, position is 0. Each + // call to ConsumeRegExp() or RegExp::Consume() advances the position in the + // case of the successful match to be after the match. + int position() const { + return position_; + } + + void set_position(int position) { + DCHECK(position >= 0 && position <= utf8_input_.length()); + position_ = position; + } + + private: + UnicodeString utf8_input_; + int position_; + + DISALLOW_COPY_AND_ASSIGN(IcuRegExpInput); +}; + +// ICU implementation of the RegExp abstract class. +class IcuRegExp : public RegExp { + public: + explicit IcuRegExp(const string& utf8_regexp) { + UParseError parse_error; + UErrorCode status = U_ZERO_ERROR; + utf8_regexp_.reset(RegexPattern::compile( + UnicodeString::fromUTF8(utf8_regexp), 0, parse_error, status)); + if (U_FAILURE(status)) { + // The provided regular expressions should compile correctly. + LOG(ERROR) << "Error compiling regular expression: " << utf8_regexp; + utf8_regexp_.reset(NULL); + } + } + + virtual ~IcuRegExp() {} + + virtual bool Consume(RegExpInput* input_string, + bool anchor_at_start, + string* matched_string1, + string* matched_string2, + string* matched_string3) const { + DCHECK(input_string); + if (!utf8_regexp_.get()) { + return false; + } + IcuRegExpInput* const input = static_cast(input_string); + UErrorCode status = U_ZERO_ERROR; + const scoped_ptr matcher( + utf8_regexp_->matcher(*input->Data(), status)); + bool match_succeeded = anchor_at_start + ? matcher->lookingAt(input->position(), status) + : matcher->find(input->position(), status); + if (!match_succeeded || U_FAILURE(status)) { + return false; + } + string* const matched_strings[] = { + matched_string1, matched_string2, matched_string3 + }; + // If less matches than expected - fail. + for (size_t i = 0; i < arraysize(matched_strings); ++i) { + if (matched_strings[i]) { + // Groups are counted from 1 rather than 0. + const int group_index = i + 1; + if (group_index > matcher->groupCount()) { + return false; + } + *matched_strings[i] = + UnicodeStringToUtf8String(matcher->group(group_index, status)); + } + } + input->set_position(matcher->end(status)); + return !U_FAILURE(status); + } + + bool Match(const string& input_string, + bool full_match, + string* matched_string) const { + if (!utf8_regexp_.get()) { + return false; + } + IcuRegExpInput input(input_string); + UErrorCode status = U_ZERO_ERROR; + const scoped_ptr matcher( + utf8_regexp_->matcher(*input.Data(), status)); + bool match_succeeded = full_match + ? matcher->matches(input.position(), status) + : matcher->find(input.position(), status); + if (!match_succeeded || U_FAILURE(status)) { + return false; + } + if (matcher->groupCount() > 0 && matched_string) { + *matched_string = UnicodeStringToUtf8String(matcher->group(1, status)); + } + return !U_FAILURE(status); + } + + bool Replace(string* string_to_process, + bool global, + const string& replacement_string) const { + DCHECK(string_to_process); + if (!utf8_regexp_.get()) { + return false; + } + IcuRegExpInput input(*string_to_process); + UErrorCode status = U_ZERO_ERROR; + const scoped_ptr matcher( + utf8_regexp_->matcher(*input.Data(), status)); + if (U_FAILURE(status)) { + return false; + } + UnicodeString result = global + ? matcher->replaceAll( + UnicodeString::fromUTF8(replacement_string), status) + : matcher->replaceFirst( + UnicodeString::fromUTF8(replacement_string), status); + if (U_FAILURE(status)) { + return false; + } + const string replaced_string = UnicodeStringToUtf8String(result); + if (replaced_string == *string_to_process) { + return false; + } + *string_to_process = replaced_string; + return true; + } + + private: + scoped_ptr utf8_regexp_; + + DISALLOW_COPY_AND_ASSIGN(IcuRegExp); +}; + +RegExpInput* RegExpInput::Create(const string& utf8_input) { + return new IcuRegExpInput(utf8_input); +} + +RegExp* RegExp::Create(const string& utf8_regexp) { + return new IcuRegExp(utf8_regexp); +} + +} // namespace phonenumbers +} // namespace i18n -- cgit v1.2.3